]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
Merge tag 'please-pull-put_kernel_page' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Jul 2015 21:46:15 +0000 (14:46 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Jul 2015 21:46:15 +0000 (14:46 -0700)
Pull ia64 boot noise reduction fix from Tony Luck:
 "Remove some boot noise from a now-invalid check that pages are
  reserved"

* tag 'please-pull-put_kernel_page' of git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux:
  [IA64] Drop debug test/printk that some special pages are marked reserved

197 files changed:
Documentation/devicetree/bindings/usb/atmel-usb.txt
Documentation/ioctl/ioctl-number.txt
MAINTAINERS
arch/arm/boot/dts/armada-xp.dtsi
arch/arm/boot/dts/at91sam9g45.dtsi
arch/arm/boot/dts/at91sam9x5.dtsi
arch/arm/boot/dts/sama5d3.dtsi
arch/arm/boot/dts/sama5d4.dtsi
arch/arm/configs/multi_v7_defconfig
arch/arm/kernel/entry-armv.S
arch/arm/mach-bcm/Kconfig
arch/arm/mach-dove/include/mach/irqs.h
arch/arm/mach-dove/irq.c
arch/arm/mach-mvebu/headsmp-a9.S
arch/arm/mach-mvebu/platsmp-a9.c
arch/arm/mach-mvebu/pm-board.c
arch/arm/mach-rockchip/platsmp.c
arch/arm/mach-vexpress/spc.c
arch/arm64/boot/dts/apm/apm-storm.dtsi
arch/cris/arch-v10/drivers/eeprom.c
arch/cris/arch-v32/mm/intmem.c
arch/frv/mb93090-mb00/flash.c
arch/ia64/hp/sim/simscsi.c
arch/ia64/sn/kernel/mca.c
arch/mn10300/unit-asb2303/flash.c
arch/parisc/kernel/pdc_cons.c
arch/parisc/kernel/perf.c
arch/powerpc/kernel/time.c
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/platforms/83xx/suspend.c
arch/powerpc/platforms/ps3/time.c
arch/powerpc/sysdev/fsl_lbc.c
arch/sh/boards/mach-highlander/psw.c
arch/sh/boards/mach-landisk/psw.c
arch/tile/kernel/usb.c
arch/x86/kernel/bootflag.c
arch/x86/kernel/cpu/perf_event_intel_bts.c
arch/x86/kernel/cpu/perf_event_intel_pt.c
arch/x86/kernel/devicetree.c
arch/x86/kernel/vsmp_64.c
arch/x86/platform/intel-mid/intel_mid_vrtc.c
arch/xtensa/platforms/iss/network.c
crypto/asymmetric_keys/pkcs7_key_type.c
drivers/block/rbd.c
drivers/char/agp/intel-gtt.c
drivers/clk/clk-max77686.c
drivers/clk/clk-max77802.c
drivers/clk/clk-nomadik.c
drivers/clk/sunxi/clk-mod0.c
drivers/cpufreq/exynos-cpufreq.c
drivers/cpufreq/s5pv210-cpufreq.c
drivers/cpuidle/cpuidle-at91.c
drivers/cpuidle/cpuidle-calxeda.c
drivers/cpuidle/cpuidle-zynq.c
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/cik.c
drivers/gpu/drm/amd/amdgpu/cikd.h
drivers/gpu/drm/amd/amdgpu/cz_dpm.c
drivers/gpu/drm/amd/amdgpu/cz_dpm.h
drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
drivers/gpu/drm/amd/amdgpu/vi.c
drivers/gpu/drm/i915/i915_gem_gtt.c
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_dp.c
drivers/gpu/drm/i915/intel_drv.h
drivers/gpu/drm/i915/intel_panel.c
drivers/gpu/drm/radeon/cik.c
drivers/gpu/drm/radeon/cik_sdma.c
drivers/gpu/drm/radeon/radeon_audio.c
drivers/gpu/drm/radeon/radeon_fb.c
drivers/gpu/drm/radeon/radeon_ttm.c
drivers/gpu/drm/radeon/radeon_vm.c
drivers/gpu/drm/rockchip/rockchip_drm_drv.c
drivers/gpu/drm/rockchip/rockchip_drm_vop.c
drivers/hsi/controllers/omap_ssi.h
drivers/mailbox/pl320-ipc.c
drivers/mmc/host/omap_hsmmc.c
drivers/pcmcia/xxs1500_ss.c
drivers/platform/goldfish/pdev_bus.c
drivers/power/reset/syscon-reboot.c
drivers/regulator/max77802.c
drivers/soc/qcom/spm.c
drivers/soc/tegra/pmc.c
drivers/soc/versatile/soc-realview.c
drivers/tty/metag_da.c
drivers/tty/serial/8250/8250_omap.c
drivers/tty/serial/omap-serial.c
fs/ceph/acl.c
fs/ceph/addr.c
fs/ceph/caps.c
fs/ceph/dir.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/snap.c
fs/ceph/super.c
fs/ceph/super.h
fs/ceph/xattr.c
fs/fuse/cuse.c
fs/fuse/dev.c
fs/fuse/file.c
fs/fuse/fuse_i.h
fs/fuse/inode.c
fs/nfs/callback.c
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/client.c
fs/nfs/dir.c
fs/nfs/file.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/inode.c
fs/nfs/nfs3xdr.c
fs/nfs/nfs42.h
fs/nfs/nfs42proc.c
fs/nfs/nfs42xdr.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4file.c
fs/nfs/nfs4getroot.c
fs/nfs/nfs4idmap.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4xdr.c
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/write.c
fs/notify/inotify/inotify_user.c
fs/overlayfs/readdir.c
fs/overlayfs/super.c
include/linux/ceph/libceph.h
include/linux/ceph/osd_client.h
include/linux/crush/crush.h
include/linux/crush/hash.h
include/linux/crush/mapper.h
include/linux/device.h
include/linux/init.h
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_page.h
include/linux/nfs_xdr.h
include/linux/platform_device.h
include/linux/sunrpc/bc_xprt.h
include/linux/sunrpc/clnt.h
include/linux/sunrpc/sched.h
include/linux/sunrpc/xprt.h
include/linux/sunrpc/xprtrdma.h
include/uapi/drm/amdgpu_drm.h
include/uapi/linux/fuse.h
lib/list_sort.c
mm/nommu.c
mm/page_owner.c
net/ceph/ceph_common.c
net/ceph/crush/crush.c
net/ceph/crush/crush_ln_table.h
net/ceph/crush/hash.c
net/ceph/crush/mapper.c
net/ceph/messenger.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/ceph/pagevec.c
net/ipv4/netfilter.c
net/sunrpc/Makefile
net/sunrpc/backchannel_rqst.c
net/sunrpc/bc_svc.c [deleted file]
net/sunrpc/clnt.c
net/sunrpc/debugfs.c
net/sunrpc/svc.c
net/sunrpc/xprt.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/physical_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
scripts/tags.sh

index 1be8d7a26c15fff480c9d9644914eeee0bec03e9..5883b73ea1b56053fbafb98f473b1f2c8a349c62 100644 (file)
@@ -79,9 +79,9 @@ Atmel High-Speed USB device controller
 
 Required properties:
  - compatible: Should be one of the following
-              "at91sam9rl-udc"
-              "at91sam9g45-udc"
-              "sama5d3-udc"
+              "atmel,at91sam9rl-udc"
+              "atmel,at91sam9g45-udc"
+              "atmel,sama5d3-udc"
  - reg: Address and length of the register set for the device
  - interrupts: Should contain usba interrupt
  - clocks: Should reference the peripheral and host clocks
index 51f4221657bff5b03c9a8ef44116d6e27ef27423..611c52267d24812423821a9f062a407414f86e36 100644 (file)
@@ -321,6 +321,7 @@ Code  Seq#(hex)     Include File            Comments
 0xDB   00-0F   drivers/char/mwave/mwavepub.h
 0xDD   00-3F   ZFCP device driver      see drivers/s390/scsi/
                                        <mailto:aherrman@de.ibm.com>
+0xE5   00-3F   linux/fuse.h
 0xEC   00-01   drivers/platform/chrome/cros_ec_dev.h   ChromeOS EC driver
 0xF3   00-3F   drivers/usb/misc/sisusbvga/sisusb.h     sisfb (in development)
                                        <mailto:thomas@winischhofer.net>
index 993d4cfd5aa01e02e7aa3016a92dc74ec477eff7..86ea2084bc581d3ad7992da40c01c88157e9ce84 100644 (file)
@@ -4430,9 +4430,11 @@ FUSE: FILESYSTEM IN USERSPACE
 M:     Miklos Szeredi <miklos@szeredi.hu>
 L:     fuse-devel@lists.sourceforge.net
 W:     http://fuse.sourceforge.net/
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git
 S:     Maintained
 F:     fs/fuse/
 F:     include/uapi/linux/fuse.h
+F:     Documentation/filesystems/fuse.txt
 
 FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit)
 M:     Rik Faith <faith@cs.unc.edu>
index 0854d4493da7a8518c241dabe06e91ce22eb5fe9..3de9b761cc1ab0fe7a8d3f0ea9caa7ca72e2a989 100644 (file)
@@ -305,7 +305,7 @@ sdio_pins: sdio-pins {
        spi0_pins: spi0-pins {
                marvell,pins = "mpp36", "mpp37",
                               "mpp38", "mpp39";
-               marvell,function = "spi";
+               marvell,function = "spi0";
        };
 
        uart2_pins: uart2-pins {
index d260ba779ae53ce671db09142e99c573a23c22dd..18177f5a7464200453c9a82415b08bc73e6a6703 100644 (file)
@@ -1148,7 +1148,7 @@ spi1: spi@fffa8000 {
                        usb2: gadget@fff78000 {
                                #address-cells = <1>;
                                #size-cells = <0>;
-                               compatible = "atmel,at91sam9rl-udc";
+                               compatible = "atmel,at91sam9g45-udc";
                                reg = <0x00600000 0x80000
                                       0xfff78000 0x400>;
                                interrupts = <27 IRQ_TYPE_LEVEL_HIGH 0>;
index 7521bdf17ef25ab133e61f7c7a71fd0a189004d2..b6c8df8d380ea41c9ed9807fb15dd4708d23a913 100644 (file)
@@ -1108,7 +1108,7 @@ spi1: spi@f0004000 {
                        usb2: gadget@f803c000 {
                                #address-cells = <1>;
                                #size-cells = <0>;
-                               compatible = "atmel,at91sam9rl-udc";
+                               compatible = "atmel,at91sam9g45-udc";
                                reg = <0x00500000 0x80000
                                       0xf803c000 0x400>;
                                interrupts = <23 IRQ_TYPE_LEVEL_HIGH 0>;
index 5ab7548e04e1f459eb7105e5808bcc3e54f938b1..9e2444b07bceee0153c69a83e22c8fd7a54c06fe 100644 (file)
@@ -1321,7 +1321,7 @@ rtc@fffffeb0 {
                usb0: gadget@00500000 {
                        #address-cells = <1>;
                        #size-cells = <0>;
-                       compatible = "atmel,at91sam9rl-udc";
+                       compatible = "atmel,sama5d3-udc";
                        reg = <0x00500000 0x100000
                               0xf8030000 0x4000>;
                        interrupts = <33 IRQ_TYPE_LEVEL_HIGH 2>;
index 653a1f851f2b8f5bf641cb586ede4f0841105c6b..3ee22ee13c5a899fba199321cb55b6a3a15221b1 100644 (file)
@@ -127,7 +127,7 @@ ahb {
                usb0: gadget@00400000 {
                        #address-cells = <1>;
                        #size-cells = <0>;
-                       compatible = "atmel,at91sam9rl-udc";
+                       compatible = "atmel,sama5d3-udc";
                        reg = <0x00400000 0x100000
                               0xfc02c000 0x4000>;
                        interrupts = <47 IRQ_TYPE_LEVEL_HIGH 2>;
index fd6a6d23bc20b0f470c757a1592d88cf84ac5a9c..6d83a1bf0c7494593eb608ddba29f2bfea8d8455 100644 (file)
@@ -169,6 +169,7 @@ CONFIG_MTD_BLOCK=y
 CONFIG_MTD_M25P80=y
 CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_ATMEL=y
+CONFIG_MTD_NAND_BRCMNAND=y
 CONFIG_MTD_NAND_DAVINCI=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_MTD_UBI=y
index f8f7398c74c2d355d63a2bf3ef3faec225d2011c..7dac3086e361c8e3680d5209e5b72184c9c48cdb 100644 (file)
@@ -15,6 +15,8 @@
  *  that causes it to save wrong values...  Be aware!
  */
 
+#include <linux/init.h>
+
 #include <asm/assembler.h>
 #include <asm/memory.h>
 #include <asm/glue-df.h>
index e9184feffc4e5b55d46008be3f3587d2756ea6e3..0ac9e4b3b26525b1ff466ad60a92f6a79a1204c9 100644 (file)
@@ -19,7 +19,6 @@ config ARCH_BCM_IPROC
        select ARCH_REQUIRE_GPIOLIB
        select ARM_AMBA
        select PINCTRL
-       select MTD_NAND_BRCMNAND
        help
          This enables support for systems based on Broadcom IPROC architected SoCs.
          The IPROC complex contains one or more ARM CPUs along with common
index 03d401d20453eb447f15ab5dca15cd80967d2baf..3f29e6bca058623504e9c87c1b46fc4afefe7aaf 100644 (file)
 /*
  * Dove Low Interrupt Controller
  */
-#define IRQ_DOVE_BRIDGE                0
-#define IRQ_DOVE_H2C           1
-#define IRQ_DOVE_C2H           2
-#define IRQ_DOVE_NAND          3
-#define IRQ_DOVE_PDMA          4
-#define IRQ_DOVE_SPI1          5
-#define IRQ_DOVE_SPI0          6
-#define IRQ_DOVE_UART_0                7
-#define IRQ_DOVE_UART_1                8
-#define IRQ_DOVE_UART_2                9
-#define IRQ_DOVE_UART_3                10
-#define IRQ_DOVE_I2C           11
-#define IRQ_DOVE_GPIO_0_7      12
-#define IRQ_DOVE_GPIO_8_15     13
-#define IRQ_DOVE_GPIO_16_23    14
-#define IRQ_DOVE_PCIE0_ERR     15
-#define IRQ_DOVE_PCIE0         16
-#define IRQ_DOVE_PCIE1_ERR     17
-#define IRQ_DOVE_PCIE1         18
-#define IRQ_DOVE_I2S0          19
-#define IRQ_DOVE_I2S0_ERR      20
-#define IRQ_DOVE_I2S1          21
-#define IRQ_DOVE_I2S1_ERR      22
-#define IRQ_DOVE_USB_ERR       23
-#define IRQ_DOVE_USB0          24
-#define IRQ_DOVE_USB1          25
-#define IRQ_DOVE_GE00_RX       26
-#define IRQ_DOVE_GE00_TX       27
-#define IRQ_DOVE_GE00_MISC     28
-#define IRQ_DOVE_GE00_SUM      29
-#define IRQ_DOVE_GE00_ERR      30
-#define IRQ_DOVE_CRYPTO                31
+#define IRQ_DOVE_BRIDGE                (1 + 0)
+#define IRQ_DOVE_H2C           (1 + 1)
+#define IRQ_DOVE_C2H           (1 + 2)
+#define IRQ_DOVE_NAND          (1 + 3)
+#define IRQ_DOVE_PDMA          (1 + 4)
+#define IRQ_DOVE_SPI1          (1 + 5)
+#define IRQ_DOVE_SPI0          (1 + 6)
+#define IRQ_DOVE_UART_0                (1 + 7)
+#define IRQ_DOVE_UART_1                (1 + 8)
+#define IRQ_DOVE_UART_2                (1 + 9)
+#define IRQ_DOVE_UART_3                (1 + 10)
+#define IRQ_DOVE_I2C           (1 + 11)
+#define IRQ_DOVE_GPIO_0_7      (1 + 12)
+#define IRQ_DOVE_GPIO_8_15     (1 + 13)
+#define IRQ_DOVE_GPIO_16_23    (1 + 14)
+#define IRQ_DOVE_PCIE0_ERR     (1 + 15)
+#define IRQ_DOVE_PCIE0         (1 + 16)
+#define IRQ_DOVE_PCIE1_ERR     (1 + 17)
+#define IRQ_DOVE_PCIE1         (1 + 18)
+#define IRQ_DOVE_I2S0          (1 + 19)
+#define IRQ_DOVE_I2S0_ERR      (1 + 20)
+#define IRQ_DOVE_I2S1          (1 + 21)
+#define IRQ_DOVE_I2S1_ERR      (1 + 22)
+#define IRQ_DOVE_USB_ERR       (1 + 23)
+#define IRQ_DOVE_USB0          (1 + 24)
+#define IRQ_DOVE_USB1          (1 + 25)
+#define IRQ_DOVE_GE00_RX       (1 + 26)
+#define IRQ_DOVE_GE00_TX       (1 + 27)
+#define IRQ_DOVE_GE00_MISC     (1 + 28)
+#define IRQ_DOVE_GE00_SUM      (1 + 29)
+#define IRQ_DOVE_GE00_ERR      (1 + 30)
+#define IRQ_DOVE_CRYPTO                (1 + 31)
 
 /*
  * Dove High Interrupt Controller
  */
-#define IRQ_DOVE_AC97          32
-#define IRQ_DOVE_PMU           33
-#define IRQ_DOVE_CAM           34
-#define IRQ_DOVE_SDIO0         35
-#define IRQ_DOVE_SDIO1         36
-#define IRQ_DOVE_SDIO0_WAKEUP  37
-#define IRQ_DOVE_SDIO1_WAKEUP  38
-#define IRQ_DOVE_XOR_00                39
-#define IRQ_DOVE_XOR_01                40
-#define IRQ_DOVE_XOR0_ERR      41
-#define IRQ_DOVE_XOR_10                42
-#define IRQ_DOVE_XOR_11                43
-#define IRQ_DOVE_XOR1_ERR      44
-#define IRQ_DOVE_LCD_DCON      45
-#define IRQ_DOVE_LCD1          46
-#define IRQ_DOVE_LCD0          47
-#define IRQ_DOVE_GPU           48
-#define IRQ_DOVE_PERFORM_MNTR  49
-#define IRQ_DOVE_VPRO_DMA1     51
-#define IRQ_DOVE_SSP_TIMER     54
-#define IRQ_DOVE_SSP           55
-#define IRQ_DOVE_MC_L2_ERR     56
-#define IRQ_DOVE_CRYPTO_ERR    59
-#define IRQ_DOVE_GPIO_24_31    60
-#define IRQ_DOVE_HIGH_GPIO     61
-#define IRQ_DOVE_SATA          62
+#define IRQ_DOVE_AC97          (1 + 32)
+#define IRQ_DOVE_PMU           (1 + 33)
+#define IRQ_DOVE_CAM           (1 + 34)
+#define IRQ_DOVE_SDIO0         (1 + 35)
+#define IRQ_DOVE_SDIO1         (1 + 36)
+#define IRQ_DOVE_SDIO0_WAKEUP  (1 + 37)
+#define IRQ_DOVE_SDIO1_WAKEUP  (1 + 38)
+#define IRQ_DOVE_XOR_00                (1 + 39)
+#define IRQ_DOVE_XOR_01                (1 + 40)
+#define IRQ_DOVE_XOR0_ERR      (1 + 41)
+#define IRQ_DOVE_XOR_10                (1 + 42)
+#define IRQ_DOVE_XOR_11                (1 + 43)
+#define IRQ_DOVE_XOR1_ERR      (1 + 44)
+#define IRQ_DOVE_LCD_DCON      (1 + 45)
+#define IRQ_DOVE_LCD1          (1 + 46)
+#define IRQ_DOVE_LCD0          (1 + 47)
+#define IRQ_DOVE_GPU           (1 + 48)
+#define IRQ_DOVE_PERFORM_MNTR  (1 + 49)
+#define IRQ_DOVE_VPRO_DMA1     (1 + 51)
+#define IRQ_DOVE_SSP_TIMER     (1 + 54)
+#define IRQ_DOVE_SSP           (1 + 55)
+#define IRQ_DOVE_MC_L2_ERR     (1 + 56)
+#define IRQ_DOVE_CRYPTO_ERR    (1 + 59)
+#define IRQ_DOVE_GPIO_24_31    (1 + 60)
+#define IRQ_DOVE_HIGH_GPIO     (1 + 61)
+#define IRQ_DOVE_SATA          (1 + 62)
 
 /*
  * DOVE General Purpose Pins
  */
-#define IRQ_DOVE_GPIO_START    64
+#define IRQ_DOVE_GPIO_START    65
 #define NR_GPIO_IRQS           64
 
 /*
index 4a5a7aedcb763e9673dd8d51b416c6f1024e3310..df0223f76fa92d8752f31d2ccb1dd40e18d11a4f 100644 (file)
@@ -126,14 +126,14 @@ __exception_irq_entry dove_legacy_handle_irq(struct pt_regs *regs)
        stat = readl_relaxed(dove_irq_base + IRQ_CAUSE_LOW_OFF);
        stat &= readl_relaxed(dove_irq_base + IRQ_MASK_LOW_OFF);
        if (stat) {
-               unsigned int hwirq = __fls(stat);
+               unsigned int hwirq = 1 + __fls(stat);
                handle_IRQ(hwirq, regs);
                return;
        }
        stat = readl_relaxed(dove_irq_base + IRQ_CAUSE_HIGH_OFF);
        stat &= readl_relaxed(dove_irq_base + IRQ_MASK_HIGH_OFF);
        if (stat) {
-               unsigned int hwirq = 32 + __fls(stat);
+               unsigned int hwirq = 33 + __fls(stat);
                handle_IRQ(hwirq, regs);
                return;
        }
@@ -144,8 +144,8 @@ void __init dove_init_irq(void)
 {
        int i;
 
-       orion_irq_init(0, IRQ_VIRT_BASE + IRQ_MASK_LOW_OFF);
-       orion_irq_init(32, IRQ_VIRT_BASE + IRQ_MASK_HIGH_OFF);
+       orion_irq_init(1, IRQ_VIRT_BASE + IRQ_MASK_LOW_OFF);
+       orion_irq_init(33, IRQ_VIRT_BASE + IRQ_MASK_HIGH_OFF);
 
 #ifdef CONFIG_MULTI_IRQ_HANDLER
        set_handle_irq(dove_legacy_handle_irq);
index 48e4c4b3cd1c9a52f6e5580c531088e10aac8662..b093a196e80176d44cc083ee89d51b6fee1318c4 100644 (file)
  */
 
 #include <linux/linkage.h>
-#include <linux/init.h>
 
 #include <asm/assembler.h>
 
-       __CPUINIT
-
 ENTRY(mvebu_cortex_a9_secondary_startup)
 ARM_BE8(setend be)
        bl      armada_38x_scu_power_up
index df0a9cc5da59ad2ce7a95e7d13dc8e2a82937069..3d5000481c112dda6c0ee32a0ce036f3b08c9b53 100644 (file)
@@ -24,7 +24,7 @@
 
 extern void mvebu_cortex_a9_secondary_startup(void);
 
-static int __cpuinit mvebu_cortex_a9_boot_secondary(unsigned int cpu,
+static int mvebu_cortex_a9_boot_secondary(unsigned int cpu,
                                                    struct task_struct *idle)
 {
        int ret, hw_cpu;
index 6dfd4ab97b2aaf2e6982de227dd17e403dce7d24..301ab38d38ba884e194657d3e1173576b7f43ad0 100644 (file)
@@ -43,6 +43,9 @@ static void mvebu_armada_xp_gp_pm_enter(void __iomem *sdram_reg, u32 srcmd)
        for (i = 0; i < ARMADA_XP_GP_PIC_NR_GPIOS; i++)
                ackcmd |= BIT(pic_raw_gpios[i]);
 
+       srcmd = cpu_to_le32(srcmd);
+       ackcmd = cpu_to_le32(ackcmd);
+
        /*
         * Wait a while, the PIC needs quite a bit of time between the
         * two GPIO commands.
index 2e6ab67e2284497f9fc1d8fe323c2daaa4f34809..8fcec1cc101e09be617340f8c7e91b6e004b636f 100644 (file)
@@ -119,8 +119,7 @@ static int pmu_set_power_domain(int pd, bool on)
  * Handling of CPU cores
  */
 
-static int __cpuinit rockchip_boot_secondary(unsigned int cpu,
-                                            struct task_struct *idle)
+static int rockchip_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
        int ret;
 
index f61158c6ce7185a3b30ef396116a7fc2d74b3c72..5766ce2be32bbd28452c95ea9e43e92de0f4dcb0 100644 (file)
@@ -589,4 +589,4 @@ static int __init ve_spc_clk_init(void)
        platform_device_register_simple("vexpress-spc-cpufreq", -1, NULL, 0);
        return 0;
 }
-module_init(ve_spc_clk_init);
+device_initcall(ve_spc_clk_init);
index 0bb287ca0a98e8dcaac6ebd6497d762005b27013..0689c3fb56e3d84fe3ed7790f4a6b25835c86555 100644 (file)
@@ -717,6 +717,19 @@ sata3: sata@1a800000 {
                        phy-names = "sata-phy";
                };
 
+               sbgpio: sbgpio@17001000{
+                       compatible = "apm,xgene-gpio-sb";
+                       reg = <0x0 0x17001000 0x0 0x400>;
+                       #gpio-cells = <2>;
+                       gpio-controller;
+                       interrupts =    <0x0 0x28 0x1>,
+                                       <0x0 0x29 0x1>,
+                                       <0x0 0x2a 0x1>,
+                                       <0x0 0x2b 0x1>,
+                                       <0x0 0x2c 0x1>,
+                                       <0x0 0x2d 0x1>;
+               };
+
                rtc: rtc@10510000 {
                        compatible = "apm,xgene-rtc";
                        reg = <0x0 0x10510000 0x0 0x400>;
index 5047a33043bdf4bfa4d6ae3226f7385a6c703d37..f679a19dfeb8bbd857d8c577b14baeccf05d3865 100644 (file)
@@ -848,5 +848,4 @@ static void eeprom_disable_write_protect(void)
     /* Write protect disabled */
   }
 }
-
-module_init(eeprom_init);
+device_initcall(eeprom_init);
index 1b17d92cef8ebb98e57b34a63be0f6f8fb6d9508..9ef56092a4c54f9cdd49255ae6683bf33adeb72a 100644 (file)
@@ -145,6 +145,5 @@ unsigned long crisv32_intmem_virt_to_phys(void* addr)
                (unsigned long)intmem_virtual + MEM_INTMEM_START +
                RESERVED_SIZE);
 }
-
-module_init(crisv32_intmem_init);
+device_initcall(crisv32_intmem_init);
 
index c0e3707c2299bae3f4cb5cf1a1c7ded959f361e9..e1cf802d1639bb431cef9f27e8bfa29c3c2505f6 100644 (file)
@@ -9,7 +9,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
-#include <linux/init.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
index 3a428f19a00116ad963f7cdae2ddf469ec21a054..085047f3a545b283c41ec546cb2b71e623a4c7fa 100644 (file)
@@ -368,13 +368,4 @@ simscsi_init(void)
        scsi_host_put(host);
        return error;
 }
-
-static void __exit
-simscsi_exit(void)
-{
-       scsi_remove_host(host);
-       scsi_host_put(host);
-}
-
-module_init(simscsi_init);
-module_exit(simscsi_exit);
+device_initcall(simscsi_init);
index 27793f7aa99c3c406b5547511a59347b05dd0210..5b799d4deb747c481f3ad7a47d1c75374fa92715 100644 (file)
@@ -142,5 +142,4 @@ static int __init sn_salinfo_init(void)
                salinfo_platform_oemdata = &sn_salinfo_platform_oemdata;
        return 0;
 }
-
-module_init(sn_salinfo_init)
+device_initcall(sn_salinfo_init);
index 17fe083fcb6fa985cec4f108036ad490e11b785d..b03d8738d67cd682ef0a96d8770f8bb9ebc4fbc6 100644 (file)
@@ -96,5 +96,4 @@ static int __init asb2303_mtd_init(void)
        platform_device_register(&asb2303_sysflash);
        return 0;
 }
-
-module_init(asb2303_mtd_init);
+device_initcall(asb2303_mtd_init);
index d5cae55195ecfd4f108b1431ecf52193b5a8d282..10a5ae9553fd657211524e4af5257958815e134a 100644 (file)
@@ -207,8 +207,7 @@ static int __init pdc_console_tty_driver_init(void)
 
        return 0;
 }
-
-module_init(pdc_console_tty_driver_init);
+device_initcall(pdc_console_tty_driver_init);
 
 static struct tty_driver * pdc_console_device (struct console *c, int *index)
 {
index ba0c053e25ae9d66cf536ab87ce1cca9eef53dbf..518f4f5f1f43ec6b2dcaceb9b2f5c9536097d59f 100644 (file)
@@ -543,6 +543,7 @@ static int __init perf_init(void)
 
        return 0;
 }
+device_initcall(perf_init);
 
 /*
  * perf_start_counters(void)
@@ -847,5 +848,3 @@ printk("perf_rdr_write\n");
        }
 printk("perf_rdr_write done\n");
 }
-
-module_init(perf_init);
index 56f44848b044b67c7d3b84caf82a9a9ed6fb493a..43922509a4833e8da175b668e9be588daa123f4f 100644 (file)
@@ -1124,4 +1124,4 @@ static int __init rtc_init(void)
        return PTR_ERR_OR_ZERO(pdev);
 }
 
-module_init(rtc_init);
+device_initcall(rtc_init);
index 1f614d778a8b5ffc2b1cf32465fce3a579839f3a..bb0bd7025cb88f893af04d3f98141860c038ee54 100644 (file)
@@ -928,7 +928,7 @@ static int __init hugetlbpage_init(void)
        return 0;
 }
 #endif
-module_init(hugetlbpage_init);
+arch_initcall(hugetlbpage_init);
 
 void flush_dcache_icache_hugepage(struct page *page)
 {
index c9adbfb65006e153aa8f6caf0d0fd5b396cf2997..fcbea4b51a7821ac2865622de1440a9aaa766449 100644 (file)
@@ -445,5 +445,4 @@ static int pmc_init(void)
 {
        return platform_driver_register(&pmc_driver);
 }
-
-module_init(pmc_init);
+device_initcall(pmc_init);
index ce73ce865613b0521330f8e896f471daf6da9a15..791c6142c4a7bd0185437890350381c529779a54 100644 (file)
@@ -92,5 +92,4 @@ static int __init ps3_rtc_init(void)
 
        return PTR_ERR_OR_ZERO(pdev);
 }
-
-module_init(ps3_rtc_init);
+device_initcall(ps3_rtc_init);
index d631022ffb4b3c77a4fe057c3c6da91457a20c64..38138cf8d33e2213804dad3e903c4037a59db605 100644 (file)
@@ -407,4 +407,4 @@ static int __init fsl_lbc_init(void)
 {
        return platform_driver_register(&fsl_lbc_ctrl_driver);
 }
-module_init(fsl_lbc_init);
+subsys_initcall(fsl_lbc_init);
index 522786318d36c4f5dbee20c8cbdee94009bbf201..40e2b585d4887b7bc48361395c0b9881ecf64970 100644 (file)
@@ -10,7 +10,7 @@
  * for more details.
  */
 #include <linux/io.h>
-#include <linux/init.h>
+#include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <mach/highlander.h>
index bef83522f958c25ed186dc21187963607757ed89..5192b1f43ada5ce884bc4a8abc3f1e0f53e7cc1d 100644 (file)
@@ -140,4 +140,4 @@ static int __init psw_init(void)
 {
        return platform_add_devices(psw_devices, ARRAY_SIZE(psw_devices));
 }
-module_init(psw_init);
+device_initcall(psw_init);
index 5af8debc6a71aab44a6cf48e63d44f7343f2fca4..f0da5a237e94077ced050b6f3d746c89d44bd341 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
 #include <linux/usb/tilegx.h>
+#include <linux/init.h>
 #include <linux/types.h>
 
 static u64 ehci_dmamask = DMA_BIT_MASK(32);
index 5de7f4c5697136e180e4a24e153bde77c7112b8c..52c8e3c7789dc81f6f6bd5ae60a6672f500dc8ca 100644 (file)
@@ -98,4 +98,4 @@ static int __init sbf_init(void)
 
        return 0;
 }
-module_init(sbf_init);
+arch_initcall(sbf_init);
index 7795f3f8b1d57198469ded20ac9a1244035428e8..43dd672d788bcc8a07106f8c84c0aeae786cb49e 100644 (file)
@@ -530,5 +530,4 @@ static __init int bts_init(void)
 
        return perf_pmu_register(&bts_pmu, "intel_bts", -1);
 }
-
-module_init(bts_init);
+arch_initcall(bts_init);
index 159887c3a89d66a4aaad415ddc069b25904b5676..183de719628d2a4740d20a173a6c950f35c322fa 100644 (file)
@@ -1106,5 +1106,4 @@ static __init int pt_init(void)
 
        return ret;
 }
-
-module_init(pt_init);
+arch_initcall(pt_init);
index 5ee771859b6f6e144090efe8f315e0c7707978b3..1f4acd68b98bccb7bf4032bc6ff2bde3f4efecdb 100644 (file)
@@ -65,7 +65,7 @@ static int __init add_bus_probe(void)
 
        return of_platform_bus_probe(NULL, ce4100_ids, NULL);
 }
-module_init(add_bus_probe);
+device_initcall(add_bus_probe);
 
 #ifdef CONFIG_PCI
 struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
index ee22c1d93ae5c4c5ffe065981d591507a0511a07..b034b1b14b9c66ab77b24d9757ce10de7f0f318e 100644 (file)
@@ -72,7 +72,7 @@ asmlinkage __visible void vsmp_irq_enable(void)
 }
 PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
 
-static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
                                  unsigned long addr, unsigned len)
 {
        switch (type) {
index 32947ba0f62dad08eaaf712848bff024ea396a46..ee40fcb6e54dd5f816819bf2f048dcc9177ce95c 100644 (file)
@@ -173,5 +173,4 @@ static int __init intel_mid_device_create(void)
 
        return platform_device_register(&vrtc_device);
 }
-
-module_init(intel_mid_device_create);
+device_initcall(intel_mid_device_create);
index 17b1ef3232e4833349c2f64ed19e08dede454b52..8ab021b1f14128d49948de51f685a78fd7180fa2 100644 (file)
@@ -681,6 +681,4 @@ static int iss_net_init(void)
 
        return 1;
 }
-
-module_init(iss_net_init);
-
+device_initcall(iss_net_init);
index 751f8fd7335db2203f7257edc8ad680dc7ea2a14..3d13b042da735823b7c6425f93cce9cff82e0abe 100644 (file)
@@ -12,6 +12,7 @@
 #define pr_fmt(fmt) "PKCS7key: "fmt
 #include <linux/key.h>
 #include <linux/err.h>
+#include <linux/module.h>
 #include <linux/key-type.h>
 #include <crypto/pkcs7.h>
 #include <keys/user-type.h>
index ec6c5c6e1ac94b2bcbe0619a7fe62b9e7d0ce4a5..d94529d5c8e951378eaf62d74b708edf271a550f 100644 (file)
@@ -346,6 +346,7 @@ struct rbd_device {
        struct rbd_image_header header;
        unsigned long           flags;          /* possibly lock protected */
        struct rbd_spec         *spec;
+       struct rbd_options      *opts;
 
        char                    *header_name;
 
@@ -724,34 +725,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 }
 
 /*
- * mount options
+ * (Per device) rbd map options
  */
 enum {
+       Opt_queue_depth,
        Opt_last_int,
        /* int args above */
        Opt_last_string,
        /* string args above */
        Opt_read_only,
        Opt_read_write,
-       /* Boolean args above */
-       Opt_last_bool,
+       Opt_err
 };
 
 static match_table_t rbd_opts_tokens = {
+       {Opt_queue_depth, "queue_depth=%d"},
        /* int args above */
        /* string args above */
        {Opt_read_only, "read_only"},
        {Opt_read_only, "ro"},          /* Alternate spelling */
        {Opt_read_write, "read_write"},
        {Opt_read_write, "rw"},         /* Alternate spelling */
-       /* Boolean args above */
-       {-1, NULL}
+       {Opt_err, NULL}
 };
 
 struct rbd_options {
+       int     queue_depth;
        bool    read_only;
 };
 
+#define RBD_QUEUE_DEPTH_DEFAULT        BLKDEV_MAX_RQ
 #define RBD_READ_ONLY_DEFAULT  false
 
 static int parse_rbd_opts_token(char *c, void *private)
@@ -761,27 +764,27 @@ static int parse_rbd_opts_token(char *c, void *private)
        int token, intval, ret;
 
        token = match_token(c, rbd_opts_tokens, argstr);
-       if (token < 0)
-               return -EINVAL;
-
        if (token < Opt_last_int) {
                ret = match_int(&argstr[0], &intval);
                if (ret < 0) {
-                       pr_err("bad mount option arg (not int) "
-                              "at '%s'\n", c);
+                       pr_err("bad mount option arg (not int) at '%s'\n", c);
                        return ret;
                }
                dout("got int token %d val %d\n", token, intval);
        } else if (token > Opt_last_int && token < Opt_last_string) {
-               dout("got string token %d val %s\n", token,
-                    argstr[0].from);
-       } else if (token > Opt_last_string && token < Opt_last_bool) {
-               dout("got Boolean token %d\n", token);
+               dout("got string token %d val %s\n", token, argstr[0].from);
        } else {
                dout("got token %d\n", token);
        }
 
        switch (token) {
+       case Opt_queue_depth:
+               if (intval < 1) {
+                       pr_err("queue_depth out of range\n");
+                       return -EINVAL;
+               }
+               rbd_opts->queue_depth = intval;
+               break;
        case Opt_read_only:
                rbd_opts->read_only = true;
                break;
@@ -789,9 +792,10 @@ static int parse_rbd_opts_token(char *c, void *private)
                rbd_opts->read_only = false;
                break;
        default:
-               rbd_assert(false);
-               break;
+               /* libceph prints "bad option" msg */
+               return -EINVAL;
        }
+
        return 0;
 }
 
@@ -1563,22 +1567,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
 /*
  * Wait for an object request to complete.  If interrupted, cancel the
  * underlying osd request.
+ *
+ * @timeout: in jiffies, 0 means "wait forever"
  */
-static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
+                                 unsigned long timeout)
 {
-       int ret;
+       long ret;
 
        dout("%s %p\n", __func__, obj_request);
-
-       ret = wait_for_completion_interruptible(&obj_request->completion);
-       if (ret < 0) {
-               dout("%s %p interrupted\n", __func__, obj_request);
+       ret = wait_for_completion_interruptible_timeout(
+                                       &obj_request->completion,
+                                       ceph_timeout_jiffies(timeout));
+       if (ret <= 0) {
+               if (ret == 0)
+                       ret = -ETIMEDOUT;
                rbd_obj_request_end(obj_request);
-               return ret;
+       } else {
+               ret = 0;
        }
 
-       dout("%s %p done\n", __func__, obj_request);
-       return 0;
+       dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
+       return ret;
+}
+
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+{
+       return __rbd_obj_request_wait(obj_request, 0);
+}
+
+static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
+                                       unsigned long timeout)
+{
+       return __rbd_obj_request_wait(obj_request, timeout);
 }
 
 static void rbd_img_request_complete(struct rbd_img_request *img_request)
@@ -2001,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
        rbd_assert(obj_request_type_valid(type));
 
        size = strlen(object_name) + 1;
-       name = kmalloc(size, GFP_KERNEL);
+       name = kmalloc(size, GFP_NOIO);
        if (!name)
                return NULL;
 
-       obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
+       obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
        if (!obj_request) {
                kfree(name);
                return NULL;
@@ -2376,7 +2397,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
        }
 
        if (opcode == CEPH_OSD_OP_DELETE)
-               osd_req_op_init(osd_request, num_ops, opcode);
+               osd_req_op_init(osd_request, num_ops, opcode, 0);
        else
                osd_req_op_extent_init(osd_request, num_ops, opcode,
                                       offset, length, 0, 0);
@@ -2848,7 +2869,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
                goto out;
        stat_request->callback = rbd_img_obj_exists_callback;
 
-       osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
+       osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
        osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
                                        false, false);
        rbd_osd_req_format_read(stat_request);
@@ -3122,6 +3143,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
                                                bool watch)
 {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+       struct ceph_options *opts = osdc->client->options;
        struct rbd_obj_request *obj_request;
        int ret;
 
@@ -3148,7 +3170,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
        if (ret)
                goto out;
 
-       ret = rbd_obj_request_wait(obj_request);
+       ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
        if (ret)
                goto out;
 
@@ -3750,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 
        memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
        rbd_dev->tag_set.ops = &rbd_mq_ops;
-       rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+       rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
        rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-       rbd_dev->tag_set.flags =
-               BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
        rbd_dev->tag_set.nr_hw_queues = 1;
        rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
 
@@ -3773,6 +3794,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
        /* set io sizes to object size */
        segment_size = rbd_obj_bytes(&rbd_dev->header);
        blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+       blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
        blk_queue_max_segment_size(q, segment_size);
        blk_queue_io_min(q, segment_size);
        blk_queue_io_opt(q, segment_size);
@@ -4044,7 +4066,8 @@ static void rbd_spec_free(struct kref *kref)
 }
 
 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
-                               struct rbd_spec *spec)
+                                        struct rbd_spec *spec,
+                                        struct rbd_options *opts)
 {
        struct rbd_device *rbd_dev;
 
@@ -4058,8 +4081,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
        INIT_LIST_HEAD(&rbd_dev->node);
        init_rwsem(&rbd_dev->header_rwsem);
 
-       rbd_dev->spec = spec;
        rbd_dev->rbd_client = rbdc;
+       rbd_dev->spec = spec;
+       rbd_dev->opts = opts;
 
        /* Initialize the layout used for all rbd requests */
 
@@ -4075,6 +4099,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev)
 {
        rbd_put_client(rbd_dev->rbd_client);
        rbd_spec_put(rbd_dev->spec);
+       kfree(rbd_dev->opts);
        kfree(rbd_dev);
 }
 
@@ -4933,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf,
                goto out_mem;
 
        rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
+       rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
 
        copts = ceph_parse_options(options, mon_addrs,
                                        mon_addrs + mon_addrs_size - 1,
@@ -4963,8 +4989,8 @@ static int rbd_add_parse_args(const char *buf,
  */
 static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 {
+       struct ceph_options *opts = rbdc->client->options;
        u64 newest_epoch;
-       unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
        int tries = 0;
        int ret;
 
@@ -4979,7 +5005,8 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
                if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
                        ceph_monc_request_next_osdmap(&rbdc->client->monc);
                        (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
-                                                    newest_epoch, timeout);
+                                                    newest_epoch,
+                                                    opts->mount_timeout);
                        goto again;
                } else {
                        /* the osdmap we have is new enough */
@@ -5148,7 +5175,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
        rbdc = __rbd_get_client(rbd_dev->rbd_client);
 
        ret = -ENOMEM;
-       parent = rbd_dev_create(rbdc, parent_spec);
+       parent = rbd_dev_create(rbdc, parent_spec, NULL);
        if (!parent)
                goto out_err;
 
@@ -5394,9 +5421,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
        rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
        if (rc < 0)
                goto err_out_module;
-       read_only = rbd_opts->read_only;
-       kfree(rbd_opts);
-       rbd_opts = NULL;        /* done with this */
 
        rbdc = rbd_get_client(ceph_opts);
        if (IS_ERR(rbdc)) {
@@ -5422,11 +5446,12 @@ static ssize_t do_rbd_add(struct bus_type *bus,
                goto err_out_client;
        }
 
-       rbd_dev = rbd_dev_create(rbdc, spec);
+       rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
        if (!rbd_dev)
                goto err_out_client;
        rbdc = NULL;            /* rbd_dev now owns this */
        spec = NULL;            /* rbd_dev now owns this */
+       rbd_opts = NULL;        /* rbd_dev now owns this */
 
        rc = rbd_dev_image_probe(rbd_dev, true);
        if (rc < 0)
@@ -5434,6 +5459,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 
        /* If we are mapping a snapshot it must be marked read-only */
 
+       read_only = rbd_dev->opts->read_only;
        if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
                read_only = true;
        rbd_dev->mapping.read_only = read_only;
@@ -5458,6 +5484,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
        rbd_put_client(rbdc);
 err_out_args:
        rbd_spec_put(spec);
+       kfree(rbd_opts);
 err_out_module:
        module_put(THIS_MODULE);
 
index 0b4188b9af7cd055571851d9f0e0e25e7d03033c..c6dea3f6917bdcfc144fc70540cbbd26ea1918ee 100644 (file)
@@ -581,7 +581,7 @@ static inline int needs_ilk_vtd_wa(void)
        /* Query intel_iommu to see if we need the workaround. Presumably that
         * was loaded first.
         */
-       if ((gpu_devid == PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB ||
+       if ((gpu_devid == PCI_DEVICE_ID_INTEL_IRONLAKE_D_IG ||
             gpu_devid == PCI_DEVICE_ID_INTEL_IRONLAKE_M_IG) &&
             intel_iommu_gfx_mapped)
                return 1;
index 86cdb3a28629ab09394292e2b210eb5b689b4504..446c2fe76dc28635b28b0fb116aaf5cd7e79e30e 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/max77686.h>
 #include <linux/mfd/max77686-private.h>
index 0729dc723a8ff81099bc9e0071a9784f6fd9e4d3..74c49b93a6eba5c40f28342f7d4ab4225c477795 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/mfd/max77686-private.h>
 #include <linux/clk-provider.h>
index 05e04ce0f1488f7301ad5153f687860bf872f8bc..c9487179f25f46d79e511ace153e3a3e7078e49e 100644 (file)
@@ -503,8 +503,7 @@ static int __init nomadik_src_clk_init_debugfs(void)
                            NULL, NULL, &nomadik_src_clk_debugfs_ops);
        return 0;
 }
-
-module_init(nomadik_src_clk_init_debugfs);
+device_initcall(nomadik_src_clk_init_debugfs);
 
 #endif
 
index ec8f5a1fca09f4240c433a1de2ea8207e6e369e5..9d028aec58e5d8addc3a080ced48594db57c1657 100644 (file)
@@ -128,7 +128,7 @@ static struct platform_driver sun4i_a10_mod0_clk_driver = {
        },
        .probe = sun4i_a10_mod0_clk_probe,
 };
-module_platform_driver(sun4i_a10_mod0_clk_driver);
+builtin_platform_driver(sun4i_a10_mod0_clk_driver);
 
 static const struct factors_data sun9i_a80_mod0_data __initconst = {
        .enable = 31,
index fb24aaf4adcf8099c04a4f2de5eb558a56292707..ae5b2bd3a9785c63646e3e922fbe17330678b481 100644 (file)
@@ -10,6 +10,7 @@
 */
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/err.h>
 #include <linux/clk.h>
 #include <linux/io.h>
index b0dac7d6ba31475b9902eefaff8c9dab557f34d2..9e231f52150c404ebd92e6d74ea6a24b5642576a 100644 (file)
@@ -659,4 +659,4 @@ static struct platform_driver s5pv210_cpufreq_platdrv = {
        },
        .probe = s5pv210_cpufreq_probe,
 };
-module_platform_driver(s5pv210_cpufreq_platdrv);
+builtin_platform_driver(s5pv210_cpufreq_platdrv);
index f2446c78d87cf14eda9cec13ab05d9bd0b9198a9..9c5853b6ca4a2f47039dce98ab13e722dfdc05f4 100644 (file)
@@ -62,5 +62,4 @@ static struct platform_driver at91_cpuidle_driver = {
        },
        .probe = at91_cpuidle_probe,
 };
-
-module_platform_driver(at91_cpuidle_driver);
+builtin_platform_driver(at91_cpuidle_driver);
index 9445e6cc02be4336b396d8125d95db5577b0ad13..c13feec89ea1d68c9734553110c56aeee2b812c2 100644 (file)
@@ -75,5 +75,4 @@ static struct platform_driver calxeda_cpuidle_plat_driver = {
         },
         .probe = calxeda_cpuidle_probe,
 };
-
-module_platform_driver(calxeda_cpuidle_plat_driver);
+builtin_platform_driver(calxeda_cpuidle_plat_driver);
index 543292b1d38ea045e9d9504c59d75ba4678a0ce2..6f4257fc56e5192f0b743a00d7bafcbb3b5566c3 100644 (file)
@@ -73,5 +73,4 @@ static struct platform_driver zynq_cpuidle_driver = {
        },
        .probe = zynq_cpuidle_probe,
 };
-
-module_platform_driver(zynq_cpuidle_driver);
+builtin_platform_driver(zynq_cpuidle_driver);
index 22866d1c3d69c196bdb332f7c056d9e4a2a5005f..01657830b470a49e8209fd39fa829d4a1fbb3610 100644 (file)
@@ -425,6 +425,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
                                   unsigned irq_type);
 int amdgpu_fence_emit(struct amdgpu_ring *ring, void *owner,
                      struct amdgpu_fence **fence);
+int amdgpu_fence_recreate(struct amdgpu_ring *ring, void *owner,
+                         uint64_t seq, struct amdgpu_fence **fence);
 void amdgpu_fence_process(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_next(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
@@ -435,9 +437,6 @@ int amdgpu_fence_wait(struct amdgpu_fence *fence, bool interruptible);
 int amdgpu_fence_wait_any(struct amdgpu_device *adev,
                          struct amdgpu_fence **fences,
                          bool intr);
-long amdgpu_fence_wait_seq_timeout(struct amdgpu_device *adev,
-                                  u64 *target_seq, bool intr,
-                                  long timeout);
 struct amdgpu_fence *amdgpu_fence_ref(struct amdgpu_fence *fence);
 void amdgpu_fence_unref(struct amdgpu_fence **fence);
 
@@ -1622,6 +1621,7 @@ struct amdgpu_vce {
        unsigned                fb_version;
        atomic_t                handles[AMDGPU_MAX_VCE_HANDLES];
        struct drm_file         *filp[AMDGPU_MAX_VCE_HANDLES];
+       uint32_t                img_size[AMDGPU_MAX_VCE_HANDLES];
        struct delayed_work     idle_work;
        const struct firmware   *fw;    /* VCE firmware */
        struct amdgpu_ring      ring[AMDGPU_MAX_VCE_RINGS];
index 36d34e0afbc3a5cacb51d590af6506c62a7d58ee..f82a2dd83874dea20c7e7b2a6ddf8aee74e0fe1d 100644 (file)
@@ -30,6 +30,7 @@
 
 #include <drm/drmP.h>
 #include "amdgpu.h"
+#include "amdgpu_trace.h"
 
 static int amdgpu_bo_list_create(struct amdgpu_fpriv *fpriv,
                                 struct amdgpu_bo_list **result,
@@ -124,6 +125,8 @@ static int amdgpu_bo_list_set(struct amdgpu_device *adev,
                        gws_obj = entry->robj;
                if (entry->prefered_domains == AMDGPU_GEM_DOMAIN_OA)
                        oa_obj = entry->robj;
+
+               trace_amdgpu_bo_list_set(list, entry->robj);
        }
 
        for (i = 0; i < list->num_entries; ++i)
index f09b2cba40ca505649decf23a27b60850b62407a..d63135bf29c0c258f72025fa6f41f34677576ec4 100644 (file)
@@ -181,8 +181,6 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
                }
                p->chunks[i].chunk_id = user_chunk.chunk_id;
                p->chunks[i].length_dw = user_chunk.length_dw;
-               if (p->chunks[i].chunk_id == AMDGPU_CHUNK_ID_IB)
-                       p->num_ibs++;
 
                size = p->chunks[i].length_dw;
                cdata = (void __user *)(unsigned long)user_chunk.chunk_data;
@@ -199,7 +197,12 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
                        goto out;
                }
 
-               if (p->chunks[i].chunk_id == AMDGPU_CHUNK_ID_FENCE) {
+               switch (p->chunks[i].chunk_id) {
+               case AMDGPU_CHUNK_ID_IB:
+                       p->num_ibs++;
+                       break;
+
+               case AMDGPU_CHUNK_ID_FENCE:
                        size = sizeof(struct drm_amdgpu_cs_chunk_fence);
                        if (p->chunks[i].length_dw * sizeof(uint32_t) >= size) {
                                uint32_t handle;
@@ -221,6 +224,14 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
                                r = -EINVAL;
                                goto out;
                        }
+                       break;
+
+               case AMDGPU_CHUNK_ID_DEPENDENCIES:
+                       break;
+
+               default:
+                       r = -EINVAL;
+                       goto out;
                }
        }
 
@@ -445,8 +456,9 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, bo
        for (i = 0; i < parser->nchunks; i++)
                drm_free_large(parser->chunks[i].kdata);
        kfree(parser->chunks);
-       for (i = 0; i < parser->num_ibs; i++)
-               amdgpu_ib_free(parser->adev, &parser->ibs[i]);
+       if (parser->ibs)
+               for (i = 0; i < parser->num_ibs; i++)
+                       amdgpu_ib_free(parser->adev, &parser->ibs[i]);
        kfree(parser->ibs);
        if (parser->uf.bo)
                drm_gem_object_unreference_unlocked(&parser->uf.bo->gem_base);
@@ -654,6 +666,55 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
        return 0;
 }
 
+static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
+                                 struct amdgpu_cs_parser *p)
+{
+       struct amdgpu_ib *ib;
+       int i, j, r;
+
+       if (!p->num_ibs)
+               return 0;
+
+       /* Add dependencies to first IB */
+       ib = &p->ibs[0];
+       for (i = 0; i < p->nchunks; ++i) {
+               struct drm_amdgpu_cs_chunk_dep *deps;
+               struct amdgpu_cs_chunk *chunk;
+               unsigned num_deps;
+
+               chunk = &p->chunks[i];
+
+               if (chunk->chunk_id != AMDGPU_CHUNK_ID_DEPENDENCIES)
+                       continue;
+
+               deps = (struct drm_amdgpu_cs_chunk_dep *)chunk->kdata;
+               num_deps = chunk->length_dw * 4 /
+                       sizeof(struct drm_amdgpu_cs_chunk_dep);
+
+               for (j = 0; j < num_deps; ++j) {
+                       struct amdgpu_fence *fence;
+                       struct amdgpu_ring *ring;
+
+                       r = amdgpu_cs_get_ring(adev, deps[j].ip_type,
+                                              deps[j].ip_instance,
+                                              deps[j].ring, &ring);
+                       if (r)
+                               return r;
+
+                       r = amdgpu_fence_recreate(ring, p->filp,
+                                                 deps[j].handle,
+                                                 &fence);
+                       if (r)
+                               return r;
+
+                       amdgpu_sync_fence(&ib->sync, fence);
+                       amdgpu_fence_unref(&fence);
+               }
+       }
+
+       return 0;
+}
+
 int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 {
        struct amdgpu_device *adev = dev->dev_private;
@@ -688,11 +749,16 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
                        else
                                DRM_ERROR("Failed to process the buffer list %d!\n", r);
                }
-       } else {
+       }
+
+       if (!r) {
                reserved_buffers = true;
                r = amdgpu_cs_ib_fill(adev, &parser);
        }
 
+       if (!r)
+               r = amdgpu_cs_dependencies(adev, &parser);
+
        if (r) {
                amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
                up_read(&adev->exclusive_lock);
@@ -730,9 +796,9 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
 {
        union drm_amdgpu_wait_cs *wait = data;
        struct amdgpu_device *adev = dev->dev_private;
-       uint64_t seq[AMDGPU_MAX_RINGS] = {0};
-       struct amdgpu_ring *ring = NULL;
        unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
+       struct amdgpu_fence *fence = NULL;
+       struct amdgpu_ring *ring = NULL;
        struct amdgpu_ctx *ctx;
        long r;
 
@@ -745,9 +811,12 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
        if (r)
                return r;
 
-       seq[ring->idx] = wait->in.handle;
+       r = amdgpu_fence_recreate(ring, filp, wait->in.handle, &fence);
+       if (r)
+               return r;
 
-       r = amdgpu_fence_wait_seq_timeout(adev, seq, true, timeout);
+       r = fence_wait_timeout(&fence->base, true, timeout);
+       amdgpu_fence_unref(&fence);
        amdgpu_ctx_put(ctx);
        if (r < 0)
                return r;
index fec487d1c870ae27ea586ddda2def8b6c45272e4..ba46be361c9b2c9f40bf0acb751e65b156d4b171 100644 (file)
@@ -1191,7 +1191,9 @@ static int amdgpu_early_init(struct amdgpu_device *adev)
                return -EINVAL;
        }
 
-
+       adev->ip_block_enabled = kcalloc(adev->num_ip_blocks, sizeof(bool), GFP_KERNEL);
+       if (adev->ip_block_enabled == NULL)
+               return -ENOMEM;
 
        if (adev->ip_blocks == NULL) {
                DRM_ERROR("No IP blocks found!\n");
@@ -1575,8 +1577,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
        amdgpu_fence_driver_fini(adev);
        amdgpu_fbdev_fini(adev);
        r = amdgpu_fini(adev);
-       if (adev->ip_block_enabled)
-               kfree(adev->ip_block_enabled);
+       kfree(adev->ip_block_enabled);
        adev->ip_block_enabled = NULL;
        adev->accel_working = false;
        /* free i2c buses */
@@ -2000,4 +2001,10 @@ int amdgpu_debugfs_init(struct drm_minor *minor)
 void amdgpu_debugfs_cleanup(struct drm_minor *minor)
 {
 }
+#else
+static int amdgpu_debugfs_regs_init(struct amdgpu_device *adev)
+{
+       return 0;
+}
+static void amdgpu_debugfs_regs_cleanup(struct amdgpu_device *adev) { }
 #endif
index 5c9918d01bf984b75e2fae95daaabf7d46cf6414..a7189a1fa6a17dc308075535c6ddc0fcf3403270 100644 (file)
@@ -135,6 +135,38 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, void *owner,
        return 0;
 }
 
+/**
+ * amdgpu_fence_recreate - recreate a fence from an user fence
+ *
+ * @ring: ring the fence is associated with
+ * @owner: creator of the fence
+ * @seq: user fence sequence number
+ * @fence: resulting amdgpu fence object
+ *
+ * Recreates a fence command from the user fence sequence number (all asics).
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int amdgpu_fence_recreate(struct amdgpu_ring *ring, void *owner,
+                         uint64_t seq, struct amdgpu_fence **fence)
+{
+       struct amdgpu_device *adev = ring->adev;
+
+       if (seq > ring->fence_drv.sync_seq[ring->idx])
+               return -EINVAL;
+
+       *fence = kmalloc(sizeof(struct amdgpu_fence), GFP_KERNEL);
+       if ((*fence) == NULL)
+               return -ENOMEM;
+
+       (*fence)->seq = seq;
+       (*fence)->ring = ring;
+       (*fence)->owner = owner;
+       fence_init(&(*fence)->base, &amdgpu_fence_ops,
+               &adev->fence_queue.lock, adev->fence_context + ring->idx,
+               (*fence)->seq);
+       return 0;
+}
+
 /**
  * amdgpu_fence_check_signaled - callback from fence_queue
  *
@@ -517,12 +549,14 @@ static bool amdgpu_fence_any_seq_signaled(struct amdgpu_device *adev, u64 *seq)
  * the wait timeout, or an error for all other cases.
  * -EDEADLK is returned when a GPU lockup has been detected.
  */
-long amdgpu_fence_wait_seq_timeout(struct amdgpu_device *adev, u64 *target_seq,
-                                  bool intr, long timeout)
+static long amdgpu_fence_wait_seq_timeout(struct amdgpu_device *adev,
+                                         u64 *target_seq, bool intr,
+                                         long timeout)
 {
        uint64_t last_seq[AMDGPU_MAX_RINGS];
        bool signaled;
-       int i, r;
+       int i;
+       long r;
 
        if (timeout == 0) {
                return amdgpu_fence_any_seq_signaled(adev, target_seq);
@@ -1023,7 +1057,7 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
 
                amdgpu_fence_process(ring);
 
-               seq_printf(m, "--- ring %d ---\n", i);
+               seq_printf(m, "--- ring %d (%s) ---\n", i, ring->name);
                seq_printf(m, "Last signaled fence 0x%016llx\n",
                           (unsigned long long)atomic64_read(&ring->fence_drv.last_seq));
                seq_printf(m, "Last emitted        0x%016llx\n",
@@ -1031,7 +1065,8 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
 
                for (j = 0; j < AMDGPU_MAX_RINGS; ++j) {
                        struct amdgpu_ring *other = adev->rings[j];
-                       if (i != j && other && other->fence_drv.initialized)
+                       if (i != j && other && other->fence_drv.initialized &&
+                           ring->fence_drv.sync_seq[j])
                                seq_printf(m, "Last sync to ring %d 0x%016llx\n",
                                           j, ring->fence_drv.sync_seq[j]);
                }
index 0ec222295feeb50e2a0798d7c091067ff2c3975b..975edb1000a202e3dcd2b7cbe2cc2c8916aa855f 100644 (file)
@@ -496,7 +496,7 @@ static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
 error_free:
        drm_free_large(vm_bos);
 
-       if (r)
+       if (r && r != -ERESTARTSYS)
                DRM_ERROR("Couldn't update BO_VA (%d)\n", r);
 }
 
@@ -525,8 +525,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                return -EINVAL;
        }
 
-       invalid_flags = ~(AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
-                       AMDGPU_VM_PAGE_EXECUTABLE);
+       invalid_flags = ~(AMDGPU_VM_DELAY_UPDATE | AMDGPU_VM_PAGE_READABLE |
+                       AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE);
        if ((args->flags & invalid_flags)) {
                dev_err(&dev->pdev->dev, "invalid flags 0x%08X vs 0x%08X\n",
                        args->flags, invalid_flags);
@@ -579,7 +579,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                break;
        }
 
-       if (!r)
+       if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE))
                amdgpu_gem_va_update_vm(adev, bo_va);
 
        drm_gem_object_unreference_unlocked(gobj);
index b56dd64bd4ea78fa4f10dec8ddeb5bdbde0ff3a8..961d7265c286524956e1100b14f084eb6e5341b0 100644 (file)
@@ -30,19 +30,21 @@ TRACE_EVENT(amdgpu_cs,
            TP_PROTO(struct amdgpu_cs_parser *p, int i),
            TP_ARGS(p, i),
            TP_STRUCT__entry(
+                            __field(struct amdgpu_bo_list *, bo_list)
                             __field(u32, ring)
                             __field(u32, dw)
                             __field(u32, fences)
                             ),
 
            TP_fast_assign(
+                          __entry->bo_list = p->bo_list;
                           __entry->ring = p->ibs[i].ring->idx;
                           __entry->dw = p->ibs[i].length_dw;
                           __entry->fences = amdgpu_fence_count_emitted(
                                p->ibs[i].ring);
                           ),
-           TP_printk("ring=%u, dw=%u, fences=%u",
-                     __entry->ring, __entry->dw,
+           TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
+                     __entry->bo_list, __entry->ring, __entry->dw,
                      __entry->fences)
 );
 
@@ -61,6 +63,54 @@ TRACE_EVENT(amdgpu_vm_grab_id,
            TP_printk("vmid=%u, ring=%u", __entry->vmid, __entry->ring)
 );
 
+TRACE_EVENT(amdgpu_vm_bo_map,
+           TP_PROTO(struct amdgpu_bo_va *bo_va,
+                    struct amdgpu_bo_va_mapping *mapping),
+           TP_ARGS(bo_va, mapping),
+           TP_STRUCT__entry(
+                            __field(struct amdgpu_bo *, bo)
+                            __field(long, start)
+                            __field(long, last)
+                            __field(u64, offset)
+                            __field(u32, flags)
+                            ),
+
+           TP_fast_assign(
+                          __entry->bo = bo_va->bo;
+                          __entry->start = mapping->it.start;
+                          __entry->last = mapping->it.last;
+                          __entry->offset = mapping->offset;
+                          __entry->flags = mapping->flags;
+                          ),
+           TP_printk("bo=%p, start=%lx, last=%lx, offset=%010llx, flags=%08x",
+                     __entry->bo, __entry->start, __entry->last,
+                     __entry->offset, __entry->flags)
+);
+
+TRACE_EVENT(amdgpu_vm_bo_unmap,
+           TP_PROTO(struct amdgpu_bo_va *bo_va,
+                    struct amdgpu_bo_va_mapping *mapping),
+           TP_ARGS(bo_va, mapping),
+           TP_STRUCT__entry(
+                            __field(struct amdgpu_bo *, bo)
+                            __field(long, start)
+                            __field(long, last)
+                            __field(u64, offset)
+                            __field(u32, flags)
+                            ),
+
+           TP_fast_assign(
+                          __entry->bo = bo_va->bo;
+                          __entry->start = mapping->it.start;
+                          __entry->last = mapping->it.last;
+                          __entry->offset = mapping->offset;
+                          __entry->flags = mapping->flags;
+                          ),
+           TP_printk("bo=%p, start=%lx, last=%lx, offset=%010llx, flags=%08x",
+                     __entry->bo, __entry->start, __entry->last,
+                     __entry->offset, __entry->flags)
+);
+
 TRACE_EVENT(amdgpu_vm_bo_update,
            TP_PROTO(struct amdgpu_bo_va_mapping *mapping),
            TP_ARGS(mapping),
@@ -121,6 +171,21 @@ TRACE_EVENT(amdgpu_vm_flush,
                      __entry->pd_addr, __entry->ring, __entry->id)
 );
 
+TRACE_EVENT(amdgpu_bo_list_set,
+           TP_PROTO(struct amdgpu_bo_list *list, struct amdgpu_bo *bo),
+           TP_ARGS(list, bo),
+           TP_STRUCT__entry(
+                            __field(struct amdgpu_bo_list *, list)
+                            __field(struct amdgpu_bo *, bo)
+                            ),
+
+           TP_fast_assign(
+                          __entry->list = list;
+                          __entry->bo = bo;
+                          ),
+           TP_printk("list=%p, bo=%p", __entry->list, __entry->bo)
+);
+
 DECLARE_EVENT_CLASS(amdgpu_fence_request,
 
            TP_PROTO(struct drm_device *dev, int ring, u32 seqno),
index d3706a4982933a35d09e36ec946f3bed959430fc..dd3415d2e45dcbb2f3cba5fa1ca6688ef779cfd5 100644 (file)
@@ -674,7 +674,7 @@ static int amdgpu_ttm_tt_populate(struct ttm_tt *ttm)
                return 0;
 
        if (gtt && gtt->userptr) {
-               ttm->sg = kcalloc(1, sizeof(struct sg_table), GFP_KERNEL);
+               ttm->sg = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
                if (!ttm->sg)
                        return -ENOMEM;
 
index 1127a504f11854f421ee2e202a96472094745ad4..d3ca73090e39d94f8eaf0762dcb22b4209a07712 100644 (file)
@@ -464,28 +464,42 @@ int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
  * @p: parser context
  * @lo: address of lower dword
  * @hi: address of higher dword
+ * @size: minimum size
  *
  * Patch relocation inside command stream with real buffer address
  */
-int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int hi)
+static int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx,
+                              int lo, int hi, unsigned size, uint32_t index)
 {
        struct amdgpu_bo_va_mapping *mapping;
        struct amdgpu_ib *ib = &p->ibs[ib_idx];
        struct amdgpu_bo *bo;
        uint64_t addr;
 
+       if (index == 0xffffffff)
+               index = 0;
+
        addr = ((uint64_t)amdgpu_get_ib_value(p, ib_idx, lo)) |
               ((uint64_t)amdgpu_get_ib_value(p, ib_idx, hi)) << 32;
+       addr += ((uint64_t)size) * ((uint64_t)index);
 
        mapping = amdgpu_cs_find_mapping(p, addr, &bo);
        if (mapping == NULL) {
-               DRM_ERROR("Can't find BO for addr 0x%010Lx %d %d\n",
+               DRM_ERROR("Can't find BO for addr 0x%010Lx %d %d %d %d\n",
+                         addr, lo, hi, size, index);
+               return -EINVAL;
+       }
+
+       if ((addr + (uint64_t)size) >
+           ((uint64_t)mapping->it.last + 1) * AMDGPU_GPU_PAGE_SIZE) {
+               DRM_ERROR("BO to small for addr 0x%010Lx %d %d\n",
                          addr, lo, hi);
                return -EINVAL;
        }
 
        addr -= ((uint64_t)mapping->it.start) * AMDGPU_GPU_PAGE_SIZE;
        addr += amdgpu_bo_gpu_offset(bo);
+       addr -= ((uint64_t)size) * ((uint64_t)index);
 
        ib->ptr[lo] = addr & 0xFFFFFFFF;
        ib->ptr[hi] = addr >> 32;
@@ -493,6 +507,48 @@ int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int
        return 0;
 }
 
+/**
+ * amdgpu_vce_validate_handle - validate stream handle
+ *
+ * @p: parser context
+ * @handle: handle to validate
+ * @allocated: allocated a new handle?
+ *
+ * Validates the handle and return the found session index or -EINVAL
+ * we we don't have another free session index.
+ */
+static int amdgpu_vce_validate_handle(struct amdgpu_cs_parser *p,
+                                     uint32_t handle, bool *allocated)
+{
+       unsigned i;
+
+       *allocated = false;
+
+       /* validate the handle */
+       for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
+               if (atomic_read(&p->adev->vce.handles[i]) == handle) {
+                       if (p->adev->vce.filp[i] != p->filp) {
+                               DRM_ERROR("VCE handle collision detected!\n");
+                               return -EINVAL;
+                       }
+                       return i;
+               }
+       }
+
+       /* handle not found try to alloc a new one */
+       for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
+               if (!atomic_cmpxchg(&p->adev->vce.handles[i], 0, handle)) {
+                       p->adev->vce.filp[i] = p->filp;
+                       p->adev->vce.img_size[i] = 0;
+                       *allocated = true;
+                       return i;
+               }
+       }
+
+       DRM_ERROR("No more free VCE handles!\n");
+       return -EINVAL;
+}
+
 /**
  * amdgpu_vce_cs_parse - parse and validate the command stream
  *
@@ -501,10 +557,15 @@ int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int
  */
 int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 {
-       uint32_t handle = 0;
-       bool destroy = false;
-       int i, r, idx = 0;
        struct amdgpu_ib *ib = &p->ibs[ib_idx];
+       unsigned fb_idx = 0, bs_idx = 0;
+       int session_idx = -1;
+       bool destroyed = false;
+       bool created = false;
+       bool allocated = false;
+       uint32_t tmp, handle = 0;
+       uint32_t *size = &tmp;
+       int i, r = 0, idx = 0;
 
        amdgpu_vce_note_usage(p->adev);
 
@@ -514,16 +575,44 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 
                if ((len < 8) || (len & 3)) {
                        DRM_ERROR("invalid VCE command length (%d)!\n", len);
-                       return -EINVAL;
+                       r = -EINVAL;
+                       goto out;
+               }
+
+               if (destroyed) {
+                       DRM_ERROR("No other command allowed after destroy!\n");
+                       r = -EINVAL;
+                       goto out;
                }
 
                switch (cmd) {
                case 0x00000001: // session
                        handle = amdgpu_get_ib_value(p, ib_idx, idx + 2);
+                       session_idx = amdgpu_vce_validate_handle(p, handle,
+                                                                &allocated);
+                       if (session_idx < 0)
+                               return session_idx;
+                       size = &p->adev->vce.img_size[session_idx];
                        break;
 
                case 0x00000002: // task info
+                       fb_idx = amdgpu_get_ib_value(p, ib_idx, idx + 6);
+                       bs_idx = amdgpu_get_ib_value(p, ib_idx, idx + 7);
+                       break;
+
                case 0x01000001: // create
+                       created = true;
+                       if (!allocated) {
+                               DRM_ERROR("Handle already in use!\n");
+                               r = -EINVAL;
+                               goto out;
+                       }
+
+                       *size = amdgpu_get_ib_value(p, ib_idx, idx + 8) *
+                               amdgpu_get_ib_value(p, ib_idx, idx + 10) *
+                               8 * 3 / 2;
+                       break;
+
                case 0x04000001: // config extension
                case 0x04000002: // pic control
                case 0x04000005: // rate control
@@ -534,60 +623,74 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
                        break;
 
                case 0x03000001: // encode
-                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 10, idx + 9);
+                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 10, idx + 9,
+                                               *size, 0);
                        if (r)
-                               return r;
+                               goto out;
 
-                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 12, idx + 11);
+                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 12, idx + 11,
+                                               *size / 3, 0);
                        if (r)
-                               return r;
+                               goto out;
                        break;
 
                case 0x02000001: // destroy
-                       destroy = true;
+                       destroyed = true;
                        break;
 
                case 0x05000001: // context buffer
+                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+                                               *size * 2, 0);
+                       if (r)
+                               goto out;
+                       break;
+
                case 0x05000004: // video bitstream buffer
+                       tmp = amdgpu_get_ib_value(p, ib_idx, idx + 4);
+                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+                                               tmp, bs_idx);
+                       if (r)
+                               goto out;
+                       break;
+
                case 0x05000005: // feedback buffer
-                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2);
+                       r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+                                               4096, fb_idx);
                        if (r)
-                               return r;
+                               goto out;
                        break;
 
                default:
                        DRM_ERROR("invalid VCE command (0x%x)!\n", cmd);
-                       return -EINVAL;
+                       r = -EINVAL;
+                       goto out;
                }
 
-               idx += len / 4;
-       }
-
-       if (destroy) {
-               /* IB contains a destroy msg, free the handle */
-               for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i)
-                       atomic_cmpxchg(&p->adev->vce.handles[i], handle, 0);
+               if (session_idx == -1) {
+                       DRM_ERROR("no session command at start of IB\n");
+                       r = -EINVAL;
+                       goto out;
+               }
 
-               return 0;
+               idx += len / 4;
        }
 
-       /* create or encode, validate the handle */
-       for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
-               if (atomic_read(&p->adev->vce.handles[i]) == handle)
-                       return 0;
+       if (allocated && !created) {
+               DRM_ERROR("New session without create command!\n");
+               r = -ENOENT;
        }
 
-       /* handle not found try to alloc a new one */
-       for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
-               if (!atomic_cmpxchg(&p->adev->vce.handles[i], 0, handle)) {
-                       p->adev->vce.filp[i] = p->filp;
-                       return 0;
-               }
+out:
+       if ((!r && destroyed) || (r && allocated)) {
+               /*
+                * IB contains a destroy msg or we have allocated an
+                * handle and got an error, anyway free the handle
+                */
+               for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i)
+                       atomic_cmpxchg(&p->adev->vce.handles[i], handle, 0);
        }
 
-       DRM_ERROR("No more free VCE handles!\n");
-
-       return -EINVAL;
+       return r;
 }
 
 /**
index b6a9d0956c6060befa3bd0b8a9b7dc69db5568c2..7ccdb5927da5ce4bcc7f1db1009c5b5297bca29b 100644 (file)
@@ -33,7 +33,6 @@ int amdgpu_vce_get_create_msg(struct amdgpu_ring *ring, uint32_t handle,
 int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
                               struct amdgpu_fence **fence);
 void amdgpu_vce_free_handles(struct amdgpu_device *adev, struct drm_file *filp);
-int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int hi);
 int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx);
 bool amdgpu_vce_ring_emit_semaphore(struct amdgpu_ring *ring,
                                    struct amdgpu_semaphore *semaphore,
index 407882b233c7952c99ed3128fbd9c7aa2adcab58..9a4e3b63f1cb4bf7ca9c73e813a0568f320c6574 100644 (file)
@@ -1001,6 +1001,7 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
 
        list_add(&mapping->list, &bo_va->mappings);
        interval_tree_insert(&mapping->it, &vm->va);
+       trace_amdgpu_vm_bo_map(bo_va, mapping);
 
        bo_va->addr = 0;
 
@@ -1058,6 +1059,7 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
        mutex_lock(&vm->mutex);
        list_del(&mapping->list);
        interval_tree_remove(&mapping->it, &vm->va);
+       trace_amdgpu_vm_bo_unmap(bo_va, mapping);
        kfree(mapping);
 
 error_unlock:
@@ -1099,6 +1101,7 @@ int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
        mutex_lock(&vm->mutex);
        list_del(&mapping->list);
        interval_tree_remove(&mapping->it, &vm->va);
+       trace_amdgpu_vm_bo_unmap(bo_va, mapping);
 
        if (bo_va->addr) {
                /* clear the old address */
@@ -1139,6 +1142,7 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
        list_for_each_entry_safe(mapping, next, &bo_va->mappings, list) {
                list_del(&mapping->list);
                interval_tree_remove(&mapping->it, &vm->va);
+               trace_amdgpu_vm_bo_unmap(bo_va, mapping);
                if (bo_va->addr)
                        list_add(&mapping->list, &vm->freed);
                else
index 5dab578d6462ab2949e005b9996612e09f5f1fa5..341c566818419317a0c3d16a3d5b738840e30b46 100644 (file)
@@ -2256,10 +2256,6 @@ int cik_set_ip_blocks(struct amdgpu_device *adev)
                return -EINVAL;
        }
 
-       adev->ip_block_enabled = kcalloc(adev->num_ip_blocks, sizeof(bool), GFP_KERNEL);
-       if (adev->ip_block_enabled == NULL)
-               return -ENOMEM;
-
        return 0;
 }
 
index 220865a44814a59a1934b4de16a87d1d797a3541..d19085a9706489a00a0e13306af0d6275587b88c 100644 (file)
 #define VCE_CMD_IB_AUTO                0x00000005
 #define VCE_CMD_SEMAPHORE      0x00000006
 
+/* valid for both DEFAULT_MTYPE and APE1_MTYPE */
+enum {
+       MTYPE_CACHED = 0,
+       MTYPE_NONCACHED = 3
+};
+
 #endif
index e4936a452bc6981e91dfc4bfa1c8dc4202992a13..f75a31df30bdb704f93e5dd465a3a74d93b524d8 100644 (file)
@@ -425,7 +425,7 @@ static int cz_dpm_init(struct amdgpu_device *adev)
        pi->mgcg_cgtt_local1 = 0x0;
        pi->clock_slow_down_step = 25000;
        pi->skip_clock_slow_down = 1;
-       pi->enable_nb_ps_policy = 1;
+       pi->enable_nb_ps_policy = 0;
        pi->caps_power_containment = true;
        pi->caps_cac = true;
        pi->didt_enabled = false;
index 782a74107664df05a7d7abff6e503d80389fc275..99e1afc896294c90c13f7e3ed0cc2ceff2aae1d7 100644 (file)
@@ -46,7 +46,7 @@
 
 /* Do not change the following, it is also defined in SMU8.h */
 #define SMU_EnabledFeatureScoreboard_AcpDpmOn          0x00000001
-#define SMU_EnabledFeatureScoreboard_SclkDpmOn         0x00100000
+#define SMU_EnabledFeatureScoreboard_SclkDpmOn         0x00200000
 #define SMU_EnabledFeatureScoreboard_UvdDpmOn          0x00800000
 #define SMU_EnabledFeatureScoreboard_VceDpmOn          0x01000000
 
index 72c27ac915f2a8681f35db4d15e232918405eb95..aaca8d663f2c60e97921e0c06a69a1c7a4549322 100644 (file)
@@ -3379,7 +3379,7 @@ static int dce_v8_0_hpd_irq(struct amdgpu_device *adev,
        uint32_t disp_int, mask, int_control, tmp;
        unsigned hpd;
 
-       if (entry->src_data > 6) {
+       if (entry->src_data >= adev->mode_info.num_hpd) {
                DRM_DEBUG("Unhandled interrupt: %d %d\n", entry->src_id, entry->src_data);
                return 0;
        }
index cb7907447b81dd3696312ce65dc58606c52a9ab4..2c188fb9fd22ff1a3528673beb8866639d5ef631 100644 (file)
@@ -2009,6 +2009,46 @@ static void gfx_v7_0_setup_rb(struct amdgpu_device *adev,
        mutex_unlock(&adev->grbm_idx_mutex);
 }
 
+/**
+ * gmc_v7_0_init_compute_vmid - gart enable
+ *
+ * @rdev: amdgpu_device pointer
+ *
+ * Initialize compute vmid sh_mem registers
+ *
+ */
+#define DEFAULT_SH_MEM_BASES   (0x6000)
+#define FIRST_COMPUTE_VMID     (8)
+#define LAST_COMPUTE_VMID      (16)
+static void gmc_v7_0_init_compute_vmid(struct amdgpu_device *adev)
+{
+       int i;
+       uint32_t sh_mem_config;
+       uint32_t sh_mem_bases;
+
+       /*
+        * Configure apertures:
+        * LDS:         0x60000000'00000000 - 0x60000001'00000000 (4GB)
+        * Scratch:     0x60000001'00000000 - 0x60000002'00000000 (4GB)
+        * GPUVM:       0x60010000'00000000 - 0x60020000'00000000 (1TB)
+       */
+       sh_mem_bases = DEFAULT_SH_MEM_BASES | (DEFAULT_SH_MEM_BASES << 16);
+       sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+                       SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+       sh_mem_config |= MTYPE_NONCACHED << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT;
+       mutex_lock(&adev->srbm_mutex);
+       for (i = FIRST_COMPUTE_VMID; i < LAST_COMPUTE_VMID; i++) {
+               cik_srbm_select(adev, 0, 0, 0, i);
+               /* CP and shaders */
+               WREG32(mmSH_MEM_CONFIG, sh_mem_config);
+               WREG32(mmSH_MEM_APE1_BASE, 1);
+               WREG32(mmSH_MEM_APE1_LIMIT, 0);
+               WREG32(mmSH_MEM_BASES, sh_mem_bases);
+       }
+       cik_srbm_select(adev, 0, 0, 0, 0);
+       mutex_unlock(&adev->srbm_mutex);
+}
+
 /**
  * gfx_v7_0_gpu_init - setup the 3D engine
  *
@@ -2230,6 +2270,8 @@ static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
        cik_srbm_select(adev, 0, 0, 0, 0);
        mutex_unlock(&adev->srbm_mutex);
 
+       gmc_v7_0_init_compute_vmid(adev);
+
        WREG32(mmSX_DEBUG_1, 0x20);
 
        WREG32(mmTA_CNTL_AUX, 0x00010000);
index 14242bd33363d3e26b368a1913bb73a2bf931ba9..7b683fb2173c728fff760c926f1204b3897b4eae 100644 (file)
@@ -1894,6 +1894,51 @@ static void gfx_v8_0_setup_rb(struct amdgpu_device *adev,
        mutex_unlock(&adev->grbm_idx_mutex);
 }
 
+/**
+ * gmc_v8_0_init_compute_vmid - gart enable
+ *
+ * @rdev: amdgpu_device pointer
+ *
+ * Initialize compute vmid sh_mem registers
+ *
+ */
+#define DEFAULT_SH_MEM_BASES   (0x6000)
+#define FIRST_COMPUTE_VMID     (8)
+#define LAST_COMPUTE_VMID      (16)
+static void gmc_v8_0_init_compute_vmid(struct amdgpu_device *adev)
+{
+       int i;
+       uint32_t sh_mem_config;
+       uint32_t sh_mem_bases;
+
+       /*
+        * Configure apertures:
+        * LDS:         0x60000000'00000000 - 0x60000001'00000000 (4GB)
+        * Scratch:     0x60000001'00000000 - 0x60000002'00000000 (4GB)
+        * GPUVM:       0x60010000'00000000 - 0x60020000'00000000 (1TB)
+        */
+       sh_mem_bases = DEFAULT_SH_MEM_BASES | (DEFAULT_SH_MEM_BASES << 16);
+
+       sh_mem_config = SH_MEM_ADDRESS_MODE_HSA64 <<
+                       SH_MEM_CONFIG__ADDRESS_MODE__SHIFT |
+                       SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+                       SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+                       MTYPE_CC << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+                       SH_MEM_CONFIG__PRIVATE_ATC_MASK;
+
+       mutex_lock(&adev->srbm_mutex);
+       for (i = FIRST_COMPUTE_VMID; i < LAST_COMPUTE_VMID; i++) {
+               vi_srbm_select(adev, 0, 0, 0, i);
+               /* CP and shaders */
+               WREG32(mmSH_MEM_CONFIG, sh_mem_config);
+               WREG32(mmSH_MEM_APE1_BASE, 1);
+               WREG32(mmSH_MEM_APE1_LIMIT, 0);
+               WREG32(mmSH_MEM_BASES, sh_mem_bases);
+       }
+       vi_srbm_select(adev, 0, 0, 0, 0);
+       mutex_unlock(&adev->srbm_mutex);
+}
+
 static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
 {
        u32 gb_addr_config;
@@ -2113,6 +2158,8 @@ static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
        vi_srbm_select(adev, 0, 0, 0, 0);
        mutex_unlock(&adev->srbm_mutex);
 
+       gmc_v8_0_init_compute_vmid(adev);
+
        mutex_lock(&adev->grbm_idx_mutex);
        /*
         * making sure that the following register writes will be broadcasted
@@ -3081,7 +3128,7 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev)
                                WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER,
                                       AMDGPU_DOORBELL_KIQ << 2);
                                WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER,
-                                      AMDGPU_DOORBELL_MEC_RING7 << 2);
+                                               0x7FFFF << 2);
                        }
                        tmp = RREG32(mmCP_HQD_PQ_DOORBELL_CONTROL);
                        tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
@@ -3097,6 +3144,12 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev)
                WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL,
                       mqd->cp_hqd_pq_doorbell_control);
 
+               /* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
+               ring->wptr = 0;
+               mqd->cp_hqd_pq_wptr = ring->wptr;
+               WREG32(mmCP_HQD_PQ_WPTR, mqd->cp_hqd_pq_wptr);
+               mqd->cp_hqd_pq_rptr = RREG32(mmCP_HQD_PQ_RPTR);
+
                /* set the vmid for the queue */
                mqd->cp_hqd_vmid = 0;
                WREG32(mmCP_HQD_VMID, mqd->cp_hqd_vmid);
index e3c1fde753638de09e9465464bb47339165aa62e..7bb37b93993fb5312eb2d46189bf09bf789c3989 100644 (file)
@@ -438,6 +438,31 @@ static void sdma_v3_0_rlc_stop(struct amdgpu_device *adev)
        /* XXX todo */
 }
 
+/**
+ * sdma_v3_0_ctx_switch_enable - stop the async dma engines context switch
+ *
+ * @adev: amdgpu_device pointer
+ * @enable: enable/disable the DMA MEs context switch.
+ *
+ * Halt or unhalt the async dma engines context switch (VI).
+ */
+static void sdma_v3_0_ctx_switch_enable(struct amdgpu_device *adev, bool enable)
+{
+       u32 f32_cntl;
+       int i;
+
+       for (i = 0; i < SDMA_MAX_INSTANCE; i++) {
+               f32_cntl = RREG32(mmSDMA0_CNTL + sdma_offsets[i]);
+               if (enable)
+                       f32_cntl = REG_SET_FIELD(f32_cntl, SDMA0_CNTL,
+                                       AUTO_CTXSW_ENABLE, 1);
+               else
+                       f32_cntl = REG_SET_FIELD(f32_cntl, SDMA0_CNTL,
+                                       AUTO_CTXSW_ENABLE, 0);
+               WREG32(mmSDMA0_CNTL + sdma_offsets[i], f32_cntl);
+       }
+}
+
 /**
  * sdma_v3_0_enable - stop the async dma engines
  *
@@ -648,6 +673,8 @@ static int sdma_v3_0_start(struct amdgpu_device *adev)
 
        /* unhalt the MEs */
        sdma_v3_0_enable(adev, true);
+       /* enable sdma ring preemption */
+       sdma_v3_0_ctx_switch_enable(adev, true);
 
        /* start the gfx rings and rlc compute queues */
        r = sdma_v3_0_gfx_resume(adev);
@@ -1079,6 +1106,7 @@ static int sdma_v3_0_hw_fini(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       sdma_v3_0_ctx_switch_enable(adev, false);
        sdma_v3_0_enable(adev, false);
 
        return 0;
index 90fc93c2c1d04571e4a694af23e5a7af51267a0a..fa5a4448531dfe9dd307d88b55e051761821a28d 100644 (file)
@@ -1189,10 +1189,6 @@ int vi_set_ip_blocks(struct amdgpu_device *adev)
                return -EINVAL;
        }
 
-       adev->ip_block_enabled = kcalloc(adev->num_ip_blocks, sizeof(bool), GFP_KERNEL);
-       if (adev->ip_block_enabled == NULL)
-               return -ENOMEM;
-
        return 0;
 }
 
index 619dad1b23863716972a23b12226b16af533a068..9daa2883ac186f73c64baa40ba61aadfd0a94c9a 100644 (file)
@@ -516,17 +516,17 @@ static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
                struct page *page_table;
 
                if (WARN_ON(!ppgtt->pdp.page_directory[pdpe]))
-                       continue;
+                       break;
 
                pd = ppgtt->pdp.page_directory[pdpe];
 
                if (WARN_ON(!pd->page_table[pde]))
-                       continue;
+                       break;
 
                pt = pd->page_table[pde];
 
                if (WARN_ON(!pt->page))
-                       continue;
+                       break;
 
                page_table = pt->page;
 
index f5edb3504167ec09b5165d6f54aa760a85f864d1..2030f602cbf8b74366bcb78f9f2ddc4a5f0dd9c6 100644 (file)
@@ -3491,6 +3491,7 @@ enum skl_disp_power_wells {
 #define   BLM_POLARITY_PNV                     (1 << 0) /* pnv only */
 
 #define BLC_HIST_CTL   (dev_priv->info.display_mmio_offset + 0x61260)
+#define  BLM_HISTOGRAM_ENABLE                  (1 << 31)
 
 /* New registers for PCH-split platforms. Safe where new bits show up, the
  * register layout machtes with gen4 BLC_PWM_CTL[12]. */
index dcb1d25d6f051ee88ae84ba7eee7e5fe4f4a2c76..1b61f98103870171e75338595b32e5dd473523e0 100644 (file)
@@ -13303,6 +13303,16 @@ intel_check_primary_plane(struct drm_plane *plane,
                                intel_crtc->atomic.wait_vblank = true;
                }
 
+               /*
+                * FIXME: Actually if we will still have any other plane enabled
+                * on the pipe we could let IPS enabled still, but for
+                * now lets consider that when we make primary invisible
+                * by setting DSPCNTR to 0 on update_primary_plane function
+                * IPS needs to be disable.
+                */
+               if (!state->visible || !fb)
+                       intel_crtc->atomic.disable_ips = true;
+
                intel_crtc->atomic.fb_bits |=
                        INTEL_FRONTBUFFER_PRIMARY(intel_crtc->pipe);
 
@@ -13400,6 +13410,9 @@ static void intel_begin_crtc_commit(struct drm_crtc *crtc)
        if (intel_crtc->atomic.disable_fbc)
                intel_fbc_disable(dev);
 
+       if (intel_crtc->atomic.disable_ips)
+               hsw_disable_ips(intel_crtc);
+
        if (intel_crtc->atomic.pre_disable_primary)
                intel_pre_disable_primary(crtc);
 
index 76afc62373d75bc8c6d9349acd42a38cc71a5a02..6e8faa25379240cab60f57631adf42612f28df33 100644 (file)
@@ -1140,6 +1140,9 @@ skl_edp_set_pll_config(struct intel_crtc_state *pipe_config, int link_clock)
 static void
 hsw_dp_set_ddi_pll_sel(struct intel_crtc_state *pipe_config, int link_bw)
 {
+       memset(&pipe_config->dpll_hw_state, 0,
+              sizeof(pipe_config->dpll_hw_state));
+
        switch (link_bw) {
        case DP_LINK_BW_1_62:
                pipe_config->ddi_pll_sel = PORT_CLK_SEL_LCPLL_810;
index 2afb31a4627573a3f97d0a4530231f27451e0793..105928382e216239043faf4e651d10520c92b678 100644 (file)
@@ -485,6 +485,7 @@ struct intel_crtc_atomic_commit {
        /* Sleepable operations to perform before commit */
        bool wait_for_flips;
        bool disable_fbc;
+       bool disable_ips;
        bool pre_disable_primary;
        bool update_wm;
        unsigned disabled_planes;
index 7d83527f95f797fd3e020227753923c6b029d41b..55aad2322e10ec8e7ea168aa84e0b7d98dcf9b91 100644 (file)
@@ -907,6 +907,14 @@ static void i9xx_enable_backlight(struct intel_connector *connector)
 
        /* XXX: combine this into above write? */
        intel_panel_actually_set_backlight(connector, panel->backlight.level);
+
+       /*
+        * Needed to enable backlight on some 855gm models. BLC_HIST_CTL is
+        * 855gm only, but checking for gen2 is safe, as 855gm is the only gen2
+        * that has backlight.
+        */
+       if (IS_GEN2(dev))
+               I915_WRITE(BLC_HIST_CTL, BLM_HISTOGRAM_ENABLE);
 }
 
 static void i965_enable_backlight(struct intel_connector *connector)
index b0688b0c8908f5ba0704657f6aa39d1b47508a54..4ecf5caa8c6d9745f9421710179aa2f81ef3587f 100644 (file)
@@ -4604,6 +4604,31 @@ void cik_compute_set_wptr(struct radeon_device *rdev,
        WDOORBELL32(ring->doorbell_index, ring->wptr);
 }
 
+static void cik_compute_stop(struct radeon_device *rdev,
+                            struct radeon_ring *ring)
+{
+       u32 j, tmp;
+
+       cik_srbm_select(rdev, ring->me, ring->pipe, ring->queue, 0);
+       /* Disable wptr polling. */
+       tmp = RREG32(CP_PQ_WPTR_POLL_CNTL);
+       tmp &= ~WPTR_POLL_EN;
+       WREG32(CP_PQ_WPTR_POLL_CNTL, tmp);
+       /* Disable HQD. */
+       if (RREG32(CP_HQD_ACTIVE) & 1) {
+               WREG32(CP_HQD_DEQUEUE_REQUEST, 1);
+               for (j = 0; j < rdev->usec_timeout; j++) {
+                       if (!(RREG32(CP_HQD_ACTIVE) & 1))
+                               break;
+                       udelay(1);
+               }
+               WREG32(CP_HQD_DEQUEUE_REQUEST, 0);
+               WREG32(CP_HQD_PQ_RPTR, 0);
+               WREG32(CP_HQD_PQ_WPTR, 0);
+       }
+       cik_srbm_select(rdev, 0, 0, 0, 0);
+}
+
 /**
  * cik_cp_compute_enable - enable/disable the compute CP MEs
  *
@@ -4617,6 +4642,15 @@ static void cik_cp_compute_enable(struct radeon_device *rdev, bool enable)
        if (enable)
                WREG32(CP_MEC_CNTL, 0);
        else {
+               /*
+                * To make hibernation reliable we need to clear compute ring
+                * configuration before halting the compute ring.
+                */
+               mutex_lock(&rdev->srbm_mutex);
+               cik_compute_stop(rdev,&rdev->ring[CAYMAN_RING_TYPE_CP1_INDEX]);
+               cik_compute_stop(rdev,&rdev->ring[CAYMAN_RING_TYPE_CP2_INDEX]);
+               mutex_unlock(&rdev->srbm_mutex);
+
                WREG32(CP_MEC_CNTL, (MEC_ME1_HALT | MEC_ME2_HALT));
                rdev->ring[CAYMAN_RING_TYPE_CP1_INDEX].ready = false;
                rdev->ring[CAYMAN_RING_TYPE_CP2_INDEX].ready = false;
index f86eb54e7763d65341006c4ff25f4e8f8076760c..d16f2eebd95e6b2df5412d072023a89d43d32ae2 100644 (file)
@@ -268,6 +268,17 @@ static void cik_sdma_gfx_stop(struct radeon_device *rdev)
        }
        rdev->ring[R600_RING_TYPE_DMA_INDEX].ready = false;
        rdev->ring[CAYMAN_RING_TYPE_DMA1_INDEX].ready = false;
+
+       /* FIXME use something else than big hammer but after few days can not
+        * seem to find good combination so reset SDMA blocks as it seems we
+        * do not shut them down properly. This fix hibernation and does not
+        * affect suspend to ram.
+        */
+       WREG32(SRBM_SOFT_RESET, SOFT_RESET_SDMA | SOFT_RESET_SDMA1);
+       (void)RREG32(SRBM_SOFT_RESET);
+       udelay(50);
+       WREG32(SRBM_SOFT_RESET, 0);
+       (void)RREG32(SRBM_SOFT_RESET);
 }
 
 /**
index c89215275053d3168e6deba468b32fc05ba89bf6..fa719c53449bcd90e009e1b59d1b3e1ed5bff6f3 100644 (file)
@@ -469,22 +469,22 @@ void radeon_audio_detect(struct drm_connector *connector,
        dig = radeon_encoder->enc_priv;
 
        if (status == connector_status_connected) {
-               struct radeon_connector *radeon_connector;
-               int sink_type;
-
                if (!drm_detect_monitor_audio(radeon_connector_edid(connector))) {
                        radeon_encoder->audio = NULL;
                        return;
                }
 
-               radeon_connector = to_radeon_connector(connector);
-               sink_type = radeon_dp_getsinktype(radeon_connector);
+               if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort) {
+                       struct radeon_connector *radeon_connector = to_radeon_connector(connector);
 
-               if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort &&
-                       sink_type == CONNECTOR_OBJECT_ID_DISPLAYPORT)
-                       radeon_encoder->audio = rdev->audio.dp_funcs;
-               else
+                       if (radeon_dp_getsinktype(radeon_connector) ==
+                           CONNECTOR_OBJECT_ID_DISPLAYPORT)
+                               radeon_encoder->audio = rdev->audio.dp_funcs;
+                       else
+                               radeon_encoder->audio = rdev->audio.hdmi_funcs;
+               } else {
                        radeon_encoder->audio = rdev->audio.hdmi_funcs;
+               }
 
                dig->afmt->pin = radeon_audio_get_pin(connector->encoder);
                radeon_audio_enable(rdev, dig->afmt->pin, 0xf);
index aeb676708e60cfb1871326bfc5a689631bb98741..634793ea841889847ac090c32470548a1cae418d 100644 (file)
@@ -257,7 +257,6 @@ static int radeonfb_create(struct drm_fb_helper *helper,
        }
 
        info->par = rfbdev;
-       info->skip_vt_switch = true;
 
        ret = radeon_framebuffer_init(rdev->ddev, &rfbdev->rfb, &mode_cmd, gobj);
        if (ret) {
index edafd3c2b17028a73ff5128568c73adfaff0f85b..06ac59fe332ab089d21b279aba092f615710c7ca 100644 (file)
@@ -719,7 +719,7 @@ static int radeon_ttm_tt_populate(struct ttm_tt *ttm)
                return 0;
 
        if (gtt && gtt->userptr) {
-               ttm->sg = kcalloc(1, sizeof(struct sg_table), GFP_KERNEL);
+               ttm->sg = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
                if (!ttm->sg)
                        return -ENOMEM;
 
index 3662157c2b1582b54291b607be1667579eef368b..ec10533a49b87a905aa82eda96e48bddae32d87f 100644 (file)
@@ -1129,12 +1129,12 @@ void radeon_vm_bo_rmv(struct radeon_device *rdev,
                interval_tree_remove(&bo_va->it, &vm->va);
 
        spin_lock(&vm->status_lock);
-       if (list_empty(&bo_va->vm_status)) {
+       list_del(&bo_va->vm_status);
+       if (bo_va->it.start || bo_va->it.last) {
                bo_va->bo = radeon_bo_ref(bo_va->bo);
                list_add(&bo_va->vm_status, &vm->freed);
        } else {
                radeon_fence_unref(&bo_va->last_pt_update);
-               list_del(&bo_va->vm_status);
                kfree(bo_va);
        }
        spin_unlock(&vm->status_lock);
index 3962176ee71325ca4434fa34643f117c58b2d217..01b558fe369539f447d36493f3bcd7e1bc4f3ded 100644 (file)
@@ -21,6 +21,7 @@
 #include <drm/drm_fb_helper.h>
 #include <linux/dma-mapping.h>
 #include <linux/pm_runtime.h>
+#include <linux/module.h>
 #include <linux/of_graph.h>
 #include <linux/component.h>
 
index 4557f335a8a56f243ad4aa326a1826572116d3a5..dc65161d7cad20acb2f079bdb6b229cf779b3613 100644 (file)
@@ -19,6 +19,7 @@
 #include <drm/drm_plane_helper.h>
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/of.h>
index 9d056417d88c5ee9d70d018c7341f10d7e92fb96..f9aaf37262be4cb120201313c30b167896f66aaf 100644 (file)
@@ -24,6 +24,7 @@
 #define __LINUX_HSI_OMAP_SSI_H__
 
 #include <linux/device.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/hsi/hsi.h>
 #include <linux/gpio.h>
index f3755e0aa935c96a3aa0b0d3e4122b82e8cfe241..f80acb36ff075ca6a25b398861d97993d32fd57b 100644 (file)
@@ -195,4 +195,4 @@ static int __init ipc_init(void)
 {
        return amba_driver_register(&pl320_driver);
 }
-module_init(ipc_init);
+subsys_initcall(ipc_init);
index 9df2b6801f767c9c0da6904b689299c93d031417..b2b411da297b06e73441f8dd51c8bae0b004bcc0 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/regulator/consumer.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/pm_runtime.h>
+#include <linux/pm_wakeirq.h>
 #include <linux/platform_data/hsmmc-omap.h>
 
 /* OMAP HSMMC Host Controller Registers */
@@ -218,7 +219,6 @@ struct omap_hsmmc_host {
        unsigned int            flags;
 #define AUTO_CMD23             (1 << 0)        /* Auto CMD23 support */
 #define HSMMC_SDIO_IRQ_ENABLED (1 << 1)        /* SDIO irq enabled */
-#define HSMMC_WAKE_IRQ_ENABLED (1 << 2)
        struct omap_hsmmc_next  next_data;
        struct  omap_hsmmc_platform_data        *pdata;
 
@@ -1117,22 +1117,6 @@ static irqreturn_t omap_hsmmc_irq(int irq, void *dev_id)
        return IRQ_HANDLED;
 }
 
-static irqreturn_t omap_hsmmc_wake_irq(int irq, void *dev_id)
-{
-       struct omap_hsmmc_host *host = dev_id;
-
-       /* cirq is level triggered, disable to avoid infinite loop */
-       spin_lock(&host->irq_lock);
-       if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
-               disable_irq_nosync(host->wake_irq);
-               host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
-       }
-       spin_unlock(&host->irq_lock);
-       pm_request_resume(host->dev); /* no use counter */
-
-       return IRQ_HANDLED;
-}
-
 static void set_sd_bus_power(struct omap_hsmmc_host *host)
 {
        unsigned long i;
@@ -1665,7 +1649,6 @@ static void omap_hsmmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
 
 static int omap_hsmmc_configure_wake_irq(struct omap_hsmmc_host *host)
 {
-       struct mmc_host *mmc = host->mmc;
        int ret;
 
        /*
@@ -1677,11 +1660,7 @@ static int omap_hsmmc_configure_wake_irq(struct omap_hsmmc_host *host)
        if (!host->dev->of_node || !host->wake_irq)
                return -ENODEV;
 
-       /* Prevent auto-enabling of IRQ */
-       irq_set_status_flags(host->wake_irq, IRQ_NOAUTOEN);
-       ret = devm_request_irq(host->dev, host->wake_irq, omap_hsmmc_wake_irq,
-                              IRQF_TRIGGER_LOW | IRQF_ONESHOT,
-                              mmc_hostname(mmc), host);
+       ret = dev_pm_set_dedicated_wake_irq(host->dev, host->wake_irq);
        if (ret) {
                dev_err(mmc_dev(host->mmc), "Unable to request wake IRQ\n");
                goto err;
@@ -1718,7 +1697,7 @@ static int omap_hsmmc_configure_wake_irq(struct omap_hsmmc_host *host)
        return 0;
 
 err_free_irq:
-       devm_free_irq(host->dev, host->wake_irq, host);
+       dev_pm_clear_wake_irq(host->dev);
 err:
        dev_warn(host->dev, "no SDIO IRQ support, falling back to polling\n");
        host->wake_irq = 0;
@@ -2007,6 +1986,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
                omap_hsmmc_ops.multi_io_quirk = omap_hsmmc_multi_io_quirk;
        }
 
+       device_init_wakeup(&pdev->dev, true);
        pm_runtime_enable(host->dev);
        pm_runtime_get_sync(host->dev);
        pm_runtime_set_autosuspend_delay(host->dev, MMC_AUTOSUSPEND_DELAY);
@@ -2147,6 +2127,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
        if (host->use_reg)
                omap_hsmmc_reg_put(host);
 err_irq:
+       device_init_wakeup(&pdev->dev, false);
        if (host->tx_chan)
                dma_release_channel(host->tx_chan);
        if (host->rx_chan)
@@ -2178,6 +2159,7 @@ static int omap_hsmmc_remove(struct platform_device *pdev)
 
        pm_runtime_put_sync(host->dev);
        pm_runtime_disable(host->dev);
+       device_init_wakeup(&pdev->dev, false);
        if (host->dbclk)
                clk_disable_unprepare(host->dbclk);
 
@@ -2204,11 +2186,6 @@ static int omap_hsmmc_suspend(struct device *dev)
                                OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP);
        }
 
-       /* do not wake up due to sdio irq */
-       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
-           !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
-               disable_irq(host->wake_irq);
-
        if (host->dbclk)
                clk_disable_unprepare(host->dbclk);
 
@@ -2233,11 +2210,6 @@ static int omap_hsmmc_resume(struct device *dev)
                omap_hsmmc_conf_bus_power(host);
 
        omap_hsmmc_protect_card(host);
-
-       if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
-           !(host->mmc->pm_flags & MMC_PM_WAKE_SDIO_IRQ))
-               enable_irq(host->wake_irq);
-
        pm_runtime_mark_last_busy(host->dev);
        pm_runtime_put_autosuspend(host->dev);
        return 0;
@@ -2277,10 +2249,6 @@ static int omap_hsmmc_runtime_suspend(struct device *dev)
                }
 
                pinctrl_pm_select_idle_state(dev);
-
-               WARN_ON(host->flags & HSMMC_WAKE_IRQ_ENABLED);
-               enable_irq(host->wake_irq);
-               host->flags |= HSMMC_WAKE_IRQ_ENABLED;
        } else {
                pinctrl_pm_select_idle_state(dev);
        }
@@ -2302,11 +2270,6 @@ static int omap_hsmmc_runtime_resume(struct device *dev)
        spin_lock_irqsave(&host->irq_lock, flags);
        if ((host->mmc->caps & MMC_CAP_SDIO_IRQ) &&
            (host->flags & HSMMC_SDIO_IRQ_ENABLED)) {
-               /* sdio irq flag can't change while in runtime suspend */
-               if (host->flags & HSMMC_WAKE_IRQ_ENABLED) {
-                       disable_irq_nosync(host->wake_irq);
-                       host->flags &= ~HSMMC_WAKE_IRQ_ENABLED;
-               }
 
                pinctrl_pm_select_default_state(host->dev);
 
index 4c04360f378bb94037b77c1283bc47bd245b7b7c..b2a189507fc35edfdc3057385df5c750d2ff2e8f 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/io.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/resource.h>
index 8c43589c3edba9248c13760c8e92025b4a5cf599..1f52462f4cdd4b7e1431723ca37b32c267da68a1 100644 (file)
@@ -220,20 +220,10 @@ static int goldfish_pdev_bus_probe(struct platform_device *pdev)
        return ret;
 }
 
-static int goldfish_pdev_bus_remove(struct platform_device *pdev)
-{
-       iounmap(pdev_bus_base);
-       free_irq(pdev_bus_irq, pdev);
-       release_mem_region(pdev_bus_addr, pdev_bus_len);
-       return 0;
-}
-
 static struct platform_driver goldfish_pdev_bus_driver = {
        .probe = goldfish_pdev_bus_probe,
-       .remove = goldfish_pdev_bus_remove,
        .driver = {
                .name = "goldfish_pdev_bus"
        }
 };
-
-module_platform_driver(goldfish_pdev_bus_driver);
+builtin_platform_driver(goldfish_pdev_bus_driver);
index d3c7d245ae63d93c5ec36617e69331b25f9b570c..7d0d269a0837c0b84e9f72eeabbab354159a3b8e 100644 (file)
@@ -88,4 +88,4 @@ static struct platform_driver syscon_reboot_driver = {
                .of_match_table = syscon_reboot_of_match,
        },
 };
-module_platform_driver(syscon_reboot_driver);
+builtin_platform_driver(syscon_reboot_driver);
index 6af41abccacb473921dbeba078147d067ea8e0cd..c07ee13bd47047e1e930a16e1d76229ae0b7b027 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/gpio.h>
 #include <linux/slab.h>
 #include <linux/gpio/consumer.h>
+#include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
index b562af816c0a5cd1fa0fb525b6d99e12e18fb034..b04b05a0904eec086c49250c3dc4e27cb84a3927 100644 (file)
@@ -260,7 +260,7 @@ static int __init qcom_cpuidle_init(struct device_node *cpu_node, int cpu)
                /* We have atleast one power down mode */
                cpumask_clear(&mask);
                cpumask_set_cpu(cpu, &mask);
-               qcom_scm_set_warm_boot_addr(cpu_resume, &mask);
+               qcom_scm_set_warm_boot_addr(cpu_resume_arm, &mask);
        }
 
        per_cpu(qcom_idle_ops, cpu) = fns;
index cc119d15dd1616feeef9821ecf98c94fc19ecfec..75d0457a77b72ade791df16e6193662028e35847 100644 (file)
@@ -1021,7 +1021,7 @@ static struct platform_driver tegra_pmc_driver = {
        },
        .probe = tegra_pmc_probe,
 };
-module_platform_driver(tegra_pmc_driver);
+builtin_platform_driver(tegra_pmc_driver);
 
 /*
  * Early initialization to allow access to registers in the very early boot
index 1a07bf540fecc384ef44112e0798c1afd798e9dd..e642c4540dda123a39b9ff42437f34f64f3bad06 100644 (file)
@@ -142,4 +142,4 @@ static struct platform_driver realview_soc_driver = {
                .of_match_table = realview_soc_of_match,
        },
 };
-module_platform_driver(realview_soc_driver);
+builtin_platform_driver(realview_soc_driver);
index 3774600741d844f7d21ab98d3186d4fde25c0db3..9325262289f9e191a1c3ecb5d8024df56b7e0d21 100644 (file)
@@ -640,25 +640,7 @@ static int __init dashtty_init(void)
        put_tty_driver(channel_driver);
        return ret;
 }
-
-static void dashtty_exit(void)
-{
-       int nport;
-       struct dashtty_port *dport;
-
-       del_timer_sync(&put_timer);
-       kthread_stop(dashtty_thread);
-       del_timer_sync(&poll_timer);
-       tty_unregister_driver(channel_driver);
-       for (nport = 0; nport < NUM_TTY_CHANNELS; nport++) {
-               dport = &dashtty_ports[nport];
-               tty_port_destroy(&dport->port);
-       }
-       put_tty_driver(channel_driver);
-}
-
-module_init(dashtty_init);
-module_exit(dashtty_exit);
+device_initcall(dashtty_init);
 
 #ifdef CONFIG_DA_CONSOLE
 
index 978204333c94b364b843c723c45dff353a5dbdfb..d75a66c7275098184b53344d0729c9dd2ac50a85 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/console.h>
 #include <linux/pm_qos.h>
+#include <linux/pm_wakeirq.h>
 #include <linux/dma-mapping.h>
 
 #include "8250.h"
@@ -552,17 +553,6 @@ static void omap8250_uart_qos_work(struct work_struct *work)
        pm_qos_update_request(&priv->pm_qos_request, priv->latency);
 }
 
-static irqreturn_t omap_wake_irq(int irq, void *dev_id)
-{
-       struct uart_port *port = dev_id;
-       int ret;
-
-       ret = port->handle_irq(port);
-       if (ret)
-               return IRQ_HANDLED;
-       return IRQ_NONE;
-}
-
 #ifdef CONFIG_SERIAL_8250_DMA
 static int omap_8250_dma_handle_irq(struct uart_port *port);
 #endif
@@ -596,11 +586,9 @@ static int omap_8250_startup(struct uart_port *port)
        int ret;
 
        if (priv->wakeirq) {
-               ret = request_irq(priv->wakeirq, omap_wake_irq,
-                                 port->irqflags, "uart wakeup irq", port);
+               ret = dev_pm_set_dedicated_wake_irq(port->dev, priv->wakeirq);
                if (ret)
                        return ret;
-               disable_irq(priv->wakeirq);
        }
 
        pm_runtime_get_sync(port->dev);
@@ -649,8 +637,7 @@ static int omap_8250_startup(struct uart_port *port)
 err:
        pm_runtime_mark_last_busy(port->dev);
        pm_runtime_put_autosuspend(port->dev);
-       if (priv->wakeirq)
-               free_irq(priv->wakeirq, port);
+       dev_pm_clear_wake_irq(port->dev);
        return ret;
 }
 
@@ -682,10 +669,8 @@ static void omap_8250_shutdown(struct uart_port *port)
 
        pm_runtime_mark_last_busy(port->dev);
        pm_runtime_put_autosuspend(port->dev);
-
        free_irq(port->irq, port);
-       if (priv->wakeirq)
-               free_irq(priv->wakeirq, port);
+       dev_pm_clear_wake_irq(port->dev);
 }
 
 static void omap_8250_throttle(struct uart_port *port)
@@ -1226,31 +1211,6 @@ static int omap8250_remove(struct platform_device *pdev)
        return 0;
 }
 
-#ifdef CONFIG_PM
-
-static inline void omap8250_enable_wakeirq(struct omap8250_priv *priv,
-                                          bool enable)
-{
-       if (!priv->wakeirq)
-               return;
-
-       if (enable)
-               enable_irq(priv->wakeirq);
-       else
-               disable_irq_nosync(priv->wakeirq);
-}
-
-static void omap8250_enable_wakeup(struct omap8250_priv *priv,
-                                  bool enable)
-{
-       if (enable == priv->wakeups_enabled)
-               return;
-
-       omap8250_enable_wakeirq(priv, enable);
-       priv->wakeups_enabled = enable;
-}
-#endif
-
 #ifdef CONFIG_PM_SLEEP
 static int omap8250_prepare(struct device *dev)
 {
@@ -1277,11 +1237,6 @@ static int omap8250_suspend(struct device *dev)
 
        serial8250_suspend_port(priv->line);
        flush_work(&priv->qos_work);
-
-       if (device_may_wakeup(dev))
-               omap8250_enable_wakeup(priv, true);
-       else
-               omap8250_enable_wakeup(priv, false);
        return 0;
 }
 
@@ -1289,9 +1244,6 @@ static int omap8250_resume(struct device *dev)
 {
        struct omap8250_priv *priv = dev_get_drvdata(dev);
 
-       if (device_may_wakeup(dev))
-               omap8250_enable_wakeup(priv, false);
-
        serial8250_resume_port(priv->line);
        return 0;
 }
@@ -1333,7 +1285,6 @@ static int omap8250_runtime_suspend(struct device *dev)
                        return -EBUSY;
        }
 
-       omap8250_enable_wakeup(priv, true);
        if (up->dma)
                omap_8250_rx_dma(up, UART_IIR_RX_TIMEOUT);
 
@@ -1354,7 +1305,6 @@ static int omap8250_runtime_resume(struct device *dev)
                return 0;
 
        up = serial8250_get_port(priv->line);
-       omap8250_enable_wakeup(priv, false);
        loss_cntx = omap8250_lost_context(up);
 
        if (loss_cntx)
index 7f49172ccd8673b316b3b82ab21282885ef983eb..7a2172b5e93cd296674c9b78a5fd31594db0f492 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/serial_core.h>
 #include <linux/irq.h>
 #include <linux/pm_runtime.h>
+#include <linux/pm_wakeirq.h>
 #include <linux/of.h>
 #include <linux/of_irq.h>
 #include <linux/gpio.h>
@@ -160,7 +161,6 @@ struct uart_omap_port {
        unsigned long           port_activity;
        int                     context_loss_cnt;
        u32                     errata;
-       u8                      wakeups_enabled;
        u32                     features;
 
        int                     rts_gpio;
@@ -209,28 +209,11 @@ static int serial_omap_get_context_loss_count(struct uart_omap_port *up)
        return pdata->get_context_loss_count(up->dev);
 }
 
-static inline void serial_omap_enable_wakeirq(struct uart_omap_port *up,
-                                      bool enable)
-{
-       if (!up->wakeirq)
-               return;
-
-       if (enable)
-               enable_irq(up->wakeirq);
-       else
-               disable_irq_nosync(up->wakeirq);
-}
-
+/* REVISIT: Remove this when omap3 boots in device tree only mode */
 static void serial_omap_enable_wakeup(struct uart_omap_port *up, bool enable)
 {
        struct omap_uart_port_info *pdata = dev_get_platdata(up->dev);
 
-       if (enable == up->wakeups_enabled)
-               return;
-
-       serial_omap_enable_wakeirq(up, enable);
-       up->wakeups_enabled = enable;
-
        if (!pdata || !pdata->enable_wakeup)
                return;
 
@@ -750,13 +733,11 @@ static int serial_omap_startup(struct uart_port *port)
 
        /* Optional wake-up IRQ */
        if (up->wakeirq) {
-               retval = request_irq(up->wakeirq, serial_omap_irq,
-                                    up->port.irqflags, up->name, up);
+               retval = dev_pm_set_dedicated_wake_irq(up->dev, up->wakeirq);
                if (retval) {
                        free_irq(up->port.irq, up);
                        return retval;
                }
-               disable_irq(up->wakeirq);
        }
 
        dev_dbg(up->port.dev, "serial_omap_startup+%d\n", up->port.line);
@@ -845,8 +826,7 @@ static void serial_omap_shutdown(struct uart_port *port)
        pm_runtime_mark_last_busy(up->dev);
        pm_runtime_put_autosuspend(up->dev);
        free_irq(up->port.irq, up);
-       if (up->wakeirq)
-               free_irq(up->wakeirq, up);
+       dev_pm_clear_wake_irq(up->dev);
 }
 
 static void serial_omap_uart_qos_work(struct work_struct *work)
@@ -1139,13 +1119,6 @@ serial_omap_pm(struct uart_port *port, unsigned int state,
        serial_out(up, UART_EFR, efr);
        serial_out(up, UART_LCR, 0);
 
-       if (!device_may_wakeup(up->dev)) {
-               if (!state)
-                       pm_runtime_forbid(up->dev);
-               else
-                       pm_runtime_allow(up->dev);
-       }
-
        pm_runtime_mark_last_busy(up->dev);
        pm_runtime_put_autosuspend(up->dev);
 }
index 64fa248343f65461db232ee4ae0939beff0fc05c..8f84646f10e9560ade100c1dafa180768f1d76de 100644 (file)
@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
                val_size2 = posix_acl_xattr_size(default_acl->a_count);
 
        err = -ENOMEM;
-       tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
+       tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
        if (!tmp_buf)
                goto out_err;
-       pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
+       pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
        if (!pagelist)
                goto out_err;
        ceph_pagelist_init(pagelist);
index e162bcd105ee2d98c2dcd12e80c75ee729a7a951..890c50971a690472f6dc795b00fd0bcfbcdf1a39 100644 (file)
@@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page)
        inode = mapping->host;
        ci = ceph_inode(inode);
 
-       /*
-        * Note that we're grabbing a snapc ref here without holding
-        * any locks!
-        */
-       snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
-
        /* dirty the head */
        spin_lock(&ci->i_ceph_lock);
-       if (ci->i_head_snapc == NULL)
-               ci->i_head_snapc = ceph_get_snap_context(snapc);
-       ++ci->i_wrbuffer_ref_head;
+       BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
+       if (__ceph_have_pending_cap_snap(ci)) {
+               struct ceph_cap_snap *capsnap =
+                               list_last_entry(&ci->i_cap_snaps,
+                                               struct ceph_cap_snap,
+                                               ci_item);
+               snapc = ceph_get_snap_context(capsnap->context);
+               capsnap->dirty_pages++;
+       } else {
+               BUG_ON(!ci->i_head_snapc);
+               snapc = ceph_get_snap_context(ci->i_head_snapc);
+               ++ci->i_wrbuffer_ref_head;
+       }
        if (ci->i_wrbuffer_ref == 0)
                ihold(inode);
        ++ci->i_wrbuffer_ref;
@@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 
        /* build page vector */
        nr_pages = calc_pages_for(0, len);
-       pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+       pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
        ret = -ENOMEM;
        if (!pages)
                goto out;
@@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
                dout("start_read %p adding %p idx %lu\n", inode, page,
                     page->index);
                if (add_to_page_cache_lru(page, &inode->i_data, page->index,
-                                         GFP_NOFS)) {
+                                         GFP_KERNEL)) {
                        ceph_fscache_uncache_page(inode, page);
                        page_cache_release(page);
                        dout("start_read %p add_to_page_cache failed %p\n",
@@ -436,7 +440,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
  * only snap context we are allowed to write back.
  */
 static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-                                                   u64 *snap_size)
+                                                   loff_t *snap_size)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_snap_context *snapc = NULL;
@@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        struct ceph_osd_client *osdc;
        struct ceph_snap_context *snapc, *oldest;
        loff_t page_off = page_offset(page);
+       loff_t snap_size = -1;
        long writeback_stat;
-       u64 truncate_size, snap_size = 0;
+       u64 truncate_size;
        u32 truncate_seq;
        int err = 0, len = PAGE_CACHE_SIZE;
 
@@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        spin_lock(&ci->i_ceph_lock);
        truncate_seq = ci->i_truncate_seq;
        truncate_size = ci->i_truncate_size;
-       if (!snap_size)
+       if (snap_size == -1)
                snap_size = i_size_read(inode);
        spin_unlock(&ci->i_ceph_lock);
 
@@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping,
        unsigned wsize = 1 << inode->i_blkbits;
        struct ceph_osd_request *req = NULL;
        int do_sync = 0;
-       u64 truncate_size, snap_size;
+       loff_t snap_size, i_size;
+       u64 truncate_size;
        u32 truncate_seq;
 
        /*
@@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 retry:
        /* find oldest snap context with dirty data */
        ceph_put_snap_context(snapc);
-       snap_size = 0;
+       snap_size = -1;
        snapc = get_oldest_context(inode, &snap_size);
        if (!snapc) {
                /* hmm, why does writepages get called when there
@@ -749,16 +755,13 @@ static int ceph_writepages_start(struct address_space *mapping,
                dout(" no snap context with dirty data?\n");
                goto out;
        }
-       if (snap_size == 0)
-               snap_size = i_size_read(inode);
        dout(" oldest snapc is %p seq %lld (%d snaps)\n",
             snapc, snapc->seq, snapc->num_snaps);
 
        spin_lock(&ci->i_ceph_lock);
        truncate_seq = ci->i_truncate_seq;
        truncate_size = ci->i_truncate_size;
-       if (!snap_size)
-               snap_size = i_size_read(inode);
+       i_size = i_size_read(inode);
        spin_unlock(&ci->i_ceph_lock);
 
        if (last_snapc && snapc != last_snapc) {
@@ -828,8 +831,10 @@ static int ceph_writepages_start(struct address_space *mapping,
                                dout("waiting on writeback %p\n", page);
                                wait_on_page_writeback(page);
                        }
-                       if (page_offset(page) >= snap_size) {
-                               dout("%p page eof %llu\n", page, snap_size);
+                       if (page_offset(page) >=
+                           (snap_size == -1 ? i_size : snap_size)) {
+                               dout("%p page eof %llu\n", page,
+                                    (snap_size == -1 ? i_size : snap_size));
                                done = 1;
                                unlock_page(page);
                                break;
@@ -884,7 +889,8 @@ static int ceph_writepages_start(struct address_space *mapping,
                                }
 
                                if (do_sync)
-                                       osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+                                       osd_req_op_init(req, 1,
+                                                       CEPH_OSD_OP_STARTSYNC, 0);
 
                                req->r_callback = writepages_finish;
                                req->r_inode = inode;
@@ -944,10 +950,18 @@ static int ceph_writepages_start(struct address_space *mapping,
                }
 
                /* Format the osd request message and submit the write */
-
                offset = page_offset(pages[0]);
-               len = min(snap_size - offset,
-                         (u64)locked_pages << PAGE_CACHE_SHIFT);
+               len = (u64)locked_pages << PAGE_CACHE_SHIFT;
+               if (snap_size == -1) {
+                       len = min(len, (u64)i_size_read(inode) - offset);
+                        /* writepages_finish() clears writeback pages
+                         * according to the data length, so make sure
+                         * data length covers all locked pages */
+                       len = max(len, 1 +
+                               ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
+               } else {
+                       len = min(len, snap_size - offset);
+               }
                dout("writepages got %d pages at %llu~%llu\n",
                     locked_pages, offset, len);
 
@@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file,
 {
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        loff_t page_off = pos & PAGE_CACHE_MASK;
        int pos_in_page = pos & ~PAGE_CACHE_MASK;
        int end_in_page = pos_in_page + len;
@@ -1044,10 +1057,6 @@ static int ceph_update_writeable_page(struct file *file,
        /* writepages currently holds page lock, but if we change that later, */
        wait_on_page_writeback(page);
 
-       /* check snap context */
-       BUG_ON(!ci->i_snap_realm);
-       down_read(&mdsc->snap_rwsem);
-       BUG_ON(!ci->i_snap_realm->cached_context);
        snapc = page_snap_context(page);
        if (snapc && snapc != ci->i_head_snapc) {
                /*
@@ -1055,7 +1064,6 @@ static int ceph_update_writeable_page(struct file *file,
                 * context!  is it writeable now?
                 */
                oldest = get_oldest_context(inode, NULL);
-               up_read(&mdsc->snap_rwsem);
 
                if (snapc->seq > oldest->seq) {
                        ceph_put_snap_context(oldest);
@@ -1112,7 +1120,6 @@ static int ceph_update_writeable_page(struct file *file,
        }
 
        /* we need to read it. */
-       up_read(&mdsc->snap_rwsem);
        r = readpage_nounlock(file, page);
        if (r < 0)
                goto fail_nosnap;
@@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 
 /*
  * we don't do anything in here that simple_write_end doesn't do
- * except adjust dirty page accounting and drop read lock on
- * mdsc->snap_rwsem.
+ * except adjust dirty page accounting
  */
 static int ceph_write_end(struct file *file, struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
                          struct page *page, void *fsdata)
 {
        struct inode *inode = file_inode(file);
-       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_mds_client *mdsc = fsc->mdsc;
        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
        int check_cap = 0;
 
@@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
        set_page_dirty(page);
 
        unlock_page(page);
-       up_read(&mdsc->snap_rwsem);
        page_cache_release(page);
 
        if (check_cap)
@@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct inode *inode = file_inode(vma->vm_file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_file_info *fi = vma->vm_file->private_data;
-       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+       struct ceph_cap_flush *prealloc_cf;
        struct page *page = vmf->page;
        loff_t off = page_offset(page);
        loff_t size = i_size_read(inode);
        size_t len;
        int want, got, ret;
 
+       prealloc_cf = ceph_alloc_cap_flush();
+       if (!prealloc_cf)
+               return VM_FAULT_SIGBUS;
+
        if (ci->i_inline_version != CEPH_INLINE_NONE) {
                struct page *locked_page = NULL;
                if (off == 0) {
@@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                ret = ceph_uninline_data(vma->vm_file, locked_page);
                if (locked_page)
                        unlock_page(locked_page);
-               if (ret < 0)
-                       return VM_FAULT_SIGBUS;
+               if (ret < 0) {
+                       ret = VM_FAULT_SIGBUS;
+                       goto out_free;
+               }
        }
 
        if (off + PAGE_CACHE_SIZE <= size)
@@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                        break;
                if (ret != -ERESTARTSYS) {
                        WARN_ON(1);
-                       return VM_FAULT_SIGBUS;
+                       ret = VM_FAULT_SIGBUS;
+                       goto out_free;
                }
        }
        dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
@@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret == 0) {
                /* success.  we'll keep the page locked. */
                set_page_dirty(page);
-               up_read(&mdsc->snap_rwsem);
                ret = VM_FAULT_LOCKED;
        } else {
                if (ret == -ENOMEM)
@@ -1389,7 +1398,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                int dirty;
                spin_lock(&ci->i_ceph_lock);
                ci->i_inline_version = CEPH_INLINE_NONE;
-               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+                                              &prealloc_cf);
                spin_unlock(&ci->i_ceph_lock);
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
@@ -1398,6 +1408,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
             inode, off, len, ceph_cap_string(got), ret);
        ceph_put_cap_refs(ci, got);
+out_free:
+       ceph_free_cap_flush(prealloc_cf);
 
        return ret;
 }
@@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                                    ceph_vino(inode), 0, &len, 0, 1,
                                    CEPH_OSD_OP_CREATE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-                                   ci->i_snap_realm->cached_context,
-                                   0, 0, false);
+                                   ceph_empty_snapc, 0, 0, false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
@@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
                                    ceph_vino(inode), 0, &len, 1, 3,
                                    CEPH_OSD_OP_WRITE,
                                    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-                                   ci->i_snap_realm->cached_context,
+                                   ceph_empty_snapc,
                                    ci->i_truncate_seq, ci->i_truncate_size,
                                    false);
        if (IS_ERR(req)) {
@@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
        vma->vm_ops = &ceph_vmops;
        return 0;
 }
+
+enum {
+       POOL_READ       = 1,
+       POOL_WRITE      = 2,
+};
+
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+{
+       struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
+       struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
+       struct rb_node **p, *parent;
+       struct ceph_pool_perm *perm;
+       struct page **pages;
+       int err = 0, err2 = 0, have = 0;
+
+       down_read(&mdsc->pool_perm_rwsem);
+       p = &mdsc->pool_perm_tree.rb_node;
+       while (*p) {
+               perm = rb_entry(*p, struct ceph_pool_perm, node);
+               if (pool < perm->pool)
+                       p = &(*p)->rb_left;
+               else if (pool > perm->pool)
+                       p = &(*p)->rb_right;
+               else {
+                       have = perm->perm;
+                       break;
+               }
+       }
+       up_read(&mdsc->pool_perm_rwsem);
+       if (*p)
+               goto out;
+
+       dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+
+       down_write(&mdsc->pool_perm_rwsem);
+       parent = NULL;
+       while (*p) {
+               parent = *p;
+               perm = rb_entry(parent, struct ceph_pool_perm, node);
+               if (pool < perm->pool)
+                       p = &(*p)->rb_left;
+               else if (pool > perm->pool)
+                       p = &(*p)->rb_right;
+               else {
+                       have = perm->perm;
+                       break;
+               }
+       }
+       if (*p) {
+               up_write(&mdsc->pool_perm_rwsem);
+               goto out;
+       }
+
+       rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+                                        ceph_empty_snapc,
+                                        1, false, GFP_NOFS);
+       if (!rd_req) {
+               err = -ENOMEM;
+               goto out_unlock;
+       }
+
+       rd_req->r_flags = CEPH_OSD_FLAG_READ;
+       osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
+       rd_req->r_base_oloc.pool = pool;
+       snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
+                "%llx.00000000", ci->i_vino.ino);
+       rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+
+       wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+                                        ceph_empty_snapc,
+                                        1, false, GFP_NOFS);
+       if (!wr_req) {
+               err = -ENOMEM;
+               goto out_unlock;
+       }
+
+       wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
+                         CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+       osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
+       wr_req->r_base_oloc.pool = pool;
+       wr_req->r_base_oid = rd_req->r_base_oid;
+
+       /* one page should be large enough for STAT data */
+       pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+       if (IS_ERR(pages)) {
+               err = PTR_ERR(pages);
+               goto out_unlock;
+       }
+
+       osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
+                                    0, false, true);
+       ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
+                               &ci->vfs_inode.i_mtime);
+       err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+
+       ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
+                               &ci->vfs_inode.i_mtime);
+       err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+
+       if (!err)
+               err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+       if (!err2)
+               err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+
+       if (err >= 0 || err == -ENOENT)
+               have |= POOL_READ;
+       else if (err != -EPERM)
+               goto out_unlock;
+
+       if (err2 == 0 || err2 == -EEXIST)
+               have |= POOL_WRITE;
+       else if (err2 != -EPERM) {
+               err = err2;
+               goto out_unlock;
+       }
+
+       perm = kmalloc(sizeof(*perm), GFP_NOFS);
+       if (!perm) {
+               err = -ENOMEM;
+               goto out_unlock;
+       }
+
+       perm->pool = pool;
+       perm->perm = have;
+       rb_link_node(&perm->node, parent, p);
+       rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
+       err = 0;
+out_unlock:
+       up_write(&mdsc->pool_perm_rwsem);
+
+       if (rd_req)
+               ceph_osdc_put_request(rd_req);
+       if (wr_req)
+               ceph_osdc_put_request(wr_req);
+out:
+       if (!err)
+               err = have;
+       dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+       return err;
+}
+
+int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
+{
+       u32 pool;
+       int ret, flags;
+
+       if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
+                               NOPOOLPERM))
+               return 0;
+
+       spin_lock(&ci->i_ceph_lock);
+       flags = ci->i_ceph_flags;
+       pool = ceph_file_layout_pg_pool(ci->i_layout);
+       spin_unlock(&ci->i_ceph_lock);
+check:
+       if (flags & CEPH_I_POOL_PERM) {
+               if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
+                       dout("ceph_pool_perm_check pool %u no read perm\n",
+                            pool);
+                       return -EPERM;
+               }
+               if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
+                       dout("ceph_pool_perm_check pool %u no write perm\n",
+                            pool);
+                       return -EPERM;
+               }
+               return 0;
+       }
+
+       ret = __ceph_pool_perm_get(ci, pool);
+       if (ret < 0)
+               return ret;
+
+       flags = CEPH_I_POOL_PERM;
+       if (ret & POOL_READ)
+               flags |= CEPH_I_POOL_RD;
+       if (ret & POOL_WRITE)
+               flags |= CEPH_I_POOL_WR;
+
+       spin_lock(&ci->i_ceph_lock);
+       if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
+               ci->i_ceph_flags = flags;
+        } else {
+               pool = ceph_file_layout_pg_pool(ci->i_layout);
+               flags = ci->i_ceph_flags;
+       }
+       spin_unlock(&ci->i_ceph_lock);
+       goto check;
+}
+
+void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
+{
+       struct ceph_pool_perm *perm;
+       struct rb_node *n;
+
+       while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
+               n = rb_first(&mdsc->pool_perm_tree);
+               perm = rb_entry(n, struct ceph_pool_perm, node);
+               rb_erase(n, &mdsc->pool_perm_tree);
+               kfree(perm);
+       }
+}
index be5ea6af8366479b675e81e1d13e96139abed2c4..dc10c9dd36c1a2ac6264ed21d3248e5f62f1e330 100644 (file)
@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
                used |= CEPH_CAP_PIN;
        if (ci->i_rd_ref)
                used |= CEPH_CAP_FILE_RD;
-       if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+       if (ci->i_rdcache_ref ||
+           (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+            ci->vfs_inode.i_data.nrpages))
                used |= CEPH_CAP_FILE_CACHE;
        if (ci->i_wr_ref)
                used |= CEPH_CAP_FILE_WR;
@@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 
        /* remove from session list */
        spin_lock(&session->s_cap_lock);
-       /*
-        * s_cap_reconnect is protected by s_cap_lock. no one changes
-        * s_cap_gen while session is in the reconnect state.
-        */
-       if (queue_release &&
-           (!session->s_cap_reconnect ||
-            cap->cap_gen == session->s_cap_gen))
-               __queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
-                                   cap->mseq, cap->issue_seq);
-
        if (session->s_cap_iterator == cap) {
                /* not yet, we are iterating over this very cap */
                dout("__ceph_remove_cap  delaying %p removal from session %p\n",
@@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
        }
        /* protect backpointer with s_cap_lock: see iterate_session_caps */
        cap->ci = NULL;
+
+       /*
+        * s_cap_reconnect is protected by s_cap_lock. no one changes
+        * s_cap_gen while session is in the reconnect state.
+        */
+       if (queue_release &&
+           (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+               cap->queue_release = 1;
+               if (removed) {
+                       list_add_tail(&cap->session_caps,
+                                     &session->s_cap_releases);
+                       session->s_num_cap_releases++;
+                       removed = 0;
+               }
+       } else {
+               cap->queue_release = 0;
+       }
+       cap->cap_ino = ci->i_vino.ino;
+
        spin_unlock(&session->s_cap_lock);
 
        /* remove from inode list */
@@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 static int send_cap_msg(struct ceph_mds_session *session,
                        u64 ino, u64 cid, int op,
                        int caps, int wanted, int dirty,
-                       u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
-                       u64 size, u64 max_size,
+                       u32 seq, u64 flush_tid, u64 oldest_flush_tid,
+                       u32 issue_seq, u32 mseq, u64 size, u64 max_size,
                        struct timespec *mtime, struct timespec *atime,
                        u64 time_warp_seq,
                        kuid_t uid, kgid_t gid, umode_t mode,
@@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session,
        size_t extra_len;
 
        dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
-            " seq %u/%u mseq %u follows %lld size %llu/%llu"
+            " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
             " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
             cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
             ceph_cap_string(dirty),
-            seq, issue_seq, mseq, follows, size, max_size,
+            seq, issue_seq, flush_tid, oldest_flush_tid,
+            mseq, follows, size, max_size,
             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 
-       /* flock buffer size + inline version + inline data size */
-       extra_len = 4 + 8 + 4;
+       /* flock buffer size + inline version + inline data size +
+        * osd_epoch_barrier + oldest_flush_tid */
+       extra_len = 4 + 8 + 4 + 4 + 8;
        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
                           GFP_NOFS, false);
        if (!msg)
                return -ENOMEM;
 
+       msg->hdr.version = cpu_to_le16(6);
        msg->hdr.tid = cpu_to_le64(flush_tid);
 
        fc = msg->front.iov_base;
@@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
        ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
        /* inline data size */
        ceph_encode_32(&p, 0);
+       /* osd_epoch_barrier */
+       ceph_encode_32(&p, 0);
+       /* oldest_flush_tid */
+       ceph_encode_64(&p, oldest_flush_tid);
 
        fc->xattr_version = cpu_to_le64(xattr_version);
        if (xattrs_buf) {
@@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session,
        return 0;
 }
 
-void __queue_cap_release(struct ceph_mds_session *session,
-                        u64 ino, u64 cap_id, u32 migrate_seq,
-                        u32 issue_seq)
-{
-       struct ceph_msg *msg;
-       struct ceph_mds_cap_release *head;
-       struct ceph_mds_cap_item *item;
-
-       BUG_ON(!session->s_num_cap_releases);
-       msg = list_first_entry(&session->s_cap_releases,
-                              struct ceph_msg, list_head);
-
-       dout(" adding %llx release to mds%d msg %p (%d left)\n",
-            ino, session->s_mds, msg, session->s_num_cap_releases);
-
-       BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
-       head = msg->front.iov_base;
-       le32_add_cpu(&head->num, 1);
-       item = msg->front.iov_base + msg->front.iov_len;
-       item->ino = cpu_to_le64(ino);
-       item->cap_id = cpu_to_le64(cap_id);
-       item->migrate_seq = cpu_to_le32(migrate_seq);
-       item->seq = cpu_to_le32(issue_seq);
-
-       session->s_num_cap_releases--;
-
-       msg->front.iov_len += sizeof(*item);
-       if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
-               dout(" release msg %p full\n", msg);
-               list_move_tail(&msg->list_head, &session->s_cap_releases_done);
-       } else {
-               dout(" release msg %p at %d/%d (%d)\n", msg,
-                    (int)le32_to_cpu(head->num),
-                    (int)CEPH_CAPS_PER_RELEASE,
-                    (int)msg->front.iov_len);
-       }
-}
-
 /*
  * Queue cap releases when an inode is dropped from our cache.  Since
  * inode is about to be destroyed, there is no need for i_ceph_lock.
@@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode)
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
                      int op, int used, int want, int retain, int flushing,
-                     unsigned *pflush_tid)
+                     u64 flush_tid, u64 oldest_flush_tid)
        __releases(cap->ci->i_ceph_lock)
 {
        struct ceph_inode_info *ci = cap->ci;
@@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        u64 xattr_version = 0;
        struct ceph_buffer *xattr_blob = NULL;
        int delayed = 0;
-       u64 flush_tid = 0;
-       int i;
        int ret;
        bool inline_data;
 
@@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        cap->implemented &= cap->issued | used;
        cap->mds_wanted = want;
 
-       if (flushing) {
-               /*
-                * assign a tid for flush operations so we can avoid
-                * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
-                * clean type races.  track latest tid for every bit
-                * so we can handle flush AxFw, flush Fw, and have the
-                * first ack clean Ax.
-                */
-               flush_tid = ++ci->i_cap_flush_last_tid;
-               if (pflush_tid)
-                       *pflush_tid = flush_tid;
-               dout(" cap_flush_tid %d\n", (int)flush_tid);
-               for (i = 0; i < CEPH_CAP_BITS; i++)
-                       if (flushing & (1 << i))
-                               ci->i_cap_flush_tid[i] = flush_tid;
-
-               follows = ci->i_head_snapc->seq;
-       } else {
-               follows = 0;
-       }
+       follows = flushing ? ci->i_head_snapc->seq : 0;
 
        keep = cap->implemented;
        seq = cap->seq;
@@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
        spin_unlock(&ci->i_ceph_lock);
 
        ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
-               op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+               op, keep, want, flushing, seq,
+               flush_tid, oldest_flush_tid, issue_seq, mseq,
                size, max_size, &mtime, &atime, time_warp_seq,
                uid, gid, mode, xattr_version, xattr_blob,
                follows, inline_data);
@@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  * asynchronously back to the MDS once sync writes complete and dirty
  * data is written out.
  *
- * Unless @again is true, skip cap_snaps that were already sent to
+ * Unless @kick is true, skip cap_snaps that were already sent to
  * the MDS (i.e., during this session).
  *
  * Called under i_ceph_lock.  Takes s_mutex as needed.
  */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
                        struct ceph_mds_session **psession,
-                       int again)
+                       int kick)
                __releases(ci->i_ceph_lock)
                __acquires(ci->i_ceph_lock)
 {
@@ -1297,11 +1257,8 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
                if (capsnap->dirty_pages || capsnap->writing)
                        break;
 
-               /*
-                * if cap writeback already occurred, we should have dropped
-                * the capsnap in ceph_put_wrbuffer_cap_refs.
-                */
-               BUG_ON(capsnap->dirty == 0);
+               /* should be removed by ceph_try_drop_cap_snap() */
+               BUG_ON(!capsnap->need_flush);
 
                /* pick mds, take s_mutex */
                if (ci->i_auth_cap == NULL) {
@@ -1310,7 +1267,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
                }
 
                /* only flush each capsnap once */
-               if (!again && !list_empty(&capsnap->flushing_item)) {
+               if (!kick && !list_empty(&capsnap->flushing_item)) {
                        dout("already flushed %p, skipping\n", capsnap);
                        continue;
                }
@@ -1320,6 +1277,9 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
 
                if (session && session->s_mds != mds) {
                        dout("oops, wrong session %p mutex\n", session);
+                       if (kick)
+                               goto out;
+
                        mutex_unlock(&session->s_mutex);
                        ceph_put_mds_session(session);
                        session = NULL;
@@ -1343,20 +1303,22 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
                        goto retry;
                }
 
-               capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+               spin_lock(&mdsc->cap_dirty_lock);
+               capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
+               spin_unlock(&mdsc->cap_dirty_lock);
+
                atomic_inc(&capsnap->nref);
-               if (!list_empty(&capsnap->flushing_item))
-                       list_del_init(&capsnap->flushing_item);
-               list_add_tail(&capsnap->flushing_item,
-                             &session->s_cap_snaps_flushing);
+               if (list_empty(&capsnap->flushing_item))
+                       list_add_tail(&capsnap->flushing_item,
+                                     &session->s_cap_snaps_flushing);
                spin_unlock(&ci->i_ceph_lock);
 
                dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
                     inode, capsnap, capsnap->follows, capsnap->flush_tid);
                send_cap_msg(session, ceph_vino(inode).ino, 0,
                             CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
-                            capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
-                            capsnap->size, 0,
+                            capsnap->dirty, 0, capsnap->flush_tid, 0,
+                            0, mseq, capsnap->size, 0,
                             &capsnap->mtime, &capsnap->atime,
                             capsnap->time_warp_seq,
                             capsnap->uid, capsnap->gid, capsnap->mode,
@@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  * Caller is then responsible for calling __mark_inode_dirty with the
  * returned flags value.
  */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+                          struct ceph_cap_flush **pcf)
 {
        struct ceph_mds_client *mdsc =
                ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
             ceph_cap_string(was | mask));
        ci->i_dirty_caps |= mask;
        if (was == 0) {
-               if (!ci->i_head_snapc)
+               WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+               swap(ci->i_prealloc_cap_flush, *pcf);
+
+               if (!ci->i_head_snapc) {
+                       WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
                        ci->i_head_snapc = ceph_get_snap_context(
                                ci->i_snap_realm->cached_context);
+               }
                dout(" inode %p now dirty snapc %p auth cap %p\n",
                     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
                BUG_ON(!list_empty(&ci->i_dirty_item));
@@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
                        ihold(inode);
                        dirty |= I_DIRTY_SYNC;
                }
+       } else {
+               WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
        }
        BUG_ON(list_empty(&ci->i_dirty_item));
        if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
        return dirty;
 }
 
+static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
+                                       struct ceph_cap_flush *cf)
+{
+       struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_cap_flush *other = NULL;
+
+       while (*p) {
+               parent = *p;
+               other = rb_entry(parent, struct ceph_cap_flush, i_node);
+
+               if (cf->tid < other->tid)
+                       p = &(*p)->rb_left;
+               else if (cf->tid > other->tid)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&cf->i_node, parent, p);
+       rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
+}
+
+static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
+                                      struct ceph_cap_flush *cf)
+{
+       struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_cap_flush *other = NULL;
+
+       while (*p) {
+               parent = *p;
+               other = rb_entry(parent, struct ceph_cap_flush, g_node);
+
+               if (cf->tid < other->tid)
+                       p = &(*p)->rb_left;
+               else if (cf->tid > other->tid)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&cf->g_node, parent, p);
+       rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
+}
+
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+       return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+       if (cf)
+               kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+       struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
+       if (n) {
+               struct ceph_cap_flush *cf =
+                       rb_entry(n, struct ceph_cap_flush, g_node);
+               return cf->tid;
+       }
+       return 0;
+}
+
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
@@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  * Called under i_ceph_lock.
  */
 static int __mark_caps_flushing(struct inode *inode,
-                                struct ceph_mds_session *session)
+                               struct ceph_mds_session *session,
+                               u64 *flush_tid, u64 *oldest_flush_tid)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_cap_flush *cf = NULL;
        int flushing;
 
        BUG_ON(ci->i_dirty_caps == 0);
        BUG_ON(list_empty(&ci->i_dirty_item));
+       BUG_ON(!ci->i_prealloc_cap_flush);
 
        flushing = ci->i_dirty_caps;
        dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
@@ -1463,22 +1504,31 @@ static int __mark_caps_flushing(struct inode *inode,
        ci->i_dirty_caps = 0;
        dout(" inode %p now !dirty\n", inode);
 
+       swap(cf, ci->i_prealloc_cap_flush);
+       cf->caps = flushing;
+       cf->kick = false;
+
        spin_lock(&mdsc->cap_dirty_lock);
        list_del_init(&ci->i_dirty_item);
 
+       cf->tid = ++mdsc->last_cap_flush_tid;
+       __add_cap_flushing_to_mdsc(mdsc, cf);
+       *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+
        if (list_empty(&ci->i_flushing_item)) {
-               ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
                list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
                mdsc->num_cap_flushing++;
-               dout(" inode %p now flushing seq %lld\n", inode,
-                    ci->i_cap_flush_seq);
+               dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
        } else {
                list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
-               dout(" inode %p now flushing (more) seq %lld\n", inode,
-                    ci->i_cap_flush_seq);
+               dout(" inode %p now flushing (more) tid %llu\n",
+                    inode, cf->tid);
        }
        spin_unlock(&mdsc->cap_dirty_lock);
 
+       __add_cap_flushing_to_inode(ci, cf);
+
+       *flush_tid = cf->tid;
        return flushing;
 }
 
@@ -1524,6 +1574,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap *cap;
+       u64 flush_tid, oldest_flush_tid;
        int file_wanted, used, cap_used;
        int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
        int issued, implemented, want, retain, revoking, flushing = 0;
@@ -1553,13 +1604,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 retry_locked:
        file_wanted = __ceph_caps_file_wanted(ci);
        used = __ceph_caps_used(ci);
-       want = file_wanted | used;
        issued = __ceph_caps_issued(ci, &implemented);
        revoking = implemented & ~issued;
 
-       retain = want | CEPH_CAP_PIN;
+       want = file_wanted;
+       retain = file_wanted | used | CEPH_CAP_PIN;
        if (!mdsc->stopping && inode->i_nlink > 0) {
-               if (want) {
+               if (file_wanted) {
                        retain |= CEPH_CAP_ANY;       /* be greedy */
                } else if (S_ISDIR(inode->i_mode) &&
                           (issued & CEPH_CAP_FILE_SHARED) &&
@@ -1602,9 +1653,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
         * If we fail, it's because pages are locked.... try again later.
         */
        if ((!is_delayed || mdsc->stopping) &&
-           ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-           inode->i_data.nrpages &&                 /* have cached pages */
-           (file_wanted == 0 ||                     /* no open files */
+           !S_ISDIR(inode->i_mode) &&          /* ignore readdir cache */
+           ci->i_wrbuffer_ref == 0 &&          /* no dirty pages... */
+           inode->i_data.nrpages &&            /* have cached pages */
+           (file_wanted == 0 ||                /* no open files */
             (revoking & (CEPH_CAP_FILE_CACHE|
                          CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
            !tried_invalidate) {
@@ -1742,17 +1794,25 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                        took_snap_rwsem = 1;
                }
 
-               if (cap == ci->i_auth_cap && ci->i_dirty_caps)
-                       flushing = __mark_caps_flushing(inode, session);
-               else
+               if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+                       flushing = __mark_caps_flushing(inode, session,
+                                                       &flush_tid,
+                                                       &oldest_flush_tid);
+               } else {
                        flushing = 0;
+                       flush_tid = 0;
+                       spin_lock(&mdsc->cap_dirty_lock);
+                       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+                       spin_unlock(&mdsc->cap_dirty_lock);
+               }
 
                mds = cap->mds;  /* remember mds, so we don't repeat */
                sent++;
 
                /* __send_cap drops i_ceph_lock */
                delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
-                                     want, retain, flushing, NULL);
+                                     want, retain, flushing,
+                                     flush_tid, oldest_flush_tid);
                goto retry; /* retake i_ceph_lock and restart our cap scan. */
        }
 
@@ -1781,12 +1841,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 /*
  * Try to flush dirty caps back to the auth mds.
  */
-static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, u64 *ptid)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       int flushing = 0;
        struct ceph_mds_session *session = NULL;
+       int flushing = 0;
+       u64 flush_tid = 0, oldest_flush_tid = 0;
 
 retry:
        spin_lock(&ci->i_ceph_lock);
@@ -1811,42 +1872,54 @@ static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
                if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
                        goto out;
 
-               flushing = __mark_caps_flushing(inode, session);
+               flushing = __mark_caps_flushing(inode, session, &flush_tid,
+                                               &oldest_flush_tid);
 
                /* __send_cap drops i_ceph_lock */
                delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
-                                    cap->issued | cap->implemented, flushing,
-                                    flush_tid);
-               if (!delayed)
-                       goto out_unlocked;
+                                    (cap->issued | cap->implemented),
+                                    flushing, flush_tid, oldest_flush_tid);
 
-               spin_lock(&ci->i_ceph_lock);
-               __cap_delay_requeue(mdsc, ci);
+               if (delayed) {
+                       spin_lock(&ci->i_ceph_lock);
+                       __cap_delay_requeue(mdsc, ci);
+                       spin_unlock(&ci->i_ceph_lock);
+               }
+       } else {
+               struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
+               if (n) {
+                       struct ceph_cap_flush *cf =
+                               rb_entry(n, struct ceph_cap_flush, i_node);
+                       flush_tid = cf->tid;
+               }
+               flushing = ci->i_flushing_caps;
+               spin_unlock(&ci->i_ceph_lock);
        }
 out:
-       spin_unlock(&ci->i_ceph_lock);
-out_unlocked:
        if (session)
                mutex_unlock(&session->s_mutex);
+
+       *ptid = flush_tid;
        return flushing;
 }
 
 /*
  * Return true if we've flushed caps through the given flush_tid.
  */
-static int caps_are_flushed(struct inode *inode, unsigned tid)
+static int caps_are_flushed(struct inode *inode, u64 flush_tid)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-       int i, ret = 1;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+       int ret = 1;
 
        spin_lock(&ci->i_ceph_lock);
-       for (i = 0; i < CEPH_CAP_BITS; i++)
-               if ((ci->i_flushing_caps & (1 << i)) &&
-                   ci->i_cap_flush_tid[i] <= tid) {
-                       /* still flushing this bit */
+       n = rb_first(&ci->i_cap_flush_tree);
+       if (n) {
+               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+               if (cf->tid <= flush_tid)
                        ret = 0;
-                       break;
-               }
+       }
        spin_unlock(&ci->i_ceph_lock);
        return ret;
 }
@@ -1864,13 +1937,16 @@ static void sync_write_wait(struct inode *inode)
        struct ceph_osd_request *req;
        u64 last_tid;
 
+       if (!S_ISREG(inode->i_mode))
+               return;
+
        spin_lock(&ci->i_unsafe_lock);
        if (list_empty(head))
                goto out;
 
        /* set upper bound as _last_ entry in chain */
-       req = list_entry(head->prev, struct ceph_osd_request,
-                        r_unsafe_item);
+       req = list_last_entry(head, struct ceph_osd_request,
+                             r_unsafe_item);
        last_tid = req->r_tid;
 
        do {
@@ -1888,18 +1964,64 @@ static void sync_write_wait(struct inode *inode)
                 */
                if (list_empty(head))
                        break;
-               req = list_entry(head->next, struct ceph_osd_request,
-                                r_unsafe_item);
+               req = list_first_entry(head, struct ceph_osd_request,
+                                      r_unsafe_item);
        } while (req->r_tid < last_tid);
 out:
        spin_unlock(&ci->i_unsafe_lock);
 }
 
+/*
+ * wait for any uncommitted directory operations to commit.
+ */
+static int unsafe_dirop_wait(struct inode *inode)
+{
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       struct list_head *head = &ci->i_unsafe_dirops;
+       struct ceph_mds_request *req;
+       u64 last_tid;
+       int ret = 0;
+
+       if (!S_ISDIR(inode->i_mode))
+               return 0;
+
+       spin_lock(&ci->i_unsafe_lock);
+       if (list_empty(head))
+               goto out;
+
+       req = list_last_entry(head, struct ceph_mds_request,
+                             r_unsafe_dir_item);
+       last_tid = req->r_tid;
+
+       do {
+               ceph_mdsc_get_request(req);
+               spin_unlock(&ci->i_unsafe_lock);
+
+               dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
+                    inode, req->r_tid, last_tid);
+               ret = !wait_for_completion_timeout(&req->r_safe_completion,
+                                       ceph_timeout_jiffies(req->r_timeout));
+               if (ret)
+                       ret = -EIO;  /* timed out */
+
+               ceph_mdsc_put_request(req);
+
+               spin_lock(&ci->i_unsafe_lock);
+               if (ret || list_empty(head))
+                       break;
+               req = list_first_entry(head, struct ceph_mds_request,
+                                      r_unsafe_dir_item);
+       } while (req->r_tid < last_tid);
+out:
+       spin_unlock(&ci->i_unsafe_lock);
+       return ret;
+}
+
 int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
-       unsigned flush_tid;
+       u64 flush_tid;
        int ret;
        int dirty;
 
@@ -1908,25 +2030,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (ret < 0)
-               return ret;
+               goto out;
+
+       if (datasync)
+               goto out;
+
        mutex_lock(&inode->i_mutex);
 
        dirty = try_flush_caps(inode, &flush_tid);
        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
+       ret = unsafe_dirop_wait(inode);
+
        /*
         * only wait on non-file metadata writeback (the mds
         * can recover size and mtime, so we don't need to
         * wait for that)
         */
-       if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
-               dout("fsync waiting for flush_tid %u\n", flush_tid);
+       if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
                ret = wait_event_interruptible(ci->i_cap_wq,
-                                      caps_are_flushed(inode, flush_tid));
+                                       caps_are_flushed(inode, flush_tid));
        }
-
-       dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
        mutex_unlock(&inode->i_mutex);
+out:
+       dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
        return ret;
 }
 
@@ -1939,7 +2066,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-       unsigned flush_tid;
+       u64 flush_tid;
        int err = 0;
        int dirty;
        int wait = wbc->sync_mode == WB_SYNC_ALL;
@@ -1994,6 +2121,104 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
        }
 }
 
+static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *session,
+                               struct ceph_inode_info *ci,
+                               bool kick_all)
+{
+       struct inode *inode = &ci->vfs_inode;
+       struct ceph_cap *cap;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+       int delayed = 0;
+       u64 first_tid = 0;
+       u64 oldest_flush_tid;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+       spin_unlock(&mdsc->cap_dirty_lock);
+
+       while (true) {
+               spin_lock(&ci->i_ceph_lock);
+               cap = ci->i_auth_cap;
+               if (!(cap && cap->session == session)) {
+                       pr_err("%p auth cap %p not mds%d ???\n", inode,
+                                       cap, session->s_mds);
+                       spin_unlock(&ci->i_ceph_lock);
+                       break;
+               }
+
+               for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
+                       cf = rb_entry(n, struct ceph_cap_flush, i_node);
+                       if (cf->tid < first_tid)
+                               continue;
+                       if (kick_all || cf->kick)
+                               break;
+               }
+               if (!n) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       break;
+               }
+
+               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+               cf->kick = false;
+
+               first_tid = cf->tid + 1;
+
+               dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
+                    cap, cf->tid, ceph_cap_string(cf->caps));
+               delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+                                     __ceph_caps_used(ci),
+                                     __ceph_caps_wanted(ci),
+                                     cap->issued | cap->implemented,
+                                     cf->caps, cf->tid, oldest_flush_tid);
+       }
+       return delayed;
+}
+
+void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                                  struct ceph_mds_session *session)
+{
+       struct ceph_inode_info *ci;
+       struct ceph_cap *cap;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+
+       dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+       list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+               spin_lock(&ci->i_ceph_lock);
+               cap = ci->i_auth_cap;
+               if (!(cap && cap->session == session)) {
+                       pr_err("%p auth cap %p not mds%d ???\n",
+                               &ci->vfs_inode, cap, session->s_mds);
+                       spin_unlock(&ci->i_ceph_lock);
+                       continue;
+               }
+
+
+               /*
+                * if flushing caps were revoked, we re-send the cap flush
+                * in client reconnect stage. This guarantees MDS * processes
+                * the cap flush message before issuing the flushing caps to
+                * other client.
+                */
+               if ((cap->issued & ci->i_flushing_caps) !=
+                   ci->i_flushing_caps) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       if (!__kick_flushing_caps(mdsc, session, ci, true))
+                               continue;
+                       spin_lock(&ci->i_ceph_lock);
+               }
+
+               for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
+                       cf = rb_entry(n, struct ceph_cap_flush, i_node);
+                       cf->kick = true;
+               }
+
+               spin_unlock(&ci->i_ceph_lock);
+       }
+}
+
 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                             struct ceph_mds_session *session)
 {
@@ -2003,28 +2228,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 
        dout("kick_flushing_caps mds%d\n", session->s_mds);
        list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
-               struct inode *inode = &ci->vfs_inode;
-               struct ceph_cap *cap;
-               int delayed = 0;
-
-               spin_lock(&ci->i_ceph_lock);
-               cap = ci->i_auth_cap;
-               if (cap && cap->session == session) {
-                       dout("kick_flushing_caps %p cap %p %s\n", inode,
-                            cap, ceph_cap_string(ci->i_flushing_caps));
-                       delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-                                            __ceph_caps_used(ci),
-                                            __ceph_caps_wanted(ci),
-                                            cap->issued | cap->implemented,
-                                            ci->i_flushing_caps, NULL);
-                       if (delayed) {
-                               spin_lock(&ci->i_ceph_lock);
-                               __cap_delay_requeue(mdsc, ci);
-                               spin_unlock(&ci->i_ceph_lock);
-                       }
-               } else {
-                       pr_err("%p auth cap %p not mds%d ???\n", inode,
-                              cap, session->s_mds);
+               int delayed = __kick_flushing_caps(mdsc, session, ci, false);
+               if (delayed) {
+                       spin_lock(&ci->i_ceph_lock);
+                       __cap_delay_requeue(mdsc, ci);
                        spin_unlock(&ci->i_ceph_lock);
                }
        }
@@ -2036,26 +2243,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_cap *cap;
-       int delayed = 0;
 
        spin_lock(&ci->i_ceph_lock);
        cap = ci->i_auth_cap;
-       dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
-            ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+       dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+            ceph_cap_string(ci->i_flushing_caps));
 
        __ceph_flush_snaps(ci, &session, 1);
 
        if (ci->i_flushing_caps) {
+               int delayed;
+
                spin_lock(&mdsc->cap_dirty_lock);
                list_move_tail(&ci->i_flushing_item,
                               &cap->session->s_cap_flushing);
                spin_unlock(&mdsc->cap_dirty_lock);
 
-               delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-                                    __ceph_caps_used(ci),
-                                    __ceph_caps_wanted(ci),
-                                    cap->issued | cap->implemented,
-                                    ci->i_flushing_caps, NULL);
+               spin_unlock(&ci->i_ceph_lock);
+
+               delayed = __kick_flushing_caps(mdsc, session, ci, true);
                if (delayed) {
                        spin_lock(&ci->i_ceph_lock);
                        __cap_delay_requeue(mdsc, ci);
@@ -2073,7 +2279,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
  *
  * Protected by i_ceph_lock.
  */
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+                           bool snap_rwsem_locked)
 {
        if (got & CEPH_CAP_PIN)
                ci->i_pin_ref++;
@@ -2081,8 +2288,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
                ci->i_rd_ref++;
        if (got & CEPH_CAP_FILE_CACHE)
                ci->i_rdcache_ref++;
-       if (got & CEPH_CAP_FILE_WR)
+       if (got & CEPH_CAP_FILE_WR) {
+               if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+                       BUG_ON(!snap_rwsem_locked);
+                       ci->i_head_snapc = ceph_get_snap_context(
+                                       ci->i_snap_realm->cached_context);
+               }
                ci->i_wr_ref++;
+       }
        if (got & CEPH_CAP_FILE_BUFFER) {
                if (ci->i_wb_ref == 0)
                        ihold(&ci->vfs_inode);
@@ -2100,16 +2313,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
  * requested from the MDS.
  */
 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
-                           loff_t endoff, int *got, int *check_max, int *err)
+                           loff_t endoff, bool nonblock, int *got, int *err)
 {
        struct inode *inode = &ci->vfs_inode;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
        int ret = 0;
        int have, implemented;
        int file_wanted;
+       bool snap_rwsem_locked = false;
 
        dout("get_cap_refs %p need %s want %s\n", inode,
             ceph_cap_string(need), ceph_cap_string(want));
 
+again:
        spin_lock(&ci->i_ceph_lock);
 
        /* make sure file is actually open */
@@ -2125,6 +2341,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
        /* finish pending truncate */
        while (ci->i_truncate_pending) {
                spin_unlock(&ci->i_ceph_lock);
+               if (snap_rwsem_locked) {
+                       up_read(&mdsc->snap_rwsem);
+                       snap_rwsem_locked = false;
+               }
                __ceph_do_pending_vmtruncate(inode);
                spin_lock(&ci->i_ceph_lock);
        }
@@ -2136,7 +2356,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
                        dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
                             inode, endoff, ci->i_max_size);
                        if (endoff > ci->i_requested_max_size) {
-                               *check_max = 1;
+                               *err = -EAGAIN;
                                ret = 1;
                        }
                        goto out_unlock;
@@ -2164,8 +2384,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
                     inode, ceph_cap_string(have), ceph_cap_string(not),
                     ceph_cap_string(revoking));
                if ((revoking & not) == 0) {
+                       if (!snap_rwsem_locked &&
+                           !ci->i_head_snapc &&
+                           (need & CEPH_CAP_FILE_WR)) {
+                               if (!down_read_trylock(&mdsc->snap_rwsem)) {
+                                       /*
+                                        * we can not call down_read() when
+                                        * task isn't in TASK_RUNNING state
+                                        */
+                                       if (nonblock) {
+                                               *err = -EAGAIN;
+                                               ret = 1;
+                                               goto out_unlock;
+                                       }
+
+                                       spin_unlock(&ci->i_ceph_lock);
+                                       down_read(&mdsc->snap_rwsem);
+                                       snap_rwsem_locked = true;
+                                       goto again;
+                               }
+                               snap_rwsem_locked = true;
+                       }
                        *got = need | (have & want);
-                       __take_cap_refs(ci, *got);
+                       __take_cap_refs(ci, *got, true);
                        ret = 1;
                }
        } else {
@@ -2189,6 +2430,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
        }
 out_unlock:
        spin_unlock(&ci->i_ceph_lock);
+       if (snap_rwsem_locked)
+               up_read(&mdsc->snap_rwsem);
 
        dout("get_cap_refs %p ret %d got %s\n", inode,
             ret, ceph_cap_string(*got));
@@ -2231,50 +2474,70 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                  loff_t endoff, int *got, struct page **pinned_page)
 {
-       int _got, check_max, ret, err = 0;
+       int _got, ret, err = 0;
 
-retry:
-       if (endoff > 0)
-               check_max_size(&ci->vfs_inode, endoff);
-       _got = 0;
-       check_max = 0;
-       ret = wait_event_interruptible(ci->i_cap_wq,
-                               try_get_cap_refs(ci, need, want, endoff,
-                                                &_got, &check_max, &err));
-       if (err)
-               ret = err;
+       ret = ceph_pool_perm_check(ci, need);
        if (ret < 0)
                return ret;
 
-       if (check_max)
-               goto retry;
+       while (true) {
+               if (endoff > 0)
+                       check_max_size(&ci->vfs_inode, endoff);
 
-       if (ci->i_inline_version != CEPH_INLINE_NONE &&
-           (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-           i_size_read(&ci->vfs_inode) > 0) {
-               struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
-               if (page) {
-                       if (PageUptodate(page)) {
-                               *pinned_page = page;
-                               goto out;
-                       }
-                       page_cache_release(page);
-               }
-               /*
-                * drop cap refs first because getattr while holding
-                * caps refs can cause deadlock.
-                */
-               ceph_put_cap_refs(ci, _got);
+               err = 0;
                _got = 0;
+               ret = try_get_cap_refs(ci, need, want, endoff,
+                                      false, &_got, &err);
+               if (ret) {
+                       if (err == -EAGAIN)
+                               continue;
+                       if (err < 0)
+                               return err;
+               } else {
+                       ret = wait_event_interruptible(ci->i_cap_wq,
+                                       try_get_cap_refs(ci, need, want, endoff,
+                                                        true, &_got, &err));
+                       if (err == -EAGAIN)
+                               continue;
+                       if (err < 0)
+                               ret = err;
+                       if (ret < 0)
+                               return ret;
+               }
 
-               /* getattr request will bring inline data into page cache */
-               ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
-                                       CEPH_STAT_CAP_INLINE_DATA, true);
-               if (ret < 0)
-                       return ret;
-               goto retry;
+               if (ci->i_inline_version != CEPH_INLINE_NONE &&
+                   (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+                   i_size_read(&ci->vfs_inode) > 0) {
+                       struct page *page =
+                               find_get_page(ci->vfs_inode.i_mapping, 0);
+                       if (page) {
+                               if (PageUptodate(page)) {
+                                       *pinned_page = page;
+                                       break;
+                               }
+                               page_cache_release(page);
+                       }
+                       /*
+                        * drop cap refs first because getattr while
+                        * holding * caps refs can cause deadlock.
+                        */
+                       ceph_put_cap_refs(ci, _got);
+                       _got = 0;
+
+                       /*
+                        * getattr request will bring inline data into
+                        * page cache
+                        */
+                       ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+                                               CEPH_STAT_CAP_INLINE_DATA,
+                                               true);
+                       if (ret < 0)
+                               return ret;
+                       continue;
+               }
+               break;
        }
-out:
+
        *got = _got;
        return 0;
 }
@@ -2286,10 +2549,31 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
 {
        spin_lock(&ci->i_ceph_lock);
-       __take_cap_refs(ci, caps);
+       __take_cap_refs(ci, caps, false);
        spin_unlock(&ci->i_ceph_lock);
 }
 
+
+/*
+ * drop cap_snap that is not associated with any snapshot.
+ * we don't need to send FLUSHSNAP message for it.
+ */
+static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+{
+       if (!capsnap->need_flush &&
+           !capsnap->writing && !capsnap->dirty_pages) {
+
+               dout("dropping cap_snap %p follows %llu\n",
+                    capsnap, capsnap->follows);
+               ceph_put_snap_context(capsnap->context);
+               list_del(&capsnap->ci_item);
+               list_del(&capsnap->flushing_item);
+               ceph_put_cap_snap(capsnap);
+               return 1;
+       }
+       return 0;
+}
+
 /*
  * Release cap refs.
  *
@@ -2303,7 +2587,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 {
        struct inode *inode = &ci->vfs_inode;
        int last = 0, put = 0, flushsnaps = 0, wake = 0;
-       struct ceph_cap_snap *capsnap;
 
        spin_lock(&ci->i_ceph_lock);
        if (had & CEPH_CAP_PIN)
@@ -2325,17 +2608,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
        if (had & CEPH_CAP_FILE_WR)
                if (--ci->i_wr_ref == 0) {
                        last++;
-                       if (!list_empty(&ci->i_cap_snaps)) {
-                               capsnap = list_first_entry(&ci->i_cap_snaps,
-                                                    struct ceph_cap_snap,
-                                                    ci_item);
-                               if (capsnap->writing) {
-                                       capsnap->writing = 0;
-                                       flushsnaps =
-                                               __ceph_finish_cap_snap(ci,
-                                                                      capsnap);
-                                       wake = 1;
-                               }
+                       if (__ceph_have_pending_cap_snap(ci)) {
+                               struct ceph_cap_snap *capsnap =
+                                       list_last_entry(&ci->i_cap_snaps,
+                                                       struct ceph_cap_snap,
+                                                       ci_item);
+                               capsnap->writing = 0;
+                               if (ceph_try_drop_cap_snap(capsnap))
+                                       put++;
+                               else if (__ceph_finish_cap_snap(ci, capsnap))
+                                       flushsnaps = 1;
+                               wake = 1;
+                       }
+                       if (ci->i_wrbuffer_ref_head == 0 &&
+                           ci->i_dirty_caps == 0 &&
+                           ci->i_flushing_caps == 0) {
+                               BUG_ON(!ci->i_head_snapc);
+                               ceph_put_snap_context(ci->i_head_snapc);
+                               ci->i_head_snapc = NULL;
                        }
                        /* see comment in __ceph_remove_cap() */
                        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
@@ -2352,7 +2642,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                ceph_flush_snaps(ci);
        if (wake)
                wake_up_all(&ci->i_cap_wq);
-       if (put)
+       while (put-- > 0)
                iput(inode);
 }
 
@@ -2380,7 +2670,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        if (ci->i_head_snapc == snapc) {
                ci->i_wrbuffer_ref_head -= nr;
                if (ci->i_wrbuffer_ref_head == 0 &&
-                   ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+                   ci->i_wr_ref == 0 &&
+                   ci->i_dirty_caps == 0 &&
+                   ci->i_flushing_caps == 0) {
                        BUG_ON(!ci->i_head_snapc);
                        ceph_put_snap_context(ci->i_head_snapc);
                        ci->i_head_snapc = NULL;
@@ -2401,25 +2693,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                capsnap->dirty_pages -= nr;
                if (capsnap->dirty_pages == 0) {
                        complete_capsnap = 1;
-                       if (capsnap->dirty == 0)
-                               /* cap writeback completed before we created
-                                * the cap_snap; no FLUSHSNAP is needed */
-                               drop_capsnap = 1;
+                       drop_capsnap = ceph_try_drop_cap_snap(capsnap);
                }
                dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-                    " snap %lld %d/%d -> %d/%d %s%s%s\n",
+                    " snap %lld %d/%d -> %d/%d %s%s\n",
                     inode, capsnap, capsnap->context->seq,
                     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
                     ci->i_wrbuffer_ref, capsnap->dirty_pages,
                     last ? " (wrbuffer last)" : "",
-                    complete_capsnap ? " (complete capsnap)" : "",
-                    drop_capsnap ? " (drop capsnap)" : "");
-               if (drop_capsnap) {
-                       ceph_put_snap_context(capsnap->context);
-                       list_del(&capsnap->ci_item);
-                       list_del(&capsnap->flushing_item);
-                       ceph_put_cap_snap(capsnap);
-               }
+                    complete_capsnap ? " (complete capsnap)" : "");
        }
 
        spin_unlock(&ci->i_ceph_lock);
@@ -2526,7 +2808,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
         * try to invalidate (once).  (If there are dirty buffers, we
         * will invalidate _after_ writeback.)
         */
-       if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+       if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+           ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
            !ci->i_wrbuffer_ref) {
                if (try_nonblocking_invalidate(inode)) {
@@ -2732,16 +3015,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_cap_flush *cf;
+       struct rb_node *n;
+       LIST_HEAD(to_remove);
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
        int drop = 0;
-       int i;
 
-       for (i = 0; i < CEPH_CAP_BITS; i++)
-               if ((dirty & (1 << i)) &&
-                   (u16)flush_tid == ci->i_cap_flush_tid[i])
-                       cleaned |= 1 << i;
+       n = rb_first(&ci->i_cap_flush_tree);
+       while (n) {
+               cf = rb_entry(n, struct ceph_cap_flush, i_node);
+               n = rb_next(&cf->i_node);
+               if (cf->tid == flush_tid)
+                       cleaned = cf->caps;
+               if (cf->tid <= flush_tid) {
+                       rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+                       list_add_tail(&cf->list, &to_remove);
+               } else {
+                       cleaned &= ~cf->caps;
+                       if (!cleaned)
+                               break;
+               }
+       }
 
        dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
             " flushing %s -> %s\n",
@@ -2749,12 +3045,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
             ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
             ceph_cap_string(ci->i_flushing_caps & ~cleaned));
 
-       if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+       if (list_empty(&to_remove) && !cleaned)
                goto out;
 
        ci->i_flushing_caps &= ~cleaned;
 
        spin_lock(&mdsc->cap_dirty_lock);
+
+       if (!list_empty(&to_remove)) {
+               list_for_each_entry(cf, &to_remove, list)
+                       rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
+               n = rb_first(&mdsc->cap_flush_tree);
+               cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+               if (!cf || cf->tid > flush_tid)
+                       wake_up_all(&mdsc->cap_flushing_wq);
+       }
+
        if (ci->i_flushing_caps == 0) {
                list_del_init(&ci->i_flushing_item);
                if (!list_empty(&session->s_cap_flushing))
@@ -2764,14 +3071,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
                                         struct ceph_inode_info,
                                         i_flushing_item)->vfs_inode);
                mdsc->num_cap_flushing--;
-               wake_up_all(&mdsc->cap_flushing_wq);
                dout(" inode %p now !flushing\n", inode);
 
                if (ci->i_dirty_caps == 0) {
                        dout(" inode %p now clean\n", inode);
                        BUG_ON(!list_empty(&ci->i_dirty_item));
                        drop = 1;
-                       if (ci->i_wrbuffer_ref_head == 0) {
+                       if (ci->i_wr_ref == 0 &&
+                           ci->i_wrbuffer_ref_head == 0) {
                                BUG_ON(!ci->i_head_snapc);
                                ceph_put_snap_context(ci->i_head_snapc);
                                ci->i_head_snapc = NULL;
@@ -2785,6 +3092,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 
 out:
        spin_unlock(&ci->i_ceph_lock);
+
+       while (!list_empty(&to_remove)) {
+               cf = list_first_entry(&to_remove,
+                                     struct ceph_cap_flush, list);
+               list_del(&cf->list);
+               ceph_free_cap_flush(cf);
+       }
        if (drop)
                iput(inode);
 }
@@ -2800,6 +3114,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                                     struct ceph_mds_session *session)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        u64 follows = le64_to_cpu(m->snap_follows);
        struct ceph_cap_snap *capsnap;
        int drop = 0;
@@ -2823,6 +3138,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                        list_del(&capsnap->ci_item);
                        list_del(&capsnap->flushing_item);
                        ceph_put_cap_snap(capsnap);
+                       wake_up_all(&mdsc->cap_flushing_wq);
                        drop = 1;
                        break;
                } else {
@@ -2971,7 +3287,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                        mutex_lock_nested(&session->s_mutex,
                                          SINGLE_DEPTH_NESTING);
                }
-               ceph_add_cap_releases(mdsc, tsession);
                new_cap = ceph_get_cap(mdsc, NULL);
        } else {
                WARN_ON(1);
@@ -3167,16 +3482,20 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
             (unsigned)seq);
 
-       if (op == CEPH_CAP_OP_IMPORT)
-               ceph_add_cap_releases(mdsc, session);
-
        if (!inode) {
                dout(" i don't have ino %llx\n", vino.ino);
 
                if (op == CEPH_CAP_OP_IMPORT) {
+                       cap = ceph_get_cap(mdsc, NULL);
+                       cap->cap_ino = vino.ino;
+                       cap->queue_release = 1;
+                       cap->cap_id = cap_id;
+                       cap->mseq = mseq;
+                       cap->seq = seq;
                        spin_lock(&session->s_cap_lock);
-                       __queue_cap_release(session, vino.ino, cap_id,
-                                           mseq, seq);
+                       list_add_tail(&cap->session_caps,
+                                       &session->s_cap_releases);
+                       session->s_num_cap_releases++;
                        spin_unlock(&session->s_cap_lock);
                }
                goto flush_cap_releases;
@@ -3252,11 +3571,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 flush_cap_releases:
        /*
-        * send any full release message to try to move things
+        * send any cap release message to try to move things
         * along for the mds (who clearly thinks we still have this
         * cap).
         */
-       ceph_add_cap_releases(mdsc, session);
        ceph_send_cap_releases(mdsc, session);
 
 done:
index 4248307fea909c6f1555758e536a3f733ad5f95d..9314b4ea2375145647aa16446a1f33ae2c8106b2 100644 (file)
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
 
-       di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+       di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
        if (!di)
                return -ENOMEM;          /* oh well */
 
@@ -106,6 +106,27 @@ static int fpos_cmp(loff_t l, loff_t r)
        return (int)(fpos_off(l) - fpos_off(r));
 }
 
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+                           int len, unsigned next_offset)
+{
+       char *buf = kmalloc(len+1, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+       kfree(fi->last_name);
+       fi->last_name = buf;
+       memcpy(fi->last_name, name, len);
+       fi->last_name[len] = 0;
+       fi->next_offset = next_offset;
+       dout("note_last_dentry '%s'\n", fi->last_name);
+       return 0;
+}
+
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
@@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        struct ceph_file_info *fi = file->private_data;
        struct dentry *parent = file->f_path.dentry;
        struct inode *dir = d_inode(parent);
-       struct list_head *p;
-       struct dentry *dentry, *last;
+       struct dentry *dentry, *last = NULL;
        struct ceph_dentry_info *di;
+       unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
        int err = 0;
+       loff_t ptr_pos = 0;
+       struct ceph_readdir_cache_control cache_ctl = {};
 
-       /* claim ref on last dentry we returned */
-       last = fi->dentry;
-       fi->dentry = NULL;
-
-       dout("__dcache_readdir %p v%u at %llu (last %p)\n",
-            dir, shared_gen, ctx->pos, last);
+       dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
 
-       spin_lock(&parent->d_lock);
-
-       /* start at beginning? */
-       if (ctx->pos == 2 || last == NULL ||
-           fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
-               if (list_empty(&parent->d_subdirs))
-                       goto out_unlock;
-               p = parent->d_subdirs.prev;
-               dout(" initial p %p/%p\n", p->prev, p->next);
-       } else {
-               p = last->d_child.prev;
+       /* we can calculate cache index for the first dirfrag */
+       if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
+               cache_ctl.index = fpos_off(ctx->pos) - 2;
+               BUG_ON(cache_ctl.index < 0);
+               ptr_pos = cache_ctl.index * sizeof(struct dentry *);
        }
 
-more:
-       dentry = list_entry(p, struct dentry, d_child);
-       di = ceph_dentry(dentry);
-       while (1) {
-               dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
-                    d_unhashed(dentry) ? "!hashed" : "hashed",
-                    parent->d_subdirs.prev, parent->d_subdirs.next);
-               if (p == &parent->d_subdirs) {
+       while (true) {
+               pgoff_t pgoff;
+               bool emit_dentry;
+
+               if (ptr_pos >= i_size_read(dir)) {
                        fi->flags |= CEPH_F_ATEND;
-                       goto out_unlock;
+                       err = 0;
+                       break;
+               }
+
+               err = -EAGAIN;
+               pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
+               if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
+                       ceph_readdir_cache_release(&cache_ctl);
+                       cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
+                       if (!cache_ctl.page) {
+                               dout(" page %lu not found\n", pgoff);
+                               break;
+                       }
+                       /* reading/filling the cache are serialized by
+                        * i_mutex, no need to use page lock */
+                       unlock_page(cache_ctl.page);
+                       cache_ctl.dentries = kmap(cache_ctl.page);
                }
-               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+               rcu_read_lock();
+               spin_lock(&parent->d_lock);
+               /* check i_size again here, because empty directory can be
+                * marked as complete while not holding the i_mutex. */
+               if (ceph_dir_is_complete_ordered(dir) &&
+                   ptr_pos < i_size_read(dir))
+                       dentry = cache_ctl.dentries[cache_ctl.index % nsize];
+               else
+                       dentry = NULL;
+               spin_unlock(&parent->d_lock);
+               if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+                       dentry = NULL;
+               rcu_read_unlock();
+               if (!dentry)
+                       break;
+
+               emit_dentry = false;
+               di = ceph_dentry(dentry);
+               spin_lock(&dentry->d_lock);
                if (di->lease_shared_gen == shared_gen &&
-                   !d_unhashed(dentry) && d_really_is_positive(dentry) &&
+                   d_really_is_positive(dentry) &&
                    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
                    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
-                   fpos_cmp(ctx->pos, di->offset) <= 0)
-                       break;
-               dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
-                    dentry, di->offset,
-                    ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
-                    !d_inode(dentry) ? " null" : "");
+                   fpos_cmp(ctx->pos, di->offset) <= 0) {
+                       emit_dentry = true;
+               }
                spin_unlock(&dentry->d_lock);
-               p = p->prev;
-               dentry = list_entry(p, struct dentry, d_child);
-               di = ceph_dentry(dentry);
-       }
-
-       dget_dlock(dentry);
-       spin_unlock(&dentry->d_lock);
-       spin_unlock(&parent->d_lock);
 
-       /* make sure a dentry wasn't dropped while we didn't have parent lock */
-       if (!ceph_dir_is_complete_ordered(dir)) {
-               dout(" lost dir complete on %p; falling back to mds\n", dir);
-               dput(dentry);
-               err = -EAGAIN;
-               goto out;
-       }
+               if (emit_dentry) {
+                       dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+                            dentry, dentry, d_inode(dentry));
+                       ctx->pos = di->offset;
+                       if (!dir_emit(ctx, dentry->d_name.name,
+                                     dentry->d_name.len,
+                                     ceph_translate_ino(dentry->d_sb,
+                                                        d_inode(dentry)->i_ino),
+                                     d_inode(dentry)->i_mode >> 12)) {
+                               dput(dentry);
+                               err = 0;
+                               break;
+                       }
+                       ctx->pos++;
 
-       dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
-            dentry, dentry, d_inode(dentry));
-       if (!dir_emit(ctx, dentry->d_name.name,
-                     dentry->d_name.len,
-                     ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino),
-                     d_inode(dentry)->i_mode >> 12)) {
-               if (last) {
-                       /* remember our position */
-                       fi->dentry = last;
-                       fi->next_offset = fpos_off(di->offset);
+                       if (last)
+                               dput(last);
+                       last = dentry;
+               } else {
+                       dput(dentry);
                }
-               dput(dentry);
-               return 0;
-       }
-
-       ctx->pos = di->offset + 1;
 
-       if (last)
-               dput(last);
-       last = dentry;
-
-       spin_lock(&parent->d_lock);
-       p = p->prev;    /* advance to next dentry */
-       goto more;
-
-out_unlock:
-       spin_unlock(&parent->d_lock);
-out:
-       if (last)
+               cache_ctl.index++;
+               ptr_pos += sizeof(struct dentry *);
+       }
+       ceph_readdir_cache_release(&cache_ctl);
+       if (last) {
+               int ret;
+               di = ceph_dentry(last);
+               ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
+                                      fpos_off(di->offset) + 1);
+               if (ret < 0)
+                       err = ret;
                dput(last);
+       }
        return err;
 }
 
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
-                           int len)
-{
-       kfree(fi->last_name);
-       fi->last_name = kmalloc(len+1, GFP_NOFS);
-       if (!fi->last_name)
-               return -ENOMEM;
-       memcpy(fi->last_name, name, len);
-       fi->last_name[len] = 0;
-       dout("note_last_dentry '%s'\n", fi->last_name);
-       return 0;
-}
-
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
        struct ceph_file_info *fi = file->private_data;
@@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
        /* can we use the dcache? */
        spin_lock(&ci->i_ceph_lock);
-       if ((ctx->pos == 2 || fi->dentry) &&
-           ceph_test_mount_opt(fsc, DCACHE) &&
+       if (ceph_test_mount_opt(fsc, DCACHE) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            __ceph_dir_is_complete_ordered(ci) &&
@@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
-       if (fi->dentry) {
-               err = note_last_dentry(fi, fi->dentry->d_name.name,
-                                      fi->dentry->d_name.len);
-               if (err)
-                       return err;
-               dput(fi->dentry);
-               fi->dentry = NULL;
-       }
 
        /* proceed with a normal readdir */
-
-       if (ctx->pos == 2) {
-               /* note dir version at start of readdir so we can tell
-                * if any dentries get dropped */
-               fi->dir_release_count = atomic_read(&ci->i_release_count);
-               fi->dir_ordered_count = ci->i_ordered_count;
-       }
-
 more:
        /* do we have the correct frag content buffered? */
        if (fi->frag != frag || fi->last_readdir == NULL) {
@@ -342,12 +336,15 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                req->r_direct_hash = ceph_frag_value(frag);
                req->r_direct_is_hash = true;
                if (fi->last_name) {
-                       req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+                       req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
                        if (!req->r_path2) {
                                ceph_mdsc_put_request(req);
                                return -ENOMEM;
                        }
                }
+               req->r_dir_release_cnt = fi->dir_release_count;
+               req->r_dir_ordered_cnt = fi->dir_ordered_count;
+               req->r_readdir_cache_idx = fi->readdir_cache_idx;
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
 
@@ -364,26 +361,38 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                     (int)req->r_reply_info.dir_end,
                     (int)req->r_reply_info.dir_complete);
 
-               if (!req->r_did_prepopulate) {
-                       dout("readdir !did_prepopulate");
-                       /* preclude from marking dir complete */
-                       fi->dir_release_count--;
-               }
 
                /* note next offset and last dentry name */
                rinfo = &req->r_reply_info;
                if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                        frag = le32_to_cpu(rinfo->dir_dir->frag);
-                       if (ceph_frag_is_leftmost(frag))
-                               fi->next_offset = 2;
-                       else
-                               fi->next_offset = 0;
-                       off = fi->next_offset;
+                       off = req->r_readdir_offset;
+                       fi->next_offset = off;
                }
+
                fi->frag = frag;
                fi->offset = fi->next_offset;
                fi->last_readdir = req;
 
+               if (req->r_did_prepopulate) {
+                       fi->readdir_cache_idx = req->r_readdir_cache_idx;
+                       if (fi->readdir_cache_idx < 0) {
+                               /* preclude from marking dir ordered */
+                               fi->dir_ordered_count = 0;
+                       } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+                               /* note dir version at start of readdir so
+                                * we can tell if any dentries get dropped */
+                               fi->dir_release_count = req->r_dir_release_cnt;
+                               fi->dir_ordered_count = req->r_dir_ordered_cnt;
+                       }
+               } else {
+                       dout("readdir !did_prepopulate");
+                       /* disable readdir cache */
+                       fi->readdir_cache_idx = -1;
+                       /* preclude from marking dir complete */
+                       fi->dir_release_count = 0;
+               }
+
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
@@ -394,10 +403,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
                } else {
                        err = note_last_dentry(fi,
                                       rinfo->dir_dname[rinfo->dir_nr-1],
-                                      rinfo->dir_dname_len[rinfo->dir_nr-1]);
+                                      rinfo->dir_dname_len[rinfo->dir_nr-1],
+                                      fi->next_offset + rinfo->dir_nr);
                        if (err)
                                return err;
-                       fi->next_offset += rinfo->dir_nr;
                }
        }
 
@@ -453,16 +462,22 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
         * were released during the whole readdir, and we should have
         * the complete dir contents in our cache.
         */
-       spin_lock(&ci->i_ceph_lock);
-       if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
-               if (ci->i_ordered_count == fi->dir_ordered_count)
+       if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
+               spin_lock(&ci->i_ceph_lock);
+               if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
                        dout(" marking %p complete and ordered\n", inode);
-               else
+                       /* use i_size to track number of entries in
+                        * readdir cache */
+                       BUG_ON(fi->readdir_cache_idx < 0);
+                       i_size_write(inode, fi->readdir_cache_idx *
+                                    sizeof(struct dentry*));
+               } else {
                        dout(" marking %p complete\n", inode);
+               }
                __ceph_dir_set_complete(ci, fi->dir_release_count,
                                        fi->dir_ordered_count);
+               spin_unlock(&ci->i_ceph_lock);
        }
-       spin_unlock(&ci->i_ceph_lock);
 
        dout("readdir %p file %p done.\n", inode, file);
        return 0;
@@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
        }
        kfree(fi->last_name);
        fi->last_name = NULL;
+       fi->dir_release_count = 0;
+       fi->readdir_cache_idx = -1;
        if (ceph_frag_is_leftmost(frag))
                fi->next_offset = 2;  /* compensate for . and .. */
        else
                fi->next_offset = 0;
-       if (fi->dentry) {
-               dput(fi->dentry);
-               fi->dentry = NULL;
-       }
        fi->flags &= ~CEPH_F_ATEND;
 }
 
@@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        mutex_lock(&inode->i_mutex);
        retval = -EINVAL;
        switch (whence) {
-       case SEEK_END:
-               offset += inode->i_size + 2;   /* FIXME */
-               break;
        case SEEK_CUR:
                offset += file->f_pos;
        case SEEK_SET:
                break;
+       case SEEK_END:
+               retval = -EOPNOTSUPP;
        default:
                goto out;
        }
@@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
                }
                retval = offset;
 
-               /*
-                * discard buffered readdir content on seekdir(0), or
-                * seek to new frag, or seek prior to current chunk.
-                */
                if (offset == 0 ||
                    fpos_frag(offset) != fi->frag ||
                    fpos_off(offset) < fi->offset) {
+                       /* discard buffered readdir content on seekdir(0), or
+                        * seek to new frag, or seek prior to current chunk */
                        dout("dir_llseek dropping %p content\n", file);
                        reset_readdir(fi, fpos_frag(offset));
+               } else if (fpos_cmp(offset, old_offset) > 0) {
+                       /* reset dir_release_count if we did a forward seek */
+                       fi->dir_release_count = 0;
+                       fi->readdir_cache_idx = -1;
                }
-
-               /* bump dir_release_count if we did a forward seek */
-               if (fpos_cmp(offset, old_offset) > 0)
-                       fi->dir_release_count--;
        }
 out:
        mutex_unlock(&inode->i_mutex);
@@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                err = PTR_ERR(req);
                goto out;
        }
-       req->r_path2 = kstrdup(dest, GFP_NOFS);
+       req->r_path2 = kstrdup(dest, GFP_KERNEL);
        if (!req->r_path2) {
                err = -ENOMEM;
                ceph_mdsc_put_request(req);
@@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                 * to do it here.
                 */
 
+               /* d_move screws up sibling dentries' offsets */
+               ceph_dir_clear_complete(old_dir);
+               ceph_dir_clear_complete(new_dir);
+
                d_move(old_dentry, new_dentry);
 
                /* ensure target dentry is invalidated, despite
                   rehashing bug in vfs_rename_dir */
                ceph_invalidate_dentry_lease(new_dentry);
-
-               /* d_move screws up sibling dentries' offsets */
-               ceph_dir_clear_complete(old_dir);
-               ceph_dir_clear_complete(new_dir);
-
        }
        ceph_mdsc_put_request(req);
        return err;
@@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
                return -EISDIR;
 
        if (!cf->dir_info) {
-               cf->dir_info = kmalloc(bufsize, GFP_NOFS);
+               cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
                if (!cf->dir_info)
                        return -ENOMEM;
                cf->dir_info_len =
@@ -1223,66 +1232,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        return size - left;
 }
 
-/*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
-                         int datasync)
-{
-       struct inode *inode = file_inode(file);
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct list_head *head = &ci->i_unsafe_dirops;
-       struct ceph_mds_request *req;
-       u64 last_tid;
-       int ret = 0;
-
-       dout("dir_fsync %p\n", inode);
-       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       if (ret)
-               return ret;
-       mutex_lock(&inode->i_mutex);
-
-       spin_lock(&ci->i_unsafe_lock);
-       if (list_empty(head))
-               goto out;
-
-       req = list_entry(head->prev,
-                        struct ceph_mds_request, r_unsafe_dir_item);
-       last_tid = req->r_tid;
-
-       do {
-               ceph_mdsc_get_request(req);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               dout("dir_fsync %p wait on tid %llu (until %llu)\n",
-                    inode, req->r_tid, last_tid);
-               if (req->r_timeout) {
-                       unsigned long time_left = wait_for_completion_timeout(
-                                                       &req->r_safe_completion,
-                                                       req->r_timeout);
-                       if (time_left > 0)
-                               ret = 0;
-                       else
-                               ret = -EIO;  /* timed out */
-               } else {
-                       wait_for_completion(&req->r_safe_completion);
-               }
-               ceph_mdsc_put_request(req);
-
-               spin_lock(&ci->i_unsafe_lock);
-               if (ret || list_empty(head))
-                       break;
-               req = list_entry(head->next,
-                                struct ceph_mds_request, r_unsafe_dir_item);
-       } while (req->r_tid < last_tid);
-out:
-       spin_unlock(&ci->i_unsafe_lock);
-       mutex_unlock(&inode->i_mutex);
-
-       return ret;
-}
-
 /*
  * We maintain a private dentry LRU.
  *
@@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = {
        .open = ceph_open,
        .release = ceph_release,
        .unlocked_ioctl = ceph_ioctl,
-       .fsync = ceph_dir_fsync,
+       .fsync = ceph_fsync,
 };
 
 const struct file_operations ceph_snapdir_fops = {
index 3b6b522b4b31ed9e2f7193c661894787bfe63627..faf92095e105650617d8e465e898f8d22a803d60 100644 (file)
@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
        case S_IFDIR:
                dout("init_file %p %p 0%o (regular)\n", inode, file,
                     inode->i_mode);
-               cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+               cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
                if (cf == NULL) {
                        ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
                        return -ENOMEM;
                }
                cf->fmode = fmode;
                cf->next_offset = 2;
+               cf->readdir_cache_idx = -1;
                file->private_data = cf;
                BUG_ON(inode->i_fop->release != ceph_release);
                break;
@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
                ceph_mdsc_put_request(cf->last_readdir);
        kfree(cf->last_name);
        kfree(cf->dir_info);
-       dput(cf->dentry);
        kmem_cache_free(ceph_file_cachep, cf);
 
        /* wake up anyone waiting for caps on this inode */
@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
                }
        } else {
                num_pages = calc_pages_for(off, len);
-               pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+               pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
                if (IS_ERR(pages))
                        return PTR_ERR(pages);
                ret = striped_read(inode, off, len, pages,
@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
  * objects, rollback on failure, etc.)
  */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+                      struct ceph_snap_context *snapc)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_snap_context *snapc;
        struct ceph_vino vino;
        struct ceph_osd_request *req;
        struct page **pages;
@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
                size_t start;
                ssize_t n;
 
-               snapc = ci->i_snap_realm->cached_context;
                vino = ceph_vino(inode);
                req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                            vino, pos, &len, 0,
@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
                        break;
                }
 
-               osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+               osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
                n = iov_iter_get_pages_alloc(from, &pages, len, &start);
                if (unlikely(n < 0)) {
@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
  * objects, rollback on failure, etc.)
  */
 static ssize_t
-ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+               struct ceph_snap_context *snapc)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
-       struct ceph_snap_context *snapc;
        struct ceph_vino vino;
        struct ceph_osd_request *req;
        struct page **pages;
@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
                size_t left;
                int n;
 
-               snapc = ci->i_snap_realm->cached_context;
                vino = ceph_vino(inode);
                req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                            vino, pos, &len, 0, 1,
@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
                 */
                num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-               pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+               pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -860,7 +858,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
                struct page *page = NULL;
                loff_t i_size;
                if (retry_op == READ_INLINE) {
-                       page = __page_cache_alloc(GFP_NOFS);
+                       page = __page_cache_alloc(GFP_KERNEL);
                        if (!page)
                                return -ENOMEM;
                }
@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_client *osdc =
                &ceph_sb_to_client(inode->i_sb)->client->osdc;
+       struct ceph_cap_flush *prealloc_cf;
        ssize_t count, written = 0;
        int err, want, got;
        loff_t pos;
@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
 
+       prealloc_cf = ceph_alloc_cap_flush();
+       if (!prealloc_cf)
+               return -ENOMEM;
+
        mutex_lock(&inode->i_mutex);
 
        /* We can write back this queue in page reclaim */
@@ -996,14 +999,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
        if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+               struct ceph_snap_context *snapc;
                struct iov_iter data;
                mutex_unlock(&inode->i_mutex);
+
+               spin_lock(&ci->i_ceph_lock);
+               if (__ceph_have_pending_cap_snap(ci)) {
+                       struct ceph_cap_snap *capsnap =
+                                       list_last_entry(&ci->i_cap_snaps,
+                                                       struct ceph_cap_snap,
+                                                       ci_item);
+                       snapc = ceph_get_snap_context(capsnap->context);
+               } else {
+                       BUG_ON(!ci->i_head_snapc);
+                       snapc = ceph_get_snap_context(ci->i_head_snapc);
+               }
+               spin_unlock(&ci->i_ceph_lock);
+
                /* we might need to revert back to that point */
                data = *from;
                if (iocb->ki_flags & IOCB_DIRECT)
-                       written = ceph_sync_direct_write(iocb, &data, pos);
+                       written = ceph_sync_direct_write(iocb, &data, pos,
+                                                        snapc);
                else
-                       written = ceph_sync_write(iocb, &data, pos);
+                       written = ceph_sync_write(iocb, &data, pos, snapc);
                if (written == -EOLDSNAPC) {
                        dout("aio_write %p %llx.%llx %llu~%u"
                                "got EOLDSNAPC, retrying\n",
@@ -1014,6 +1033,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
                }
                if (written > 0)
                        iov_iter_advance(from, written);
+               ceph_put_snap_context(snapc);
        } else {
                loff_t old_size = inode->i_size;
                /*
@@ -1035,7 +1055,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
                int dirty;
                spin_lock(&ci->i_ceph_lock);
                ci->i_inline_version = CEPH_INLINE_NONE;
-               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+                                              &prealloc_cf);
                spin_unlock(&ci->i_ceph_lock);
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
@@ -1059,6 +1080,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 out:
        mutex_unlock(&inode->i_mutex);
 out_unlocked:
+       ceph_free_cap_flush(prealloc_cf);
        current->backing_dev_info = NULL;
        return written ? written : err;
 }
@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_client *osdc =
                &ceph_inode_to_client(inode)->client->osdc;
+       struct ceph_cap_flush *prealloc_cf;
        int want, got = 0;
        int dirty;
        int ret = 0;
@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
 
+       prealloc_cf = ceph_alloc_cap_flush();
+       if (!prealloc_cf)
+               return -ENOMEM;
+
        mutex_lock(&inode->i_mutex);
 
        if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
        if (!ret) {
                spin_lock(&ci->i_ceph_lock);
                ci->i_inline_version = CEPH_INLINE_NONE;
-               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+                                              &prealloc_cf);
                spin_unlock(&ci->i_ceph_lock);
                if (dirty)
                        __mark_inode_dirty(inode, dirty);
@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
        ceph_put_cap_refs(ci, got);
 unlock:
        mutex_unlock(&inode->i_mutex);
+       ceph_free_cap_flush(prealloc_cf);
        return ret;
 }
 
index 571acd88606cfcec3d01fc4a6ef453f0b49e9713..96d2bd8299022e554c8bfdc0b7f1c759be8fc8cd 100644 (file)
@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_inline_version = 0;
        ci->i_time_warp_seq = 0;
        ci->i_ceph_flags = 0;
-       ci->i_ordered_count = 0;
-       atomic_set(&ci->i_release_count, 1);
-       atomic_set(&ci->i_complete_count, 0);
+       atomic64_set(&ci->i_ordered_count, 1);
+       atomic64_set(&ci->i_release_count, 1);
+       atomic64_set(&ci->i_complete_seq[0], 0);
+       atomic64_set(&ci->i_complete_seq[1], 0);
        ci->i_symlink = NULL;
 
        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_flushing_caps = 0;
        INIT_LIST_HEAD(&ci->i_dirty_item);
        INIT_LIST_HEAD(&ci->i_flushing_item);
-       ci->i_cap_flush_seq = 0;
-       ci->i_cap_flush_last_tid = 0;
-       memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+       ci->i_prealloc_cap_flush = NULL;
+       ci->i_cap_flush_tree = RB_ROOT;
        init_waitqueue_head(&ci->i_cap_wq);
        ci->i_hold_caps_min = 0;
        ci->i_hold_caps_max = 0;
@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
        if (new_version ||
            (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+               if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+                       ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
                ci->i_layout = info->layout;
+
                queue_trunc = ceph_fill_file_size(inode, issued,
                                        le32_to_cpu(info->truncate_seq),
                                        le64_to_cpu(info->truncate_size),
@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                            (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                            !__ceph_dir_is_complete(ci)) {
                                dout(" marking %p complete (empty)\n", inode);
+                               i_size_write(inode, 0);
                                __ceph_dir_set_complete(ci,
-                                       atomic_read(&ci->i_release_count),
-                                       ci->i_ordered_count);
+                                       atomic64_read(&ci->i_release_count),
+                                       atomic64_read(&ci->i_ordered_count));
                        }
 
                        wake = true;
@@ -1212,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        dout("fill_trace doing d_move %p -> %p\n",
                             req->r_old_dentry, dn);
 
+                       /* d_move screws up sibling dentries' offsets */
+                       ceph_dir_clear_ordered(dir);
+                       ceph_dir_clear_ordered(olddir);
+
                        d_move(req->r_old_dentry, dn);
                        dout(" src %p '%pd' dst %p '%pd'\n",
                             req->r_old_dentry,
@@ -1222,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                           rehashing bug in vfs_rename_dir */
                        ceph_invalidate_dentry_lease(dn);
 
-                       /* d_move screws up sibling dentries' offsets */
-                       ceph_dir_clear_ordered(dir);
-                       ceph_dir_clear_ordered(olddir);
-
                        dout("dn %p gets new offset %lld\n", req->r_old_dentry,
                             ceph_dentry(req->r_old_dentry)->offset);
 
@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
        return err;
 }
 
+void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
+{
+       if (ctl->page) {
+               kunmap(ctl->page);
+               page_cache_release(ctl->page);
+               ctl->page = NULL;
+       }
+}
+
+static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
+                             struct ceph_readdir_cache_control *ctl,
+                             struct ceph_mds_request *req)
+{
+       struct ceph_inode_info *ci = ceph_inode(dir);
+       unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+       unsigned idx = ctl->index % nsize;
+       pgoff_t pgoff = ctl->index / nsize;
+
+       if (!ctl->page || pgoff != page_index(ctl->page)) {
+               ceph_readdir_cache_release(ctl);
+               ctl->page  = grab_cache_page(&dir->i_data, pgoff);
+               if (!ctl->page) {
+                       ctl->index = -1;
+                       return -ENOMEM;
+               }
+               /* reading/filling the cache are serialized by
+                * i_mutex, no need to use page lock */
+               unlock_page(ctl->page);
+               ctl->dentries = kmap(ctl->page);
+       }
+
+       if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
+           req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
+               dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+               ctl->dentries[idx] = dn;
+               ctl->index++;
+       } else {
+               dout("disable readdir cache\n");
+               ctl->index = -1;
+       }
+       return 0;
+}
+
 int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                             struct ceph_mds_session *session)
 {
@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
        struct ceph_dentry_info *di;
-       u64 r_readdir_offset = req->r_readdir_offset;
        u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+       struct ceph_readdir_cache_control cache_ctl = {};
+
+       if (req->r_aborted)
+               return readdir_prepopulate_inodes_only(req, session);
 
        if (rinfo->dir_dir &&
            le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                     frag, le32_to_cpu(rinfo->dir_dir->frag));
                frag = le32_to_cpu(rinfo->dir_dir->frag);
                if (ceph_frag_is_leftmost(frag))
-                       r_readdir_offset = 2;
+                       req->r_readdir_offset = 2;
                else
-                       r_readdir_offset = 0;
+                       req->r_readdir_offset = 0;
        }
 
-       if (req->r_aborted)
-               return readdir_prepopulate_inodes_only(req, session);
-
        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
                snapdir = ceph_get_snapdir(d_inode(parent));
                parent = d_find_alias(snapdir);
@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                        ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
        }
 
+       if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
+               /* note dir version at start of readdir so we can tell
+                * if any dentries get dropped */
+               struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+               req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
+               req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
+               req->r_readdir_cache_idx = 0;
+       }
+
+       cache_ctl.index = req->r_readdir_cache_idx;
+
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
                struct ceph_vino vino;
@@ -1413,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                        d_delete(dn);
                        dput(dn);
                        goto retry_lookup;
-               } else {
-                       /* reorder parent's d_subdirs */
-                       spin_lock(&parent->d_lock);
-                       spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
-                       list_move(&dn->d_child, &parent->d_subdirs);
-                       spin_unlock(&dn->d_lock);
-                       spin_unlock(&parent->d_lock);
                }
 
                /* inode */
@@ -1436,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                        }
                }
 
-               if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
-                              req->r_request_started, -1,
-                              &req->r_caps_reservation) < 0) {
+               ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+                                req->r_request_started, -1,
+                                &req->r_caps_reservation);
+               if (ret < 0) {
                        pr_err("fill_inode badness on %p\n", in);
                        if (d_really_is_negative(dn))
                                iput(in);
                        d_drop(dn);
+                       err = ret;
                        goto next_item;
                }
 
@@ -1458,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                }
 
                di = dn->d_fsdata;
-               di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+               di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
 
                update_dentry_lease(dn, rinfo->dir_dlease[i],
                                    req->r_session,
                                    req->r_request_started);
+
+               if (err == 0 && cache_ctl.index >= 0) {
+                       ret = fill_readdir_cache(d_inode(parent), dn,
+                                                &cache_ctl, req);
+                       if (ret < 0)
+                               err = ret;
+               }
 next_item:
                if (dn)
                        dput(dn);
        }
-       if (err == 0)
-               req->r_did_prepopulate = true;
-
 out:
+       if (err == 0) {
+               req->r_did_prepopulate = true;
+               req->r_readdir_cache_idx = cache_ctl.index;
+       }
+       ceph_readdir_cache_release(&cache_ctl);
        if (snapdir) {
                iput(snapdir);
                dput(parent);
@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+       struct ceph_cap_flush *prealloc_cf;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
        int err = 0;
        int inode_dirty_flags = 0;
+       bool lock_snap_rwsem = false;
 
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        if (err != 0)
                return err;
 
+       prealloc_cf = ceph_alloc_cap_flush();
+       if (!prealloc_cf)
+               return -ENOMEM;
+
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
                                       USE_AUTH_MDS);
-       if (IS_ERR(req))
+       if (IS_ERR(req)) {
+               ceph_free_cap_flush(prealloc_cf);
                return PTR_ERR(req);
+       }
 
        spin_lock(&ci->i_ceph_lock);
        issued = __ceph_caps_issued(ci, NULL);
+
+       if (!ci->i_head_snapc &&
+           (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
+               lock_snap_rwsem = true;
+               if (!down_read_trylock(&mdsc->snap_rwsem)) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       down_read(&mdsc->snap_rwsem);
+                       spin_lock(&ci->i_ceph_lock);
+                       issued = __ceph_caps_issued(ci, NULL);
+               }
+       }
+
        dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
 
        if (ia_valid & ATTR_UID) {
@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
                dout("setattr %p ATTR_FILE ... hrm!\n", inode);
 
        if (dirtied) {
-               inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+               inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+                                                          &prealloc_cf);
                inode->i_ctime = CURRENT_TIME;
        }
 
        release &= issued;
        spin_unlock(&ci->i_ceph_lock);
+       if (lock_snap_rwsem)
+               up_read(&mdsc->snap_rwsem);
 
        if (inode_dirty_flags)
                __mark_inode_dirty(inode, inode_dirty_flags);
@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        ceph_mdsc_put_request(req);
        if (mask & CEPH_SETATTR_SIZE)
                __ceph_do_pending_vmtruncate(inode);
+       ceph_free_cap_flush(prealloc_cf);
        return err;
 out_put:
        ceph_mdsc_put_request(req);
+       ceph_free_cap_flush(prealloc_cf);
        return err;
 }
 
index 84f37f34f9aa663952a60e8a3440a81934c083b0..6aa07af67603ada211f49268d3845ea62b625720 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/utsname.h>
+#include <linux/ratelimit.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        s->s_cap_reconnect = 0;
        s->s_cap_iterator = NULL;
        INIT_LIST_HEAD(&s->s_cap_releases);
-       INIT_LIST_HEAD(&s->s_cap_releases_done);
        INIT_LIST_HEAD(&s->s_cap_flushing);
        INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
 
@@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
        req->r_uid = current_fsuid();
        req->r_gid = current_fsgid();
 
+       if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
+               mdsc->oldest_tid = req->r_tid;
+
        if (dir) {
                struct ceph_inode_info *ci = ceph_inode(dir);
 
@@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
                                 struct ceph_mds_request *req)
 {
        dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+
+       if (req->r_tid == mdsc->oldest_tid) {
+               struct rb_node *p = rb_next(&req->r_node);
+               mdsc->oldest_tid = 0;
+               while (p) {
+                       struct ceph_mds_request *next_req =
+                               rb_entry(p, struct ceph_mds_request, r_node);
+                       if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
+                               mdsc->oldest_tid = next_req->r_tid;
+                               break;
+                       }
+                       p = rb_next(p);
+               }
+       }
+
        rb_erase(&req->r_node, &mdsc->request_tree);
        RB_CLEAR_NODE(&req->r_node);
 
@@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
  * session caps
  */
 
-/*
- * Free preallocated cap messages assigned to this session
- */
-static void cleanup_cap_releases(struct ceph_mds_session *session)
+/* caller holds s_cap_lock, we drop it */
+static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
+                                struct ceph_mds_session *session)
+       __releases(session->s_cap_lock)
 {
-       struct ceph_msg *msg;
+       LIST_HEAD(tmp_list);
+       list_splice_init(&session->s_cap_releases, &tmp_list);
+       session->s_num_cap_releases = 0;
+       spin_unlock(&session->s_cap_lock);
 
-       spin_lock(&session->s_cap_lock);
-       while (!list_empty(&session->s_cap_releases)) {
-               msg = list_first_entry(&session->s_cap_releases,
-                                      struct ceph_msg, list_head);
-               list_del_init(&msg->list_head);
-               ceph_msg_put(msg);
-       }
-       while (!list_empty(&session->s_cap_releases_done)) {
-               msg = list_first_entry(&session->s_cap_releases_done,
-                                      struct ceph_msg, list_head);
-               list_del_init(&msg->list_head);
-               ceph_msg_put(msg);
+       dout("cleanup_cap_releases mds%d\n", session->s_mds);
+       while (!list_empty(&tmp_list)) {
+               struct ceph_cap *cap;
+               /* zero out the in-progress message */
+               cap = list_first_entry(&tmp_list,
+                                       struct ceph_cap, session_caps);
+               list_del(&cap->session_caps);
+               ceph_put_cap(mdsc, cap);
        }
-       spin_unlock(&session->s_cap_lock);
 }
 
 static void cleanup_session_requests(struct ceph_mds_client *mdsc,
@@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
                req = list_first_entry(&session->s_unsafe,
                                       struct ceph_mds_request, r_unsafe_item);
                list_del_init(&req->r_unsafe_item);
-               pr_info(" dropping unsafe request %llu\n", req->r_tid);
+               pr_warn_ratelimited(" dropping unsafe request %llu\n",
+                                   req->r_tid);
                __unregister_request(mdsc, req);
        }
        /* zero r_attempts, so kick_requests() will re-send requests */
@@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session,
                        dout("iterate_session_caps  finishing cap %p removal\n",
                             cap);
                        BUG_ON(cap->session != session);
+                       cap->session = NULL;
                        list_del_init(&cap->session_caps);
                        session->s_nr_caps--;
-                       cap->session = NULL;
-                       old_cap = cap;  /* put_cap it w/o locks held */
+                       if (cap->queue_release) {
+                               list_add_tail(&cap->session_caps,
+                                             &session->s_cap_releases);
+                               session->s_num_cap_releases++;
+                       } else {
+                               old_cap = cap;  /* put_cap it w/o locks held */
+                       }
                }
                if (ret < 0)
                        goto out;
@@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                                  void *arg)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+       LIST_HEAD(to_remove);
        int drop = 0;
 
        dout("removing cap %p, ci is %p, inode is %p\n",
@@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
        spin_lock(&ci->i_ceph_lock);
        __ceph_remove_cap(cap, false);
        if (!ci->i_auth_cap) {
+               struct ceph_cap_flush *cf;
                struct ceph_mds_client *mdsc =
                        ceph_sb_to_client(inode->i_sb)->mdsc;
 
+               while (true) {
+                       struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
+                       if (!n)
+                               break;
+                       cf = rb_entry(n, struct ceph_cap_flush, i_node);
+                       rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+                       list_add(&cf->list, &to_remove);
+               }
+
                spin_lock(&mdsc->cap_dirty_lock);
+
+               list_for_each_entry(cf, &to_remove, list)
+                       rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
                if (!list_empty(&ci->i_dirty_item)) {
-                       pr_info(" dropping dirty %s state for %p %lld\n",
+                       pr_warn_ratelimited(
+                               " dropping dirty %s state for %p %lld\n",
                                ceph_cap_string(ci->i_dirty_caps),
                                inode, ceph_ino(inode));
                        ci->i_dirty_caps = 0;
@@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                        drop = 1;
                }
                if (!list_empty(&ci->i_flushing_item)) {
-                       pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+                       pr_warn_ratelimited(
+                               " dropping dirty+flushing %s state for %p %lld\n",
                                ceph_cap_string(ci->i_flushing_caps),
                                inode, ceph_ino(inode));
                        ci->i_flushing_caps = 0;
@@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                        drop = 1;
                }
                spin_unlock(&mdsc->cap_dirty_lock);
+
+               if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+                       list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+                       ci->i_prealloc_cap_flush = NULL;
+               }
        }
        spin_unlock(&ci->i_ceph_lock);
+       while (!list_empty(&to_remove)) {
+               struct ceph_cap_flush *cf;
+               cf = list_first_entry(&to_remove,
+                                     struct ceph_cap_flush, list);
+               list_del(&cf->list);
+               ceph_free_cap_flush(cf);
+       }
        while (drop--)
                iput(inode);
        return 0;
@@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
                        spin_lock(&session->s_cap_lock);
                }
        }
-       spin_unlock(&session->s_cap_lock);
+
+       // drop cap expires and unlock s_cap_lock
+       cleanup_cap_releases(session->s_mdsc, session);
 
        BUG_ON(session->s_nr_caps > 0);
        BUG_ON(!list_empty(&session->s_cap_flushing));
-       cleanup_cap_releases(session);
 }
 
 /*
@@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
             inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
             ceph_cap_string(used), ceph_cap_string(wanted));
        if (cap == ci->i_auth_cap) {
-               if (ci->i_dirty_caps | ci->i_flushing_caps)
+               if (ci->i_dirty_caps || ci->i_flushing_caps ||
+                   !list_empty(&ci->i_cap_snaps))
                        goto out;
                if ((used | wanted) & CEPH_CAP_ANY_WR)
                        goto out;
@@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc,
                session->s_trim_caps = 0;
        }
 
-       ceph_add_cap_releases(mdsc, session);
        ceph_send_cap_releases(mdsc, session);
        return 0;
 }
 
-/*
- * Allocate cap_release messages.  If there is a partially full message
- * in the queue, try to allocate enough to cover it's remainder, so that
- * we can send it immediately.
- *
- * Called under s_mutex.
- */
-int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-                         struct ceph_mds_session *session)
+static int check_capsnap_flush(struct ceph_inode_info *ci,
+                              u64 want_snap_seq)
 {
-       struct ceph_msg *msg, *partial = NULL;
-       struct ceph_mds_cap_release *head;
-       int err = -ENOMEM;
-       int extra = mdsc->fsc->mount_options->cap_release_safety;
-       int num;
-
-       dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
-            extra);
-
-       spin_lock(&session->s_cap_lock);
-
-       if (!list_empty(&session->s_cap_releases)) {
-               msg = list_first_entry(&session->s_cap_releases,
-                                      struct ceph_msg,
-                                list_head);
-               head = msg->front.iov_base;
-               num = le32_to_cpu(head->num);
-               if (num) {
-                       dout(" partial %p with (%d/%d)\n", msg, num,
-                            (int)CEPH_CAPS_PER_RELEASE);
-                       extra += CEPH_CAPS_PER_RELEASE - num;
-                       partial = msg;
-               }
-       }
-       while (session->s_num_cap_releases < session->s_nr_caps + extra) {
-               spin_unlock(&session->s_cap_lock);
-               msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-                                  GFP_NOFS, false);
-               if (!msg)
-                       goto out_unlocked;
-               dout("add_cap_releases %p msg %p now %d\n", session, msg,
-                    (int)msg->front.iov_len);
-               head = msg->front.iov_base;
-               head->num = cpu_to_le32(0);
-               msg->front.iov_len = sizeof(*head);
-               spin_lock(&session->s_cap_lock);
-               list_add(&msg->list_head, &session->s_cap_releases);
-               session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
-       }
-
-       if (partial) {
-               head = partial->front.iov_base;
-               num = le32_to_cpu(head->num);
-               dout(" queueing partial %p with %d/%d\n", partial, num,
-                    (int)CEPH_CAPS_PER_RELEASE);
-               list_move_tail(&partial->list_head,
-                              &session->s_cap_releases_done);
-               session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
+       int ret = 1;
+       spin_lock(&ci->i_ceph_lock);
+       if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
+               struct ceph_cap_snap *capsnap =
+                       list_first_entry(&ci->i_cap_snaps,
+                                        struct ceph_cap_snap, ci_item);
+               ret = capsnap->follows >= want_snap_seq;
        }
-       err = 0;
-       spin_unlock(&session->s_cap_lock);
-out_unlocked:
-       return err;
+       spin_unlock(&ci->i_ceph_lock);
+       return ret;
 }
 
-static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+static int check_caps_flush(struct ceph_mds_client *mdsc,
+                           u64 want_flush_tid)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       int ret;
-       spin_lock(&ci->i_ceph_lock);
-       if (ci->i_flushing_caps)
-               ret = ci->i_cap_flush_seq >= want_flush_seq;
-       else
-               ret = 1;
-       spin_unlock(&ci->i_ceph_lock);
+       struct rb_node *n;
+       struct ceph_cap_flush *cf;
+       int ret = 1;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       n = rb_first(&mdsc->cap_flush_tree);
+       cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+       if (cf && cf->tid <= want_flush_tid) {
+               dout("check_caps_flush still flushing tid %llu <= %llu\n",
+                    cf->tid, want_flush_tid);
+               ret = 0;
+       }
+       spin_unlock(&mdsc->cap_dirty_lock);
        return ret;
 }
 
 /*
  * flush all dirty inode data to disk.
  *
- * returns true if we've flushed through want_flush_seq
+ * returns true if we've flushed through want_flush_tid
  */
-static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc,
+                           u64 want_flush_tid, u64 want_snap_seq)
 {
        int mds;
 
-       dout("check_cap_flush want %lld\n", want_flush_seq);
+       dout("check_caps_flush want %llu snap want %llu\n",
+            want_flush_tid, want_snap_seq);
        mutex_lock(&mdsc->mutex);
-       for (mds = 0; mds < mdsc->max_sessions; mds++) {
+       for (mds = 0; mds < mdsc->max_sessions; ) {
                struct ceph_mds_session *session = mdsc->sessions[mds];
                struct inode *inode = NULL;
 
-               if (!session)
+               if (!session) {
+                       mds++;
                        continue;
+               }
                get_session(session);
                mutex_unlock(&mdsc->mutex);
 
                mutex_lock(&session->s_mutex);
-               if (!list_empty(&session->s_cap_flushing)) {
-                       struct ceph_inode_info *ci =
-                               list_entry(session->s_cap_flushing.next,
-                                          struct ceph_inode_info,
-                                          i_flushing_item);
-
-                       if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
-                               dout("check_cap_flush still flushing %p "
-                                    "seq %lld <= %lld to mds%d\n",
-                                    &ci->vfs_inode, ci->i_cap_flush_seq,
-                                    want_flush_seq, session->s_mds);
+               if (!list_empty(&session->s_cap_snaps_flushing)) {
+                       struct ceph_cap_snap *capsnap =
+                               list_first_entry(&session->s_cap_snaps_flushing,
+                                                struct ceph_cap_snap,
+                                                flushing_item);
+                       struct ceph_inode_info *ci = capsnap->ci;
+                       if (!check_capsnap_flush(ci, want_snap_seq)) {
+                               dout("check_cap_flush still flushing snap %p "
+                                    "follows %lld <= %lld to mds%d\n",
+                                    &ci->vfs_inode, capsnap->follows,
+                                    want_snap_seq, mds);
                                inode = igrab(&ci->vfs_inode);
                        }
                }
@@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 
                if (inode) {
                        wait_event(mdsc->cap_flushing_wq,
-                                  check_cap_flush(inode, want_flush_seq));
+                                  check_capsnap_flush(ceph_inode(inode),
+                                                      want_snap_seq));
                        iput(inode);
+               } else {
+                       mds++;
                }
 
                mutex_lock(&mdsc->mutex);
        }
-
        mutex_unlock(&mdsc->mutex);
-       dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+
+       wait_event(mdsc->cap_flushing_wq,
+                  check_caps_flush(mdsc, want_flush_tid));
+
+       dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
 }
 
 /*
@@ -1557,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
                            struct ceph_mds_session *session)
 {
-       struct ceph_msg *msg;
+       struct ceph_msg *msg = NULL;
+       struct ceph_mds_cap_release *head;
+       struct ceph_mds_cap_item *item;
+       struct ceph_cap *cap;
+       LIST_HEAD(tmp_list);
+       int num_cap_releases;
 
-       dout("send_cap_releases mds%d\n", session->s_mds);
        spin_lock(&session->s_cap_lock);
-       while (!list_empty(&session->s_cap_releases_done)) {
-               msg = list_first_entry(&session->s_cap_releases_done,
-                                struct ceph_msg, list_head);
-               list_del_init(&msg->list_head);
-               spin_unlock(&session->s_cap_lock);
-               msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-               dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
-               ceph_con_send(&session->s_con, msg);
-               spin_lock(&session->s_cap_lock);
-       }
+again:
+       list_splice_init(&session->s_cap_releases, &tmp_list);
+       num_cap_releases = session->s_num_cap_releases;
+       session->s_num_cap_releases = 0;
        spin_unlock(&session->s_cap_lock);
-}
 
-static void discard_cap_releases(struct ceph_mds_client *mdsc,
-                                struct ceph_mds_session *session)
-{
-       struct ceph_msg *msg;
-       struct ceph_mds_cap_release *head;
-       unsigned num;
-
-       dout("discard_cap_releases mds%d\n", session->s_mds);
+       while (!list_empty(&tmp_list)) {
+               if (!msg) {
+                       msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
+                                       PAGE_CACHE_SIZE, GFP_NOFS, false);
+                       if (!msg)
+                               goto out_err;
+                       head = msg->front.iov_base;
+                       head->num = cpu_to_le32(0);
+                       msg->front.iov_len = sizeof(*head);
+               }
+               cap = list_first_entry(&tmp_list, struct ceph_cap,
+                                       session_caps);
+               list_del(&cap->session_caps);
+               num_cap_releases--;
 
-       if (!list_empty(&session->s_cap_releases)) {
-               /* zero out the in-progress message */
-               msg = list_first_entry(&session->s_cap_releases,
-                                       struct ceph_msg, list_head);
                head = msg->front.iov_base;
-               num = le32_to_cpu(head->num);
-               dout("discard_cap_releases mds%d %p %u\n",
-                    session->s_mds, msg, num);
-               head->num = cpu_to_le32(0);
-               msg->front.iov_len = sizeof(*head);
-               session->s_num_cap_releases += num;
+               le32_add_cpu(&head->num, 1);
+               item = msg->front.iov_base + msg->front.iov_len;
+               item->ino = cpu_to_le64(cap->cap_ino);
+               item->cap_id = cpu_to_le64(cap->cap_id);
+               item->migrate_seq = cpu_to_le32(cap->mseq);
+               item->seq = cpu_to_le32(cap->issue_seq);
+               msg->front.iov_len += sizeof(*item);
+
+               ceph_put_cap(mdsc, cap);
+
+               if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+                       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+                       dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+                       ceph_con_send(&session->s_con, msg);
+                       msg = NULL;
+               }
        }
 
-       /* requeue completed messages */
-       while (!list_empty(&session->s_cap_releases_done)) {
-               msg = list_first_entry(&session->s_cap_releases_done,
-                                struct ceph_msg, list_head);
-               list_del_init(&msg->list_head);
+       BUG_ON(num_cap_releases != 0);
 
-               head = msg->front.iov_base;
-               num = le32_to_cpu(head->num);
-               dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
-                    num);
-               session->s_num_cap_releases += num;
-               head->num = cpu_to_le32(0);
-               msg->front.iov_len = sizeof(*head);
-               list_add(&msg->list_head, &session->s_cap_releases);
+       spin_lock(&session->s_cap_lock);
+       if (!list_empty(&session->s_cap_releases))
+               goto again;
+       spin_unlock(&session->s_cap_lock);
+
+       if (msg) {
+               msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+               dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+               ceph_con_send(&session->s_con, msg);
        }
+       return;
+out_err:
+       pr_err("send_cap_releases mds%d, failed to allocate message\n",
+               session->s_mds);
+       spin_lock(&session->s_cap_lock);
+       list_splice(&tmp_list, &session->s_cap_releases);
+       session->s_num_cap_releases += num_cap_releases;
+       spin_unlock(&session->s_cap_lock);
 }
 
 /*
@@ -1635,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 
        order = get_order(size * num_entries);
        while (order >= 0) {
-               rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+               rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
+                                                       __GFP_NOWARN,
                                                        order);
                if (rinfo->dir_in)
                        break;
@@ -1697,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
                        struct ceph_mds_request, r_node);
 }
 
-static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+static inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
 {
-       struct ceph_mds_request *req = __get_oldest_req(mdsc);
-
-       if (req)
-               return req->r_tid;
-       return 0;
+       return mdsc->oldest_tid;
 }
 
 /*
@@ -2267,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        /* wait */
        mutex_unlock(&mdsc->mutex);
        dout("do_request waiting\n");
-       if (req->r_timeout) {
-               err = (long)wait_for_completion_killable_timeout(
-                       &req->r_completion, req->r_timeout);
-               if (err == 0)
-                       err = -EIO;
-       } else if (req->r_wait_for_completion) {
+       if (!req->r_timeout && req->r_wait_for_completion) {
                err = req->r_wait_for_completion(mdsc, req);
        } else {
-               err = wait_for_completion_killable(&req->r_completion);
+               long timeleft = wait_for_completion_killable_timeout(
+                                       &req->r_completion,
+                                       ceph_timeout_jiffies(req->r_timeout));
+               if (timeleft > 0)
+                       err = 0;
+               else if (!timeleft)
+                       err = -EIO;  /* timed out */
+               else
+                       err = timeleft;  /* killed */
        }
        dout("do_request waited, got %d\n", err);
        mutex_lock(&mdsc->mutex);
@@ -2496,7 +2529,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
        mutex_unlock(&mdsc->mutex);
 
-       ceph_add_cap_releases(mdsc, req->r_session);
        mutex_unlock(&session->s_mutex);
 
        /* kick calling process */
@@ -2888,8 +2920,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
         */
        session->s_cap_reconnect = 1;
        /* drop old cap expires; we're about to reestablish that state */
-       discard_cap_releases(mdsc, session);
-       spin_unlock(&session->s_cap_lock);
+       cleanup_cap_releases(mdsc, session);
 
        /* trim unused caps to reduce MDS's cache rejoin time */
        if (mdsc->fsc->sb->s_root)
@@ -2956,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 
        reply->hdr.data_len = cpu_to_le32(pagelist->length);
        ceph_msg_data_add_pagelist(reply, pagelist);
+
+       ceph_early_kick_flushing_caps(mdsc, session);
+
        ceph_con_send(&session->s_con, reply);
 
        mutex_unlock(&session->s_mutex);
@@ -3352,7 +3386,6 @@ static void delayed_work(struct work_struct *work)
                        send_renew_caps(mdsc, s);
                else
                        ceph_con_keepalive(&s->s_con);
-               ceph_add_cap_releases(mdsc, s);
                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
                    s->s_state == CEPH_MDS_SESSION_HUNG)
                        ceph_send_cap_releases(mdsc, s);
@@ -3390,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        atomic_set(&mdsc->num_sessions, 0);
        mdsc->max_sessions = 0;
        mdsc->stopping = 0;
+       mdsc->last_snap_seq = 0;
        init_rwsem(&mdsc->snap_rwsem);
        mdsc->snap_realms = RB_ROOT;
        INIT_LIST_HEAD(&mdsc->snap_empty);
        spin_lock_init(&mdsc->snap_empty_lock);
        mdsc->last_tid = 0;
+       mdsc->oldest_tid = 0;
        mdsc->request_tree = RB_ROOT;
        INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
        mdsc->last_renew_caps = jiffies;
@@ -3402,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        spin_lock_init(&mdsc->cap_delay_lock);
        INIT_LIST_HEAD(&mdsc->snap_flush_list);
        spin_lock_init(&mdsc->snap_flush_lock);
-       mdsc->cap_flush_seq = 0;
+       mdsc->last_cap_flush_tid = 1;
+       mdsc->cap_flush_tree = RB_ROOT;
        INIT_LIST_HEAD(&mdsc->cap_dirty);
        INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
        mdsc->num_cap_flushing = 0;
@@ -3414,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        ceph_caps_init(mdsc);
        ceph_adjust_min_caps(mdsc, fsc->min_caps);
 
+       init_rwsem(&mdsc->pool_perm_rwsem);
+       mdsc->pool_perm_tree = RB_ROOT;
+
        return 0;
 }
 
@@ -3423,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
  */
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
+       struct ceph_options *opts = mdsc->fsc->client->options;
        struct ceph_mds_request *req;
-       struct ceph_fs_client *fsc = mdsc->fsc;
 
        mutex_lock(&mdsc->mutex);
        if (__get_oldest_req(mdsc)) {
@@ -3432,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
                dout("wait_requests waiting for requests\n");
                wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-                                   fsc->client->options->mount_timeout * HZ);
+                                   ceph_timeout_jiffies(opts->mount_timeout));
 
                /* tear down remaining requests */
                mutex_lock(&mdsc->mutex);
@@ -3485,7 +3524,8 @@ static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
                        nextreq = rb_entry(n, struct ceph_mds_request, r_node);
                else
                        nextreq = NULL;
-               if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+               if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
+                   (req->r_op & CEPH_MDS_OP_WRITE)) {
                        /* write op */
                        ceph_mdsc_get_request(req);
                        if (nextreq)
@@ -3513,7 +3553,7 @@ static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
 
 void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
-       u64 want_tid, want_flush;
+       u64 want_tid, want_flush, want_snap;
 
        if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                return;
@@ -3525,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 
        ceph_flush_dirty_caps(mdsc);
        spin_lock(&mdsc->cap_dirty_lock);
-       want_flush = mdsc->cap_flush_seq;
+       want_flush = mdsc->last_cap_flush_tid;
        spin_unlock(&mdsc->cap_dirty_lock);
 
-       dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+       down_read(&mdsc->snap_rwsem);
+       want_snap = mdsc->last_snap_seq;
+       up_read(&mdsc->snap_rwsem);
+
+       dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
+            want_tid, want_flush, want_snap);
 
        wait_unsafe_requests(mdsc, want_tid);
-       wait_caps_flush(mdsc, want_flush);
+       wait_caps_flush(mdsc, want_flush, want_snap);
 }
 
 /*
@@ -3549,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc)
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
+       struct ceph_options *opts = mdsc->fsc->client->options;
        struct ceph_mds_session *session;
        int i;
-       struct ceph_fs_client *fsc = mdsc->fsc;
-       unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
        dout("close_sessions\n");
 
@@ -3573,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
        dout("waiting for sessions to close\n");
        wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
-                          timeout);
+                          ceph_timeout_jiffies(opts->mount_timeout));
 
        /* tear down remaining sessions */
        mutex_lock(&mdsc->mutex);
@@ -3607,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
                ceph_mdsmap_destroy(mdsc->mdsmap);
        kfree(mdsc->sessions);
        ceph_caps_finalize(mdsc);
+       ceph_pool_perm_destroy(mdsc);
 }
 
 void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
index 1875b5d985c6b0df2fbb38e16f39a78ecc76750d..762757e6cebf95fff324894d1650b8271210acdd 100644 (file)
@@ -139,7 +139,6 @@ struct ceph_mds_session {
        int               s_cap_reconnect;
        int               s_readonly;
        struct list_head  s_cap_releases; /* waiting cap_release messages */
-       struct list_head  s_cap_releases_done; /* ready to send */
        struct ceph_cap  *s_cap_iterator;
 
        /* protected by mutex */
@@ -228,7 +227,7 @@ struct ceph_mds_request {
        int r_err;
        bool r_aborted;
 
-       unsigned long r_timeout;  /* optional.  jiffies */
+       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
        unsigned long r_started;  /* start time to measure timeout against */
        unsigned long r_request_started; /* start time for mds request only,
                                            used to measure lease durations */
@@ -254,12 +253,21 @@ struct ceph_mds_request {
        bool              r_got_unsafe, r_got_safe, r_got_result;
 
        bool              r_did_prepopulate;
+       long long         r_dir_release_cnt;
+       long long         r_dir_ordered_cnt;
+       int               r_readdir_cache_idx;
        u32               r_readdir_offset;
 
        struct ceph_cap_reservation r_caps_reservation;
        int r_num_caps;
 };
 
+struct ceph_pool_perm {
+       struct rb_node node;
+       u32 pool;
+       int perm;
+};
+
 /*
  * mds client state
  */
@@ -284,12 +292,15 @@ struct ceph_mds_client {
         * references (implying they contain no inodes with caps) that
         * should be destroyed.
         */
+       u64                     last_snap_seq;
        struct rw_semaphore     snap_rwsem;
        struct rb_root          snap_realms;
        struct list_head        snap_empty;
        spinlock_t              snap_empty_lock;  /* protect snap_empty */
 
        u64                    last_tid;      /* most recent mds request */
+       u64                    oldest_tid;    /* oldest incomplete mds request,
+                                                excluding setfilelock requests */
        struct rb_root         request_tree;  /* pending mds requests */
        struct delayed_work    delayed_work;  /* delayed work */
        unsigned long    last_renew_caps;  /* last time we renewed our caps */
@@ -298,7 +309,8 @@ struct ceph_mds_client {
        struct list_head snap_flush_list;  /* cap_snaps ready to flush */
        spinlock_t       snap_flush_lock;
 
-       u64               cap_flush_seq;
+       u64               last_cap_flush_tid;
+       struct rb_root    cap_flush_tree;
        struct list_head  cap_dirty;        /* inodes with dirty caps */
        struct list_head  cap_dirty_migrating; /* ...that are migration... */
        int               num_cap_flushing; /* # caps we are flushing */
@@ -328,6 +340,9 @@ struct ceph_mds_client {
        spinlock_t        dentry_lru_lock;
        struct list_head  dentry_lru;
        int               num_dentry;
+
+       struct rw_semaphore     pool_perm_rwsem;
+       struct rb_root          pool_perm_tree;
 };
 
 extern const char *ceph_mds_op_name(int op);
@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
        kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
 
-extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-                                struct ceph_mds_session *session);
 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
                                   struct ceph_mds_session *session);
 
index a97e39f09ba683349bb5f97e44f0d229b3a88936..233d906aec02b7c4508fd2488908bdb95a130aa4 100644 (file)
@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
 }
 
 
-static struct ceph_snap_context *empty_snapc;
+struct ceph_snap_context *ceph_empty_snapc;
 
 /*
  * build the snap context for a given realm.
@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
                return 0;
        }
 
-       if (num == 0 && realm->seq == empty_snapc->seq) {
-               ceph_get_snap_context(empty_snapc);
-               snapc = empty_snapc;
+       if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
+               ceph_get_snap_context(ceph_empty_snapc);
+               snapc = ceph_empty_snapc;
                goto done;
        }
 
@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
        return 0;
 }
 
+static bool has_new_snaps(struct ceph_snap_context *o,
+                         struct ceph_snap_context *n)
+{
+       if (n->num_snaps == 0)
+               return false;
+       /* snaps are in descending order */
+       return n->snaps[0] > o->seq;
+}
 
 /*
  * When a snapshot is applied, the size/mtime inode metadata is queued
@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap_snap *capsnap;
+       struct ceph_snap_context *old_snapc, *new_snapc;
        int used, dirty;
 
        capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        used = __ceph_caps_used(ci);
        dirty = __ceph_caps_dirty(ci);
 
+       old_snapc = ci->i_head_snapc;
+       new_snapc = ci->i_snap_realm->cached_context;
+
        /*
         * If there is a write in progress, treat that as a dirty Fw,
         * even though it hasn't completed yet; by the time we finish
@@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                   writes in progress now were started before the previous
                   cap_snap.  lucky us. */
                dout("queue_cap_snap %p already pending\n", inode);
-               kfree(capsnap);
-       } else if (ci->i_snap_realm->cached_context == empty_snapc) {
-               dout("queue_cap_snap %p empty snapc\n", inode);
-               kfree(capsnap);
-       } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
-                           CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
-               struct ceph_snap_context *snapc = ci->i_head_snapc;
-
-               /*
-                * if we are a sync write, we may need to go to the snaprealm
-                * to get the current snapc.
-                */
-               if (!snapc)
-                       snapc = ci->i_snap_realm->cached_context;
+               goto update_snapc;
+       }
+       if (ci->i_wrbuffer_ref_head == 0 &&
+           !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
+               dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+               goto update_snapc;
+       }
 
-               dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
-                    inode, capsnap, snapc, ceph_cap_string(dirty));
-               ihold(inode);
+       BUG_ON(!old_snapc);
 
-               atomic_set(&capsnap->nref, 1);
-               capsnap->ci = ci;
-               INIT_LIST_HEAD(&capsnap->ci_item);
-               INIT_LIST_HEAD(&capsnap->flushing_item);
-
-               capsnap->follows = snapc->seq;
-               capsnap->issued = __ceph_caps_issued(ci, NULL);
-               capsnap->dirty = dirty;
-
-               capsnap->mode = inode->i_mode;
-               capsnap->uid = inode->i_uid;
-               capsnap->gid = inode->i_gid;
-
-               if (dirty & CEPH_CAP_XATTR_EXCL) {
-                       __ceph_build_xattrs_blob(ci);
-                       capsnap->xattr_blob =
-                               ceph_buffer_get(ci->i_xattrs.blob);
-                       capsnap->xattr_version = ci->i_xattrs.version;
-               } else {
-                       capsnap->xattr_blob = NULL;
-                       capsnap->xattr_version = 0;
+       /*
+        * There is no need to send FLUSHSNAP message to MDS if there is
+        * no new snapshot. But when there is dirty pages or on-going
+        * writes, we still need to create cap_snap. cap_snap is needed
+        * by the write path and page writeback path.
+        *
+        * also see ceph_try_drop_cap_snap()
+        */
+       if (has_new_snaps(old_snapc, new_snapc)) {
+               if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
+                       capsnap->need_flush = true;
+       } else {
+               if (!(used & CEPH_CAP_FILE_WR) &&
+                   ci->i_wrbuffer_ref_head == 0) {
+                       dout("queue_cap_snap %p "
+                            "no new_snap|dirty_page|writing\n", inode);
+                       goto update_snapc;
                }
+       }
 
-               capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-
-               /* dirty page count moved from _head to this cap_snap;
-                  all subsequent writes page dirties occur _after_ this
-                  snapshot. */
-               capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
-               ci->i_wrbuffer_ref_head = 0;
-               capsnap->context = snapc;
-               ci->i_head_snapc =
-                       ceph_get_snap_context(ci->i_snap_realm->cached_context);
-               dout(" new snapc is %p\n", ci->i_head_snapc);
-               list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-
-               if (used & CEPH_CAP_FILE_WR) {
-                       dout("queue_cap_snap %p cap_snap %p snapc %p"
-                            " seq %llu used WR, now pending\n", inode,
-                            capsnap, snapc, snapc->seq);
-                       capsnap->writing = 1;
-               } else {
-                       /* note mtime, size NOW. */
-                       __ceph_finish_cap_snap(ci, capsnap);
-               }
+       dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
+            inode, capsnap, old_snapc, ceph_cap_string(dirty),
+            capsnap->need_flush ? "" : "no_flush");
+       ihold(inode);
+
+       atomic_set(&capsnap->nref, 1);
+       capsnap->ci = ci;
+       INIT_LIST_HEAD(&capsnap->ci_item);
+       INIT_LIST_HEAD(&capsnap->flushing_item);
+
+       capsnap->follows = old_snapc->seq;
+       capsnap->issued = __ceph_caps_issued(ci, NULL);
+       capsnap->dirty = dirty;
+
+       capsnap->mode = inode->i_mode;
+       capsnap->uid = inode->i_uid;
+       capsnap->gid = inode->i_gid;
+
+       if (dirty & CEPH_CAP_XATTR_EXCL) {
+               __ceph_build_xattrs_blob(ci);
+               capsnap->xattr_blob =
+                       ceph_buffer_get(ci->i_xattrs.blob);
+               capsnap->xattr_version = ci->i_xattrs.version;
        } else {
-               dout("queue_cap_snap %p nothing dirty|writing\n", inode);
-               kfree(capsnap);
+               capsnap->xattr_blob = NULL;
+               capsnap->xattr_version = 0;
        }
 
+       capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+       /* dirty page count moved from _head to this cap_snap;
+          all subsequent writes page dirties occur _after_ this
+          snapshot. */
+       capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+       ci->i_wrbuffer_ref_head = 0;
+       capsnap->context = old_snapc;
+       list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+       old_snapc = NULL;
+
+       if (used & CEPH_CAP_FILE_WR) {
+               dout("queue_cap_snap %p cap_snap %p snapc %p"
+                    " seq %llu used WR, now pending\n", inode,
+                    capsnap, old_snapc, old_snapc->seq);
+               capsnap->writing = 1;
+       } else {
+               /* note mtime, size NOW. */
+               __ceph_finish_cap_snap(ci, capsnap);
+       }
+       capsnap = NULL;
+
+update_snapc:
+       if (ci->i_head_snapc) {
+               ci->i_head_snapc = ceph_get_snap_context(new_snapc);
+               dout(" new snapc is %p\n", new_snapc);
+       }
        spin_unlock(&ci->i_ceph_lock);
+
+       kfree(capsnap);
+       ceph_put_snap_context(old_snapc);
 }
 
 /*
@@ -699,6 +730,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 
                /* queue realm for cap_snap creation */
                list_add(&realm->dirty_item, &dirty_realms);
+               if (realm->seq > mdsc->last_snap_seq)
+                       mdsc->last_snap_seq = realm->seq;
 
                invalidate = 1;
        } else if (!realm->cached_context) {
@@ -964,14 +997,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 
 int __init ceph_snap_init(void)
 {
-       empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
-       if (!empty_snapc)
+       ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
+       if (!ceph_empty_snapc)
                return -ENOMEM;
-       empty_snapc->seq = 1;
+       ceph_empty_snapc->seq = 1;
        return 0;
 }
 
 void ceph_snap_exit(void)
 {
-       ceph_put_snap_context(empty_snapc);
+       ceph_put_snap_context(ceph_empty_snapc);
 }
index 4e9905374078a228d18762a46a782a925f8f1675..d1c833c321b92eff48d9f35bf7171ef0ac59e7bf 100644 (file)
@@ -134,10 +134,12 @@ enum {
        Opt_noino32,
        Opt_fscache,
        Opt_nofscache,
+       Opt_poolperm,
+       Opt_nopoolperm,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        Opt_acl,
 #endif
-       Opt_noacl
+       Opt_noacl,
 };
 
 static match_table_t fsopt_tokens = {
@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = {
        {Opt_noino32, "noino32"},
        {Opt_fscache, "fsc"},
        {Opt_nofscache, "nofsc"},
+       {Opt_poolperm, "poolperm"},
+       {Opt_nopoolperm, "nopoolperm"},
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        {Opt_acl, "acl"},
 #endif
@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private)
        case Opt_nofscache:
                fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
                break;
+       case Opt_poolperm:
+               fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
+               printk ("pool perm");
+               break;
+       case Opt_nopoolperm:
+               fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
+               break;
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        case Opt_acl:
                fsopt->sb_flags |= MS_POSIXACL;
@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",nodcache");
        if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
                seq_puts(m, ",fsc");
+       if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
+               seq_puts(m, ",nopoolperm");
 
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        if (fsopt->sb_flags & MS_POSIXACL)
@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
  */
 struct kmem_cache *ceph_inode_cachep;
 struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_flush_cachep;
 struct kmem_cache *ceph_dentry_cachep;
 struct kmem_cache *ceph_file_cachep;
 
@@ -634,6 +648,10 @@ static int __init init_caches(void)
                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
        if (ceph_cap_cachep == NULL)
                goto bad_cap;
+       ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
+                                          SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       if (ceph_cap_flush_cachep == NULL)
+               goto bad_cap_flush;
 
        ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
                                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
@@ -652,6 +670,8 @@ static int __init init_caches(void)
 bad_file:
        kmem_cache_destroy(ceph_dentry_cachep);
 bad_dentry:
+       kmem_cache_destroy(ceph_cap_flush_cachep);
+bad_cap_flush:
        kmem_cache_destroy(ceph_cap_cachep);
 bad_cap:
        kmem_cache_destroy(ceph_inode_cachep);
@@ -668,6 +688,7 @@ static void destroy_caches(void)
 
        kmem_cache_destroy(ceph_inode_cachep);
        kmem_cache_destroy(ceph_cap_cachep);
+       kmem_cache_destroy(ceph_cap_flush_cachep);
        kmem_cache_destroy(ceph_dentry_cachep);
        kmem_cache_destroy(ceph_file_cachep);
 
@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
        req->r_started = started;
-       req->r_timeout = fsc->client->options->mount_timeout * HZ;
+       req->r_timeout = fsc->client->options->mount_timeout;
        req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
index fa20e131893956a5360b2f6c37cd1aa7fce7542e..860cc016e70d4ff463c1f7845fc648eaf58269c4 100644 (file)
@@ -35,6 +35,7 @@
 #define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
 #define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
 #define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */
+#define CEPH_MOUNT_OPT_NOPOOLPERM      (1<<11) /* no pool permission check */
 
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES | \
                                   CEPH_MOUNT_OPT_DCACHE)
@@ -121,11 +122,21 @@ struct ceph_cap {
        struct rb_node ci_node;          /* per-ci cap tree */
        struct ceph_mds_session *session;
        struct list_head session_caps;   /* per-session caplist */
-       int mds;
        u64 cap_id;       /* unique cap id (mds provided) */
-       int issued;       /* latest, from the mds */
-       int implemented;  /* implemented superset of issued (for revocation) */
-       int mds_wanted;
+       union {
+               /* in-use caps */
+               struct {
+                       int issued;       /* latest, from the mds */
+                       int implemented;  /* implemented superset of
+                                            issued (for revocation) */
+                       int mds, mds_wanted;
+               };
+               /* caps to release */
+               struct {
+                       u64 cap_ino;
+                       int queue_release;
+               };
+       };
        u32 seq, issue_seq, mseq;
        u32 cap_gen;      /* active/stale cycle */
        unsigned long last_used;
@@ -163,6 +174,7 @@ struct ceph_cap_snap {
        int writing;   /* a sync write is still in progress */
        int dirty_pages;     /* dirty pages awaiting writeback */
        bool inline_data;
+       bool need_flush;
 };
 
 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
@@ -174,6 +186,17 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
        }
 }
 
+struct ceph_cap_flush {
+       u64 tid;
+       int caps;
+       bool kick;
+       struct rb_node g_node; // global
+       union {
+               struct rb_node i_node; // inode
+               struct list_head list;
+       };
+};
+
 /*
  * The frag tree describes how a directory is fragmented, potentially across
  * multiple metadata servers.  It is also used to indicate points where
@@ -259,9 +282,9 @@ struct ceph_inode_info {
        u32 i_time_warp_seq;
 
        unsigned i_ceph_flags;
-       int i_ordered_count;
-       atomic_t i_release_count;
-       atomic_t i_complete_count;
+       atomic64_t i_release_count;
+       atomic64_t i_ordered_count;
+       atomic64_t i_complete_seq[2];
 
        struct ceph_dir_layout i_dir_layout;
        struct ceph_file_layout i_layout;
@@ -283,11 +306,11 @@ struct ceph_inode_info {
        struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
        unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
        struct list_head i_dirty_item, i_flushing_item;
-       u64 i_cap_flush_seq;
        /* we need to track cap writeback on a per-cap-bit basis, to allow
         * overlapping, pipelined cap flushes to the mds.  we can probably
         * reduce the tid to 8 bits if we're concerned about inode size. */
-       u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+       struct ceph_cap_flush *i_prealloc_cap_flush;
+       struct rb_root i_cap_flush_tree;
        wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
        unsigned long i_hold_caps_min; /* jiffies */
        unsigned long i_hold_caps_max; /* jiffies */
@@ -438,36 +461,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 /*
  * Ceph inode.
  */
-#define CEPH_I_DIR_ORDERED     1  /* dentries in dir are ordered */
-#define CEPH_I_NODELAY         4  /* do not delay cap release */
-#define CEPH_I_FLUSH           8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH         16 /* do not flush dirty caps */
+#define CEPH_I_DIR_ORDERED     (1 << 0)  /* dentries in dir are ordered */
+#define CEPH_I_NODELAY         (1 << 1)  /* do not delay cap release */
+#define CEPH_I_FLUSH           (1 << 2)  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH         (1 << 3)  /* do not flush dirty caps */
+#define CEPH_I_POOL_PERM       (1 << 4)  /* pool rd/wr bits are valid */
+#define CEPH_I_POOL_RD         (1 << 5)  /* can read from pool */
+#define CEPH_I_POOL_WR         (1 << 6)  /* can write to pool */
+
 
 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
-                                          int release_count, int ordered_count)
+                                          long long release_count,
+                                          long long ordered_count)
 {
-       atomic_set(&ci->i_complete_count, release_count);
-       if (ci->i_ordered_count == ordered_count)
-               ci->i_ceph_flags |= CEPH_I_DIR_ORDERED;
-       else
-               ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
+       smp_mb__before_atomic();
+       atomic64_set(&ci->i_complete_seq[0], release_count);
+       atomic64_set(&ci->i_complete_seq[1], ordered_count);
 }
 
 static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
 {
-       atomic_inc(&ci->i_release_count);
+       atomic64_inc(&ci->i_release_count);
+}
+
+static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
+{
+       atomic64_inc(&ci->i_ordered_count);
 }
 
 static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
 {
-       return atomic_read(&ci->i_complete_count) ==
-               atomic_read(&ci->i_release_count);
+       return atomic64_read(&ci->i_complete_seq[0]) ==
+               atomic64_read(&ci->i_release_count);
 }
 
 static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
 {
-       return __ceph_dir_is_complete(ci) &&
-               (ci->i_ceph_flags & CEPH_I_DIR_ORDERED);
+       return  atomic64_read(&ci->i_complete_seq[0]) ==
+               atomic64_read(&ci->i_release_count) &&
+               atomic64_read(&ci->i_complete_seq[1]) ==
+               atomic64_read(&ci->i_ordered_count);
 }
 
 static inline void ceph_dir_clear_complete(struct inode *inode)
@@ -477,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
 
 static inline void ceph_dir_clear_ordered(struct inode *inode)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       spin_lock(&ci->i_ceph_lock);
-       ci->i_ordered_count++;
-       ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
-       spin_unlock(&ci->i_ceph_lock);
+       __ceph_dir_clear_ordered(ceph_inode(inode));
 }
 
 static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
 {
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       bool ret;
-       spin_lock(&ci->i_ceph_lock);
-       ret = __ceph_dir_is_complete_ordered(ci);
-       spin_unlock(&ci->i_ceph_lock);
+       bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
+       smp_rmb();
        return ret;
 }
 
@@ -552,7 +578,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 {
        return ci->i_dirty_caps | ci->i_flushing_caps;
 }
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
+extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+                                 struct ceph_cap_flush **pcf);
 
 extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
                                      struct ceph_cap *ocap, int mask);
@@ -606,16 +635,20 @@ struct ceph_file_info {
        unsigned offset;       /* offset of last chunk, adjusted for . and .. */
        unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
        char *last_name;       /* last entry in previous chunk */
-       struct dentry *dentry; /* next dentry (for dcache readdir) */
-       int dir_release_count;
-       int dir_ordered_count;
+       long long dir_release_count;
+       long long dir_ordered_count;
+       int readdir_cache_idx;
 
        /* used for -o dirstat read() on directory thing */
        char *dir_info;
        int dir_info_len;
 };
 
-
+struct ceph_readdir_cache_control {
+       struct page  *page;
+       struct dentry **dentries;
+       int index;
+};
 
 /*
  * A "snap realm" describes a subset of the file hierarchy sharing
@@ -687,6 +720,7 @@ static inline int default_congestion_kb(void)
 
 
 /* snap.c */
+extern struct ceph_snap_context *ceph_empty_snapc;
 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
                                               u64 ino);
 extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
@@ -713,8 +747,8 @@ extern void ceph_snap_exit(void);
 static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
 {
        return !list_empty(&ci->i_cap_snaps) &&
-               list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
-                          ci_item)->writing;
+              list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
+                              ci_item)->writing;
 }
 
 /* inode.c */
@@ -838,12 +872,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc,
                         struct ceph_cap *cap);
 extern int ceph_is_any_caps(struct inode *inode);
 
-extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
-                               u64 cap_id, u32 migrate_seq, u32 issue_seq);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
 extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
                      int datasync);
+extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+                                         struct ceph_mds_session *session);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
 extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
@@ -879,6 +913,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
 /* addr.c */
 extern const struct address_space_operations ceph_aops;
 extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
+extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
 
 /* file.c */
 extern const struct file_operations ceph_file_fops;
@@ -890,7 +927,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 extern int ceph_release(struct inode *inode, struct file *filp);
 extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
                                  char *data, size_t len);
-int ceph_uninline_data(struct file *filp, struct page *locked_page);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct file_operations ceph_snapdir_fops;
@@ -911,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn);
 extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
 extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
+extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
 
 /*
  * our d_ops vary depending on whether the inode is live,
index cd7ffad4041d81b605bdfbc97a4bc0072c19ca99..819163d8313bb3748765b5c3313dfdfd32ac9472 100644 (file)
@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
        struct inode *inode = d_inode(dentry);
        struct ceph_vxattr *vxattr;
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+       struct ceph_cap_flush *prealloc_cf = NULL;
        int issued;
        int err;
        int dirty = 0;
@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
        char *newval = NULL;
        struct ceph_inode_xattr *xattr = NULL;
        int required_blob_size;
+       bool lock_snap_rwsem = false;
 
        if (!ceph_is_valid_xattr(name))
                return -EOPNOTSUPP;
@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
        if (!xattr)
                goto out;
 
+       prealloc_cf = ceph_alloc_cap_flush();
+       if (!prealloc_cf)
+               goto out;
+
        spin_lock(&ci->i_ceph_lock);
 retry:
        issued = __ceph_caps_issued(ci, NULL);
-       dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
        if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
                goto do_sync;
+
+       if (!lock_snap_rwsem && !ci->i_head_snapc) {
+               lock_snap_rwsem = true;
+               if (!down_read_trylock(&mdsc->snap_rwsem)) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       down_read(&mdsc->snap_rwsem);
+                       spin_lock(&ci->i_ceph_lock);
+                       goto retry;
+               }
+       }
+
+       dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
        __build_xattrs(inode);
 
        required_blob_size = __get_required_blob_size(ci, name_len, val_len);
@@ -966,7 +984,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
                dout(" preaallocating new blob size=%d\n", required_blob_size);
                blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
                if (!blob)
-                       goto out;
+                       goto do_sync_unlocked;
                spin_lock(&ci->i_ceph_lock);
                if (ci->i_xattrs.prealloc_blob)
                        ceph_buffer_put(ci->i_xattrs.prealloc_blob);
@@ -978,21 +996,28 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
                          flags, value ? 1 : -1, &xattr);
 
        if (!err) {
-               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+               dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+                                              &prealloc_cf);
                ci->i_xattrs.dirty = true;
                inode->i_ctime = CURRENT_TIME;
        }
 
        spin_unlock(&ci->i_ceph_lock);
+       if (lock_snap_rwsem)
+               up_read(&mdsc->snap_rwsem);
        if (dirty)
                __mark_inode_dirty(inode, dirty);
+       ceph_free_cap_flush(prealloc_cf);
        return err;
 
 do_sync:
        spin_unlock(&ci->i_ceph_lock);
 do_sync_unlocked:
+       if (lock_snap_rwsem)
+               up_read(&mdsc->snap_rwsem);
        err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
+       ceph_free_cap_flush(prealloc_cf);
        kfree(newname);
        kfree(newval);
        kfree(xattr);
@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
        struct inode *inode = d_inode(dentry);
        struct ceph_vxattr *vxattr;
        struct ceph_inode_info *ci = ceph_inode(inode);
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+       struct ceph_cap_flush *prealloc_cf = NULL;
        int issued;
        int err;
        int required_blob_size;
        int dirty;
+       bool lock_snap_rwsem = false;
 
        if (!ceph_is_valid_xattr(name))
                return -EOPNOTSUPP;
@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
        if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
                goto do_sync_unlocked;
 
+       prealloc_cf = ceph_alloc_cap_flush();
+       if (!prealloc_cf)
+               return -ENOMEM;
+
        err = -ENOMEM;
        spin_lock(&ci->i_ceph_lock);
 retry:
        issued = __ceph_caps_issued(ci, NULL);
-       dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
-
        if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
                goto do_sync;
+
+       if (!lock_snap_rwsem && !ci->i_head_snapc) {
+               lock_snap_rwsem = true;
+               if (!down_read_trylock(&mdsc->snap_rwsem)) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       down_read(&mdsc->snap_rwsem);
+                       spin_lock(&ci->i_ceph_lock);
+                       goto retry;
+               }
+       }
+
+       dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
        __build_xattrs(inode);
 
        required_blob_size = __get_required_blob_size(ci, 0, 0);
@@ -1080,7 +1123,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
                dout(" preaallocating new blob size=%d\n", required_blob_size);
                blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
                if (!blob)
-                       goto out;
+                       goto do_sync_unlocked;
                spin_lock(&ci->i_ceph_lock);
                if (ci->i_xattrs.prealloc_blob)
                        ceph_buffer_put(ci->i_xattrs.prealloc_blob);
@@ -1090,18 +1133,24 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
 
        err = __remove_xattr_by_name(ceph_inode(inode), name);
 
-       dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+       dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+                                      &prealloc_cf);
        ci->i_xattrs.dirty = true;
        inode->i_ctime = CURRENT_TIME;
        spin_unlock(&ci->i_ceph_lock);
+       if (lock_snap_rwsem)
+               up_read(&mdsc->snap_rwsem);
        if (dirty)
                __mark_inode_dirty(inode, dirty);
+       ceph_free_cap_flush(prealloc_cf);
        return err;
 do_sync:
        spin_unlock(&ci->i_ceph_lock);
 do_sync_unlocked:
+       if (lock_snap_rwsem)
+               up_read(&mdsc->snap_rwsem);
+       ceph_free_cap_flush(prealloc_cf);
        err = ceph_send_removexattr(dentry, name);
-out:
        return err;
 }
 
index e5bbf748b6987a922fa211e3edb083346927f3b6..eae2c11268bcb484075cfd08482beeb172dd66bc 100644 (file)
@@ -489,6 +489,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
  */
 static int cuse_channel_open(struct inode *inode, struct file *file)
 {
+       struct fuse_dev *fud;
        struct cuse_conn *cc;
        int rc;
 
@@ -499,17 +500,22 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
 
        fuse_conn_init(&cc->fc);
 
+       fud = fuse_dev_alloc(&cc->fc);
+       if (!fud) {
+               kfree(cc);
+               return -ENOMEM;
+       }
+
        INIT_LIST_HEAD(&cc->list);
        cc->fc.release = cuse_fc_release;
 
-       cc->fc.connected = 1;
        cc->fc.initialized = 1;
        rc = cuse_send_init(cc);
        if (rc) {
-               fuse_conn_put(&cc->fc);
+               fuse_dev_free(fud);
                return rc;
        }
-       file->private_data = &cc->fc;   /* channel owns base reference to cc */
+       file->private_data = fud;
 
        return 0;
 }
@@ -527,7 +533,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
  */
 static int cuse_channel_release(struct inode *inode, struct file *file)
 {
-       struct cuse_conn *cc = fc_to_cc(file->private_data);
+       struct fuse_dev *fud = file->private_data;
+       struct cuse_conn *cc = fc_to_cc(fud->fc);
        int rc;
 
        /* remove from the conntbl, no more access from this point on */
index c8b68ab2e574a86f13fab97f9ed47b14a4e139d6..80cc1b35d46043c16bc456e0cadf61e76c281d52 100644 (file)
@@ -25,13 +25,13 @@ MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
-static struct fuse_conn *fuse_get_conn(struct file *file)
+static struct fuse_dev *fuse_get_dev(struct file *file)
 {
        /*
         * Lockless access is OK, because file->private data is set
         * once during mount and is valid until the file is released.
         */
-       return file->private_data;
+       return ACCESS_ONCE(file->private_data);
 }
 
 static void fuse_request_init(struct fuse_req *req, struct page **pages,
@@ -48,6 +48,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
        req->pages = pages;
        req->page_descs = page_descs;
        req->max_pages = npages;
+       __set_bit(FR_PENDING, &req->flags);
 }
 
 static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
@@ -168,6 +169,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
        if (!fc->connected)
                goto out;
 
+       err = -ECONNREFUSED;
+       if (fc->conn_error)
+               goto out;
+
        req = fuse_request_alloc(npages);
        err = -ENOMEM;
        if (!req) {
@@ -177,8 +182,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
        }
 
        fuse_req_init_context(req);
-       req->waiting = 1;
-       req->background = for_background;
+       __set_bit(FR_WAITING, &req->flags);
+       if (for_background)
+               __set_bit(FR_BACKGROUND, &req->flags);
+
        return req;
 
  out:
@@ -268,15 +275,15 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
                req = get_reserved_req(fc, file);
 
        fuse_req_init_context(req);
-       req->waiting = 1;
-       req->background = 0;
+       __set_bit(FR_WAITING, &req->flags);
+       __clear_bit(FR_BACKGROUND, &req->flags);
        return req;
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
        if (atomic_dec_and_test(&req->count)) {
-               if (unlikely(req->background)) {
+               if (test_bit(FR_BACKGROUND, &req->flags)) {
                        /*
                         * We get here in the unlikely case that a background
                         * request was allocated but not sent
@@ -287,8 +294,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
                        spin_unlock(&fc->lock);
                }
 
-               if (req->waiting)
+               if (test_bit(FR_WAITING, &req->flags)) {
+                       __clear_bit(FR_WAITING, &req->flags);
                        atomic_dec(&fc->num_waiting);
+               }
 
                if (req->stolen_file)
                        put_reserved_req(fc, req);
@@ -309,46 +318,38 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
        return nbytes;
 }
 
-static u64 fuse_get_unique(struct fuse_conn *fc)
+static u64 fuse_get_unique(struct fuse_iqueue *fiq)
 {
-       fc->reqctr++;
-       /* zero is special */
-       if (fc->reqctr == 0)
-               fc->reqctr = 1;
-
-       return fc->reqctr;
+       return ++fiq->reqctr;
 }
 
-static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
        req->in.h.len = sizeof(struct fuse_in_header) +
                len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
-       list_add_tail(&req->list, &fc->pending);
-       req->state = FUSE_REQ_PENDING;
-       if (!req->waiting) {
-               req->waiting = 1;
-               atomic_inc(&fc->num_waiting);
-       }
-       wake_up(&fc->waitq);
-       kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+       list_add_tail(&req->list, &fiq->pending);
+       wake_up_locked(&fiq->waitq);
+       kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 }
 
 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
                       u64 nodeid, u64 nlookup)
 {
+       struct fuse_iqueue *fiq = &fc->iq;
+
        forget->forget_one.nodeid = nodeid;
        forget->forget_one.nlookup = nlookup;
 
-       spin_lock(&fc->lock);
-       if (fc->connected) {
-               fc->forget_list_tail->next = forget;
-               fc->forget_list_tail = forget;
-               wake_up(&fc->waitq);
-               kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+       spin_lock(&fiq->waitq.lock);
+       if (fiq->connected) {
+               fiq->forget_list_tail->next = forget;
+               fiq->forget_list_tail = forget;
+               wake_up_locked(&fiq->waitq);
+               kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
        } else {
                kfree(forget);
        }
-       spin_unlock(&fc->lock);
+       spin_unlock(&fiq->waitq.lock);
 }
 
 static void flush_bg_queue(struct fuse_conn *fc)
@@ -356,12 +357,15 @@ static void flush_bg_queue(struct fuse_conn *fc)
        while (fc->active_background < fc->max_background &&
               !list_empty(&fc->bg_queue)) {
                struct fuse_req *req;
+               struct fuse_iqueue *fiq = &fc->iq;
 
                req = list_entry(fc->bg_queue.next, struct fuse_req, list);
                list_del(&req->list);
                fc->active_background++;
-               req->in.h.unique = fuse_get_unique(fc);
-               queue_request(fc, req);
+               spin_lock(&fiq->waitq.lock);
+               req->in.h.unique = fuse_get_unique(fiq);
+               queue_request(fiq, req);
+               spin_unlock(&fiq->waitq.lock);
        }
 }
 
@@ -372,20 +376,22 @@ static void flush_bg_queue(struct fuse_conn *fc)
  * was closed.  The requester thread is woken up (if still waiting),
  * the 'end' callback is called if given, else the reference to the
  * request is released
- *
- * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
 {
-       void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-       req->end = NULL;
-       list_del(&req->list);
-       list_del(&req->intr_entry);
-       req->state = FUSE_REQ_FINISHED;
-       if (req->background) {
-               req->background = 0;
+       struct fuse_iqueue *fiq = &fc->iq;
+
+       if (test_and_set_bit(FR_FINISHED, &req->flags))
+               return;
 
+       spin_lock(&fiq->waitq.lock);
+       list_del_init(&req->intr_entry);
+       spin_unlock(&fiq->waitq.lock);
+       WARN_ON(test_bit(FR_PENDING, &req->flags));
+       WARN_ON(test_bit(FR_SENT, &req->flags));
+       if (test_bit(FR_BACKGROUND, &req->flags)) {
+               spin_lock(&fc->lock);
+               clear_bit(FR_BACKGROUND, &req->flags);
                if (fc->num_background == fc->max_background)
                        fc->blocked = 0;
 
@@ -401,122 +407,105 @@ __releases(fc->lock)
                fc->num_background--;
                fc->active_background--;
                flush_bg_queue(fc);
+               spin_unlock(&fc->lock);
        }
-       spin_unlock(&fc->lock);
        wake_up(&req->waitq);
-       if (end)
-               end(fc, req);
+       if (req->end)
+               req->end(fc, req);
        fuse_put_request(fc, req);
 }
 
-static void wait_answer_interruptible(struct fuse_conn *fc,
-                                     struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-       if (signal_pending(current))
-               return;
-
-       spin_unlock(&fc->lock);
-       wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
-       spin_lock(&fc->lock);
-}
-
-static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
-       list_add_tail(&req->intr_entry, &fc->interrupts);
-       wake_up(&fc->waitq);
-       kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+       spin_lock(&fiq->waitq.lock);
+       if (list_empty(&req->intr_entry)) {
+               list_add_tail(&req->intr_entry, &fiq->interrupts);
+               wake_up_locked(&fiq->waitq);
+       }
+       spin_unlock(&fiq->waitq.lock);
+       kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
 {
+       struct fuse_iqueue *fiq = &fc->iq;
+       int err;
+
        if (!fc->no_interrupt) {
                /* Any signal may interrupt this */
-               wait_answer_interruptible(fc, req);
-
-               if (req->aborted)
-                       goto aborted;
-               if (req->state == FUSE_REQ_FINISHED)
+               err = wait_event_interruptible(req->waitq,
+                                       test_bit(FR_FINISHED, &req->flags));
+               if (!err)
                        return;
 
-               req->interrupted = 1;
-               if (req->state == FUSE_REQ_SENT)
-                       queue_interrupt(fc, req);
+               set_bit(FR_INTERRUPTED, &req->flags);
+               /* matches barrier in fuse_dev_do_read() */
+               smp_mb__after_atomic();
+               if (test_bit(FR_SENT, &req->flags))
+                       queue_interrupt(fiq, req);
        }
 
-       if (!req->force) {
+       if (!test_bit(FR_FORCE, &req->flags)) {
                sigset_t oldset;
 
                /* Only fatal signals may interrupt this */
                block_sigs(&oldset);
-               wait_answer_interruptible(fc, req);
+               err = wait_event_interruptible(req->waitq,
+                                       test_bit(FR_FINISHED, &req->flags));
                restore_sigs(&oldset);
 
-               if (req->aborted)
-                       goto aborted;
-               if (req->state == FUSE_REQ_FINISHED)
+               if (!err)
                        return;
 
+               spin_lock(&fiq->waitq.lock);
                /* Request is not yet in userspace, bail out */
-               if (req->state == FUSE_REQ_PENDING) {
+               if (test_bit(FR_PENDING, &req->flags)) {
                        list_del(&req->list);
+                       spin_unlock(&fiq->waitq.lock);
                        __fuse_put_request(req);
                        req->out.h.error = -EINTR;
                        return;
                }
+               spin_unlock(&fiq->waitq.lock);
        }
 
        /*
         * Either request is already in userspace, or it was forced.
         * Wait it out.
         */
-       spin_unlock(&fc->lock);
-       wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
-       spin_lock(&fc->lock);
-
-       if (!req->aborted)
-               return;
-
- aborted:
-       BUG_ON(req->state != FUSE_REQ_FINISHED);
-       if (req->locked) {
-               /* This is uninterruptible sleep, because data is
-                  being copied to/from the buffers of req.  During
-                  locked state, there mustn't be any filesystem
-                  operation (e.g. page fault), since that could lead
-                  to deadlock */
-               spin_unlock(&fc->lock);
-               wait_event(req->waitq, !req->locked);
-               spin_lock(&fc->lock);
-       }
+       wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
 }
 
 static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-       BUG_ON(req->background);
-       spin_lock(&fc->lock);
-       if (!fc->connected)
+       struct fuse_iqueue *fiq = &fc->iq;
+
+       BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
+       spin_lock(&fiq->waitq.lock);
+       if (!fiq->connected) {
+               spin_unlock(&fiq->waitq.lock);
                req->out.h.error = -ENOTCONN;
-       else if (fc->conn_error)
-               req->out.h.error = -ECONNREFUSED;
-       else {
-               req->in.h.unique = fuse_get_unique(fc);
-               queue_request(fc, req);
+       } else {
+               req->in.h.unique = fuse_get_unique(fiq);
+               queue_request(fiq, req);
                /* acquire extra reference, since request is still needed
                   after request_end() */
                __fuse_get_request(req);
+               spin_unlock(&fiq->waitq.lock);
 
                request_wait_answer(fc, req);
+               /* Pairs with smp_wmb() in request_end() */
+               smp_rmb();
        }
-       spin_unlock(&fc->lock);
 }
 
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-       req->isreply = 1;
+       __set_bit(FR_ISREPLY, &req->flags);
+       if (!test_bit(FR_WAITING, &req->flags)) {
+               __set_bit(FR_WAITING, &req->flags);
+               atomic_inc(&fc->num_waiting);
+       }
        __fuse_request_send(fc, req);
 }
 EXPORT_SYMBOL_GPL(fuse_request_send);
@@ -586,10 +575,20 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
        return ret;
 }
 
-static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
-                                           struct fuse_req *req)
+/*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+                                        struct fuse_req *req)
 {
-       BUG_ON(!req->background);
+       BUG_ON(!test_bit(FR_BACKGROUND, &req->flags));
+       if (!test_bit(FR_WAITING, &req->flags)) {
+               __set_bit(FR_WAITING, &req->flags);
+               atomic_inc(&fc->num_waiting);
+       }
+       __set_bit(FR_ISREPLY, &req->flags);
        fc->num_background++;
        if (fc->num_background == fc->max_background)
                fc->blocked = 1;
@@ -602,54 +601,40 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
        flush_bg_queue(fc);
 }
 
-static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
+       BUG_ON(!req->end);
        spin_lock(&fc->lock);
        if (fc->connected) {
-               fuse_request_send_nowait_locked(fc, req);
+               fuse_request_send_background_locked(fc, req);
                spin_unlock(&fc->lock);
        } else {
+               spin_unlock(&fc->lock);
                req->out.h.error = -ENOTCONN;
-               request_end(fc, req);
+               req->end(fc, req);
+               fuse_put_request(fc, req);
        }
 }
-
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
-{
-       req->isreply = 1;
-       fuse_request_send_nowait(fc, req);
-}
 EXPORT_SYMBOL_GPL(fuse_request_send_background);
 
 static int fuse_request_send_notify_reply(struct fuse_conn *fc,
                                          struct fuse_req *req, u64 unique)
 {
        int err = -ENODEV;
+       struct fuse_iqueue *fiq = &fc->iq;
 
-       req->isreply = 0;
+       __clear_bit(FR_ISREPLY, &req->flags);
        req->in.h.unique = unique;
-       spin_lock(&fc->lock);
-       if (fc->connected) {
-               queue_request(fc, req);
+       spin_lock(&fiq->waitq.lock);
+       if (fiq->connected) {
+               queue_request(fiq, req);
                err = 0;
        }
-       spin_unlock(&fc->lock);
+       spin_unlock(&fiq->waitq.lock);
 
        return err;
 }
 
-/*
- * Called under fc->lock
- *
- * fc->connected must have been checked previously
- */
-void fuse_request_send_background_locked(struct fuse_conn *fc,
-                                        struct fuse_req *req)
-{
-       req->isreply = 1;
-       fuse_request_send_nowait_locked(fc, req);
-}
-
 void fuse_force_forget(struct file *file, u64 nodeid)
 {
        struct inode *inode = file_inode(file);
@@ -665,7 +650,7 @@ void fuse_force_forget(struct file *file, u64 nodeid)
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-       req->isreply = 0;
+       __clear_bit(FR_ISREPLY, &req->flags);
        __fuse_request_send(fc, req);
        /* ignore errors */
        fuse_put_request(fc, req);
@@ -676,38 +661,39 @@ void fuse_force_forget(struct file *file, u64 nodeid)
  * anything that could cause a page-fault.  If the request was already
  * aborted bail out.
  */
-static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int lock_request(struct fuse_req *req)
 {
        int err = 0;
        if (req) {
-               spin_lock(&fc->lock);
-               if (req->aborted)
+               spin_lock(&req->waitq.lock);
+               if (test_bit(FR_ABORTED, &req->flags))
                        err = -ENOENT;
                else
-                       req->locked = 1;
-               spin_unlock(&fc->lock);
+                       set_bit(FR_LOCKED, &req->flags);
+               spin_unlock(&req->waitq.lock);
        }
        return err;
 }
 
 /*
- * Unlock request.  If it was aborted during being locked, the
- * requester thread is currently waiting for it to be unlocked, so
- * wake it up.
+ * Unlock request.  If it was aborted while locked, caller is responsible
+ * for unlocking and ending the request.
  */
-static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int unlock_request(struct fuse_req *req)
 {
+       int err = 0;
        if (req) {
-               spin_lock(&fc->lock);
-               req->locked = 0;
-               if (req->aborted)
-                       wake_up(&req->waitq);
-               spin_unlock(&fc->lock);
+               spin_lock(&req->waitq.lock);
+               if (test_bit(FR_ABORTED, &req->flags))
+                       err = -ENOENT;
+               else
+                       clear_bit(FR_LOCKED, &req->flags);
+               spin_unlock(&req->waitq.lock);
        }
+       return err;
 }
 
 struct fuse_copy_state {
-       struct fuse_conn *fc;
        int write;
        struct fuse_req *req;
        struct iov_iter *iter;
@@ -721,13 +707,10 @@ struct fuse_copy_state {
        unsigned move_pages:1;
 };
 
-static void fuse_copy_init(struct fuse_copy_state *cs,
-                          struct fuse_conn *fc,
-                          int write,
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
                           struct iov_iter *iter)
 {
        memset(cs, 0, sizeof(*cs));
-       cs->fc = fc;
        cs->write = write;
        cs->iter = iter;
 }
@@ -760,7 +743,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
        struct page *page;
        int err;
 
-       unlock_request(cs->fc, cs->req);
+       err = unlock_request(cs->req);
+       if (err)
+               return err;
+
        fuse_copy_finish(cs);
        if (cs->pipebufs) {
                struct pipe_buffer *buf = cs->pipebufs;
@@ -809,7 +795,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                iov_iter_advance(cs->iter, err);
        }
 
-       return lock_request(cs->fc, cs->req);
+       return lock_request(cs->req);
 }
 
 /* Do as much copy to/from userspace buffer as we can */
@@ -860,7 +846,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        struct page *newpage;
        struct pipe_buffer *buf = cs->pipebufs;
 
-       unlock_request(cs->fc, cs->req);
+       err = unlock_request(cs->req);
+       if (err)
+               return err;
+
        fuse_copy_finish(cs);
 
        err = buf->ops->confirm(cs->pipe, buf);
@@ -914,12 +903,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
                lru_cache_add_file(newpage);
 
        err = 0;
-       spin_lock(&cs->fc->lock);
-       if (cs->req->aborted)
+       spin_lock(&cs->req->waitq.lock);
+       if (test_bit(FR_ABORTED, &cs->req->flags))
                err = -ENOENT;
        else
                *pagep = newpage;
-       spin_unlock(&cs->fc->lock);
+       spin_unlock(&cs->req->waitq.lock);
 
        if (err) {
                unlock_page(newpage);
@@ -939,7 +928,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        cs->pg = buf->page;
        cs->offset = buf->offset;
 
-       err = lock_request(cs->fc, cs->req);
+       err = lock_request(cs->req);
        if (err)
                return err;
 
@@ -950,11 +939,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
                         unsigned offset, unsigned count)
 {
        struct pipe_buffer *buf;
+       int err;
 
        if (cs->nr_segs == cs->pipe->buffers)
                return -EIO;
 
-       unlock_request(cs->fc, cs->req);
+       err = unlock_request(cs->req);
+       if (err)
+               return err;
+
        fuse_copy_finish(cs);
 
        buf = cs->pipebufs;
@@ -1065,36 +1058,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
        return err;
 }
 
-static int forget_pending(struct fuse_conn *fc)
+static int forget_pending(struct fuse_iqueue *fiq)
 {
-       return fc->forget_list_head.next != NULL;
+       return fiq->forget_list_head.next != NULL;
 }
 
-static int request_pending(struct fuse_conn *fc)
+static int request_pending(struct fuse_iqueue *fiq)
 {
-       return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
-               forget_pending(fc);
-}
-
-/* Wait until a request is available on the pending list */
-static void request_wait(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-       DECLARE_WAITQUEUE(wait, current);
-
-       add_wait_queue_exclusive(&fc->waitq, &wait);
-       while (fc->connected && !request_pending(fc)) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (signal_pending(current))
-                       break;
-
-               spin_unlock(&fc->lock);
-               schedule();
-               spin_lock(&fc->lock);
-       }
-       set_current_state(TASK_RUNNING);
-       remove_wait_queue(&fc->waitq, &wait);
+       return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) ||
+               forget_pending(fiq);
 }
 
 /*
@@ -1103,11 +1075,12 @@ __acquires(fc->lock)
  * Unlike other requests this is assembled on demand, without a need
  * to allocate a separate fuse_req structure.
  *
- * Called with fc->lock held, releases it
+ * Called with fiq->waitq.lock held, releases it
  */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_interrupt(struct fuse_iqueue *fiq,
+                              struct fuse_copy_state *cs,
                               size_t nbytes, struct fuse_req *req)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
        struct fuse_in_header ih;
        struct fuse_interrupt_in arg;
@@ -1115,7 +1088,7 @@ __releases(fc->lock)
        int err;
 
        list_del_init(&req->intr_entry);
-       req->intr_unique = fuse_get_unique(fc);
+       req->intr_unique = fuse_get_unique(fiq);
        memset(&ih, 0, sizeof(ih));
        memset(&arg, 0, sizeof(arg));
        ih.len = reqsize;
@@ -1123,7 +1096,7 @@ __releases(fc->lock)
        ih.unique = req->intr_unique;
        arg.unique = req->in.h.unique;
 
-       spin_unlock(&fc->lock);
+       spin_unlock(&fiq->waitq.lock);
        if (nbytes < reqsize)
                return -EINVAL;
 
@@ -1135,21 +1108,21 @@ __releases(fc->lock)
        return err ? err : reqsize;
 }
 
-static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
                                               unsigned max,
                                               unsigned *countp)
 {
-       struct fuse_forget_link *head = fc->forget_list_head.next;
+       struct fuse_forget_link *head = fiq->forget_list_head.next;
        struct fuse_forget_link **newhead = &head;
        unsigned count;
 
        for (count = 0; *newhead != NULL && count < max; count++)
                newhead = &(*newhead)->next;
 
-       fc->forget_list_head.next = *newhead;
+       fiq->forget_list_head.next = *newhead;
        *newhead = NULL;
-       if (fc->forget_list_head.next == NULL)
-               fc->forget_list_tail = &fc->forget_list_head;
+       if (fiq->forget_list_head.next == NULL)
+               fiq->forget_list_tail = &fiq->forget_list_head;
 
        if (countp != NULL)
                *countp = count;
@@ -1157,24 +1130,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
        return head;
 }
 
-static int fuse_read_single_forget(struct fuse_conn *fc,
+static int fuse_read_single_forget(struct fuse_iqueue *fiq,
                                   struct fuse_copy_state *cs,
                                   size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
        int err;
-       struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+       struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL);
        struct fuse_forget_in arg = {
                .nlookup = forget->forget_one.nlookup,
        };
        struct fuse_in_header ih = {
                .opcode = FUSE_FORGET,
                .nodeid = forget->forget_one.nodeid,
-               .unique = fuse_get_unique(fc),
+               .unique = fuse_get_unique(fiq),
                .len = sizeof(ih) + sizeof(arg),
        };
 
-       spin_unlock(&fc->lock);
+       spin_unlock(&fiq->waitq.lock);
        kfree(forget);
        if (nbytes < ih.len)
                return -EINVAL;
@@ -1190,9 +1163,9 @@ __releases(fc->lock)
        return ih.len;
 }
 
-static int fuse_read_batch_forget(struct fuse_conn *fc,
+static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
                                   struct fuse_copy_state *cs, size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
        int err;
        unsigned max_forgets;
@@ -1201,18 +1174,18 @@ __releases(fc->lock)
        struct fuse_batch_forget_in arg = { .count = 0 };
        struct fuse_in_header ih = {
                .opcode = FUSE_BATCH_FORGET,
-               .unique = fuse_get_unique(fc),
+               .unique = fuse_get_unique(fiq),
                .len = sizeof(ih) + sizeof(arg),
        };
 
        if (nbytes < ih.len) {
-               spin_unlock(&fc->lock);
+               spin_unlock(&fiq->waitq.lock);
                return -EINVAL;
        }
 
        max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
-       head = dequeue_forget(fc, max_forgets, &count);
-       spin_unlock(&fc->lock);
+       head = dequeue_forget(fiq, max_forgets, &count);
+       spin_unlock(&fiq->waitq.lock);
 
        arg.count = count;
        ih.len += count * sizeof(struct fuse_forget_one);
@@ -1239,14 +1212,15 @@ __releases(fc->lock)
        return ih.len;
 }
 
-static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
+                           struct fuse_copy_state *cs,
                            size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
 {
-       if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
-               return fuse_read_single_forget(fc, cs, nbytes);
+       if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
+               return fuse_read_single_forget(fiq, cs, nbytes);
        else
-               return fuse_read_batch_forget(fc, cs, nbytes);
+               return fuse_read_batch_forget(fiq, cs, nbytes);
 }
 
 /*
@@ -1258,46 +1232,51 @@ __releases(fc->lock)
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
-static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
                                struct fuse_copy_state *cs, size_t nbytes)
 {
-       int err;
+       ssize_t err;
+       struct fuse_conn *fc = fud->fc;
+       struct fuse_iqueue *fiq = &fc->iq;
+       struct fuse_pqueue *fpq = &fud->pq;
        struct fuse_req *req;
        struct fuse_in *in;
        unsigned reqsize;
 
  restart:
-       spin_lock(&fc->lock);
+       spin_lock(&fiq->waitq.lock);
        err = -EAGAIN;
-       if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-           !request_pending(fc))
+       if ((file->f_flags & O_NONBLOCK) && fiq->connected &&
+           !request_pending(fiq))
                goto err_unlock;
 
-       request_wait(fc);
-       err = -ENODEV;
-       if (!fc->connected)
+       err = wait_event_interruptible_exclusive_locked(fiq->waitq,
+                               !fiq->connected || request_pending(fiq));
+       if (err)
                goto err_unlock;
-       err = -ERESTARTSYS;
-       if (!request_pending(fc))
+
+       err = -ENODEV;
+       if (!fiq->connected)
                goto err_unlock;
 
-       if (!list_empty(&fc->interrupts)) {
-               req = list_entry(fc->interrupts.next, struct fuse_req,
+       if (!list_empty(&fiq->interrupts)) {
+               req = list_entry(fiq->interrupts.next, struct fuse_req,
                                 intr_entry);
-               return fuse_read_interrupt(fc, cs, nbytes, req);
+               return fuse_read_interrupt(fiq, cs, nbytes, req);
        }
 
-       if (forget_pending(fc)) {
-               if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
-                       return fuse_read_forget(fc, cs, nbytes);
+       if (forget_pending(fiq)) {
+               if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0)
+                       return fuse_read_forget(fc, fiq, cs, nbytes);
 
-               if (fc->forget_batch <= -8)
-                       fc->forget_batch = 16;
+               if (fiq->forget_batch <= -8)
+                       fiq->forget_batch = 16;
        }
 
-       req = list_entry(fc->pending.next, struct fuse_req, list);
-       req->state = FUSE_REQ_READING;
-       list_move(&req->list, &fc->io);
+       req = list_entry(fiq->pending.next, struct fuse_req, list);
+       clear_bit(FR_PENDING, &req->flags);
+       list_del_init(&req->list);
+       spin_unlock(&fiq->waitq.lock);
 
        in = &req->in;
        reqsize = in->h.len;
@@ -1310,37 +1289,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
                request_end(fc, req);
                goto restart;
        }
-       spin_unlock(&fc->lock);
+       spin_lock(&fpq->lock);
+       list_add(&req->list, &fpq->io);
+       spin_unlock(&fpq->lock);
        cs->req = req;
        err = fuse_copy_one(cs, &in->h, sizeof(in->h));
        if (!err)
                err = fuse_copy_args(cs, in->numargs, in->argpages,
                                     (struct fuse_arg *) in->args, 0);
        fuse_copy_finish(cs);
-       spin_lock(&fc->lock);
-       req->locked = 0;
-       if (req->aborted) {
-               request_end(fc, req);
-               return -ENODEV;
+       spin_lock(&fpq->lock);
+       clear_bit(FR_LOCKED, &req->flags);
+       if (!fpq->connected) {
+               err = -ENODEV;
+               goto out_end;
        }
        if (err) {
                req->out.h.error = -EIO;
-               request_end(fc, req);
-               return err;
+               goto out_end;
        }
-       if (!req->isreply)
-               request_end(fc, req);
-       else {
-               req->state = FUSE_REQ_SENT;
-               list_move_tail(&req->list, &fc->processing);
-               if (req->interrupted)
-                       queue_interrupt(fc, req);
-               spin_unlock(&fc->lock);
+       if (!test_bit(FR_ISREPLY, &req->flags)) {
+               err = reqsize;
+               goto out_end;
        }
+       list_move_tail(&req->list, &fpq->processing);
+       spin_unlock(&fpq->lock);
+       set_bit(FR_SENT, &req->flags);
+       /* matches barrier in request_wait_answer() */
+       smp_mb__after_atomic();
+       if (test_bit(FR_INTERRUPTED, &req->flags))
+               queue_interrupt(fiq, req);
+
        return reqsize;
 
+out_end:
+       if (!test_bit(FR_PRIVATE, &req->flags))
+               list_del_init(&req->list);
+       spin_unlock(&fpq->lock);
+       request_end(fc, req);
+       return err;
+
  err_unlock:
-       spin_unlock(&fc->lock);
+       spin_unlock(&fiq->waitq.lock);
        return err;
 }
 
@@ -1359,16 +1349,17 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
 {
        struct fuse_copy_state cs;
        struct file *file = iocb->ki_filp;
-       struct fuse_conn *fc = fuse_get_conn(file);
-       if (!fc)
+       struct fuse_dev *fud = fuse_get_dev(file);
+
+       if (!fud)
                return -EPERM;
 
        if (!iter_is_iovec(to))
                return -EINVAL;
 
-       fuse_copy_init(&cs, fc, 1, to);
+       fuse_copy_init(&cs, 1, to);
 
-       return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to));
+       return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
 }
 
 static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
@@ -1380,18 +1371,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
        int do_wakeup = 0;
        struct pipe_buffer *bufs;
        struct fuse_copy_state cs;
-       struct fuse_conn *fc = fuse_get_conn(in);
-       if (!fc)
+       struct fuse_dev *fud = fuse_get_dev(in);
+
+       if (!fud)
                return -EPERM;
 
        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;
 
-       fuse_copy_init(&cs, fc, 1, NULL);
+       fuse_copy_init(&cs, 1, NULL);
        cs.pipebufs = bufs;
        cs.pipe = pipe;
-       ret = fuse_dev_do_read(fc, in, &cs, len);
+       ret = fuse_dev_do_read(fud, in, &cs, len);
        if (ret < 0)
                goto out;
 
@@ -1830,11 +1822,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 }
 
 /* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
 {
        struct fuse_req *req;
 
-       list_for_each_entry(req, &fc->processing, list) {
+       list_for_each_entry(req, &fpq->processing, list) {
                if (req->in.h.unique == unique || req->intr_unique == unique)
                        return req;
        }
@@ -1871,10 +1863,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
  * it from the list and copy the rest of the buffer to the request.
  * The request is finished by calling request_end()
  */
-static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
                                 struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
+       struct fuse_conn *fc = fud->fc;
+       struct fuse_pqueue *fpq = &fud->pq;
        struct fuse_req *req;
        struct fuse_out_header oh;
 
@@ -1902,63 +1896,60 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
        if (oh.error <= -1000 || oh.error > 0)
                goto err_finish;
 
-       spin_lock(&fc->lock);
+       spin_lock(&fpq->lock);
        err = -ENOENT;
-       if (!fc->connected)
-               goto err_unlock;
+       if (!fpq->connected)
+               goto err_unlock_pq;
 
-       req = request_find(fc, oh.unique);
+       req = request_find(fpq, oh.unique);
        if (!req)
-               goto err_unlock;
+               goto err_unlock_pq;
 
-       if (req->aborted) {
-               spin_unlock(&fc->lock);
-               fuse_copy_finish(cs);
-               spin_lock(&fc->lock);
-               request_end(fc, req);
-               return -ENOENT;
-       }
        /* Is it an interrupt reply? */
        if (req->intr_unique == oh.unique) {
+               spin_unlock(&fpq->lock);
+
                err = -EINVAL;
                if (nbytes != sizeof(struct fuse_out_header))
-                       goto err_unlock;
+                       goto err_finish;
 
                if (oh.error == -ENOSYS)
                        fc->no_interrupt = 1;
                else if (oh.error == -EAGAIN)
-                       queue_interrupt(fc, req);
+                       queue_interrupt(&fc->iq, req);
 
-               spin_unlock(&fc->lock);
                fuse_copy_finish(cs);
                return nbytes;
        }
 
-       req->state = FUSE_REQ_WRITING;
-       list_move(&req->list, &fc->io);
+       clear_bit(FR_SENT, &req->flags);
+       list_move(&req->list, &fpq->io);
        req->out.h = oh;
-       req->locked = 1;
+       set_bit(FR_LOCKED, &req->flags);
+       spin_unlock(&fpq->lock);
        cs->req = req;
        if (!req->out.page_replace)
                cs->move_pages = 0;
-       spin_unlock(&fc->lock);
 
        err = copy_out_args(cs, &req->out, nbytes);
        fuse_copy_finish(cs);
 
-       spin_lock(&fc->lock);
-       req->locked = 0;
-       if (!err) {
-               if (req->aborted)
-                       err = -ENOENT;
-       } else if (!req->aborted)
+       spin_lock(&fpq->lock);
+       clear_bit(FR_LOCKED, &req->flags);
+       if (!fpq->connected)
+               err = -ENOENT;
+       else if (err)
                req->out.h.error = -EIO;
+       if (!test_bit(FR_PRIVATE, &req->flags))
+               list_del_init(&req->list);
+       spin_unlock(&fpq->lock);
+
        request_end(fc, req);
 
        return err ? err : nbytes;
 
- err_unlock:
-       spin_unlock(&fc->lock);
+ err_unlock_pq:
+       spin_unlock(&fpq->lock);
  err_finish:
        fuse_copy_finish(cs);
        return err;
@@ -1967,16 +1958,17 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
 static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
 {
        struct fuse_copy_state cs;
-       struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-       if (!fc)
+       struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+
+       if (!fud)
                return -EPERM;
 
        if (!iter_is_iovec(from))
                return -EINVAL;
 
-       fuse_copy_init(&cs, fc, 0, from);
+       fuse_copy_init(&cs, 0, from);
 
-       return fuse_dev_do_write(fc, &cs, iov_iter_count(from));
+       return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
 }
 
 static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
@@ -1987,12 +1979,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        unsigned idx;
        struct pipe_buffer *bufs;
        struct fuse_copy_state cs;
-       struct fuse_conn *fc;
+       struct fuse_dev *fud;
        size_t rem;
        ssize_t ret;
 
-       fc = fuse_get_conn(out);
-       if (!fc)
+       fud = fuse_get_dev(out);
+       if (!fud)
                return -EPERM;
 
        bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
@@ -2039,7 +2031,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        }
        pipe_unlock(pipe);
 
-       fuse_copy_init(&cs, fc, 0, NULL);
+       fuse_copy_init(&cs, 0, NULL);
        cs.pipebufs = bufs;
        cs.nr_segs = nbuf;
        cs.pipe = pipe;
@@ -2047,7 +2039,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
        if (flags & SPLICE_F_MOVE)
                cs.move_pages = 1;
 
-       ret = fuse_dev_do_write(fc, &cs, len);
+       ret = fuse_dev_do_write(fud, &cs, len);
 
        for (idx = 0; idx < nbuf; idx++) {
                struct pipe_buffer *buf = &bufs[idx];
@@ -2061,18 +2053,21 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
        unsigned mask = POLLOUT | POLLWRNORM;
-       struct fuse_conn *fc = fuse_get_conn(file);
-       if (!fc)
+       struct fuse_iqueue *fiq;
+       struct fuse_dev *fud = fuse_get_dev(file);
+
+       if (!fud)
                return POLLERR;
 
-       poll_wait(file, &fc->waitq, wait);
+       fiq = &fud->fc->iq;
+       poll_wait(file, &fiq->waitq, wait);
 
-       spin_lock(&fc->lock);
-       if (!fc->connected)
+       spin_lock(&fiq->waitq.lock);
+       if (!fiq->connected)
                mask = POLLERR;
-       else if (request_pending(fc))
+       else if (request_pending(fiq))
                mask |= POLLIN | POLLRDNORM;
-       spin_unlock(&fc->lock);
+       spin_unlock(&fiq->waitq.lock);
 
        return mask;
 }
@@ -2083,67 +2078,18 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
  * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(fc->lock)
-__acquires(fc->lock)
 {
        while (!list_empty(head)) {
                struct fuse_req *req;
                req = list_entry(head->next, struct fuse_req, list);
                req->out.h.error = -ECONNABORTED;
-               request_end(fc, req);
-               spin_lock(&fc->lock);
-       }
-}
-
-/*
- * Abort requests under I/O
- *
- * The requests are set to aborted and finished, and the request
- * waiter is woken up.  This will make request_wait_answer() wait
- * until the request is unlocked and then return.
- *
- * If the request is asynchronous, then the end function needs to be
- * called after waiting for the request to be unlocked (if it was
- * locked).
- */
-static void end_io_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-       while (!list_empty(&fc->io)) {
-               struct fuse_req *req =
-                       list_entry(fc->io.next, struct fuse_req, list);
-               void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-
-               req->aborted = 1;
-               req->out.h.error = -ECONNABORTED;
-               req->state = FUSE_REQ_FINISHED;
+               clear_bit(FR_PENDING, &req->flags);
+               clear_bit(FR_SENT, &req->flags);
                list_del_init(&req->list);
-               wake_up(&req->waitq);
-               if (end) {
-                       req->end = NULL;
-                       __fuse_get_request(req);
-                       spin_unlock(&fc->lock);
-                       wait_event(req->waitq, !req->locked);
-                       end(fc, req);
-                       fuse_put_request(fc, req);
-                       spin_lock(&fc->lock);
-               }
+               request_end(fc, req);
        }
 }
 
-static void end_queued_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
-       fc->max_background = UINT_MAX;
-       flush_bg_queue(fc);
-       end_requests(fc, &fc->pending);
-       end_requests(fc, &fc->processing);
-       while (forget_pending(fc))
-               kfree(dequeue_forget(fc, 1, NULL));
-}
-
 static void end_polls(struct fuse_conn *fc)
 {
        struct rb_node *p;
@@ -2162,67 +2108,156 @@ static void end_polls(struct fuse_conn *fc)
 /*
  * Abort all requests.
  *
- * Emergency exit in case of a malicious or accidental deadlock, or
- * just a hung filesystem.
+ * Emergency exit in case of a malicious or accidental deadlock, or just a hung
+ * filesystem.
  *
- * The same effect is usually achievable through killing the
- * filesystem daemon and all users of the filesystem.  The exception
- * is the combination of an asynchronous request and the tricky
- * deadlock (see Documentation/filesystems/fuse.txt).
+ * The same effect is usually achievable through killing the filesystem daemon
+ * and all users of the filesystem.  The exception is the combination of an
+ * asynchronous request and the tricky deadlock (see
+ * Documentation/filesystems/fuse.txt).
  *
- * During the aborting, progression of requests from the pending and
- * processing lists onto the io list, and progression of new requests
- * onto the pending list is prevented by req->connected being false.
- *
- * Progression of requests under I/O to the processing list is
- * prevented by the req->aborted flag being true for these requests.
- * For this reason requests on the io list must be aborted first.
+ * Aborting requests under I/O goes as follows: 1: Separate out unlocked
+ * requests, they should be finished off immediately.  Locked requests will be
+ * finished after unlock; see unlock_request(). 2: Finish off the unlocked
+ * requests.  It is possible that some request will finish before we can.  This
+ * is OK, the request will in that case be removed from the list before we touch
+ * it.
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
+       struct fuse_iqueue *fiq = &fc->iq;
+
        spin_lock(&fc->lock);
        if (fc->connected) {
+               struct fuse_dev *fud;
+               struct fuse_req *req, *next;
+               LIST_HEAD(to_end1);
+               LIST_HEAD(to_end2);
+
                fc->connected = 0;
                fc->blocked = 0;
                fuse_set_initialized(fc);
-               end_io_requests(fc);
-               end_queued_requests(fc);
+               list_for_each_entry(fud, &fc->devices, entry) {
+                       struct fuse_pqueue *fpq = &fud->pq;
+
+                       spin_lock(&fpq->lock);
+                       fpq->connected = 0;
+                       list_for_each_entry_safe(req, next, &fpq->io, list) {
+                               req->out.h.error = -ECONNABORTED;
+                               spin_lock(&req->waitq.lock);
+                               set_bit(FR_ABORTED, &req->flags);
+                               if (!test_bit(FR_LOCKED, &req->flags)) {
+                                       set_bit(FR_PRIVATE, &req->flags);
+                                       list_move(&req->list, &to_end1);
+                               }
+                               spin_unlock(&req->waitq.lock);
+                       }
+                       list_splice_init(&fpq->processing, &to_end2);
+                       spin_unlock(&fpq->lock);
+               }
+               fc->max_background = UINT_MAX;
+               flush_bg_queue(fc);
+
+               spin_lock(&fiq->waitq.lock);
+               fiq->connected = 0;
+               list_splice_init(&fiq->pending, &to_end2);
+               while (forget_pending(fiq))
+                       kfree(dequeue_forget(fiq, 1, NULL));
+               wake_up_all_locked(&fiq->waitq);
+               spin_unlock(&fiq->waitq.lock);
+               kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
                end_polls(fc);
-               wake_up_all(&fc->waitq);
                wake_up_all(&fc->blocked_waitq);
-               kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+               spin_unlock(&fc->lock);
+
+               while (!list_empty(&to_end1)) {
+                       req = list_first_entry(&to_end1, struct fuse_req, list);
+                       __fuse_get_request(req);
+                       list_del_init(&req->list);
+                       request_end(fc, req);
+               }
+               end_requests(fc, &to_end2);
+       } else {
+               spin_unlock(&fc->lock);
        }
-       spin_unlock(&fc->lock);
 }
 EXPORT_SYMBOL_GPL(fuse_abort_conn);
 
 int fuse_dev_release(struct inode *inode, struct file *file)
 {
-       struct fuse_conn *fc = fuse_get_conn(file);
-       if (fc) {
-               spin_lock(&fc->lock);
-               fc->connected = 0;
-               fc->blocked = 0;
-               fuse_set_initialized(fc);
-               end_queued_requests(fc);
-               end_polls(fc);
-               wake_up_all(&fc->blocked_waitq);
-               spin_unlock(&fc->lock);
-               fuse_conn_put(fc);
-       }
+       struct fuse_dev *fud = fuse_get_dev(file);
 
+       if (fud) {
+               struct fuse_conn *fc = fud->fc;
+               struct fuse_pqueue *fpq = &fud->pq;
+
+               WARN_ON(!list_empty(&fpq->io));
+               end_requests(fc, &fpq->processing);
+               /* Are we the last open device? */
+               if (atomic_dec_and_test(&fc->dev_count)) {
+                       WARN_ON(fc->iq.fasync != NULL);
+                       fuse_abort_conn(fc);
+               }
+               fuse_dev_free(fud);
+       }
        return 0;
 }
 EXPORT_SYMBOL_GPL(fuse_dev_release);
 
 static int fuse_dev_fasync(int fd, struct file *file, int on)
 {
-       struct fuse_conn *fc = fuse_get_conn(file);
-       if (!fc)
+       struct fuse_dev *fud = fuse_get_dev(file);
+
+       if (!fud)
                return -EPERM;
 
        /* No locking - fasync_helper does its own locking */
-       return fasync_helper(fd, file, on, &fc->fasync);
+       return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
+}
+
+static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
+{
+       struct fuse_dev *fud;
+
+       if (new->private_data)
+               return -EINVAL;
+
+       fud = fuse_dev_alloc(fc);
+       if (!fud)
+               return -ENOMEM;
+
+       new->private_data = fud;
+       atomic_inc(&fc->dev_count);
+
+       return 0;
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+                          unsigned long arg)
+{
+       int err = -ENOTTY;
+
+       if (cmd == FUSE_DEV_IOC_CLONE) {
+               int oldfd;
+
+               err = -EFAULT;
+               if (!get_user(oldfd, (__u32 __user *) arg)) {
+                       struct file *old = fget(oldfd);
+
+                       err = -EINVAL;
+                       if (old) {
+                               struct fuse_dev *fud = fuse_get_dev(old);
+
+                               if (fud) {
+                                       mutex_lock(&fuse_mutex);
+                                       err = fuse_device_clone(fud->fc, file);
+                                       mutex_unlock(&fuse_mutex);
+                               }
+                               fput(old);
+                       }
+               }
+       }
+       return err;
 }
 
 const struct file_operations fuse_dev_operations = {
@@ -2236,6 +2271,8 @@ const struct file_operations fuse_dev_operations = {
        .poll           = fuse_dev_poll,
        .release        = fuse_dev_release,
        .fasync         = fuse_dev_fasync,
+       .unlocked_ioctl = fuse_dev_ioctl,
+       .compat_ioctl   = fuse_dev_ioctl,
 };
 EXPORT_SYMBOL_GPL(fuse_dev_operations);
 
index 8c5e2fa68835a07216d250dc9536aafaa1063dc2..014fa8ba2b5189e923557c446be8f150921f9b28 100644 (file)
@@ -96,17 +96,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
                         * Drop the release request when client does not
                         * implement 'open'
                         */
-                       req->background = 0;
+                       __clear_bit(FR_BACKGROUND, &req->flags);
                        iput(req->misc.release.inode);
                        fuse_put_request(ff->fc, req);
                } else if (sync) {
-                       req->background = 0;
+                       __clear_bit(FR_BACKGROUND, &req->flags);
                        fuse_request_send(ff->fc, req);
                        iput(req->misc.release.inode);
                        fuse_put_request(ff->fc, req);
                } else {
                        req->end = fuse_release_end;
-                       req->background = 1;
+                       __set_bit(FR_BACKGROUND, &req->flags);
                        fuse_request_send_background(ff->fc, req);
                }
                kfree(ff);
@@ -299,8 +299,8 @@ void fuse_sync_release(struct fuse_file *ff, int flags)
 {
        WARN_ON(atomic_read(&ff->count) > 1);
        fuse_prepare_release(ff, flags, FUSE_RELEASE);
-       ff->reserved_req->force = 1;
-       ff->reserved_req->background = 0;
+       __set_bit(FR_FORCE, &ff->reserved_req->flags);
+       __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
        fuse_request_send(ff->fc, ff->reserved_req);
        fuse_put_request(ff->fc, ff->reserved_req);
        kfree(ff);
@@ -426,7 +426,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-       req->force = 1;
+       __set_bit(FR_FORCE, &req->flags);
        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -1611,7 +1611,8 @@ static int fuse_writepage_locked(struct page *page)
        if (!req)
                goto err;
 
-       req->background = 1; /* writeback always goes to bg_queue */
+       /* writeback always goes to bg_queue */
+       __set_bit(FR_BACKGROUND, &req->flags);
        tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
        if (!tmp_page)
                goto err_free;
@@ -1742,8 +1743,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
                }
        }
 
-       if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
-                                       old_req->state == FUSE_REQ_PENDING)) {
+       if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
                struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
 
                copy_highpage(old_req->pages[0], page);
@@ -1830,7 +1830,7 @@ static int fuse_writepages_fill(struct page *page,
                req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
                req->misc.write.next = NULL;
                req->in.argpages = 1;
-               req->background = 1;
+               __set_bit(FR_BACKGROUND, &req->flags);
                req->num_pages = 0;
                req->end = fuse_writepage_end;
                req->inode = inode;
index 7354dc142a50845a62e9a413d82d185afc1f5b0d..405113101db8d868fcb40c34199978be576d0961 100644 (file)
@@ -241,16 +241,6 @@ struct fuse_args {
 
 #define FUSE_ARGS(args) struct fuse_args args = {}
 
-/** The request state */
-enum fuse_req_state {
-       FUSE_REQ_INIT = 0,
-       FUSE_REQ_PENDING,
-       FUSE_REQ_READING,
-       FUSE_REQ_SENT,
-       FUSE_REQ_WRITING,
-       FUSE_REQ_FINISHED
-};
-
 /** The request IO state (for asynchronous processing) */
 struct fuse_io_priv {
        int async;
@@ -266,8 +256,41 @@ struct fuse_io_priv {
        struct completion *done;
 };
 
+/**
+ * Request flags
+ *
+ * FR_ISREPLY:         set if the request has reply
+ * FR_FORCE:           force sending of the request even if interrupted
+ * FR_BACKGROUND:      request is sent in the background
+ * FR_WAITING:         request is counted as "waiting"
+ * FR_ABORTED:         the request was aborted
+ * FR_INTERRUPTED:     the request has been interrupted
+ * FR_LOCKED:          data is being copied to/from the request
+ * FR_PENDING:         request is not yet in userspace
+ * FR_SENT:            request is in userspace, waiting for an answer
+ * FR_FINISHED:                request is finished
+ * FR_PRIVATE:         request is on private list
+ */
+enum fuse_req_flag {
+       FR_ISREPLY,
+       FR_FORCE,
+       FR_BACKGROUND,
+       FR_WAITING,
+       FR_ABORTED,
+       FR_INTERRUPTED,
+       FR_LOCKED,
+       FR_PENDING,
+       FR_SENT,
+       FR_FINISHED,
+       FR_PRIVATE,
+};
+
 /**
  * A request to the client
+ *
+ * .waitq.lock protects the following fields:
+ *   - FR_ABORTED
+ *   - FR_LOCKED (may also be modified under fc->lock, tested under both)
  */
 struct fuse_req {
        /** This can be on either pending processing or io lists in
@@ -283,35 +306,8 @@ struct fuse_req {
        /** Unique ID for the interrupt request */
        u64 intr_unique;
 
-       /*
-        * The following bitfields are either set once before the
-        * request is queued or setting/clearing them is protected by
-        * fuse_conn->lock
-        */
-
-       /** True if the request has reply */
-       unsigned isreply:1;
-
-       /** Force sending of the request even if interrupted */
-       unsigned force:1;
-
-       /** The request was aborted */
-       unsigned aborted:1;
-
-       /** Request is sent in the background */
-       unsigned background:1;
-
-       /** The request has been interrupted */
-       unsigned interrupted:1;
-
-       /** Data is being copied to/from the request */
-       unsigned locked:1;
-
-       /** Request is counted as "waiting" */
-       unsigned waiting:1;
-
-       /** State of the request */
-       enum fuse_req_state state;
+       /* Request flags, updated with test/set/clear_bit() */
+       unsigned long flags;
 
        /** The request input */
        struct fuse_in in;
@@ -380,6 +376,61 @@ struct fuse_req {
        struct file *stolen_file;
 };
 
+struct fuse_iqueue {
+       /** Connection established */
+       unsigned connected;
+
+       /** Readers of the connection are waiting on this */
+       wait_queue_head_t waitq;
+
+       /** The next unique request id */
+       u64 reqctr;
+
+       /** The list of pending requests */
+       struct list_head pending;
+
+       /** Pending interrupts */
+       struct list_head interrupts;
+
+       /** Queue of pending forgets */
+       struct fuse_forget_link forget_list_head;
+       struct fuse_forget_link *forget_list_tail;
+
+       /** Batching of FORGET requests (positive indicates FORGET batch) */
+       int forget_batch;
+
+       /** O_ASYNC requests */
+       struct fasync_struct *fasync;
+};
+
+struct fuse_pqueue {
+       /** Connection established */
+       unsigned connected;
+
+       /** Lock protecting accessess to  members of this structure */
+       spinlock_t lock;
+
+       /** The list of requests being processed */
+       struct list_head processing;
+
+       /** The list of requests under I/O */
+       struct list_head io;
+};
+
+/**
+ * Fuse device instance
+ */
+struct fuse_dev {
+       /** Fuse connection for this device */
+       struct fuse_conn *fc;
+
+       /** Processing queue */
+       struct fuse_pqueue pq;
+
+       /** list entry on fc->devices */
+       struct list_head entry;
+};
+
 /**
  * A Fuse connection.
  *
@@ -394,6 +445,9 @@ struct fuse_conn {
        /** Refcount */
        atomic_t count;
 
+       /** Number of fuse_dev's */
+       atomic_t dev_count;
+
        struct rcu_head rcu;
 
        /** The user id for this mount */
@@ -411,17 +465,8 @@ struct fuse_conn {
        /** Maximum write size */
        unsigned max_write;
 
-       /** Readers of the connection are waiting on this */
-       wait_queue_head_t waitq;
-
-       /** The list of pending requests */
-       struct list_head pending;
-
-       /** The list of requests being processed */
-       struct list_head processing;
-
-       /** The list of requests under I/O */
-       struct list_head io;
+       /** Input queue */
+       struct fuse_iqueue iq;
 
        /** The next unique kernel file handle */
        u64 khctr;
@@ -444,16 +489,6 @@ struct fuse_conn {
        /** The list of background requests set aside for later queuing */
        struct list_head bg_queue;
 
-       /** Pending interrupts */
-       struct list_head interrupts;
-
-       /** Queue of pending forgets */
-       struct fuse_forget_link forget_list_head;
-       struct fuse_forget_link *forget_list_tail;
-
-       /** Batching of FORGET requests (positive indicates FORGET batch) */
-       int forget_batch;
-
        /** Flag indicating that INIT reply has been received. Allocating
         * any fuse request will be suspended until the flag is set */
        int initialized;
@@ -469,9 +504,6 @@ struct fuse_conn {
        /** waitq for reserved requests */
        wait_queue_head_t reserved_req_waitq;
 
-       /** The next unique request id */
-       u64 reqctr;
-
        /** Connection established, cleared on umount, connection
            abort and device release */
        unsigned connected;
@@ -594,9 +626,6 @@ struct fuse_conn {
        /** number of dentries used in the above array */
        int ctl_ndents;
 
-       /** O_ASYNC requests */
-       struct fasync_struct *fasync;
-
        /** Key for lock owner ID scrambling */
        u32 scramble_key[4];
 
@@ -614,6 +643,9 @@ struct fuse_conn {
 
        /** Read/write semaphore to hold when accessing sb. */
        struct rw_semaphore killsb;
+
+       /** List of device instances belonging to this connection */
+       struct list_head devices;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -826,6 +858,9 @@ void fuse_conn_init(struct fuse_conn *fc);
  */
 void fuse_conn_put(struct fuse_conn *fc);
 
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
+void fuse_dev_free(struct fuse_dev *fud);
+
 /**
  * Add connection to control filesystem
  */
index 082ac1c97f397f7f8014c7896123f8c7a5eebe8f..ac81f48ab2f42e957b532d1162de0e1c9dba9cf6 100644 (file)
@@ -362,8 +362,8 @@ static void fuse_send_destroy(struct fuse_conn *fc)
        if (req && fc->conn_init) {
                fc->destroy_req = NULL;
                req->in.h.opcode = FUSE_DESTROY;
-               req->force = 1;
-               req->background = 0;
+               __set_bit(FR_FORCE, &req->flags);
+               __clear_bit(FR_BACKGROUND, &req->flags);
                fuse_request_send(fc, req);
                fuse_put_request(fc, req);
        }
@@ -567,30 +567,46 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
        return 0;
 }
 
+static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+{
+       memset(fiq, 0, sizeof(struct fuse_iqueue));
+       init_waitqueue_head(&fiq->waitq);
+       INIT_LIST_HEAD(&fiq->pending);
+       INIT_LIST_HEAD(&fiq->interrupts);
+       fiq->forget_list_tail = &fiq->forget_list_head;
+       fiq->connected = 1;
+}
+
+static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+{
+       memset(fpq, 0, sizeof(struct fuse_pqueue));
+       spin_lock_init(&fpq->lock);
+       INIT_LIST_HEAD(&fpq->processing);
+       INIT_LIST_HEAD(&fpq->io);
+       fpq->connected = 1;
+}
+
 void fuse_conn_init(struct fuse_conn *fc)
 {
        memset(fc, 0, sizeof(*fc));
        spin_lock_init(&fc->lock);
        init_rwsem(&fc->killsb);
        atomic_set(&fc->count, 1);
-       init_waitqueue_head(&fc->waitq);
+       atomic_set(&fc->dev_count, 1);
        init_waitqueue_head(&fc->blocked_waitq);
        init_waitqueue_head(&fc->reserved_req_waitq);
-       INIT_LIST_HEAD(&fc->pending);
-       INIT_LIST_HEAD(&fc->processing);
-       INIT_LIST_HEAD(&fc->io);
-       INIT_LIST_HEAD(&fc->interrupts);
+       fuse_iqueue_init(&fc->iq);
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
-       fc->forget_list_tail = &fc->forget_list_head;
+       INIT_LIST_HEAD(&fc->devices);
        atomic_set(&fc->num_waiting, 0);
        fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
        fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
        fc->khctr = 0;
        fc->polled_files = RB_ROOT;
-       fc->reqctr = 0;
        fc->blocked = 0;
        fc->initialized = 0;
+       fc->connected = 1;
        fc->attr_version = 1;
        get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 }
@@ -930,6 +946,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 
 static void fuse_free_conn(struct fuse_conn *fc)
 {
+       WARN_ON(!list_empty(&fc->devices));
        kfree_rcu(fc, rcu);
 }
 
@@ -975,8 +992,42 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
        return 0;
 }
 
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+{
+       struct fuse_dev *fud;
+
+       fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
+       if (fud) {
+               fud->fc = fuse_conn_get(fc);
+               fuse_pqueue_init(&fud->pq);
+
+               spin_lock(&fc->lock);
+               list_add_tail(&fud->entry, &fc->devices);
+               spin_unlock(&fc->lock);
+       }
+
+       return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+
+void fuse_dev_free(struct fuse_dev *fud)
+{
+       struct fuse_conn *fc = fud->fc;
+
+       if (fc) {
+               spin_lock(&fc->lock);
+               list_del(&fud->entry);
+               spin_unlock(&fc->lock);
+
+               fuse_conn_put(fc);
+       }
+       kfree(fud);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_free);
+
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
+       struct fuse_dev *fud;
        struct fuse_conn *fc;
        struct inode *root;
        struct fuse_mount_data d;
@@ -1026,12 +1077,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
                goto err_fput;
 
        fuse_conn_init(fc);
+       fc->release = fuse_free_conn;
+
+       fud = fuse_dev_alloc(fc);
+       if (!fud)
+               goto err_put_conn;
 
        fc->dev = sb->s_dev;
        fc->sb = sb;
        err = fuse_bdi_init(fc, sb);
        if (err)
-               goto err_put_conn;
+               goto err_dev_free;
 
        sb->s_bdi = &fc->bdi;
 
@@ -1040,7 +1096,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
                fc->dont_mask = 1;
        sb->s_flags |= MS_POSIXACL;
 
-       fc->release = fuse_free_conn;
        fc->flags = d.flags;
        fc->user_id = d.user_id;
        fc->group_id = d.group_id;
@@ -1053,14 +1108,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        root = fuse_get_root_inode(sb, d.rootmode);
        root_dentry = d_make_root(root);
        if (!root_dentry)
-               goto err_put_conn;
+               goto err_dev_free;
        /* only now - we want root dentry with NULL ->d_op */
        sb->s_d_op = &fuse_dentry_operations;
 
        init_req = fuse_request_alloc(0);
        if (!init_req)
                goto err_put_root;
-       init_req->background = 1;
+       __set_bit(FR_BACKGROUND, &init_req->flags);
 
        if (is_bdev) {
                fc->destroy_req = fuse_request_alloc(0);
@@ -1079,8 +1134,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
        list_add_tail(&fc->entry, &fuse_conn_list);
        sb->s_root = root_dentry;
-       fc->connected = 1;
-       file->private_data = fuse_conn_get(fc);
+       file->private_data = fud;
        mutex_unlock(&fuse_mutex);
        /*
         * atomic_dec_and_test() in fput() provides the necessary
@@ -1099,6 +1153,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        fuse_request_free(init_req);
  err_put_root:
        dput(root_dentry);
+ err_dev_free:
+       fuse_dev_free(fud);
  err_put_conn:
        fuse_bdi_destroy(fc);
        fuse_conn_put(fc);
index 8d129bb7355afbb2ca7f1904ff0263f604e86dd6..682529c009966b85f986955c04d2b48fb645e981 100644 (file)
@@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
  * pg_authenticate method for nfsv4 callback threads.
  *
  * The authflavor has been negotiated, so an incorrect flavor is a server
- * bug. Drop packets with incorrect authflavor.
+ * bug. Deny packets with incorrect authflavor.
  *
  * All other checking done after NFS decoding where the nfs_client can be
  * found in nfs4_callback_compound
@@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        switch (rqstp->rq_authop->flavour) {
        case RPC_AUTH_NULL:
                if (rqstp->rq_proc != CB_NULL)
-                       return SVC_DROP;
+                       return SVC_DENIED;
                break;
        case RPC_AUTH_GSS:
                /* No RPC_AUTH_GSS support yet in NFSv4.1 */
                 if (svc_is_backchannel(rqstp))
-                       return SVC_DROP;
+                       return SVC_DENIED;
        }
        return SVC_OK;
 }
index 197806fb87ffb459c19f3c4bbc8da50c58c870dc..29e3c1b011b73e4661f4deb1ef200e2f8d27792b 100644 (file)
@@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
 
        /* Normal */
-       if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
-               slot->seq_nr++;
+       if (likely(args->csa_sequenceid == slot->seq_nr + 1))
                goto out_ok;
-       }
 
        /* Replay */
        if (args->csa_sequenceid == slot->seq_nr) {
@@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                              struct cb_process_state *cps)
 {
        struct nfs4_slot_table *tbl;
+       struct nfs4_slot *slot;
        struct nfs_client *clp;
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
@@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 
        if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
                goto out;
+
        tbl = &clp->cl_session->bc_slot_table;
+       slot = tbl->slots + args->csa_slotid;
 
        spin_lock(&tbl->slot_tbl_lock);
        /* state manager is resetting the session */
        if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
-               spin_unlock(&tbl->slot_tbl_lock);
                status = htonl(NFS4ERR_DELAY);
                /* Return NFS4ERR_BADSESSION if we're draining the session
                 * in order to reset it.
                 */
                if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
                        status = htonl(NFS4ERR_BADSESSION);
-               goto out;
+               goto out_unlock;
        }
 
-       status = validate_seqid(&clp->cl_session->bc_slot_table, args);
-       spin_unlock(&tbl->slot_tbl_lock);
+       memcpy(&res->csr_sessionid, &args->csa_sessionid,
+              sizeof(res->csr_sessionid));
+       res->csr_sequenceid = args->csa_sequenceid;
+       res->csr_slotid = args->csa_slotid;
+       res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+       res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+
+       status = validate_seqid(tbl, args);
        if (status)
-               goto out;
+               goto out_unlock;
 
        cps->slotid = args->csa_slotid;
 
@@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
         */
        if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
                status = htonl(NFS4ERR_DELAY);
-               goto out;
+               goto out_unlock;
        }
 
-       memcpy(&res->csr_sessionid, &args->csa_sessionid,
-              sizeof(res->csr_sessionid));
-       res->csr_sequenceid = args->csa_sequenceid;
-       res->csr_slotid = args->csa_slotid;
-       res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-       res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+       /*
+        * RFC5661 20.9.3
+        * If CB_SEQUENCE returns an error, then the state of the slot
+        * (sequence ID, cached reply) MUST NOT change.
+        */
+       slot->seq_nr++;
+out_unlock:
+       spin_unlock(&tbl->slot_tbl_lock);
 
 out:
        cps->clp = clp; /* put in nfs4_callback_compound */
index 19ca95cdfd9b0f26aedbbc23f036babf2aeca67a..6b1697a01dde35e1384d72a4a7cd2e02d1afbf84 100644 (file)
@@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
 
        status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
-       if (status == __constant_htonl(NFS4ERR_RESOURCE))
+       if (status == htonl(NFS4ERR_RESOURCE))
                return rpc_garbage_args;
 
        if (hdr_arg.minorversion == 0) {
index 892aefff36300a0861f9bf5d43cb7a4b5069d873..ecebb406cc1aec554ce780f2c3680eddc351e8da 100644 (file)
@@ -825,7 +825,6 @@ static int nfs_init_server(struct nfs_server *server,
  * Load up the server record from information gained in an fsinfo record
  */
 static void nfs_server_set_fsinfo(struct nfs_server *server,
-                                 struct nfs_fh *mntfh,
                                  struct nfs_fsinfo *fsinfo)
 {
        unsigned long max_rpc_payload;
@@ -901,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
        if (error < 0)
                goto out_error;
 
-       nfs_server_set_fsinfo(server, mntfh, &fsinfo);
+       nfs_server_set_fsinfo(server, &fsinfo);
 
        /* Get some general file system info */
        if (server->namelen == 0) {
@@ -1193,8 +1192,6 @@ void nfs_clients_init(struct net *net)
 }
 
 #ifdef CONFIG_PROC_FS
-static struct proc_dir_entry *proc_fs_nfs;
-
 static int nfs_server_list_open(struct inode *inode, struct file *file);
 static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
@@ -1364,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 {
        struct nfs_server *server;
        struct nfs_client *clp;
-       char dev[8], fsid[17];
+       char dev[13];   // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
+       char fsid[34];  // 2 * 16 for %llx, 1 for ':', 1 for '\0'
        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
 
        /* display header on line 1 */
        if (v == &nn->nfs_volume_list) {
-               seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
+               seq_puts(m, "NV SERVER   PORT DEV          FSID"
+                           "                              FSC\n");
                return 0;
        }
        /* display one transport per line on subsequent lines */
        server = list_entry(v, struct nfs_server, master_link);
        clp = server->nfs_client;
 
-       snprintf(dev, 8, "%u:%u",
+       snprintf(dev, sizeof(dev), "%u:%u",
                 MAJOR(server->s_dev), MINOR(server->s_dev));
 
-       snprintf(fsid, 17, "%llx:%llx",
+       snprintf(fsid, sizeof(fsid), "%llx:%llx",
                 (unsigned long long) server->fsid.major,
                 (unsigned long long) server->fsid.minor);
 
        rcu_read_lock();
-       seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
+       seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
@@ -1434,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net)
  */
 int __init nfs_fs_proc_init(void)
 {
-       struct proc_dir_entry *p;
-
-       proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
-       if (!proc_fs_nfs)
+       if (!proc_mkdir("fs/nfsfs", NULL))
                goto error_0;
 
        /* a file of servers with which we're dealing */
-       p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
-       if (!p)
+       if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
                goto error_1;
 
        /* a file of volumes that we have mounted */
-       p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
-       if (!p)
-               goto error_2;
-       return 0;
+       if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
+               goto error_1;
 
-error_2:
-       remove_proc_entry("servers", proc_fs_nfs);
+       return 0;
 error_1:
-       remove_proc_entry("fs/nfsfs", NULL);
+       remove_proc_subtree("fs/nfsfs", NULL);
 error_0:
        return -ENOMEM;
 }
@@ -1464,9 +1456,7 @@ int __init nfs_fs_proc_init(void)
  */
 void nfs_fs_proc_exit(void)
 {
-       remove_proc_entry("volumes", proc_fs_nfs);
-       remove_proc_entry("servers", proc_fs_nfs);
-       remove_proc_entry("fs/nfsfs", NULL);
+       remove_proc_subtree("fs/nfsfs", NULL);
 }
 
 #endif /* CONFIG_PROC_FS */
index b2c8b31b2be77d9a1d524b230ed2b66e479ad3fe..21457bb0edd62b42af307d5850b336711f41e82f 100644 (file)
@@ -1470,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
        int err;
 
-       if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
-               *opened |= FILE_CREATED;
-
        err = finish_open(file, dentry, do_open, opened);
        if (err)
                goto out;
index 8b8d83a526ce2366ae974a87761c9c62c22da7f5..cc4fa1ed61fc5bdfe04d1afcaa5f081bb3ba0470 100644 (file)
@@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
        return nfs_wb_page(inode, page);
 }
 
-#ifdef CONFIG_NFS_SWAP
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
                                                sector_t *span)
 {
-       int ret;
        struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
        *span = sis->pages;
 
-       rcu_read_lock();
-       ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
-       rcu_read_unlock();
-
-       return ret;
+       return rpc_clnt_swap_activate(clnt);
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
        struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
-       rcu_read_lock();
-       xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
-       rcu_read_unlock();
+       rpc_clnt_swap_deactivate(clnt);
 }
-#endif
 
 const struct address_space_operations nfs_file_aops = {
        .readpage = nfs_readpage,
@@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
        .launder_page = nfs_launder_page,
        .is_dirty_writeback = nfs_check_dirty_writeback,
        .error_remove_page = generic_error_remove_page,
-#ifdef CONFIG_NFS_SWAP
        .swap_activate = nfs_swap_activate,
        .swap_deactivate = nfs_swap_deactivate,
-#endif
 };
 
 /*
index 7d05089e52d6c8b7a29e92e80011bdee0e06a32c..c12951b9551eab8b0394ed2aa218cf15ce6f2c39 100644 (file)
@@ -20,6 +20,7 @@
 #include "../nfs4trace.h"
 #include "../iostat.h"
 #include "../nfs.h"
+#include "../nfs42.h"
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
@@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
-       struct nfs4_ff_layout_mirror *tmp;
        int i, j;
 
        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
                for (j = i + 1; j < fls->mirror_array_cnt; j++)
                        if (fls->mirror_array[i]->efficiency <
-                           fls->mirror_array[j]->efficiency) {
-                               tmp = fls->mirror_array[i];
-                               fls->mirror_array[i] = fls->mirror_array[j];
-                               fls->mirror_array[j] = tmp;
-                       }
+                           fls->mirror_array[j]->efficiency)
+                               swap(fls->mirror_array[i],
+                                    fls->mirror_array[j]);
        }
 }
 
@@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 
                spin_lock_init(&fls->mirror_array[i]->lock);
                fls->mirror_array[i]->ds_count = ds_count;
+               fls->mirror_array[i]->lseg = &fls->generic_hdr;
 
                /* deviceid */
                rc = decode_deviceid(&stream, &devid);
@@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                        fls->mirror_array[i]->gid);
        }
 
+       p = xdr_inline_decode(&stream, 4);
+       if (p)
+               fls->flags = be32_to_cpup(p);
+
        ff_layout_sort_mirrors(fls);
        rc = ff_layout_check_layout(lgr);
        if (rc)
@@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
        return 1;
 }
 
+static void
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+       /* first IO request? */
+       if (atomic_inc_return(&timer->n_ops) == 1) {
+               timer->start_time = ktime_get();
+       }
+}
+
+static ktime_t
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+       ktime_t start, now;
+
+       if (atomic_dec_return(&timer->n_ops) < 0)
+               WARN_ON_ONCE(1);
+
+       now = ktime_get();
+       start = timer->start_time;
+       timer->start_time = now;
+       return ktime_sub(now, start);
+}
+
+static ktime_t
+nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
+{
+       return ktime_sub(ktime_get(), task->tk_start);
+}
+
+static bool
+nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+                           struct nfs4_ff_layoutstat *layoutstat)
+{
+       static const ktime_t notime = {0};
+       ktime_t now = ktime_get();
+
+       nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+       if (ktime_equal(mirror->start_time, notime))
+               mirror->start_time = now;
+       if (ktime_equal(mirror->last_report_time, notime))
+               mirror->last_report_time = now;
+       if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+                       FF_LAYOUTSTATS_REPORT_INTERVAL) {
+               mirror->last_report_time = now;
+               return true;
+       }
+
+       return false;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
+               __u64 requested)
+{
+       struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+
+       iostat->ops_requested++;
+       iostat->bytes_requested += requested;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
+               __u64 requested,
+               __u64 completed,
+               ktime_t time_completed)
+{
+       struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+       ktime_t timer;
+
+       iostat->ops_completed++;
+       iostat->bytes_completed += completed;
+       iostat->bytes_not_delivered += requested - completed;
+
+       timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+       iostat->total_busy_time =
+                       ktime_add(iostat->total_busy_time, timer);
+       iostat->aggregate_completion_time =
+                       ktime_add(iostat->aggregate_completion_time, time_completed);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested)
+{
+       bool report;
+
+       spin_lock(&mirror->lock);
+       report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+       nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+       spin_unlock(&mirror->lock);
+
+       if (report)
+               pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested,
+               __u64 completed)
+{
+       spin_lock(&mirror->lock);
+       nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+                       requested, completed,
+                       nfs4_ff_layout_calc_completion_time(task));
+       spin_unlock(&mirror->lock);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested)
+{
+       bool report;
+
+       spin_lock(&mirror->lock);
+       report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+       nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+       spin_unlock(&mirror->lock);
+
+       if (report)
+               pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested,
+               __u64 completed,
+               enum nfs3_stable_how committed)
+{
+       if (committed == NFS_UNSTABLE)
+               requested = completed = 0;
+
+       spin_lock(&mirror->lock);
+       nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+                       requested, completed,
+                       nfs4_ff_layout_calc_completion_time(task));
+       spin_unlock(&mirror->lock);
+}
+
 static int
 ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
                            struct nfs_commit_info *cinfo,
@@ -631,7 +774,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
                        nfs_direct_set_resched_writes(hdr->dreq);
                        /* fake unstable write to let common nfs resend pages */
                        hdr->verf.committed = NFS_UNSTABLE;
-                       hdr->good_bytes = 0;
+                       hdr->good_bytes = hdr->args.count;
                }
                return;
        }
@@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
        return 0;
 }
 
+static bool
+ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
+{
+       return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
+}
+
 /*
  * We reference the rpc_cred of the first WRITE that triggers the need for
  * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
@@ -891,6 +1040,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 static void
 ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
+       if (!ff_layout_need_layoutcommit(hdr->lseg))
+               return;
+
        pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
                        hdr->mds_offset + hdr->res.count);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
@@ -909,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 static int ff_layout_read_prepare_common(struct rpc_task *task,
                                         struct nfs_pgio_header *hdr)
 {
+       nfs4_ff_layout_stat_io_start_read(
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count);
+
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -962,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
-       if (ff_layout_read_prepare_common(task, hdr))
-               return;
-
        if (ff_layout_setup_sequence(hdr->ds_clp,
                                     &hdr->args.seq_args,
                                     &hdr->res.seq_res,
                                     task))
                return;
 
+       if (ff_layout_read_prepare_common(task, hdr))
+               return;
+
        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
                        hdr->args.lock_context, FMODE_READ) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -982,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
+       nfs4_ff_layout_stat_io_end_read(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count, hdr->res.count);
+
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1074,7 +1234,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
                return -EAGAIN;
        }
 
-       if (data->verf.committed == NFS_UNSTABLE)
+       if (data->verf.committed == NFS_UNSTABLE
+           && ff_layout_need_layoutcommit(data->lseg))
                pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
        return 0;
@@ -1083,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
                                          struct nfs_pgio_header *hdr)
 {
+       nfs4_ff_layout_stat_io_start_write(
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count);
+
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -1116,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
-       if (ff_layout_write_prepare_common(task, hdr))
-               return;
-
        if (ff_layout_setup_sequence(hdr->ds_clp,
                                     &hdr->args.seq_args,
                                     &hdr->res.seq_res,
                                     task))
                return;
 
+       if (ff_layout_write_prepare_common(task, hdr))
+               return;
+
        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -1134,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count, hdr->res.count,
+                       hdr->res.verf->committed);
+
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1152,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
 }
 
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+               struct nfs_commit_data *cdata)
+{
+       nfs4_ff_layout_stat_io_start_write(
+                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+                       0);
+}
+
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 {
+       ff_layout_commit_prepare_common(task, data);
        rpc_call_start(task);
 }
 
@@ -1161,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *wdata = data;
 
-       ff_layout_setup_sequence(wdata->ds_clp,
+       if (ff_layout_setup_sequence(wdata->ds_clp,
                                 &wdata->args.seq_args,
                                 &wdata->res.seq_res,
-                                task);
+                                task))
+               return;
+       ff_layout_commit_prepare_common(task, data);
+}
+
+static void ff_layout_commit_done(struct rpc_task *task, void *data)
+{
+       struct nfs_commit_data *cdata = data;
+       struct nfs_page *req;
+       __u64 count = 0;
+
+       if (task->tk_status == 0) {
+               list_for_each_entry(req, &cdata->pages, wb_list)
+                       count += req->wb_bytes;
+       }
+
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+                       count, count, NFS_FILE_SYNC);
+
+       pnfs_generic_write_commit_done(task, data);
 }
 
 static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
@@ -1205,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v3,
-       .rpc_call_done = pnfs_generic_write_commit_done,
+       .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
        .rpc_release = pnfs_generic_commit_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v4,
-       .rpc_call_done = pnfs_generic_write_commit_done,
+       .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
        .rpc_release = pnfs_generic_commit_release,
 };
@@ -1256,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
        if (fh)
                hdr->args.fh = fh;
-
        /*
         * Note that if we ever decide to split across DSes,
         * then we may need to handle dense-like offsets.
@@ -1385,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
        if (fh)
                data->args.fh = fh;
+
        return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
                                               &ff_layout_commit_call_ops_v4,
@@ -1488,6 +1687,247 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
        dprintk("%s: Return\n", __func__);
 }
 
+static int
+ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+       const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+       return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+static size_t
+ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
+                         const int buflen)
+{
+       const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+       const struct in6_addr *addr = &sin6->sin6_addr;
+
+       /*
+        * RFC 4291, Section 2.2.2
+        *
+        * Shorthanded ANY address
+        */
+       if (ipv6_addr_any(addr))
+               return snprintf(buf, buflen, "::");
+
+       /*
+        * RFC 4291, Section 2.2.2
+        *
+        * Shorthanded loopback address
+        */
+       if (ipv6_addr_loopback(addr))
+               return snprintf(buf, buflen, "::1");
+
+       /*
+        * RFC 4291, Section 2.2.3
+        *
+        * Special presentation address format for mapped v4
+        * addresses.
+        */
+       if (ipv6_addr_v4mapped(addr))
+               return snprintf(buf, buflen, "::ffff:%pI4",
+                                       &addr->s6_addr32[3]);
+
+       /*
+        * RFC 4291, Section 2.2.1
+        */
+       return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+/* Derived from rpc_sockaddr2uaddr */
+static void
+ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
+{
+       struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
+       char portbuf[RPCBIND_MAXUADDRPLEN];
+       char addrbuf[RPCBIND_MAXUADDRLEN];
+       char *netid;
+       unsigned short port;
+       int len, netid_len;
+       __be32 *p;
+
+       switch (sap->sa_family) {
+       case AF_INET:
+               if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+                       return;
+               port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+               netid = "tcp";
+               netid_len = 3;
+               break;
+       case AF_INET6:
+               if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+                       return;
+               port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+               netid = "tcp6";
+               netid_len = 4;
+               break;
+       default:
+               /* we only support tcp and tcp6 */
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
+       len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
+
+       p = xdr_reserve_space(xdr, 4 + netid_len);
+       xdr_encode_opaque(p, netid, netid_len);
+
+       p = xdr_reserve_space(xdr, 4 + len);
+       xdr_encode_opaque(p, addrbuf, len);
+}
+
+static void
+ff_layout_encode_nfstime(struct xdr_stream *xdr,
+                        ktime_t t)
+{
+       struct timespec64 ts;
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, 12);
+       ts = ktime_to_timespec64(t);
+       p = xdr_encode_hyper(p, ts.tv_sec);
+       *p++ = cpu_to_be32(ts.tv_nsec);
+}
+
+static void
+ff_layout_encode_io_latency(struct xdr_stream *xdr,
+                           struct nfs4_ff_io_stat *stat)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, 5 * 8);
+       p = xdr_encode_hyper(p, stat->ops_requested);
+       p = xdr_encode_hyper(p, stat->bytes_requested);
+       p = xdr_encode_hyper(p, stat->ops_completed);
+       p = xdr_encode_hyper(p, stat->bytes_completed);
+       p = xdr_encode_hyper(p, stat->bytes_not_delivered);
+       ff_layout_encode_nfstime(xdr, stat->total_busy_time);
+       ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr,
+                            struct nfs42_layoutstat_args *args,
+                            struct nfs42_layoutstat_devinfo *devinfo)
+{
+       struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
+       struct nfs4_pnfs_ds_addr *da;
+       struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
+       struct nfs_fh *fh = &mirror->fh_versions[0];
+       __be32 *p, *start;
+
+       da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
+       dprintk("%s: DS %s: encoding address %s\n",
+               __func__, ds->ds_remotestr, da->da_remotestr);
+       /* layoutupdate length */
+       start = xdr_reserve_space(xdr, 4);
+       /* netaddr4 */
+       ff_layout_encode_netaddr(xdr, da);
+       /* nfs_fh4 */
+       p = xdr_reserve_space(xdr, 4 + fh->size);
+       xdr_encode_opaque(p, fh->data, fh->size);
+       /* ff_io_latency4 read */
+       spin_lock(&mirror->lock);
+       ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+       /* ff_io_latency4 write */
+       ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
+       spin_unlock(&mirror->lock);
+       /* nfstime4 */
+       ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+       /* bool */
+       p = xdr_reserve_space(xdr, 4);
+       *p = cpu_to_be32(false);
+
+       *start = cpu_to_be32((xdr->p - start - 1) * 4);
+}
+
+static bool
+ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
+                              struct pnfs_layout_segment *pls,
+                              int *dev_count, int dev_limit)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+       struct nfs4_deviceid_node *dev;
+       struct nfs42_layoutstat_devinfo *devinfo;
+       int i;
+
+       for (i = 0; i <= FF_LAYOUT_MIRROR_COUNT(pls); i++) {
+               if (*dev_count >= dev_limit)
+                       break;
+               mirror = FF_LAYOUT_COMP(pls, i);
+               if (!mirror || !mirror->mirror_ds)
+                       continue;
+               dev = FF_LAYOUT_DEVID_NODE(pls, i);
+               devinfo = &args->devinfo[*dev_count];
+               memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
+               devinfo->offset = pls->pls_range.offset;
+               devinfo->length = pls->pls_range.length;
+               /* well, we don't really know if IO is continuous or not! */
+               devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+               devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
+               devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+               devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+               devinfo->layout_type = LAYOUT_FLEX_FILES;
+               devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
+               devinfo->layout_private = mirror;
+               /* lseg refcount put in cleanup_layoutstats */
+               pnfs_get_lseg(pls);
+
+               ++(*dev_count);
+       }
+
+       return *dev_count < dev_limit;
+}
+
+static int
+ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+{
+       struct pnfs_layout_segment *pls;
+       int dev_count = 0;
+
+       spin_lock(&args->inode->i_lock);
+       list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+               dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+       }
+       spin_unlock(&args->inode->i_lock);
+       /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
+       if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
+               dprintk("%s: truncating devinfo to limit (%d:%d)\n",
+                       __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
+               dev_count = PNFS_LAYOUTSTATS_MAXDEV;
+       }
+       args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+       if (!args->devinfo)
+               return -ENOMEM;
+
+       dev_count = 0;
+       spin_lock(&args->inode->i_lock);
+       list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+               if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
+                                                   PNFS_LAYOUTSTATS_MAXDEV)) {
+                       break;
+               }
+       }
+       spin_unlock(&args->inode->i_lock);
+       args->num_dev = dev_count;
+
+       return 0;
+}
+
+static void
+ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+       int i;
+
+       for (i = 0; i < data->args.num_dev; i++) {
+               mirror = data->args.devinfo[i].layout_private;
+               data->args.devinfo[i].layout_private = NULL;
+               pnfs_put_lseg(mirror->lseg);
+       }
+}
+
 static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .id                     = LAYOUT_FLEX_FILES,
        .name                   = "LAYOUT_FLEX_FILES",
@@ -1510,6 +1950,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
        .encode_layoutreturn    = ff_layout_encode_layoutreturn,
        .sync                   = pnfs_nfs_generic_sync,
+       .prepare_layoutstats    = ff_layout_prepare_layoutstats,
+       .cleanup_layoutstats    = ff_layout_cleanup_layoutstats,
 };
 
 static int __init nfs4flexfilelayout_init(void)
index 070f20445b2d33883445038d4888538845555198..f92f9a0a856b3e698c8859923438549d1bffed37 100644 (file)
@@ -9,12 +9,17 @@
 #ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
 #define FS_NFS_NFS4FLEXFILELAYOUT_H
 
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+
 #include "../pnfs.h"
 
 /* XXX: Let's filter out insanely large mirror count for now to avoid oom
  * due to network error etc. */
 #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
 
+/* LAYOUTSTATS report interval in ms */
+#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+
 struct nfs4_ff_ds_version {
        u32                             version;
        u32                             minor_version;
@@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err {
        struct nfs4_deviceid            deviceid;
 };
 
+struct nfs4_ff_io_stat {
+       __u64                           ops_requested;
+       __u64                           bytes_requested;
+       __u64                           ops_completed;
+       __u64                           bytes_completed;
+       __u64                           bytes_not_delivered;
+       ktime_t                         total_busy_time;
+       ktime_t                         aggregate_completion_time;
+};
+
+struct nfs4_ff_busy_timer {
+       ktime_t start_time;
+       atomic_t n_ops;
+};
+
+struct nfs4_ff_layoutstat {
+       struct nfs4_ff_io_stat io_stat;
+       struct nfs4_ff_busy_timer busy_timer;
+};
+
 struct nfs4_ff_layout_mirror {
+       struct pnfs_layout_segment      *lseg; /* back pointer */
        u32                             ds_count;
        u32                             efficiency;
        struct nfs4_ff_layout_ds        *mirror_ds;
        u32                             fh_versions_cnt;
        struct nfs_fh                   *fh_versions;
        nfs4_stateid                    stateid;
-       struct nfs4_string              user_name;
-       struct nfs4_string              group_name;
        u32                             uid;
        u32                             gid;
        struct rpc_cred                 *cred;
        spinlock_t                      lock;
+       struct nfs4_ff_layoutstat       read_stat;
+       struct nfs4_ff_layoutstat       write_stat;
+       ktime_t                         start_time;
+       ktime_t                         last_report_time;
 };
 
 struct nfs4_ff_layout_segment {
        struct pnfs_layout_segment      generic_hdr;
        u64                             stripe_unit;
+       u32                             flags;
        u32                             mirror_array_cnt;
        struct nfs4_ff_layout_mirror    **mirror_array;
 };
index 77a2d026aa12b62bdc29dac2345cff0b3237e9c4..f13e1969eedd911bf6a5d9be6af6e4ae403f6c1e 100644 (file)
@@ -324,7 +324,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
                                __func__, PTR_ERR(cred));
                        return PTR_ERR(cred);
                } else {
-                       mirror->cred = cred;
+                       if (cmpxchg(&mirror->cred, NULL, cred))
+                               put_rpccred(cred);
                }
        }
        return 0;
@@ -386,7 +387,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
        smp_rmb();
        if (ds->ds_clp)
-               goto out;
+               goto out_update_creds;
 
        flavor = nfs4_ff_layout_choose_authflavor(mirror);
 
@@ -430,7 +431,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
                        }
                }
        }
-
+out_update_creds:
        if (ff_layout_update_mirror_cred(mirror, ds))
                ds = NULL;
 out:
index f734562c6d244034cb5036fee8ab0b7d69cc90c5..b77b328a06d74f0124d2a65b51fac0fc21fbd692 100644 (file)
@@ -678,6 +678,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        if (!err) {
                generic_fillattr(inode, stat);
                stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+               if (S_ISDIR(inode->i_mode))
+                       stat->blksize = NFS_SERVER(inode)->dtsize;
        }
 out:
        trace_nfs_getattr_exit(inode, err);
@@ -2008,17 +2010,15 @@ static int __init init_nfs_fs(void)
        if (err)
                goto out1;
 
-#ifdef CONFIG_PROC_FS
        rpc_proc_register(&init_net, &nfs_rpcstat);
-#endif
-       if ((err = register_nfs_fs()) != 0)
+
+       err = register_nfs_fs();
+       if (err)
                goto out0;
 
        return 0;
 out0:
-#ifdef CONFIG_PROC_FS
        rpc_proc_unregister(&init_net, "nfs");
-#endif
        nfs_destroy_directcache();
 out1:
        nfs_destroy_writepagecache();
@@ -2049,9 +2049,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
        unregister_pernet_subsys(&nfs_net_ops);
-#ifdef CONFIG_PROC_FS
        rpc_proc_unregister(&init_net, "nfs");
-#endif
        unregister_nfs_fs();
        nfs_fs_proc_exit();
        nfsiod_stop();
index 53852a4bd88be68781bb9dd8a39760fd497d1d61..9b04c2e6fffc3f306f3c598b7c4557beff653c8e 100644 (file)
@@ -1342,7 +1342,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
        if (args->npages != 0)
                xdr_write_pages(xdr, args->pages, 0, args->len);
        else
-               xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
+               xdr_reserve_space(xdr, args->len);
 
        error = nfsacl_encode(xdr->buf, base, args->inode,
                            (args->mask & NFS_ACL) ?
index 7afb8947dfdf3e299ee39c6490640f40567a70b7..ff66ae700b8991eeed513e210397f6639c0e5f70 100644 (file)
@@ -5,11 +5,18 @@
 #ifndef __LINUX_FS_NFS_NFS4_2_H
 #define __LINUX_FS_NFS_NFS4_2_H
 
+/*
+ * FIXME:  four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
-
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+                                  struct nfs42_layoutstat_data *);
 /* nfs4.2xdr.h */
 extern struct rpc_procinfo nfs4_2_procedures[];
 
index 3a9e75235f30e60e5a4ae974864c63fb003b969d..f486b80f927ab7204159852a9740900a6c73aec6 100644 (file)
 #include <linux/nfs_fs.h>
 #include "nfs4_fs.h"
 #include "nfs42.h"
+#include "iostat.h"
+#include "pnfs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
 
 static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
                                fmode_t fmode)
@@ -165,3 +170,85 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 
        return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
 }
+
+static void
+nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
+{
+       struct nfs42_layoutstat_data *data = calldata;
+       struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+       nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
+                            &data->res.seq_res, task);
+}
+
+static void
+nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
+{
+       struct nfs42_layoutstat_data *data = calldata;
+
+       if (!nfs4_sequence_done(task, &data->res.seq_res))
+               return;
+
+       switch (task->tk_status) {
+       case 0:
+               break;
+       case -ENOTSUPP:
+       case -EOPNOTSUPP:
+               NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+       default:
+               dprintk("%s server returns %d\n", __func__, task->tk_status);
+       }
+}
+
+static void
+nfs42_layoutstat_release(void *calldata)
+{
+       struct nfs42_layoutstat_data *data = calldata;
+       struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+       if (nfss->pnfs_curr_ld->cleanup_layoutstats)
+               nfss->pnfs_curr_ld->cleanup_layoutstats(data);
+
+       pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+       smp_mb__before_atomic();
+       clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+       smp_mb__after_atomic();
+       nfs_iput_and_deactive(data->inode);
+       kfree(data->args.devinfo);
+       kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+       .rpc_call_prepare = nfs42_layoutstat_prepare,
+       .rpc_call_done = nfs42_layoutstat_done,
+       .rpc_release = nfs42_layoutstat_release,
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+                                  struct nfs42_layoutstat_data *data)
+{
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+               .rpc_argp = &data->args,
+               .rpc_resp = &data->res,
+       };
+       struct rpc_task_setup task_setup = {
+               .rpc_client = server->client,
+               .rpc_message = &msg,
+               .callback_ops = &nfs42_layoutstat_ops,
+               .callback_data = data,
+               .flags = RPC_TASK_ASYNC,
+       };
+       struct rpc_task *task;
+
+       data->inode = nfs_igrab_and_active(data->args.inode);
+       if (!data->inode) {
+               nfs42_layoutstat_release(data);
+               return -EAGAIN;
+       }
+       nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+       task = rpc_run_task(&task_setup);
+       if (IS_ERR(task))
+               return PTR_ERR(task);
+       return 0;
+}
index 1a25b27248f2ff5fd0026b00a3032589ee8cdff5..a6bd27da6286f9fee14f0b087eddcc1ec437cdde 100644 (file)
@@ -4,6 +4,8 @@
 #ifndef __LINUX_FS_NFS_NFS4_2XDR_H
 #define __LINUX_FS_NFS_NFS4_2XDR_H
 
+#include "nfs42.h"
+
 #define encode_fallocate_maxsz         (encode_stateid_maxsz + \
                                         2 /* offset */ + \
                                         2 /* length */)
                                         1 /* whence */ + \
                                         2 /* offset */ + \
                                         2 /* length */)
+#define encode_io_info_maxsz           4
+#define encode_layoutstats_maxsz       (op_decode_hdr_maxsz + \
+                                       2 /* offset */ + \
+                                       2 /* length */ + \
+                                       encode_stateid_maxsz + \
+                                       encode_io_info_maxsz + \
+                                       encode_io_info_maxsz + \
+                                       1 /* opaque devaddr4 length */ + \
+                                       XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz       (op_decode_hdr_maxsz)
 
 #define NFS4_enc_allocate_sz           (compound_encode_hdr_maxsz + \
                                         encode_putfh_maxsz + \
 #define NFS4_dec_seek_sz               (compound_decode_hdr_maxsz + \
                                         decode_putfh_maxsz + \
                                         decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz                (compound_encode_hdr_maxsz + \
+                                        encode_sequence_maxsz + \
+                                        encode_putfh_maxsz + \
+                                        PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz                (compound_decode_hdr_maxsz + \
+                                        decode_sequence_maxsz + \
+                                        decode_putfh_maxsz + \
+                                        PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
 
 
 static void encode_fallocate(struct xdr_stream *xdr,
@@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
        encode_uint32(xdr, args->sa_what);
 }
 
+static void encode_layoutstats(struct xdr_stream *xdr,
+                              struct nfs42_layoutstat_args *args,
+                              struct nfs42_layoutstat_devinfo *devinfo,
+                              struct compound_hdr *hdr)
+{
+       __be32 *p;
+
+       encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+       p = reserve_space(xdr, 8 + 8);
+       p = xdr_encode_hyper(p, devinfo->offset);
+       p = xdr_encode_hyper(p, devinfo->length);
+       encode_nfs4_stateid(xdr, &args->stateid);
+       p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+       p = xdr_encode_hyper(p, devinfo->read_count);
+       p = xdr_encode_hyper(p, devinfo->read_bytes);
+       p = xdr_encode_hyper(p, devinfo->write_count);
+       p = xdr_encode_hyper(p, devinfo->write_bytes);
+       p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+                       NFS4_DEVICEID4_SIZE);
+       /* Encode layoutupdate4 */
+       *p++ = cpu_to_be32(devinfo->layout_type);
+       if (devinfo->layoutstats_encode != NULL)
+               devinfo->layoutstats_encode(xdr, args, devinfo);
+       else
+               encode_uint32(xdr, 0);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
        encode_nops(&hdr);
 }
 
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+                                    struct xdr_stream *xdr,
+                                    struct nfs42_layoutstat_args *args)
+{
+       int i;
+
+       struct compound_hdr hdr = {
+               .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+       };
+
+       encode_compound_hdr(xdr, req, &hdr);
+       encode_sequence(xdr, &args->seq_args, &hdr);
+       encode_putfh(xdr, args->fh, &hdr);
+       WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+       for (i = 0; i < args->num_dev; i++)
+               encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+       encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
        return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -169,6 +238,12 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
        return -EIO;
 }
 
+static int decode_layoutstats(struct xdr_stream *xdr,
+                             struct nfs42_layoutstat_res *res)
+{
+       return decode_op_hdr(xdr, OP_LAYOUTSTATS);
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -246,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
 out:
        return status;
 }
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+                                   struct xdr_stream *xdr,
+                                   struct nfs42_layoutstat_res *res)
+{
+       struct compound_hdr hdr;
+       int status, i;
+
+       status = decode_compound_hdr(xdr, &hdr);
+       if (status)
+               goto out;
+       status = decode_sequence(xdr, &res->seq_res, rqstp);
+       if (status)
+               goto out;
+       status = decode_putfh(xdr);
+       if (status)
+               goto out;
+       WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+       for (i = 0; i < res->num_dev; i++) {
+               status = decode_layoutstats(xdr, res);
+               if (status)
+                       goto out;
+       }
+out:
+       res->rpc_status = status;
+       return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
index fdef424b0cd3c6120ddc7d9e379e49647b7b24fb..ea3bee919a765840a267f8fc59ccdec4ef61f676 100644 (file)
@@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
 extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
                          struct rpc_message *, struct nfs4_sequence_args *,
                          struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
index e42be52a8c18d8121c934a5ac79483e77166206c..3aa6a9ba51136f31f30dea29d60dded106b05241 100644 (file)
@@ -676,7 +676,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
                break;
        }
 
-       /* No matching nfs_client found. */
        spin_unlock(&nn->nfs_client_lock);
        dprintk("NFS: <-- %s status = %d\n", __func__, status);
        nfs_put_client(prev);
index f58c17b3b480367c6322359ae7ca4b33b3695348..dcd39d4e2efebd78eed64d4df00fd2745f747027 100644 (file)
@@ -41,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
        dprintk("NFS: open file(%pd2)\n", dentry);
 
+       err = nfs_check_flags(openflags);
+       if (err)
+               return err;
+
        if ((openflags & O_ACCMODE) == 3)
                openflags--;
 
index c0b3a16b4a00806f79ea9eb28b6933a8d94bcc52..039b3eb6d83404f33224961465406d52203ff570 100644 (file)
@@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
                goto out;
        }
 
-       if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
-               printk(KERN_ERR "nfs4_get_rootfh:"
-                      " getroot obtained referral\n");
-               ret = -EREMOTE;
-               goto out;
-       }
-
        memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
 out:
        nfs_free_fattr(fsinfo.fattr);
index 2e1737c40a29837488823e04f27db975e0a51b9b..535dfc69c628f825cc4422339406b1365f66e8d4 100644 (file)
@@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp)
 
 int nfs_idmap_init(void)
 {
-       int ret;
-       ret = nfs_idmap_init_keyring();
-       if (ret != 0)
-               goto out;
-out:
-       return ret;
+       return nfs_idmap_init_keyring();
 }
 
 void nfs_idmap_quit(void)
index 55e1e3af23a3d3f2313f977b185eb8c3f8ccbc6d..6f228b5af819ea576240c40869c1da74d823e460 100644 (file)
@@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
                case 0:
                        return 0;
                case -NFS4ERR_OPENMODE:
+               case -NFS4ERR_DELEG_REVOKED:
+               case -NFS4ERR_ADMIN_REVOKED:
+               case -NFS4ERR_BAD_STATEID:
                        if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
                                nfs4_inode_return_delegation(inode);
                                exception->retry = 1;
@@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
                        if (ret < 0)
                                break;
                        goto wait_on_recovery;
-               case -NFS4ERR_DELEG_REVOKED:
-               case -NFS4ERR_ADMIN_REVOKED:
-               case -NFS4ERR_BAD_STATEID:
-                       if (state == NULL)
-                               break;
-                       ret = nfs4_schedule_stateid_recovery(server, state);
-                       if (ret < 0)
-                               break;
-                       goto wait_on_recovery;
                case -NFS4ERR_EXPIRED:
                        if (state != NULL) {
                                ret = nfs4_schedule_stateid_recovery(server, state);
@@ -482,8 +476,8 @@ struct nfs4_call_sync_data {
        struct nfs4_sequence_res *seq_res;
 };
 
-static void nfs4_init_sequence(struct nfs4_sequence_args *args,
-                              struct nfs4_sequence_res *res, int cache_reply)
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+                       struct nfs4_sequence_res *res, int cache_reply)
 {
        args->sa_slot = NULL;
        args->sa_cache_this = cache_reply;
@@ -1553,6 +1547,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
        struct nfs4_state *newstate;
        int ret;
 
+       if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+            opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
+           (opendata->o_arg.u.delegation_type & fmode) != fmode)
+               /* This mode can't have been delegated, so we must have
+                * a valid open_stateid to cover it - not need to reclaim.
+                */
+               return 0;
        opendata->o_arg.open_flags = 0;
        opendata->o_arg.fmode = fmode;
        opendata->o_arg.share_access = nfs4_map_atomic_open_share(
@@ -1684,6 +1685,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
                                        "%d.\n", __func__, err);
                case 0:
                case -ENOENT:
+               case -EAGAIN:
                case -ESTALE:
                        break;
                case -NFS4ERR_BADSESSION:
@@ -3355,6 +3357,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
                        goto out;
                case -NFS4ERR_MOVED:
                        err = nfs4_get_referral(client, dir, name, fattr, fhandle);
+                       if (err == -NFS4ERR_MOVED)
+                               err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
                        goto out;
                case -NFS4ERR_WRONGSEC:
                        err = -EPERM;
@@ -4955,49 +4959,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
        memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
 
-static unsigned int
-nfs4_init_nonuniform_client_string(struct nfs_client *clp,
-                                  char *buf, size_t len)
+static int
+nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 {
-       unsigned int result;
+       int result;
+       size_t len;
+       char *str;
+       bool retried = false;
 
        if (clp->cl_owner_id != NULL)
-               return strlcpy(buf, clp->cl_owner_id, len);
+               return 0;
+retry:
+       rcu_read_lock();
+       len = 10 + strlen(clp->cl_ipaddr) + 1 +
+               strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
+               1 +
+               strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
+               1;
+       rcu_read_unlock();
+
+       if (len > NFS4_OPAQUE_LIMIT + 1)
+               return -EINVAL;
+
+       /*
+        * Since this string is allocated at mount time, and held until the
+        * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+        * about a memory-reclaim deadlock.
+        */
+       str = kmalloc(len, GFP_KERNEL);
+       if (!str)
+               return -ENOMEM;
 
        rcu_read_lock();
-       result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
-                               clp->cl_ipaddr,
-                               rpc_peeraddr2str(clp->cl_rpcclient,
-                                                       RPC_DISPLAY_ADDR),
-                               rpc_peeraddr2str(clp->cl_rpcclient,
-                                                       RPC_DISPLAY_PROTO));
+       result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+                       clp->cl_ipaddr,
+                       rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+                       rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
        rcu_read_unlock();
-       clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-       return result;
+
+       /* Did something change? */
+       if (result >= len) {
+               kfree(str);
+               if (retried)
+                       return -EINVAL;
+               retried = true;
+               goto retry;
+       }
+       clp->cl_owner_id = str;
+       return 0;
 }
 
-static unsigned int
-nfs4_init_uniform_client_string(struct nfs_client *clp,
-                               char *buf, size_t len)
+static int
+nfs4_init_uniquifier_client_string(struct nfs_client *clp)
+{
+       int result;
+       size_t len;
+       char *str;
+
+       len = 10 + 10 + 1 + 10 + 1 +
+               strlen(nfs4_client_id_uniquifier) + 1 +
+               strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+       if (len > NFS4_OPAQUE_LIMIT + 1)
+               return -EINVAL;
+
+       /*
+        * Since this string is allocated at mount time, and held until the
+        * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+        * about a memory-reclaim deadlock.
+        */
+       str = kmalloc(len, GFP_KERNEL);
+       if (!str)
+               return -ENOMEM;
+
+       result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+                       clp->rpc_ops->version, clp->cl_minorversion,
+                       nfs4_client_id_uniquifier,
+                       clp->cl_rpcclient->cl_nodename);
+       if (result >= len) {
+               kfree(str);
+               return -EINVAL;
+       }
+       clp->cl_owner_id = str;
+       return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
 {
-       const char *nodename = clp->cl_rpcclient->cl_nodename;
-       unsigned int result;
+       int result;
+       size_t len;
+       char *str;
 
        if (clp->cl_owner_id != NULL)
-               return strlcpy(buf, clp->cl_owner_id, len);
+               return 0;
 
        if (nfs4_client_id_uniquifier[0] != '\0')
-               result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
-                               clp->rpc_ops->version,
-                               clp->cl_minorversion,
-                               nfs4_client_id_uniquifier,
-                               nodename);
-       else
-               result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
-                               clp->rpc_ops->version, clp->cl_minorversion,
-                               nodename);
-       clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-       return result;
+               return nfs4_init_uniquifier_client_string(clp);
+
+       len = 10 + 10 + 1 + 10 + 1 +
+               strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+       if (len > NFS4_OPAQUE_LIMIT + 1)
+               return -EINVAL;
+
+       /*
+        * Since this string is allocated at mount time, and held until the
+        * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+        * about a memory-reclaim deadlock.
+        */
+       str = kmalloc(len, GFP_KERNEL);
+       if (!str)
+               return -ENOMEM;
+
+       result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+                       clp->rpc_ops->version, clp->cl_minorversion,
+                       clp->cl_rpcclient->cl_nodename);
+       if (result >= len) {
+               kfree(str);
+               return -EINVAL;
+       }
+       clp->cl_owner_id = str;
+       return 0;
 }
 
 /*
@@ -5044,7 +5127,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        struct nfs4_setclientid setclientid = {
                .sc_verifier = &sc_verifier,
                .sc_prog = program,
-               .sc_cb_ident = clp->cl_cb_ident,
+               .sc_clnt = clp,
        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -5064,16 +5147,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 
        /* nfs_client_id4 */
        nfs4_init_boot_verifier(clp, &sc_verifier);
+
        if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
-               setclientid.sc_name_len =
-                               nfs4_init_uniform_client_string(clp,
-                                               setclientid.sc_name,
-                                               sizeof(setclientid.sc_name));
+               status = nfs4_init_uniform_client_string(clp);
        else
-               setclientid.sc_name_len =
-                               nfs4_init_nonuniform_client_string(clp,
-                                               setclientid.sc_name,
-                                               sizeof(setclientid.sc_name));
+               status = nfs4_init_nonuniform_client_string(clp);
+
+       if (status)
+               goto out;
+
        /* cb_client4 */
        setclientid.sc_netid_len =
                                nfs4_init_callback_netid(clp,
@@ -5083,9 +5165,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                                sizeof(setclientid.sc_uaddr), "%s.%u.%u",
                                clp->cl_ipaddr, port >> 8, port & 255);
 
-       dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
+       dprintk("NFS call  setclientid auth=%s, '%s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
-               setclientid.sc_name_len, setclientid.sc_name);
+               clp->cl_owner_id);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task)) {
                status = PTR_ERR(task);
@@ -5402,6 +5484,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        atomic_inc(&lsp->ls_count);
        /* Ensure we don't close file until we're done freeing locks! */
        p->ctx = get_nfs_open_context(ctx);
+       get_file(fl->fl_file);
        memcpy(&p->fl, fl, sizeof(p->fl));
        p->server = NFS_SERVER(inode);
        return p;
@@ -5413,6 +5496,7 @@ static void nfs4_locku_release_calldata(void *data)
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_lock_state(calldata->lsp);
        put_nfs_open_context(calldata->ctx);
+       fput(calldata->fl.fl_file);
        kfree(calldata);
 }
 
@@ -6846,11 +6930,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
        };
 
        nfs4_init_boot_verifier(clp, &verifier);
-       args.id_len = nfs4_init_uniform_client_string(clp, args.id,
-                                                       sizeof(args.id));
-       dprintk("NFS call  exchange_id auth=%s, '%.*s'\n",
+
+       status = nfs4_init_uniform_client_string(clp);
+       if (status)
+               goto out;
+
+       dprintk("NFS call  exchange_id auth=%s, '%s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
-               args.id_len, args.id);
+               clp->cl_owner_id);
 
        res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
                                        GFP_NOFS);
@@ -6885,7 +6972,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
                /* unsupported! */
                WARN_ON_ONCE(1);
                status = -EINVAL;
-               goto out_server_scope;
+               goto out_impl_id;
        }
 
        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -6913,6 +7000,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
                /* use the most recent implementation id */
                kfree(clp->cl_implid);
                clp->cl_implid = res.impl_id;
+               res.impl_id = NULL;
 
                if (clp->cl_serverscope != NULL &&
                    !nfs41_same_server_scope(clp->cl_serverscope,
@@ -6926,15 +7014,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 
                if (clp->cl_serverscope == NULL) {
                        clp->cl_serverscope = res.server_scope;
-                       goto out;
+                       res.server_scope = NULL;
                }
-       } else
-               kfree(res.impl_id);
+       }
 
-out_server_owner:
-       kfree(res.server_owner);
+out_impl_id:
+       kfree(res.impl_id);
 out_server_scope:
        kfree(res.server_scope);
+out_server_owner:
+       kfree(res.server_owner);
 out:
        if (clp->cl_implid != NULL)
                dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -8061,9 +8150,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
        struct rpc_task *task;
        int status = 0;
 
-       dprintk("NFS: %4d initiating layoutcommit call. sync %d "
-               "lbw: %llu inode %lu\n",
-               data->task.tk_pid, sync,
+       dprintk("NFS: initiating layoutcommit call. sync %d "
+               "lbw: %llu inode %lu\n", sync,
                data->args.lastbytewritten,
                data->args.inode->i_ino);
 
@@ -8557,7 +8645,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
                | NFS_CAP_ATOMIC_OPEN_V1
                | NFS_CAP_ALLOCATE
                | NFS_CAP_DEALLOCATE
-               | NFS_CAP_SEEK,
+               | NFS_CAP_SEEK
+               | NFS_CAP_LAYOUTSTATS,
        .init_client = nfs41_init_client,
        .shutdown_client = nfs41_shutdown_client,
        .match_stateid = nfs41_match_stateid,
index 2782cfca22650922e012a4f86a1755e3cca68243..605840dc89cf9e28c173659af201aab109f9328d 100644 (file)
@@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 
        if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
                goto do_confirm;
-       nfs4_begin_drain_session(clp);
        status = nfs4_proc_exchange_id(clp, cred);
        if (status != 0)
                goto out;
@@ -1482,6 +1481,8 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
                                        spin_unlock(&state->state_lock);
                                }
                                nfs4_put_open_state(state);
+                               clear_bit(NFS4CLNT_RECLAIM_NOGRACE,
+                                       &state->flags);
                                spin_lock(&sp->so_lock);
                                goto restart;
                        }
@@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
                clp->cl_mvops->reboot_recovery_ops;
        int status;
 
+       nfs4_begin_drain_session(clp);
        cred = nfs4_get_clid_cred(clp);
        if (cred == NULL)
                return -ENOENT;
index 0aea97841d3038b56056d0d7fcd0dcddb11f584e..558cd65dbdb752d111b5b85649b72bae36fdf040 100644 (file)
@@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int);
 #define encode_setclientid_maxsz \
                                (op_encode_hdr_maxsz + \
                                XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
-                               XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+                               /* client name */ \
+                               1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
                                1 /* sc_prog */ + \
                                1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
                                1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
@@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int);
 #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
                                encode_verifier_maxsz + \
                                1 /* co_ownerid.len */ + \
-                               XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+                               /* eia_clientowner */ \
+                               1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
                                1 /* flags */ + \
                                1 /* spa_how */ + \
                                /* max is SP4_MACH_CRED (for now) */ + \
@@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
        encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
        encode_nfs4_verifier(xdr, setclientid->sc_verifier);
 
-       encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+       encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+                       setclientid->sc_clnt->cl_owner_id);
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(setclientid->sc_prog);
        encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
        encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
        p = reserve_space(xdr, 4);
-       *p = cpu_to_be32(setclientid->sc_cb_ident);
+       *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
 }
 
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
        encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
        encode_nfs4_verifier(xdr, args->verifier);
 
-       encode_string(xdr, args->id_len, args->id);
+       encode_string(xdr, strlen(args->client->cl_owner_id),
+                       args->client->cl_owner_id);
 
        encode_uint32(xdr, args->flags);
        encode_uint32(xdr, args->state_protect.how);
@@ -7427,6 +7431,7 @@ struct rpc_procinfo       nfs4_procedures[] = {
        PROC(SEEK,              enc_seek,               dec_seek),
        PROC(ALLOCATE,          enc_allocate,           dec_allocate),
        PROC(DEALLOCATE,        enc_deallocate,         dec_deallocate),
+       PROC(LAYOUTSTATS,       enc_layoutstats,        dec_layoutstats),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
index 282b3936951060a2c8a6216c4c690e166a5fcb8d..1da68d3b1edabdb78c60527f502af5f40d6cf69b 100644 (file)
@@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
 
        hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
 
-       dprintk("NFS: %5u initiated pgio call "
+       dprintk("NFS: initiated pgio call "
                "(req %s/%llu, %u bytes @ offset %llu)\n",
-               hdr->task.tk_pid,
                hdr->inode->i_sb->s_id,
                (unsigned long long)NFS_FILEID(hdr->inode),
                hdr->args.count,
@@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 static void nfs_pgio_release(void *calldata)
 {
        struct nfs_pgio_header *hdr = calldata;
-       if (hdr->rw_ops->rw_release)
-               hdr->rw_ops->rw_release(hdr);
        nfs_pgio_data_destroy(hdr);
        hdr->completion_ops->completion(hdr);
 }
@@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
  * nfs_pageio_init - initialise a page io descriptor
  * @desc: pointer to descriptor
  * @inode: pointer to inode
- * @doio: pointer to io function
+ * @pg_ops: pointer to pageio operations
+ * @compl_ops: pointer to pageio completion operations
+ * @rw_ops: pointer to nfs read/write operations
  * @bsize: io block size
  * @io_flags: extra parameters for the io function
  */
@@ -1186,6 +1185,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
  * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
  *                             nfs_pageio_descriptor
  * @desc: pointer to io descriptor
+ * @mirror_idx: pointer to mirror index
  */
 static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
                                       u32 mirror_idx)
index 230606243be6ad079733e583d173b14d3baeda55..0ba9a02c95664960f8c0f46ea97249bd8653fe16 100644 (file)
@@ -35,6 +35,7 @@
 #include "iostat.h"
 #include "nfs4trace.h"
 #include "delegation.h"
+#include "nfs42.h"
 
 #define NFSDBG_FACILITY                NFSDBG_PNFS
 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -1821,6 +1822,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
        /* Resend all requests through the MDS */
        nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
                              hdr->completion_ops);
+       set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
        return nfs_pageio_resend(&pgio, hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
@@ -1865,6 +1867,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
+       hdr->release(hdr);
 }
 
 static enum pnfs_try_status
@@ -1979,6 +1982,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
+       hdr->release(hdr);
 }
 
 /*
@@ -2247,3 +2251,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
        }
        return thp;
 }
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int
+pnfs_report_layoutstat(struct inode *inode)
+{
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+       struct nfs_server *server = NFS_SERVER(inode);
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs42_layoutstat_data *data;
+       struct pnfs_layout_hdr *hdr;
+       int status = 0;
+
+       if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
+               goto out;
+
+       if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+               goto out;
+
+       if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+               goto out;
+
+       spin_lock(&inode->i_lock);
+       if (!NFS_I(inode)->layout) {
+               spin_unlock(&inode->i_lock);
+               goto out;
+       }
+       hdr = NFS_I(inode)->layout;
+       pnfs_get_layout_hdr(hdr);
+       spin_unlock(&inode->i_lock);
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data) {
+               status = -ENOMEM;
+               goto out_put;
+       }
+
+       data->args.fh = NFS_FH(inode);
+       data->args.inode = inode;
+       nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
+       status = ld->prepare_layoutstats(&data->args);
+       if (status)
+               goto out_free;
+
+       status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
+
+out:
+       dprintk("%s returns %d\n", __func__, status);
+       return status;
+
+out_free:
+       kfree(data);
+out_put:
+       pnfs_put_layout_hdr(hdr);
+       smp_mb__before_atomic();
+       clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+       smp_mb__after_atomic();
+       goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
+#endif
index 1e6308f82fc3d5887de850e72226233e4f2138d2..3e6ab7bfbabd428425227b6f9d2a94711edd371e 100644 (file)
@@ -178,6 +178,8 @@ struct pnfs_layoutdriver_type {
        void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
                                     struct xdr_stream *xdr,
                                     const struct nfs4_layoutcommit_args *args);
+       int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+       void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
 };
 
 struct pnfs_layout_hdr {
@@ -290,7 +292,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 void pnfs_error_mark_layout_for_return(struct inode *inode,
                                       struct pnfs_layout_segment *lseg);
-
 /* nfs4_deviceid_flags */
 enum {
        NFS_DEVICEID_INVALID = 0,       /* set when MDS clientid recalled */
@@ -689,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
 
 #endif /* CONFIG_NFS_V4_1 */
 
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int pnfs_report_layoutstat(struct inode *inode);
+#else
+static inline int
+pnfs_report_layoutstat(struct inode *inode)
+{
+       return 0;
+}
+#endif
+
 #endif /* FS_NFS_PNFS_H */
index e6c262555e08a62aff65ef3baa04e9666e9f18c2..65869ca9c851dbf4f0b289ca84865a018c2b6e57 100644 (file)
@@ -1290,6 +1290,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
 static void nfs_redirty_request(struct nfs_page *req)
 {
        nfs_mark_request_dirty(req);
+       set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
        nfs_unlock_request(req);
        nfs_end_page_writeback(req);
        nfs_release_request(req);
@@ -1348,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
        NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
-{
-       /* do nothing! */
-}
-
 /*
  * Special version of should_remove_suid() that ignores capabilities.
  */
@@ -1556,7 +1552,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
        /* Set up the initial task struct.  */
        nfs_ops->commit_setup(data, &msg);
 
-       dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+       dprintk("NFS: initiated commit call\n");
 
        nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
                NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
@@ -2013,7 +2009,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
        .rw_mode                = FMODE_WRITE,
        .rw_alloc_header        = nfs_writehdr_alloc,
        .rw_free_header         = nfs_writehdr_free,
-       .rw_release             = nfs_writeback_release_common,
        .rw_done                = nfs_writeback_done,
        .rw_result              = nfs_writeback_result,
        .rw_initiate            = nfs_initiate_write,
index 4506486974336d6451abc0658de78c0c06bdec9a..5b1e2a497e5114c26e556830f9f891c36130ffc7 100644 (file)
@@ -26,7 +26,7 @@
 #include <linux/fs.h> /* struct inode */
 #include <linux/fsnotify_backend.h>
 #include <linux/idr.h>
-#include <linux/init.h> /* module_init */
+#include <linux/init.h> /* fs_initcall */
 #include <linux/inotify.h>
 #include <linux/kernel.h> /* roundup() */
 #include <linux/namei.h> /* LOOKUP_FOLLOW */
@@ -812,4 +812,4 @@ static int __init inotify_user_setup(void)
 
        return 0;
 }
-module_init(inotify_user_setup);
+fs_initcall(inotify_user_setup);
index 907870e81a72e36f4c5abb29fd34e23b4307efc5..70e9af5516004d20188ca0757110fe5bf203d19f 100644 (file)
@@ -23,6 +23,7 @@ struct ovl_cache_entry {
        u64 ino;
        struct list_head l_node;
        struct rb_node node;
+       struct ovl_cache_entry *next_maybe_whiteout;
        bool is_whiteout;
        char name[];
 };
@@ -39,7 +40,7 @@ struct ovl_readdir_data {
        struct rb_root root;
        struct list_head *list;
        struct list_head middle;
-       struct dentry *dir;
+       struct ovl_cache_entry *first_maybe_whiteout;
        int count;
        int err;
 };
@@ -79,7 +80,7 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
        return NULL;
 }
 
-static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
+static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
                                                   const char *name, int len,
                                                   u64 ino, unsigned int d_type)
 {
@@ -98,29 +99,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
        p->is_whiteout = false;
 
        if (d_type == DT_CHR) {
-               struct dentry *dentry;
-               const struct cred *old_cred;
-               struct cred *override_cred;
-
-               override_cred = prepare_creds();
-               if (!override_cred) {
-                       kfree(p);
-                       return NULL;
-               }
-
-               /*
-                * CAP_DAC_OVERRIDE for lookup
-                */
-               cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
-               old_cred = override_creds(override_cred);
-
-               dentry = lookup_one_len(name, dir, len);
-               if (!IS_ERR(dentry)) {
-                       p->is_whiteout = ovl_is_whiteout(dentry);
-                       dput(dentry);
-               }
-               revert_creds(old_cred);
-               put_cred(override_cred);
+               p->next_maybe_whiteout = rdd->first_maybe_whiteout;
+               rdd->first_maybe_whiteout = p;
        }
        return p;
 }
@@ -148,7 +128,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
                        return 0;
        }
 
-       p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
+       p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
        if (p == NULL)
                return -ENOMEM;
 
@@ -169,7 +149,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd,
        if (p) {
                list_move_tail(&p->l_node, &rdd->middle);
        } else {
-               p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
+               p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
                if (p == NULL)
                        rdd->err = -ENOMEM;
                else
@@ -219,6 +199,43 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
                return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
 }
 
+static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
+{
+       int err;
+       struct ovl_cache_entry *p;
+       struct dentry *dentry;
+       const struct cred *old_cred;
+       struct cred *override_cred;
+
+       override_cred = prepare_creds();
+       if (!override_cred)
+               return -ENOMEM;
+
+       /*
+        * CAP_DAC_OVERRIDE for lookup
+        */
+       cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+       old_cred = override_creds(override_cred);
+
+       err = mutex_lock_killable(&dir->d_inode->i_mutex);
+       if (!err) {
+               while (rdd->first_maybe_whiteout) {
+                       p = rdd->first_maybe_whiteout;
+                       rdd->first_maybe_whiteout = p->next_maybe_whiteout;
+                       dentry = lookup_one_len(p->name, dir, p->len);
+                       if (!IS_ERR(dentry)) {
+                               p->is_whiteout = ovl_is_whiteout(dentry);
+                               dput(dentry);
+                       }
+               }
+               mutex_unlock(&dir->d_inode->i_mutex);
+       }
+       revert_creds(old_cred);
+       put_cred(override_cred);
+
+       return err;
+}
+
 static inline int ovl_dir_read(struct path *realpath,
                               struct ovl_readdir_data *rdd)
 {
@@ -229,7 +246,7 @@ static inline int ovl_dir_read(struct path *realpath,
        if (IS_ERR(realfile))
                return PTR_ERR(realfile);
 
-       rdd->dir = realpath->dentry;
+       rdd->first_maybe_whiteout = NULL;
        rdd->ctx.pos = 0;
        do {
                rdd->count = 0;
@@ -238,6 +255,10 @@ static inline int ovl_dir_read(struct path *realpath,
                if (err >= 0)
                        err = rdd->err;
        } while (!err && rdd->count);
+
+       if (!err && rdd->first_maybe_whiteout)
+               err = ovl_check_whiteouts(realpath->dentry, rdd);
+
        fput(realfile);
 
        return err;
index bf8537c7f455207830046a50d67d394f86d37f4a..8a08c582bc22e400a16f395609200ed105d3933f 100644 (file)
@@ -273,10 +273,57 @@ static void ovl_dentry_release(struct dentry *dentry)
        }
 }
 
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       struct ovl_entry *oe = dentry->d_fsdata;
+       unsigned int i;
+       int ret = 1;
+
+       for (i = 0; i < oe->numlower; i++) {
+               struct dentry *d = oe->lowerstack[i].dentry;
+
+               if (d->d_flags & DCACHE_OP_REVALIDATE) {
+                       ret = d->d_op->d_revalidate(d, flags);
+                       if (ret < 0)
+                               return ret;
+                       if (!ret) {
+                               if (!(flags & LOOKUP_RCU))
+                                       d_invalidate(d);
+                               return -ESTALE;
+                       }
+               }
+       }
+       return 1;
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       struct ovl_entry *oe = dentry->d_fsdata;
+       unsigned int i;
+       int ret = 1;
+
+       for (i = 0; i < oe->numlower; i++) {
+               struct dentry *d = oe->lowerstack[i].dentry;
+
+               if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
+                       ret = d->d_op->d_weak_revalidate(d, flags);
+                       if (ret <= 0)
+                               break;
+               }
+       }
+       return ret;
+}
+
 static const struct dentry_operations ovl_dentry_operations = {
        .d_release = ovl_dentry_release,
 };
 
+static const struct dentry_operations ovl_reval_dentry_operations = {
+       .d_release = ovl_dentry_release,
+       .d_revalidate = ovl_dentry_revalidate,
+       .d_weak_revalidate = ovl_dentry_weak_revalidate,
+};
+
 static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
 {
        size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
@@ -288,6 +335,20 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
        return oe;
 }
 
+static bool ovl_dentry_remote(struct dentry *dentry)
+{
+       return dentry->d_flags &
+               (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+static bool ovl_dentry_weird(struct dentry *dentry)
+{
+       return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
+                                 DCACHE_MANAGE_TRANSIT |
+                                 DCACHE_OP_HASH |
+                                 DCACHE_OP_COMPARE);
+}
+
 static inline struct dentry *ovl_lookup_real(struct dentry *dir,
                                             struct qstr *name)
 {
@@ -303,6 +364,10 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
        } else if (!dentry->d_inode) {
                dput(dentry);
                dentry = NULL;
+       } else if (ovl_dentry_weird(dentry)) {
+               dput(dentry);
+               /* Don't support traversing automounts and other weirdness */
+               dentry = ERR_PTR(-EREMOTE);
        }
        return dentry;
 }
@@ -350,6 +415,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                        goto out;
 
                if (this) {
+                       if (unlikely(ovl_dentry_remote(this))) {
+                               dput(this);
+                               err = -EREMOTE;
+                               goto out;
+                       }
                        if (ovl_is_whiteout(this)) {
                                dput(this);
                                this = NULL;
@@ -694,25 +764,6 @@ static void ovl_unescape(char *s)
        }
 }
 
-static bool ovl_is_allowed_fs_type(struct dentry *root)
-{
-       const struct dentry_operations *dop = root->d_op;
-
-       /*
-        * We don't support:
-        *  - automount filesystems
-        *  - filesystems with revalidate (FIXME for lower layer)
-        *  - filesystems with case insensitive names
-        */
-       if (dop &&
-           (dop->d_manage || dop->d_automount ||
-            dop->d_revalidate || dop->d_weak_revalidate ||
-            dop->d_compare || dop->d_hash)) {
-               return false;
-       }
-       return true;
-}
-
 static int ovl_mount_dir_noesc(const char *name, struct path *path)
 {
        int err = -EINVAL;
@@ -727,7 +778,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
                goto out;
        }
        err = -EINVAL;
-       if (!ovl_is_allowed_fs_type(path->dentry)) {
+       if (ovl_dentry_weird(path->dentry)) {
                pr_err("overlayfs: filesystem on '%s' not supported\n", name);
                goto out_put;
        }
@@ -751,13 +802,21 @@ static int ovl_mount_dir(const char *name, struct path *path)
        if (tmp) {
                ovl_unescape(tmp);
                err = ovl_mount_dir_noesc(tmp, path);
+
+               if (!err)
+                       if (ovl_dentry_remote(path->dentry)) {
+                               pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
+                                      tmp);
+                               path_put(path);
+                               err = -EINVAL;
+                       }
                kfree(tmp);
        }
        return err;
 }
 
 static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
-                        int *stack_depth)
+                        int *stack_depth, bool *remote)
 {
        int err;
        struct kstatfs statfs;
@@ -774,6 +833,9 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
        *namelen = max(*namelen, statfs.f_namelen);
        *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
 
+       if (ovl_dentry_remote(path->dentry))
+               *remote = true;
+
        return 0;
 
 out_put:
@@ -827,6 +889,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        unsigned int numlower;
        unsigned int stacklen = 0;
        unsigned int i;
+       bool remote = false;
        int err;
 
        err = -ENOMEM;
@@ -900,7 +963,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        lower = lowertmp;
        for (numlower = 0; numlower < stacklen; numlower++) {
                err = ovl_lower_dir(lower, &stack[numlower],
-                                   &ufs->lower_namelen, &sb->s_stack_depth);
+                                   &ufs->lower_namelen, &sb->s_stack_depth,
+                                   &remote);
                if (err)
                        goto out_put_lowerpath;
 
@@ -958,7 +1022,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        if (!ufs->upper_mnt)
                sb->s_flags |= MS_RDONLY;
 
-       sb->s_d_op = &ovl_dentry_operations;
+       if (remote)
+               sb->s_d_op = &ovl_reval_dentry_operations;
+       else
+               sb->s_d_op = &ovl_dentry_operations;
 
        err = -ENOMEM;
        oe = ovl_alloc_entry(numlower);
index 30f92cefaa721d040d5f1e2d92ec16429f8c00e3..9ebee53d3bf586ef80690fa778c0a3e030e410f1 100644 (file)
@@ -43,9 +43,9 @@ struct ceph_options {
        int flags;
        struct ceph_fsid fsid;
        struct ceph_entity_addr my_addr;
-       int mount_timeout;
-       int osd_idle_ttl;
-       int osd_keepalive_timeout;
+       unsigned long mount_timeout;            /* jiffies */
+       unsigned long osd_idle_ttl;             /* jiffies */
+       unsigned long osd_keepalive_timeout;    /* jiffies */
 
        /*
         * any type that can't be simply compared or doesn't need need
@@ -63,9 +63,9 @@ struct ceph_options {
 /*
  * defaults
  */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_TIMEOUT_DEFAULT     msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT     msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT      msecs_to_jiffies(60 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN        (16*1024*1024)
@@ -93,13 +93,9 @@ enum {
        CEPH_MOUNT_SHUTDOWN,
 };
 
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
+static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
 {
-       BUG_ON(time_after(b, a));
-       return (long)a - (long)b;
+       return timeout ?: MAX_SCHEDULE_TIMEOUT;
 }
 
 struct ceph_mds_client;
@@ -178,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len)
 
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_cap_flush_cachep;
 extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
 
index 61b19c46bdb33d5fc2f4752df0c345350fa739e8..7506b485bb6d1d4cee0aff00cbe1132d55396071 100644 (file)
@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
                                 struct ceph_msg *msg);
 
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
-                                       unsigned int which, u16 opcode);
+                           unsigned int which, u16 opcode, u32 flags);
 
 extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
                                        unsigned int which,
index 48a1a7d100f190efae067afa7866b75e10eff9fe..48b49305716bd728e69477db6b430c34e7781746 100644 (file)
@@ -1,7 +1,11 @@
 #ifndef CEPH_CRUSH_CRUSH_H
 #define CEPH_CRUSH_CRUSH_H
 
-#include <linux/types.h>
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
 /*
  * CRUSH is a pseudo-random data distribution algorithm that
 #define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
 
 #define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_RULESET (1<<8)  /* max crush ruleset number */
+#define CRUSH_MAX_RULES CRUSH_MAX_RULESET  /* should be the same as max rulesets */
 
+#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
+#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
 
 #define CRUSH_ITEM_UNDEF  0x7ffffffe  /* undefined result (internal use only) */
 #define CRUSH_ITEM_NONE   0x7fffffff  /* no result */
@@ -108,6 +116,15 @@ enum {
 };
 extern const char *crush_bucket_alg_name(int alg);
 
+/*
+ * although tree was a legacy algorithm, it has been buggy, so
+ * exclude it.
+ */
+#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS (     \
+               (1 << CRUSH_BUCKET_UNIFORM) |   \
+               (1 << CRUSH_BUCKET_LIST) |      \
+               (1 << CRUSH_BUCKET_STRAW))
+
 struct crush_bucket {
        __s32 id;        /* this'll be negative */
        __u16 type;      /* non-zero; type=0 is reserved for devices */
@@ -174,7 +191,7 @@ struct crush_map {
        /* choose local attempts using a fallback permutation before
         * re-descent */
        __u32 choose_local_fallback_tries;
-       /* choose attempts before giving up */ 
+       /* choose attempts before giving up */
        __u32 choose_total_tries;
        /* attempt chooseleaf inner descent once for firstn mode; on
         * reject retry outer descent.  Note that this does *not*
@@ -187,6 +204,25 @@ struct crush_map {
         * that want to limit reshuffling, a value of 3 or 4 will make the
         * mappings line up a bit better with previous mappings. */
        __u8 chooseleaf_vary_r;
+
+#ifndef __KERNEL__
+       /*
+        * version 0 (original) of straw_calc has various flaws.  version 1
+        * fixes a few of them.
+        */
+       __u8 straw_calc_version;
+
+       /*
+        * allowed bucket algs is a bitmask, here the bit positions
+        * are CRUSH_BUCKET_*.  note that these are *bits* and
+        * CRUSH_BUCKET_* values are not, so we need to or together (1
+        * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
+        * minimize confusion (bucket type values start at 1).
+        */
+       __u32 allowed_bucket_algs;
+
+       __u32 *choose_tries;
+#endif
 };
 
 
index 91e884230d5db99cbfacc478bf8acbe2d4533908..d1d90258242eef2965eb3e1e1e399132e85c8c34 100644 (file)
@@ -1,6 +1,12 @@
 #ifndef CEPH_CRUSH_HASH_H
 #define CEPH_CRUSH_HASH_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
 #define CRUSH_HASH_RJENKINS1   0
 
 #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
index eab367446eea7fa683cb4fd15e74ad3822bb35c8..5dfd5b1125d2b257a4a00d1e77661613ca2227ec 100644 (file)
@@ -8,7 +8,7 @@
  * LGPL2
  */
 
-#include <linux/crush/crush.h>
+#include "crush.h"
 
 extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
 extern int crush_do_rule(const struct crush_map *map,
index 00ac57c26615103b0983040981cac8012c9e2f99..5a31bf3a40243e1faff078fdc509361568143f01 100644 (file)
@@ -1300,4 +1300,26 @@ static void __exit __driver##_exit(void) \
 } \
 module_exit(__driver##_exit);
 
+/**
+ * builtin_driver() - Helper macro for drivers that don't do anything
+ * special in init and have no exit. This eliminates some boilerplate.
+ * Each driver may only use this macro once, and calling it replaces
+ * device_initcall (or in some cases, the legacy __initcall).  This is
+ * meant to be a direct parallel of module_driver() above but without
+ * the __exit stuff that is not used for builtin cases.
+ *
+ * @__driver: driver name
+ * @__register: register function for this driver type
+ * @...: Additional arguments to be passed to __register
+ *
+ * Use this macro to construct bus specific macros for registering
+ * drivers, and do not use it on its own.
+ */
+#define builtin_driver(__driver, __register, ...) \
+static int __init __driver##_init(void) \
+{ \
+       return __register(&(__driver) , ##__VA_ARGS__); \
+} \
+device_initcall(__driver##_init);
+
 #endif /* _DEVICE_H_ */
index 21b6d768edd7a4e0f1aee98ed9f437000fa1b996..7c68c36d3fd88788f043447c5bed889cdd408bb6 100644 (file)
 
 #define __exit          __section(.exit.text) __exitused __cold notrace
 
-/* temporary, until all users are removed */
-#define __cpuinit
-#define __cpuinitdata
-#define __cpuinitconst
-#define __cpuexit
-#define __cpuexitdata
-#define __cpuexitconst
-
 /* Used for MEMORY_HOTPLUG */
 #define __meminit        __section(.meminit.text) __cold notrace
 #define __meminitdata    __section(.meminit.data)
 #define __INITRODATA   .section        ".init.rodata","a",%progbits
 #define __FINITDATA    .previous
 
-/* temporary, until all users are removed */
-#define __CPUINIT
-
 #define __MEMINIT        .section      ".meminit.text", "ax"
 #define __MEMINITDATA    .section      ".meminit.data", "aw"
 #define __MEMINITRODATA  .section      ".meminit.rodata", "a"
index 32201c269890433817f1fac83026d95fa760b996..b8e72aad919cfc72ea6710ac3786dc35bbe47201 100644 (file)
@@ -500,6 +500,7 @@ enum {
        NFSPROC4_CLNT_SEEK,
        NFSPROC4_CLNT_ALLOCATE,
        NFSPROC4_CLNT_DEALLOCATE,
+       NFSPROC4_CLNT_LAYOUTSTATS,
 };
 
 /* nfs41 types */
index b95f914ce083891325b6b8defa3f9995851cf76b..f91b5ade30c98fe8b03d06bc40deb0c434edb633 100644 (file)
@@ -219,6 +219,7 @@ struct nfs_inode {
 #define NFS_INO_COMMIT         (7)             /* inode is committing unstable writes */
 #define NFS_INO_LAYOUTCOMMIT   (9)             /* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)          /* layoutcommit inflight */
+#define NFS_INO_LAYOUTSTATS    (11)            /* layoutstats inflight */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
index 5e1273d4de14064198489a7aaccb73a88f36e7b8..a2ea1491d3dfc487611445490fb10adf9972d777 100644 (file)
@@ -237,5 +237,6 @@ struct nfs_server {
 #define NFS_CAP_SEEK           (1U << 19)
 #define NFS_CAP_ALLOCATE       (1U << 20)
 #define NFS_CAP_DEALLOCATE     (1U << 21)
+#define NFS_CAP_LAYOUTSTATS    (1U << 22)
 
 #endif
index 3eb072dbce833dd268b0189b9ff98a2aa85cc7bc..f2f650f136ee6fe181dfa27b71d7b944fc1f9357 100644 (file)
@@ -67,7 +67,6 @@ struct nfs_rw_ops {
        const fmode_t rw_mode;
        struct nfs_pgio_header *(*rw_alloc_header)(void);
        void (*rw_free_header)(struct nfs_pgio_header *);
-       void (*rw_release)(struct nfs_pgio_header *);
        int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
                        struct inode *);
        void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
index 93ab6071bbe967b56ea44c56111157f8a9135e3f..7bbe50504211d65cc096baa8bc6d45e2e9449125 100644 (file)
@@ -316,6 +316,49 @@ struct nfs4_layoutreturn {
        int rpc_status;
 };
 
+#define PNFS_LAYOUTSTATS_MAXSIZE 256
+
+struct nfs42_layoutstat_args;
+struct nfs42_layoutstat_devinfo;
+typedef        void (*layoutstats_encode_t)(struct xdr_stream *,
+               struct nfs42_layoutstat_args *,
+               struct nfs42_layoutstat_devinfo *);
+
+/* Per file per deviceid layoutstats */
+struct nfs42_layoutstat_devinfo {
+       struct nfs4_deviceid dev_id;
+       __u64 offset;
+       __u64 length;
+       __u64 read_count;
+       __u64 read_bytes;
+       __u64 write_count;
+       __u64 write_bytes;
+       __u32 layout_type;
+       layoutstats_encode_t layoutstats_encode;
+       void *layout_private;
+};
+
+struct nfs42_layoutstat_args {
+       struct nfs4_sequence_args seq_args;
+       struct nfs_fh *fh;
+       struct inode *inode;
+       nfs4_stateid stateid;
+       int num_dev;
+       struct nfs42_layoutstat_devinfo *devinfo;
+};
+
+struct nfs42_layoutstat_res {
+       struct nfs4_sequence_res seq_res;
+       int num_dev;
+       int rpc_status;
+};
+
+struct nfs42_layoutstat_data {
+       struct inode *inode;
+       struct nfs42_layoutstat_args args;
+       struct nfs42_layoutstat_res res;
+};
+
 struct stateowner_id {
        __u64   create_time;
        __u32   uniquifier;
@@ -984,17 +1027,14 @@ struct nfs4_readlink_res {
        struct nfs4_sequence_res        seq_res;
 };
 
-#define NFS4_SETCLIENTID_NAMELEN       (127)
 struct nfs4_setclientid {
        const nfs4_verifier *           sc_verifier;
-       unsigned int                    sc_name_len;
-       char                            sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
        u32                             sc_prog;
        unsigned int                    sc_netid_len;
        char                            sc_netid[RPCBIND_MAXNETIDLEN + 1];
        unsigned int                    sc_uaddr_len;
        char                            sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
-       u32                             sc_cb_ident;
+       struct nfs_client               *sc_clnt;
        struct rpc_cred                 *sc_cred;
 };
 
@@ -1142,12 +1182,9 @@ struct nfs41_state_protection {
        struct nfs4_op_map allow;
 };
 
-#define NFS4_EXCHANGE_ID_LEN   (48)
 struct nfs41_exchange_id_args {
        struct nfs_client               *client;
        nfs4_verifier                   *verifier;
-       unsigned int                    id_len;
-       char                            id[NFS4_EXCHANGE_ID_LEN];
        u32                             flags;
        struct nfs41_state_protection   state_protect;
 };
index 58f1e75ba105ca9096f8fbf1d60b808e08087c03..bba08f44cc97da8f881fba53a9d20c4e80ee2969 100644 (file)
@@ -222,6 +222,15 @@ static inline void platform_set_drvdata(struct platform_device *pdev,
        module_driver(__platform_driver, platform_driver_register, \
                        platform_driver_unregister)
 
+/* builtin_platform_driver() - Helper macro for builtin drivers that
+ * don't do anything special in driver init.  This eliminates some
+ * boilerplate.  Each driver may only use this macro once, and
+ * calling it replaces device_initcall().  Note this is meant to be
+ * a parallel of module_platform_driver() above, but w/o _exit stuff.
+ */
+#define builtin_platform_driver(__platform_driver) \
+       builtin_driver(__platform_driver, platform_driver_register)
+
 /* module_platform_driver_probe() - Helper macro for drivers that don't do
  * anything special in module init/exit.  This eliminates a lot of
  * boilerplate.  Each module may only use this macro once, and
@@ -240,6 +249,20 @@ static void __exit __platform_driver##_exit(void) \
 } \
 module_exit(__platform_driver##_exit);
 
+/* builtin_platform_driver_probe() - Helper macro for drivers that don't do
+ * anything special in device init.  This eliminates some boilerplate.  Each
+ * driver may only use this macro once, and using it replaces device_initcall.
+ * This is meant to be a parallel of module_platform_driver_probe above, but
+ * without the __exit parts.
+ */
+#define builtin_platform_driver_probe(__platform_driver, __platform_probe) \
+static int __init __platform_driver##_init(void) \
+{ \
+       return platform_driver_probe(&(__platform_driver), \
+                                    __platform_probe);    \
+} \
+device_initcall(__platform_driver##_init); \
+
 #define platform_create_bundle(driver, probe, res, n_res, data, size) \
        __platform_create_bundle(driver, probe, res, n_res, data, size, THIS_MODULE)
 extern struct platform_device *__platform_create_bundle(
index 2ca67b55e0fe2f0abf7e432ee8402608b6d57201..8df43c9f11dc295889639364101f324150d769ff 100644 (file)
@@ -37,7 +37,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
-int bc_send(struct rpc_rqst *req);
 
 /*
  * Determine if a shared backchannel is in use
index 598ba80ec30c974f02477a216a21077213a25aa5..131032f15cc187e0c7cbd0df80a08f4c44220800 100644 (file)
@@ -56,6 +56,7 @@ struct rpc_clnt {
        struct rpc_rtt *        cl_rtt;         /* RTO estimator data */
        const struct rpc_timeout *cl_timeout;   /* Timeout strategy */
 
+       atomic_t                cl_swapper;     /* swapfile count */
        int                     cl_nodelen;     /* nodename length */
        char                    cl_nodename[UNX_MAXNODENAME+1];
        struct rpc_pipe_dir_head cl_pipedir_objects;
index 5f1e6bd4c316d143751d19aac15af77aa7c1ac21..d703f0ef37d8f87436310247c19ca37f60ea692b 100644 (file)
@@ -205,8 +205,7 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-                               const struct rpc_call_ops *ops);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
 void           rpc_put_task(struct rpc_task *);
 void           rpc_put_task_async(struct rpc_task *);
 void           rpc_exit_task(struct rpc_task *);
@@ -269,4 +268,20 @@ static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
 }
 #endif
 
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int rpc_clnt_swap_activate(struct rpc_clnt *clnt);
+void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt);
+#else
+static inline int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+       return -EINVAL;
+}
+
+static inline void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+}
+#endif /* CONFIG_SUNRPC_SWAP */
+
 #endif /* _LINUX_SUNRPC_SCHED_H_ */
index 8b93ef53df3c95df08625bef6ba58341d583e1e0..0fb9acbb478095b576445d74d264ad5e5193a55e 100644 (file)
@@ -133,6 +133,9 @@ struct rpc_xprt_ops {
        void            (*close)(struct rpc_xprt *xprt);
        void            (*destroy)(struct rpc_xprt *xprt);
        void            (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
+       int             (*enable_swap)(struct rpc_xprt *xprt);
+       void            (*disable_swap)(struct rpc_xprt *xprt);
+       void            (*inject_disconnect)(struct rpc_xprt *xprt);
 };
 
 /*
@@ -180,7 +183,7 @@ struct rpc_xprt {
        atomic_t                num_reqs;       /* total slots */
        unsigned long           state;          /* transport state */
        unsigned char           resvport   : 1; /* use a reserved port */
-       unsigned int            swapper;        /* we're swapping over this
+       atomic_t                swapper;        /* we're swapping over this
                                                   transport */
        unsigned int            bind_index;     /* bind function index */
 
@@ -212,7 +215,8 @@ struct rpc_xprt {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
        struct svc_serv         *bc_serv;       /* The RPC service which will */
                                                /* process the callback */
-       unsigned int            bc_alloc_count; /* Total number of preallocs */
+       int                     bc_alloc_count; /* Total number of preallocs */
+       atomic_t                bc_free_slots;
        spinlock_t              bc_pa_lock;     /* Protects the preallocated
                                                 * items */
        struct list_head        bc_pa_list;     /* List of preallocated
@@ -241,6 +245,7 @@ struct rpc_xprt {
        const char              *address_strings[RPC_DISPLAY_MAX];
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
        struct dentry           *debugfs;               /* debugfs directory */
+       atomic_t                inject_disconnect;
 #endif
 };
 
@@ -327,6 +332,18 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
        return p + xprt->tsh_size;
 }
 
+static inline int
+xprt_enable_swap(struct rpc_xprt *xprt)
+{
+       return xprt->ops->enable_swap(xprt);
+}
+
+static inline void
+xprt_disable_swap(struct rpc_xprt *xprt)
+{
+       xprt->ops->disable_swap(xprt);
+}
+
 /*
  * Transport switch helper functions
  */
@@ -345,7 +362,6 @@ void                        xprt_release_rqst_cong(struct rpc_task *task);
 void                   xprt_disconnect_done(struct rpc_xprt *xprt);
 void                   xprt_force_disconnect(struct rpc_xprt *xprt);
 void                   xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
-int                    xs_swapper(struct rpc_xprt *xprt, int enable);
 
 bool                   xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
 void                   xprt_unlock_connect(struct rpc_xprt *, void *);
@@ -431,6 +447,23 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
        return test_and_set_bit(XPRT_BINDING, &xprt->state);
 }
 
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+extern unsigned int rpc_inject_disconnect;
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+       if (!rpc_inject_disconnect)
+               return;
+       if (atomic_dec_return(&xprt->inject_disconnect))
+               return;
+       atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
+       xprt->ops->inject_disconnect(xprt);
+}
+#else
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+}
+#endif
+
 #endif /* __KERNEL__*/
 
 #endif /* _LINUX_SUNRPC_XPRT_H */
index c984c85981eae2881ebdac43ffdbfba9b55b70b4..b17613052cc3fd9d8827ede1944d3489bdd2285d 100644 (file)
@@ -56,7 +56,8 @@
 
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
-/* memory registration strategies */
+/* Memory registration strategies, by number.
+ * This is part of a kernel / user space API. Do not remove. */
 enum rpcrdma_memreg {
        RPCRDMA_BOUNCEBUFFERS = 0,
        RPCRDMA_REGISTER,
index d3f4832db289b5da2787427360e7b933677d9fd5..b6fce900a8334613f45f169a0c28802327222f56 100644 (file)
@@ -313,6 +313,9 @@ struct drm_amdgpu_gem_op {
 #define AMDGPU_VA_OP_MAP                       1
 #define AMDGPU_VA_OP_UNMAP                     2
 
+/* Delay the page table update till the next CS */
+#define AMDGPU_VM_DELAY_UPDATE         (1 << 0)
+
 /* Mapping flags */
 /* readable mapping */
 #define AMDGPU_VM_PAGE_READABLE                (1 << 1)
@@ -348,6 +351,7 @@ struct drm_amdgpu_gem_va {
 
 #define AMDGPU_CHUNK_ID_IB             0x01
 #define AMDGPU_CHUNK_ID_FENCE          0x02
+#define AMDGPU_CHUNK_ID_DEPENDENCIES   0x03
 
 struct drm_amdgpu_cs_chunk {
        uint32_t                chunk_id;
@@ -399,6 +403,14 @@ struct drm_amdgpu_cs_chunk_ib {
        uint32_t ring;
 };
 
+struct drm_amdgpu_cs_chunk_dep {
+       uint32_t ip_type;
+       uint32_t ip_instance;
+       uint32_t ring;
+       uint32_t ctx_id;
+       uint64_t handle;
+};
+
 struct drm_amdgpu_cs_chunk_fence {
        uint32_t handle;
        uint32_t offset;
index 25084a052a1eff964d19a9683d5d6470590e4c7e..c9aca042e61d197927409531af4f8cdd88218adc 100644 (file)
@@ -755,4 +755,7 @@ struct fuse_notify_retrieve_in {
        uint64_t        dummy4;
 };
 
+/* Device ioctls: */
+#define FUSE_DEV_IOC_CLONE     _IOR(229, 0, uint32_t)
+
 #endif /* _LINUX_FUSE_H */
index b29015102698b393e0fdb84e2f7e6170d1e2a0a9..3fe401067e20ba81c762fb00ae00730de6b9502f 100644 (file)
@@ -289,5 +289,5 @@ static int __init list_sort_test(void)
        kfree(elts);
        return err;
 }
-module_init(list_sort_test);
+late_initcall(list_sort_test);
 #endif /* CONFIG_TEST_LIST_SORT */
index 05e7447d960b0628d9dea7ea22a02a0f15488b9d..58ea3643b9e9968a723f498d7df55b34c51a179e 100644 (file)
@@ -2085,7 +2085,7 @@ static int __meminit init_user_reserve(void)
        sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
        return 0;
 }
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
 
 /*
  * Initialise sysctl_admin_reserve_kbytes.
@@ -2106,4 +2106,4 @@ static int __meminit init_admin_reserve(void)
        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
        return 0;
 }
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
index 0993f5f36b011ec567e3b559f50bb4dafde90479..bd5f842b56d26aca3cf24225884d5a2eacd1580c 100644 (file)
@@ -310,4 +310,4 @@ static int __init pageowner_init(void)
 
        return 0;
 }
-module_init(pageowner_init)
+late_initcall(pageowner_init)
index 79e8f71aef5be312ab7aa547293fd047cf42e97c..cb7db320dd276aa0107d8e484270e3e8fb61dd6a 100644 (file)
@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
        /* start with defaults */
        opt->flags = CEPH_OPT_DEFAULT;
        opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 
        /* get mon ip(s) */
        /* ip1[:port1][,ip2[:port2]...] */
@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
                        pr_warn("ignoring deprecated osdtimeout option\n");
                        break;
                case Opt_osdkeepalivetimeout:
-                       opt->osd_keepalive_timeout = intval;
+                       /* 0 isn't well defined right now, reject it */
+                       if (intval < 1 || intval > INT_MAX / 1000) {
+                               pr_err("osdkeepalive out of range\n");
+                               err = -EINVAL;
+                               goto out;
+                       }
+                       opt->osd_keepalive_timeout =
+                                       msecs_to_jiffies(intval * 1000);
                        break;
                case Opt_osd_idle_ttl:
-                       opt->osd_idle_ttl = intval;
+                       /* 0 isn't well defined right now, reject it */
+                       if (intval < 1 || intval > INT_MAX / 1000) {
+                               pr_err("osd_idle_ttl out of range\n");
+                               err = -EINVAL;
+                               goto out;
+                       }
+                       opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
                        break;
                case Opt_mount_timeout:
-                       opt->mount_timeout = intval;
+                       /* 0 is "wait forever" (i.e. infinite timeout) */
+                       if (intval < 0 || intval > INT_MAX / 1000) {
+                               pr_err("mount_timeout out of range\n");
+                               err = -EINVAL;
+                               goto out;
+                       }
+                       opt->mount_timeout = msecs_to_jiffies(intval * 1000);
                        break;
 
                case Opt_share:
@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
                seq_puts(m, "notcp_nodelay,");
 
        if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-               seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
+               seq_printf(m, "mount_timeout=%d,",
+                          jiffies_to_msecs(opt->mount_timeout) / 1000);
        if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-               seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
+               seq_printf(m, "osd_idle_ttl=%d,",
+                          jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
        if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
                seq_printf(m, "osdkeepalivetimeout=%d,",
-                          opt->osd_keepalive_timeout);
+                   jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
 
        /* drop redundant comma */
        if (m->count != pos)
@@ -626,8 +647,8 @@ static int have_mon_and_osd_map(struct ceph_client *client)
  */
 int __ceph_open_session(struct ceph_client *client, unsigned long started)
 {
-       int err;
-       unsigned long timeout = client->options->mount_timeout * HZ;
+       unsigned long timeout = client->options->mount_timeout;
+       long err;
 
        /* open session, and wait for mon and osd maps */
        err = ceph_monc_open_session(&client->monc);
@@ -635,16 +656,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
                return err;
 
        while (!have_mon_and_osd_map(client)) {
-               err = -EIO;
                if (timeout && time_after_eq(jiffies, started + timeout))
-                       return err;
+                       return -ETIMEDOUT;
 
                /* wait */
                dout("mount waiting for mon_map\n");
                err = wait_event_interruptible_timeout(client->auth_wq,
                        have_mon_and_osd_map(client) || (client->auth_err < 0),
-                       timeout);
-               if (err == -EINTR || err == -ERESTARTSYS)
+                       ceph_timeout_jiffies(timeout));
+               if (err < 0)
                        return err;
                if (client->auth_err < 0)
                        return client->auth_err;
@@ -721,5 +741,5 @@ module_exit(exit_ceph_lib);
 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
 MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
-MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_DESCRIPTION("Ceph core library");
 MODULE_LICENSE("GPL");
index 9d84ce4ea0dfa8928cc498b9f0515735666b7857..80d7c3a97cb84355e82e9d8f4c83fbf5b0d82893 100644 (file)
@@ -1,15 +1,11 @@
-
 #ifdef __KERNEL__
 # include <linux/slab.h>
+# include <linux/crush/crush.h>
 #else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
+# include "crush_compat.h"
+# include "crush.h"
 #endif
 
-#include <linux/crush/crush.h>
-
 const char *crush_bucket_alg_name(int alg)
 {
        switch (alg) {
@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
                kfree(map->rules);
        }
 
+#ifndef __KERNEL__
+       kfree(map->choose_tries);
+#endif
        kfree(map);
 }
 
index 6192c7fc958ce84f6dc19b4005cf3580add5f88a..aae534c901a43169d73a8dc8c043dab47cc69941 100644 (file)
  *
  */
 
-#if defined(__linux__)
-#include <linux/types.h>
-#elif defined(__FreeBSD__)
-#include <sys/types.h>
-#endif
-
 #ifndef CEPH_CRUSH_LN_H
 #define CEPH_CRUSH_LN_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
-// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
-// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
-
-static int64_t __RH_LH_tbl[128*2+2] = {
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
   0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
   0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
   0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
@@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = {
   0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
   0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
   0x0000800000000000ll, 0x0000ffff00000000ll,
-  };
-
+};
 
-    // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
-static int64_t __LL_tbl[256] = {
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
   0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
   0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
   0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
@@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = {
   0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
 };
 
-
-
-
 #endif
index 5bb63e37a8a10f3419a399b32339cf5baf279762..ed123af49eba563a004d5b69e8b08c648480cb59 100644 (file)
@@ -1,6 +1,8 @@
-
-#include <linux/types.h>
-#include <linux/crush/hash.h>
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
+#endif
 
 /*
  * Robert Jenkins' function for mixing 32-bit values
index 5b47736d27d94584bf33b2e60af56b5e76fc141c..393bfb22d5bbafd83c20f5953b98c6709b637452 100644 (file)
@@ -1,27 +1,31 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
 
 #ifdef __KERNEL__
 # include <linux/string.h>
 # include <linux/slab.h>
 # include <linux/bug.h>
 # include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
 #else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
 #endif
-
-#include <linux/crush/crush.h>
-#include <linux/crush/hash.h>
 #include "crush_ln_table.h"
 
+#define dprintk(args...) /* printf(args) */
+
 /*
  * Implement the core CRUSH mapping algorithm.
  */
@@ -139,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
        int i;
 
        for (i = bucket->h.size-1; i >= 0; i--) {
-               __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+               __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
                                         r, bucket->h.id);
                w &= 0xffff;
                dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
@@ -238,43 +242,46 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
        return bucket->h.items[high];
 }
 
-// compute 2^44*log2(input+1)
-uint64_t crush_ln(unsigned xin)
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
 {
-    unsigned x=xin, x1;
-    int iexpon, index1, index2;
-    uint64_t RH, LH, LL, xl64, result;
+       unsigned int x = xin, x1;
+       int iexpon, index1, index2;
+       __u64 RH, LH, LL, xl64, result;
 
-    x++;
+       x++;
 
-    // normalize input
-    iexpon = 15;
-    while(!(x&0x18000)) { x<<=1; iexpon--; }
+       /* normalize input */
+       iexpon = 15;
+       while (!(x & 0x18000)) {
+               x <<= 1;
+               iexpon--;
+       }
 
-    index1 = (x>>8)<<1;
-    // RH ~ 2^56/index1
-    RH = __RH_LH_tbl[index1 - 256];
-    // LH ~ 2^48 * log2(index1/256)
-    LH = __RH_LH_tbl[index1 + 1 - 256];
+       index1 = (x >> 8) << 1;
+       /* RH ~ 2^56/index1 */
+       RH = __RH_LH_tbl[index1 - 256];
+       /* LH ~ 2^48 * log2(index1/256) */
+       LH = __RH_LH_tbl[index1 + 1 - 256];
 
-    // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
-    xl64 = (int64_t)x * RH;
-    xl64 >>= 48;
-    x1 = xl64;
+       /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+       xl64 = (__s64)x * RH;
+       xl64 >>= 48;
+       x1 = xl64;
 
-    result = iexpon;
-    result <<= (12 + 32);
+       result = iexpon;
+       result <<= (12 + 32);
 
-    index2 = x1 & 0xff;
-    // LL ~ 2^48*log2(1.0+index2/2^15)
-    LL = __LL_tbl[index2];
+       index2 = x1 & 0xff;
+       /* LL ~ 2^48*log2(1.0+index2/2^15) */
+       LL = __LL_tbl[index2];
 
-    LH = LH + LL;
+       LH = LH + LL;
 
-    LH >>= (48-12 - 32);
-    result += LH;
+       LH >>= (48 - 12 - 32);
+       result += LH;
 
-    return result;
+       return result;
 }
 
 
@@ -290,9 +297,9 @@ uint64_t crush_ln(unsigned xin)
 static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
                                int x, int r)
 {
-       unsigned i, high = 0;
-       unsigned u;
-       unsigned w;
+       unsigned int i, high = 0;
+       unsigned int u;
+       unsigned int w;
        __s64 ln, draw, high_draw = 0;
 
        for (i = 0; i < bucket->h.size; i++) {
@@ -567,6 +574,10 @@ static int crush_choose_firstn(const struct crush_map *map,
                out[outpos] = item;
                outpos++;
                count--;
+#ifndef __KERNEL__
+               if (map->choose_tries && ftotal <= map->choose_total_tries)
+                       map->choose_tries[ftotal]++;
+#endif
        }
 
        dprintk("CHOOSE returns %d\n", outpos);
@@ -610,6 +621,20 @@ static void crush_choose_indep(const struct crush_map *map,
        }
 
        for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+               if (out2 && ftotal) {
+                       dprintk("%u %d a: ", ftotal, left);
+                       for (rep = outpos; rep < endpos; rep++) {
+                               dprintk(" %d", out[rep]);
+                       }
+                       dprintk("\n");
+                       dprintk("%u %d b: ", ftotal, left);
+                       for (rep = outpos; rep < endpos; rep++) {
+                               dprintk(" %d", out2[rep]);
+                       }
+                       dprintk("\n");
+               }
+#endif
                for (rep = outpos; rep < endpos; rep++) {
                        if (out[rep] != CRUSH_ITEM_UNDEF)
                                continue;
@@ -726,6 +751,24 @@ static void crush_choose_indep(const struct crush_map *map,
                        out2[rep] = CRUSH_ITEM_NONE;
                }
        }
+#ifndef __KERNEL__
+       if (map->choose_tries && ftotal <= map->choose_total_tries)
+               map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+       if (out2) {
+               dprintk("%u %d a: ", ftotal, left);
+               for (rep = outpos; rep < endpos; rep++) {
+                       dprintk(" %d", out[rep]);
+               }
+               dprintk("\n");
+               dprintk("%u %d b: ", ftotal, left);
+               for (rep = outpos; rep < endpos; rep++) {
+                       dprintk(" %d", out2[rep]);
+               }
+               dprintk("\n");
+       }
+#endif
 }
 
 /**
@@ -790,8 +833,15 @@ int crush_do_rule(const struct crush_map *map,
 
                switch (curstep->op) {
                case CRUSH_RULE_TAKE:
-                       w[0] = curstep->arg1;
-                       wsize = 1;
+                       if ((curstep->arg1 >= 0 &&
+                            curstep->arg1 < map->max_devices) ||
+                           (-1-curstep->arg1 < map->max_buckets &&
+                            map->buckets[-1-curstep->arg1])) {
+                               w[0] = curstep->arg1;
+                               wsize = 1;
+                       } else {
+                               dprintk(" bad take value %d\n", curstep->arg1);
+                       }
                        break;
 
                case CRUSH_RULE_SET_CHOOSE_TRIES:
@@ -877,7 +927,7 @@ int crush_do_rule(const struct crush_map *map,
                                                0);
                                } else {
                                        out_size = ((numrep < (result_max-osize)) ?
-                                                    numrep : (result_max-osize));
+                                                   numrep : (result_max-osize));
                                        crush_choose_indep(
                                                map,
                                                map->buckets[-1-w[i]],
@@ -923,5 +973,3 @@ int crush_do_rule(const struct crush_map *map,
        }
        return result_len;
 }
-
-
index 073262fea6ddab4acb4cd128ee402ab54b870552..1679f47280e2678202238e667c1b617c7665d5fa 100644 (file)
@@ -278,7 +278,6 @@ static void _ceph_msgr_exit(void)
        ceph_msgr_slab_exit();
 
        BUG_ON(zero_page == NULL);
-       kunmap(zero_page);
        page_cache_release(zero_page);
        zero_page = NULL;
 }
@@ -1545,7 +1544,7 @@ static int write_partial_message_data(struct ceph_connection *con)
                page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
                                                        &last_piece);
                ret = ceph_tcp_sendpage(con->sock, page, page_offset,
-                                     length, last_piece);
+                                       length, !last_piece);
                if (ret <= 0) {
                        if (do_datacrc)
                                msg->footer.data_crc = cpu_to_le32(crc);
index 2b3cf05e87b0fc44a150f1c314f2db98f1ab7dfc..9d6ff1215928cb69787a85421fdbab9e2a1c8bf5 100644 (file)
@@ -298,21 +298,28 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
 
+/*
+ * Wait for an osdmap with a given epoch.
+ *
+ * @epoch: epoch to wait for
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
                          unsigned long timeout)
 {
        unsigned long started = jiffies;
-       int ret;
+       long ret;
 
        mutex_lock(&monc->mutex);
        while (monc->have_osdmap < epoch) {
                mutex_unlock(&monc->mutex);
 
-               if (timeout != 0 && time_after_eq(jiffies, started + timeout))
+               if (timeout && time_after_eq(jiffies, started + timeout))
                        return -ETIMEDOUT;
 
                ret = wait_event_interruptible_timeout(monc->client->auth_wq,
-                                        monc->have_osdmap >= epoch, timeout);
+                                               monc->have_osdmap >= epoch,
+                                               ceph_timeout_jiffies(timeout));
                if (ret < 0)
                        return ret;
 
index c4ec9239249ae6541a8ee378f95230a42c2f3a3d..50033677c0fa5134d540fba82cc8e298ce1367a0 100644 (file)
@@ -296,6 +296,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
        case CEPH_OSD_OP_CMPXATTR:
                ceph_osd_data_release(&op->xattr.osd_data);
                break;
+       case CEPH_OSD_OP_STAT:
+               ceph_osd_data_release(&op->raw_data_in);
+               break;
        default:
                break;
        }
@@ -450,7 +453,7 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
  */
 static struct ceph_osd_req_op *
 _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
-                               u16 opcode)
+                u16 opcode, u32 flags)
 {
        struct ceph_osd_req_op *op;
 
@@ -460,14 +463,15 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
        op = &osd_req->r_ops[which];
        memset(op, 0, sizeof (*op));
        op->op = opcode;
+       op->flags = flags;
 
        return op;
 }
 
 void osd_req_op_init(struct ceph_osd_request *osd_req,
-                               unsigned int which, u16 opcode)
+                    unsigned int which, u16 opcode, u32 flags)
 {
-       (void)_osd_req_op_init(osd_req, which, opcode);
+       (void)_osd_req_op_init(osd_req, which, opcode, flags);
 }
 EXPORT_SYMBOL(osd_req_op_init);
 
@@ -476,7 +480,8 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
                                u64 offset, u64 length,
                                u64 truncate_size, u32 truncate_seq)
 {
-       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+                                                     opcode, 0);
        size_t payload_len = 0;
 
        BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
@@ -515,7 +520,8 @@ EXPORT_SYMBOL(osd_req_op_extent_update);
 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
                        u16 opcode, const char *class, const char *method)
 {
-       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+                                                     opcode, 0);
        struct ceph_pagelist *pagelist;
        size_t payload_len = 0;
        size_t size;
@@ -552,7 +558,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
                          u16 opcode, const char *name, const void *value,
                          size_t size, u8 cmp_op, u8 cmp_mode)
 {
-       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+                                                     opcode, 0);
        struct ceph_pagelist *pagelist;
        size_t payload_len;
 
@@ -585,7 +592,8 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
                                unsigned int which, u16 opcode,
                                u64 cookie, u64 version, int flag)
 {
-       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+       struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+                                                     opcode, 0);
 
        BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
 
@@ -602,7 +610,8 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
                                u64 expected_write_size)
 {
        struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-                                                     CEPH_OSD_OP_SETALLOCHINT);
+                                                     CEPH_OSD_OP_SETALLOCHINT,
+                                                     0);
 
        op->alloc_hint.expected_object_size = expected_object_size;
        op->alloc_hint.expected_write_size = expected_write_size;
@@ -786,7 +795,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        }
 
        if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
-               osd_req_op_init(req, which, opcode);
+               osd_req_op_init(req, which, opcode, 0);
        } else {
                u32 object_size = le32_to_cpu(layout->fl_object_size);
                u32 object_base = off - objoff;
@@ -1088,7 +1097,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
        BUG_ON(!list_empty(&osd->o_osd_lru));
 
        list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-       osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+       osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
 static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
@@ -1199,7 +1208,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
        schedule_delayed_work(&osdc->timeout_work,
-                       osdc->client->options->osd_keepalive_timeout * HZ);
+                             osdc->client->options->osd_keepalive_timeout);
 }
 
 static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -1567,10 +1576,9 @@ static void handle_timeout(struct work_struct *work)
 {
        struct ceph_osd_client *osdc =
                container_of(work, struct ceph_osd_client, timeout_work.work);
+       struct ceph_options *opts = osdc->client->options;
        struct ceph_osd_request *req;
        struct ceph_osd *osd;
-       unsigned long keepalive =
-               osdc->client->options->osd_keepalive_timeout * HZ;
        struct list_head slow_osds;
        dout("timeout\n");
        down_read(&osdc->map_sem);
@@ -1586,7 +1594,8 @@ static void handle_timeout(struct work_struct *work)
         */
        INIT_LIST_HEAD(&slow_osds);
        list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-               if (time_before(jiffies, req->r_stamp + keepalive))
+               if (time_before(jiffies,
+                               req->r_stamp + opts->osd_keepalive_timeout))
                        break;
 
                osd = req->r_osd;
@@ -1613,8 +1622,7 @@ static void handle_osds_timeout(struct work_struct *work)
        struct ceph_osd_client *osdc =
                container_of(work, struct ceph_osd_client,
                             osds_timeout_work.work);
-       unsigned long delay =
-               osdc->client->options->osd_idle_ttl * HZ >> 2;
+       unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
 
        dout("osds timeout\n");
        down_read(&osdc->map_sem);
@@ -2619,7 +2627,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        osdc->event_count = 0;
 
        schedule_delayed_work(&osdc->osds_timeout_work,
-          round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
        err = -ENOMEM;
        osdc->req_mempool = mempool_create_kmalloc_pool(10,
index 15796696d64ede6b6a3118f5077972bed0c64350..4a3125836b64a0e5264e005badb7d108ddf9c47b 100644 (file)
@@ -89,7 +89,7 @@ static int crush_decode_tree_bucket(void **p, void *end,
 {
        int j;
        dout("crush_decode_tree_bucket %p to %p\n", *p, end);
-       ceph_decode_32_safe(p, end, b->num_nodes, bad);
+       ceph_decode_8_safe(p, end, b->num_nodes, bad);
        b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
        if (b->node_weights == NULL)
                return -ENOMEM;
index 096d91447e06e8a759ab8daf5283ddd4d27922b9..d4f5f220a8e55e063db6d3f3923a32be1ac33223 100644 (file)
@@ -51,10 +51,7 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
                        set_page_dirty_lock(pages[i]);
                put_page(pages[i]);
        }
-       if (is_vmalloc_addr(pages))
-               vfree(pages);
-       else
-               kfree(pages);
+       kvfree(pages);
 }
 EXPORT_SYMBOL(ceph_put_page_vector);
 
index 65de0684e22a17862663096da407eee16bc33a31..61eafc9b4545d5a9e1ea405e196753b3608c5fa0 100644 (file)
@@ -197,11 +197,4 @@ static int __init ipv4_netfilter_init(void)
 {
        return nf_register_afinfo(&nf_ip_afinfo);
 }
-
-static void __exit ipv4_netfilter_fini(void)
-{
-       nf_unregister_afinfo(&nf_ip_afinfo);
-}
-
-module_init(ipv4_netfilter_init);
-module_exit(ipv4_netfilter_fini);
+subsys_initcall(ipv4_netfilter_init);
index 936ad0a15371ac4f1e49b8785ef645fa65583e83..b512fbd9d79a403ee980d40c1a21fc3fde47f215 100644 (file)
@@ -14,6 +14,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
            sunrpc_syms.o cache.o rpc_pipe.o \
            svc_xprt.o
 sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
-sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
+sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
index 9dd0ea8db463acc9daba0c51be89b1f17ec8f17d..9825ff0f91d6c0bde819105f639cae21883bbfad 100644 (file)
@@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
 {
-       return xprt->bc_alloc_count > 0;
+       return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
 }
 
 static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+       atomic_add(n, &xprt->bc_free_slots);
        xprt->bc_alloc_count += n;
 }
 
 static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+       atomic_sub(n, &xprt->bc_free_slots);
        return xprt->bc_alloc_count -= n;
 }
 
@@ -60,13 +62,62 @@ static void xprt_free_allocation(struct rpc_rqst *req)
 
        dprintk("RPC:        free allocations for req= %p\n", req);
        WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
-       xbufp = &req->rq_private_buf;
+       xbufp = &req->rq_rcv_buf;
        free_page((unsigned long)xbufp->head[0].iov_base);
        xbufp = &req->rq_snd_buf;
        free_page((unsigned long)xbufp->head[0].iov_base);
        kfree(req);
 }
 
+static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
+{
+       struct page *page;
+       /* Preallocate one XDR receive buffer */
+       page = alloc_page(gfp_flags);
+       if (page == NULL)
+               return -ENOMEM;
+       buf->head[0].iov_base = page_address(page);
+       buf->head[0].iov_len = PAGE_SIZE;
+       buf->tail[0].iov_base = NULL;
+       buf->tail[0].iov_len = 0;
+       buf->page_len = 0;
+       buf->len = 0;
+       buf->buflen = PAGE_SIZE;
+       return 0;
+}
+
+static
+struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
+{
+       struct rpc_rqst *req;
+
+       /* Pre-allocate one backchannel rpc_rqst */
+       req = kzalloc(sizeof(*req), gfp_flags);
+       if (req == NULL)
+               return NULL;
+
+       req->rq_xprt = xprt;
+       INIT_LIST_HEAD(&req->rq_list);
+       INIT_LIST_HEAD(&req->rq_bc_list);
+
+       /* Preallocate one XDR receive buffer */
+       if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) {
+               printk(KERN_ERR "Failed to create bc receive xbuf\n");
+               goto out_free;
+       }
+       req->rq_rcv_buf.len = PAGE_SIZE;
+
+       /* Preallocate one XDR send buffer */
+       if (xprt_alloc_xdr_buf(&req->rq_snd_buf, gfp_flags) < 0) {
+               printk(KERN_ERR "Failed to create bc snd xbuf\n");
+               goto out_free;
+       }
+       return req;
+out_free:
+       xprt_free_allocation(req);
+       return NULL;
+}
+
 /*
  * Preallocate up to min_reqs structures and related buffers for use
  * by the backchannel.  This function can be called multiple times
@@ -87,9 +138,7 @@ static void xprt_free_allocation(struct rpc_rqst *req)
  */
 int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
 {
-       struct page *page_rcv = NULL, *page_snd = NULL;
-       struct xdr_buf *xbufp = NULL;
-       struct rpc_rqst *req, *tmp;
+       struct rpc_rqst *req;
        struct list_head tmp_list;
        int i;
 
@@ -106,7 +155,7 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
        INIT_LIST_HEAD(&tmp_list);
        for (i = 0; i < min_reqs; i++) {
                /* Pre-allocate one backchannel rpc_rqst */
-               req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+               req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
                if (req == NULL) {
                        printk(KERN_ERR "Failed to create bc rpc_rqst\n");
                        goto out_free;
@@ -115,41 +164,6 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
                /* Add the allocated buffer to the tmp list */
                dprintk("RPC:       adding req= %p\n", req);
                list_add(&req->rq_bc_pa_list, &tmp_list);
-
-               req->rq_xprt = xprt;
-               INIT_LIST_HEAD(&req->rq_list);
-               INIT_LIST_HEAD(&req->rq_bc_list);
-
-               /* Preallocate one XDR receive buffer */
-               page_rcv = alloc_page(GFP_KERNEL);
-               if (page_rcv == NULL) {
-                       printk(KERN_ERR "Failed to create bc receive xbuf\n");
-                       goto out_free;
-               }
-               xbufp = &req->rq_rcv_buf;
-               xbufp->head[0].iov_base = page_address(page_rcv);
-               xbufp->head[0].iov_len = PAGE_SIZE;
-               xbufp->tail[0].iov_base = NULL;
-               xbufp->tail[0].iov_len = 0;
-               xbufp->page_len = 0;
-               xbufp->len = PAGE_SIZE;
-               xbufp->buflen = PAGE_SIZE;
-
-               /* Preallocate one XDR send buffer */
-               page_snd = alloc_page(GFP_KERNEL);
-               if (page_snd == NULL) {
-                       printk(KERN_ERR "Failed to create bc snd xbuf\n");
-                       goto out_free;
-               }
-
-               xbufp = &req->rq_snd_buf;
-               xbufp->head[0].iov_base = page_address(page_snd);
-               xbufp->head[0].iov_len = 0;
-               xbufp->tail[0].iov_base = NULL;
-               xbufp->tail[0].iov_len = 0;
-               xbufp->page_len = 0;
-               xbufp->len = 0;
-               xbufp->buflen = PAGE_SIZE;
        }
 
        /*
@@ -167,7 +181,10 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
        /*
         * Memory allocation failed, free the temporary list
         */
-       list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) {
+       while (!list_empty(&tmp_list)) {
+               req = list_first_entry(&tmp_list,
+                               struct rpc_rqst,
+                               rq_bc_pa_list);
                list_del(&req->rq_bc_pa_list);
                xprt_free_allocation(req);
        }
@@ -217,9 +234,15 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
        struct rpc_rqst *req = NULL;
 
        dprintk("RPC:       allocate a backchannel request\n");
-       if (list_empty(&xprt->bc_pa_list))
+       if (atomic_read(&xprt->bc_free_slots) <= 0)
                goto not_found;
-
+       if (list_empty(&xprt->bc_pa_list)) {
+               req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
+               if (!req)
+                       goto not_found;
+               /* Note: this 'free' request adds it to xprt->bc_pa_list */
+               xprt_free_bc_request(req);
+       }
        req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
                                rq_bc_pa_list);
        req->rq_reply_bytes_recvd = 0;
@@ -245,11 +268,21 @@ void xprt_free_bc_request(struct rpc_rqst *req)
 
        req->rq_connect_cookie = xprt->connect_cookie - 1;
        smp_mb__before_atomic();
-       WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
        clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
        smp_mb__after_atomic();
 
-       if (!xprt_need_to_requeue(xprt)) {
+       /*
+        * Return it to the list of preallocations so that it
+        * may be reused by a new callback request.
+        */
+       spin_lock_bh(&xprt->bc_pa_lock);
+       if (xprt_need_to_requeue(xprt)) {
+               list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+               xprt->bc_alloc_count++;
+               req = NULL;
+       }
+       spin_unlock_bh(&xprt->bc_pa_lock);
+       if (req != NULL) {
                /*
                 * The last remaining session was destroyed while this
                 * entry was in use.  Free the entry and don't attempt
@@ -260,14 +293,6 @@ void xprt_free_bc_request(struct rpc_rqst *req)
                xprt_free_allocation(req);
                return;
        }
-
-       /*
-        * Return it to the list of preallocations so that it
-        * may be reused by a new callback request.
-        */
-       spin_lock_bh(&xprt->bc_pa_lock);
-       list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
-       spin_unlock_bh(&xprt->bc_pa_lock);
 }
 
 /*
@@ -311,6 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
        spin_lock(&xprt->bc_pa_lock);
        list_del(&req->rq_bc_pa_list);
+       xprt->bc_alloc_count--;
        spin_unlock(&xprt->bc_pa_lock);
 
        req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
deleted file mode 100644 (file)
index 15c7a8a..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/******************************************************************************
-
-(c) 2007 Network Appliance, Inc.  All Rights Reserved.
-(c) 2009 NetApp.  All Rights Reserved.
-
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-******************************************************************************/
-
-/*
- * The NFSv4.1 callback service helper routines.
- * They implement the transport level processing required to send the
- * reply over an existing open connection previously established by the client.
- */
-
-#include <linux/module.h>
-
-#include <linux/sunrpc/xprt.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/sunrpc/bc_xprt.h>
-
-#define RPCDBG_FACILITY        RPCDBG_SVCDSP
-
-/* Empty callback ops */
-static const struct rpc_call_ops nfs41_callback_ops = {
-};
-
-
-/*
- * Send the callback reply
- */
-int bc_send(struct rpc_rqst *req)
-{
-       struct rpc_task *task;
-       int ret;
-
-       dprintk("RPC:       bc_send req= %p\n", req);
-       task = rpc_run_bc_task(req, &nfs41_callback_ops);
-       if (IS_ERR(task))
-               ret = PTR_ERR(task);
-       else {
-               WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
-               ret = task->tk_status;
-               rpc_put_task(task);
-       }
-       dprintk("RPC:       bc_send ret= %d\n", ret);
-       return ret;
-}
-
index e6ce1517367f884608640b2532080ab6566b9379..cbc6af923dd1cb0baabc95161989133150269d4f 100644 (file)
@@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
                        task->tk_flags |= RPC_TASK_SOFT;
                if (clnt->cl_noretranstimeo)
                        task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
-               if (sk_memalloc_socks()) {
-                       struct rpc_xprt *xprt;
-
-                       rcu_read_lock();
-                       xprt = rcu_dereference(clnt->cl_xprt);
-                       if (xprt->swapper)
-                               task->tk_flags |= RPC_TASK_SWAPPER;
-                       rcu_read_unlock();
-               }
+               if (atomic_read(&clnt->cl_swapper))
+                       task->tk_flags |= RPC_TASK_SWAPPER;
                /* Add to the client's list of all tasks */
                spin_lock(&clnt->cl_lock);
                list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -1031,15 +1024,14 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
  * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
  * rpc_execute against it
  * @req: RPC request
- * @tk_ops: RPC call ops
  */
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-                               const struct rpc_call_ops *tk_ops)
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
 {
        struct rpc_task *task;
        struct xdr_buf *xbufp = &req->rq_snd_buf;
        struct rpc_task_setup task_setup_data = {
-               .callback_ops = tk_ops,
+               .callback_ops = &rpc_default_ops,
+               .flags = RPC_TASK_SOFTCONN,
        };
 
        dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@@ -1614,6 +1606,7 @@ call_allocate(struct rpc_task *task)
                                        req->rq_callsize + req->rq_rcvsize);
        if (req->rq_buffer != NULL)
                return;
+       xprt_inject_disconnect(xprt);
 
        dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
@@ -1951,33 +1944,36 @@ call_bc_transmit(struct rpc_task *task)
 {
        struct rpc_rqst *req = task->tk_rqstp;
 
-       if (!xprt_prepare_transmit(task)) {
-               /*
-                * Could not reserve the transport. Try again after the
-                * transport is released.
-                */
-               task->tk_status = 0;
-               task->tk_action = call_bc_transmit;
-               return;
-       }
+       if (!xprt_prepare_transmit(task))
+               goto out_retry;
 
-       task->tk_action = rpc_exit_task;
        if (task->tk_status < 0) {
                printk(KERN_NOTICE "RPC: Could not send backchannel reply "
                        "error: %d\n", task->tk_status);
-               return;
+               goto out_done;
        }
+       if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
+               req->rq_bytes_sent = 0;
 
        xprt_transmit(task);
+
+       if (task->tk_status == -EAGAIN)
+               goto out_nospace;
+
        xprt_end_transmit(task);
        dprint_status(task);
        switch (task->tk_status) {
        case 0:
                /* Success */
-               break;
        case -EHOSTDOWN:
        case -EHOSTUNREACH:
        case -ENETUNREACH:
+       case -ECONNRESET:
+       case -ECONNREFUSED:
+       case -EADDRINUSE:
+       case -ENOTCONN:
+       case -EPIPE:
+               break;
        case -ETIMEDOUT:
                /*
                 * Problem reaching the server.  Disconnect and let the
@@ -2002,6 +1998,13 @@ call_bc_transmit(struct rpc_task *task)
                break;
        }
        rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
+out_done:
+       task->tk_action = rpc_exit_task;
+       return;
+out_nospace:
+       req->rq_connect_cookie = req->rq_xprt->connect_cookie;
+out_retry:
+       task->tk_status = 0;
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
@@ -2476,3 +2479,59 @@ void rpc_show_tasks(struct net *net)
        spin_unlock(&sn->rpc_client_lock);
 }
 #endif
+
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+       int ret = 0;
+       struct rpc_xprt *xprt;
+
+       if (atomic_inc_return(&clnt->cl_swapper) == 1) {
+retry:
+               rcu_read_lock();
+               xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+               rcu_read_unlock();
+               if (!xprt) {
+                       /*
+                        * If we didn't get a reference, then we likely are
+                        * racing with a migration event. Wait for a grace
+                        * period and try again.
+                        */
+                       synchronize_rcu();
+                       goto retry;
+               }
+
+               ret = xprt_enable_swap(xprt);
+               xprt_put(xprt);
+       }
+       return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
+
+void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+       struct rpc_xprt *xprt;
+
+       if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
+retry:
+               rcu_read_lock();
+               xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+               rcu_read_unlock();
+               if (!xprt) {
+                       /*
+                        * If we didn't get a reference, then we likely are
+                        * racing with a migration event. Wait for a grace
+                        * period and try again.
+                        */
+                       synchronize_rcu();
+                       goto retry;
+               }
+
+               xprt_disable_swap(xprt);
+               xprt_put(xprt);
+       }
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
+#endif /* CONFIG_SUNRPC_SWAP */
index 82962f7e6e888f619ad79754f038732d5d5b6333..e7b4d93566df42dfa5ecf985152235e539ed9933 100644 (file)
 #include "netns.h"
 
 static struct dentry *topdir;
+static struct dentry *rpc_fault_dir;
 static struct dentry *rpc_clnt_dir;
 static struct dentry *rpc_xprt_dir;
 
+unsigned int rpc_inject_disconnect;
+
 struct rpc_clnt_iter {
        struct rpc_clnt *clnt;
        loff_t          pos;
@@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
                debugfs_remove_recursive(xprt->debugfs);
                xprt->debugfs = NULL;
        }
+
+       atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
 }
 
 void
@@ -266,11 +271,79 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
        xprt->debugfs = NULL;
 }
 
+static int
+fault_open(struct inode *inode, struct file *filp)
+{
+       filp->private_data = kmalloc(128, GFP_KERNEL);
+       if (!filp->private_data)
+               return -ENOMEM;
+       return 0;
+}
+
+static int
+fault_release(struct inode *inode, struct file *filp)
+{
+       kfree(filp->private_data);
+       return 0;
+}
+
+static ssize_t
+fault_disconnect_read(struct file *filp, char __user *user_buf,
+                     size_t len, loff_t *offset)
+{
+       char *buffer = (char *)filp->private_data;
+       size_t size;
+
+       size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
+       return simple_read_from_buffer(user_buf, len, offset, buffer, size);
+}
+
+static ssize_t
+fault_disconnect_write(struct file *filp, const char __user *user_buf,
+                      size_t len, loff_t *offset)
+{
+       char buffer[16];
+
+       if (len >= sizeof(buffer))
+               len = sizeof(buffer) - 1;
+       if (copy_from_user(buffer, user_buf, len))
+               return -EFAULT;
+       buffer[len] = '\0';
+       if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
+               return -EINVAL;
+       return len;
+}
+
+static const struct file_operations fault_disconnect_fops = {
+       .owner          = THIS_MODULE,
+       .open           = fault_open,
+       .read           = fault_disconnect_read,
+       .write          = fault_disconnect_write,
+       .release        = fault_release,
+};
+
+static struct dentry *
+inject_fault_dir(struct dentry *topdir)
+{
+       struct dentry *faultdir;
+
+       faultdir = debugfs_create_dir("inject_fault", topdir);
+       if (!faultdir)
+               return NULL;
+
+       if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
+                                NULL, &fault_disconnect_fops))
+               return NULL;
+
+       return faultdir;
+}
+
 void __exit
 sunrpc_debugfs_exit(void)
 {
        debugfs_remove_recursive(topdir);
        topdir = NULL;
+       rpc_fault_dir = NULL;
        rpc_clnt_dir = NULL;
        rpc_xprt_dir = NULL;
 }
@@ -282,6 +355,10 @@ sunrpc_debugfs_init(void)
        if (!topdir)
                return;
 
+       rpc_fault_dir = inject_fault_dir(topdir);
+       if (!rpc_fault_dir)
+               goto out_remove;
+
        rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
        if (!rpc_clnt_dir)
                goto out_remove;
@@ -294,5 +371,6 @@ sunrpc_debugfs_init(void)
 out_remove:
        debugfs_remove_recursive(topdir);
        topdir = NULL;
+       rpc_fault_dir = NULL;
        rpc_clnt_dir = NULL;
 }
index 852ae606b02a37760a5a4dc1fd1860b8a45b89a6..5a16d8d8c831c4ad2805f5958b9ccef63449af82 100644 (file)
@@ -1350,6 +1350,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 {
        struct kvec     *argv = &rqstp->rq_arg.head[0];
        struct kvec     *resv = &rqstp->rq_res.head[0];
+       struct rpc_task *task;
+       int proc_error;
+       int error;
+
+       dprintk("svc: %s(%p)\n", __func__, req);
 
        /* Build the svc_rqst used by the common processing routine */
        rqstp->rq_xprt = serv->sv_bc_xprt;
@@ -1372,21 +1377,36 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 
        /*
         * Skip the next two words because they've already been
-        * processed in the trasport
+        * processed in the transport
         */
        svc_getu32(argv);       /* XID */
        svc_getnl(argv);        /* CALLDIR */
 
-       /* Returns 1 for send, 0 for drop */
-       if (svc_process_common(rqstp, argv, resv)) {
-               memcpy(&req->rq_snd_buf, &rqstp->rq_res,
-                                               sizeof(req->rq_snd_buf));
-               return bc_send(req);
-       } else {
-               /* drop request */
+       /* Parse and execute the bc call */
+       proc_error = svc_process_common(rqstp, argv, resv);
+
+       atomic_inc(&req->rq_xprt->bc_free_slots);
+       if (!proc_error) {
+               /* Processing error: drop the request */
                xprt_free_bc_request(req);
                return 0;
        }
+
+       /* Finally, send the reply synchronously */
+       memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
+       task = rpc_run_bc_task(req);
+       if (IS_ERR(task)) {
+               error = PTR_ERR(task);
+               goto out;
+       }
+
+       WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
+       error = task->tk_status;
+       rpc_put_task(task);
+
+out:
+       dprintk("svc: %s(), error=%d\n", __func__, error);
+       return error;
 }
 EXPORT_SYMBOL_GPL(bc_svc_process);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
index 1d4fe24af06a1115bd80538c5346ae2f843f1eb8..ab5dd621ae0c0795a0d86e1a9fb83c5cc2812486 100644 (file)
@@ -68,6 +68,7 @@ static void    xprt_init(struct rpc_xprt *xprt, struct net *net);
 static void    xprt_request_init(struct rpc_task *, struct rpc_xprt *);
 static void    xprt_connect_status(struct rpc_task *task);
 static int      __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
+static void     __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
 static void     xprt_destroy(struct rpc_xprt *xprt);
 
 static DEFINE_SPINLOCK(xprt_list_lock);
@@ -250,6 +251,8 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
        }
        xprt_clear_locked(xprt);
 out_sleep:
+       if (req)
+               __xprt_put_cong(xprt, req);
        dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
        task->tk_timeout = 0;
        task->tk_status = -EAGAIN;
@@ -608,8 +611,8 @@ static void xprt_autoclose(struct work_struct *work)
        struct rpc_xprt *xprt =
                container_of(work, struct rpc_xprt, task_cleanup);
 
-       xprt->ops->close(xprt);
        clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+       xprt->ops->close(xprt);
        xprt_release_write(xprt, NULL);
 }
 
@@ -967,6 +970,7 @@ void xprt_transmit(struct rpc_task *task)
                task->tk_status = status;
                return;
        }
+       xprt_inject_disconnect(xprt);
 
        dprintk("RPC: %5u xmit complete\n", task->tk_pid);
        task->tk_flags |= RPC_TASK_SENT;
@@ -1285,6 +1289,7 @@ void xprt_release(struct rpc_task *task)
        spin_unlock_bh(&xprt->transport_lock);
        if (req->rq_buffer)
                xprt->ops->buf_free(req->rq_buffer);
+       xprt_inject_disconnect(xprt);
        if (req->rq_cred != NULL)
                put_rpccred(req->rq_cred);
        task->tk_rqstp = NULL;
index 302d4ebf6fbfb2a2c15780ef966b52bb473b91e4..f1e8dafbd5079b3406a769ba4854ecba229edca6 100644 (file)
  * can take tens of usecs to complete.
  */
 
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using the
+ * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
+ * finished, the Memory Region is unmapped using the ib_unmap_fmr
+ * verb (fmr_op_unmap).
+ */
+
+/* Transport recovery
+ *
+ * After a transport reconnect, fmr_op_map re-uses the MR already
+ * allocated for the RPC, but generates a fresh rkey then maps the
+ * MR again. This process is synchronous.
+ */
+
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -50,19 +65,28 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
        struct rpcrdma_mw *r;
        int i, rc;
 
+       spin_lock_init(&buf->rb_mwlock);
        INIT_LIST_HEAD(&buf->rb_mws);
        INIT_LIST_HEAD(&buf->rb_all);
 
-       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-       dprintk("RPC:       %s: initializing %d FMRs\n", __func__, i);
+       i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
+       i += 2;                         /* head + tail */
+       i *= buf->rb_max_requests;      /* one set for each RPC slot */
+       dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
 
+       rc = -ENOMEM;
        while (i--) {
                r = kzalloc(sizeof(*r), GFP_KERNEL);
                if (!r)
-                       return -ENOMEM;
+                       goto out;
 
-               r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
-               if (IS_ERR(r->r.fmr))
+               r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
+                                            sizeof(u64), GFP_KERNEL);
+               if (!r->r.fmr.physaddrs)
+                       goto out_free;
+
+               r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+               if (IS_ERR(r->r.fmr.fmr))
                        goto out_fmr_err;
 
                list_add(&r->mw_list, &buf->rb_mws);
@@ -71,12 +95,24 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
        return 0;
 
 out_fmr_err:
-       rc = PTR_ERR(r->r.fmr);
+       rc = PTR_ERR(r->r.fmr.fmr);
        dprintk("RPC:       %s: ib_alloc_fmr status %i\n", __func__, rc);
+       kfree(r->r.fmr.physaddrs);
+out_free:
        kfree(r);
+out:
        return rc;
 }
 
+static int
+__fmr_unmap(struct rpcrdma_mw *r)
+{
+       LIST_HEAD(l);
+
+       list_add(&r->r.fmr.fmr->list, &l);
+       return ib_unmap_fmr(&l);
+}
+
 /* Use the ib_map_phys_fmr() verb to register a memory region
  * for remote access via RDMA READ or RDMA WRITE.
  */
@@ -85,12 +121,24 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
           int nsegs, bool writing)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct ib_device *device = ia->ri_id->device;
+       struct ib_device *device = ia->ri_device;
        enum dma_data_direction direction = rpcrdma_data_dir(writing);
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
        int len, pageoff, i, rc;
+       struct rpcrdma_mw *mw;
+
+       mw = seg1->rl_mw;
+       seg1->rl_mw = NULL;
+       if (!mw) {
+               mw = rpcrdma_get_mw(r_xprt);
+               if (!mw)
+                       return -ENOMEM;
+       } else {
+               /* this is a retransmit; generate a fresh rkey */
+               rc = __fmr_unmap(mw);
+               if (rc)
+                       return rc;
+       }
 
        pageoff = offset_in_page(seg1->mr_offset);
        seg1->mr_offset -= pageoff;     /* start of page */
@@ -100,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                nsegs = RPCRDMA_MAX_FMR_SGES;
        for (i = 0; i < nsegs;) {
                rpcrdma_map_one(device, seg, direction);
-               physaddrs[i] = seg->mr_dma;
+               mw->r.fmr.physaddrs[i] = seg->mr_dma;
                len += seg->mr_len;
                ++seg;
                ++i;
@@ -110,11 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
                        break;
        }
 
-       rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+       rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs,
+                            i, seg1->mr_dma);
        if (rc)
                goto out_maperr;
 
-       seg1->mr_rkey = mw->r.fmr->rkey;
+       seg1->rl_mw = mw;
+       seg1->mr_rkey = mw->r.fmr.fmr->rkey;
        seg1->mr_base = seg1->mr_dma + pageoff;
        seg1->mr_nsegs = i;
        seg1->mr_len = len;
@@ -137,48 +187,28 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct ib_device *device;
+       struct rpcrdma_mw *mw = seg1->rl_mw;
        int rc, nsegs = seg->mr_nsegs;
-       LIST_HEAD(l);
 
-       list_add(&seg1->rl_mw->r.fmr->list, &l);
-       rc = ib_unmap_fmr(&l);
-       read_lock(&ia->ri_qplock);
-       device = ia->ri_id->device;
+       dprintk("RPC:       %s: FMR %p\n", __func__, mw);
+
+       seg1->rl_mw = NULL;
        while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(device, seg++);
-       read_unlock(&ia->ri_qplock);
+               rpcrdma_unmap_one(ia->ri_device, seg++);
+       rc = __fmr_unmap(mw);
        if (rc)
                goto out_err;
+       rpcrdma_put_mw(r_xprt, mw);
        return nsegs;
 
 out_err:
+       /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
+        * will attempt to release it when the transport is destroyed.
+        */
        dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
        return nsegs;
 }
 
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-fmr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct rpcrdma_mw *r;
-       LIST_HEAD(list);
-       int rc;
-
-       list_for_each_entry(r, &buf->rb_all, mw_all)
-               list_add(&r->r.fmr->list, &list);
-
-       rc = ib_unmap_fmr(&list);
-       if (rc)
-               dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
-                       __func__, rc);
-}
-
 static void
 fmr_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -188,10 +218,13 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
        while (!list_empty(&buf->rb_all)) {
                r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
                list_del(&r->mw_all);
-               rc = ib_dealloc_fmr(r->r.fmr);
+               kfree(r->r.fmr.physaddrs);
+
+               rc = ib_dealloc_fmr(r->r.fmr.fmr);
                if (rc)
                        dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
                                __func__, rc);
+
                kfree(r);
        }
 }
@@ -202,7 +235,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
        .ro_open                        = fmr_op_open,
        .ro_maxpages                    = fmr_op_maxpages,
        .ro_init                        = fmr_op_init,
-       .ro_reset                       = fmr_op_reset,
        .ro_destroy                     = fmr_op_destroy,
        .ro_displayname                 = "fmr",
 };
index d234521320a4bb45a9bbecd40e5e58bbb42d77d8..04ea914201b237cc6f42ce68caa6b5dbc7b29d59 100644 (file)
  * but most complex memory registration mode.
  */
 
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
+ * Work Request (frmr_op_map). When the RDMA operation is finished, this
+ * Memory Region is invalidated using a LOCAL_INV Work Request
+ * (frmr_op_unmap).
+ *
+ * Typically these Work Requests are not signaled, and neither are RDMA
+ * SEND Work Requests (with the exception of signaling occasionally to
+ * prevent provider work queue overflows). This greatly reduces HCA
+ * interrupt workload.
+ *
+ * As an optimization, frwr_op_unmap marks MRs INVALID before the
+ * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
+ * rb_mws immediately so that no work (like managing a linked list
+ * under a spinlock) is needed in the completion upcall.
+ *
+ * But this means that frwr_op_map() can occasionally encounter an MR
+ * that is INVALID but the LOCAL_INV WR has not completed. Work Queue
+ * ordering prevents a subsequent FAST_REG WR from executing against
+ * that MR while it is still being invalidated.
+ */
+
+/* Transport recovery
+ *
+ * ->op_map and the transport connect worker cannot run at the same
+ * time, but ->op_unmap can fire while the transport connect worker
+ * is running. Thus MR recovery is handled in ->op_map, to guarantee
+ * that recovered MRs are owned by a sending RPC, and not one where
+ * ->op_unmap could fire at the same time transport reconnect is
+ * being done.
+ *
+ * When the underlying transport disconnects, MRs are left in one of
+ * three states:
+ *
+ * INVALID:    The MR was not in use before the QP entered ERROR state.
+ *             (Or, the LOCAL_INV WR has not completed or flushed yet).
+ *
+ * STALE:      The MR was being registered or unregistered when the QP
+ *             entered ERROR state, and the pending WR was flushed.
+ *
+ * VALID:      The MR was registered before the QP entered ERROR state.
+ *
+ * When frwr_op_map encounters STALE and VALID MRs, they are recovered
+ * with ib_dereg_mr and then are re-initialized. Beause MR recovery
+ * allocates fresh resources, it is deferred to a workqueue, and the
+ * recovered MRs are placed back on the rb_mws list when recovery is
+ * complete. frwr_op_map allocates another MR for the current RPC while
+ * the broken MR is reset.
+ *
+ * To ensure that frwr_op_map doesn't encounter an MR that is marked
+ * INVALID but that is about to be flushed due to a previous transport
+ * disconnect, the transport connect worker attempts to drain all
+ * pending send queue WRs before the transport is reconnected.
+ */
+
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY       RPCDBG_TRANS
 #endif
 
+static struct workqueue_struct *frwr_recovery_wq;
+
+#define FRWR_RECOVERY_WQ_FLAGS         (WQ_UNBOUND | WQ_MEM_RECLAIM)
+
+int
+frwr_alloc_recovery_wq(void)
+{
+       frwr_recovery_wq = alloc_workqueue("frwr_recovery",
+                                          FRWR_RECOVERY_WQ_FLAGS, 0);
+       return !frwr_recovery_wq ? -ENOMEM : 0;
+}
+
+void
+frwr_destroy_recovery_wq(void)
+{
+       struct workqueue_struct *wq;
+
+       if (!frwr_recovery_wq)
+               return;
+
+       wq = frwr_recovery_wq;
+       frwr_recovery_wq = NULL;
+       destroy_workqueue(wq);
+}
+
+/* Deferred reset of a single FRMR. Generate a fresh rkey by
+ * replacing the MR.
+ *
+ * There's no recovery if this fails. The FRMR is abandoned, but
+ * remains in rb_all. It will be cleaned up when the transport is
+ * destroyed.
+ */
+static void
+__frwr_recovery_worker(struct work_struct *work)
+{
+       struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
+                                           r.frmr.fr_work);
+       struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt;
+       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+
+       if (ib_dereg_mr(r->r.frmr.fr_mr))
+               goto out_fail;
+
+       r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+       if (IS_ERR(r->r.frmr.fr_mr))
+               goto out_fail;
+
+       dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+       r->r.frmr.fr_state = FRMR_IS_INVALID;
+       rpcrdma_put_mw(r_xprt, r);
+       return;
+
+out_fail:
+       pr_warn("RPC:       %s: FRMR %p unrecovered\n",
+               __func__, r);
+}
+
+/* A broken MR was discovered in a context that can't sleep.
+ * Defer recovery to the recovery worker.
+ */
+static void
+__frwr_queue_recovery(struct rpcrdma_mw *r)
+{
+       INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker);
+       queue_work(frwr_recovery_wq, &r->r.frmr.fr_work);
+}
+
 static int
 __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
            unsigned int depth)
@@ -128,7 +252,7 @@ frwr_sendcompletion(struct ib_wc *wc)
 
        /* WARNING: Only wr_id and status are reliable at this point */
        r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
-       dprintk("RPC:       %s: frmr %p (stale), status %s (%d)\n",
+       pr_warn("RPC:       %s: frmr %p flushed, status %s (%d)\n",
                __func__, r, ib_wc_status_msg(wc->status), wc->status);
        r->r.frmr.fr_state = FRMR_IS_STALE;
 }
@@ -137,16 +261,19 @@ static int
 frwr_op_init(struct rpcrdma_xprt *r_xprt)
 {
        struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+       struct ib_device *device = r_xprt->rx_ia.ri_device;
        unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
        struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
        int i;
 
+       spin_lock_init(&buf->rb_mwlock);
        INIT_LIST_HEAD(&buf->rb_mws);
        INIT_LIST_HEAD(&buf->rb_all);
 
-       i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-       dprintk("RPC:       %s: initializing %d FRMRs\n", __func__, i);
+       i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
+       i += 2;                         /* head + tail */
+       i *= buf->rb_max_requests;      /* one set for each RPC slot */
+       dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
 
        while (i--) {
                struct rpcrdma_mw *r;
@@ -165,6 +292,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
                list_add(&r->mw_list, &buf->rb_mws);
                list_add(&r->mw_all, &buf->rb_all);
                r->mw_sendcompletion = frwr_sendcompletion;
+               r->r.frmr.fr_xprt = r_xprt;
        }
 
        return 0;
@@ -178,12 +306,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
            int nsegs, bool writing)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-       struct ib_device *device = ia->ri_id->device;
+       struct ib_device *device = ia->ri_device;
        enum dma_data_direction direction = rpcrdma_data_dir(writing);
        struct rpcrdma_mr_seg *seg1 = seg;
-       struct rpcrdma_mw *mw = seg1->rl_mw;
-       struct rpcrdma_frmr *frmr = &mw->r.frmr;
-       struct ib_mr *mr = frmr->fr_mr;
+       struct rpcrdma_mw *mw;
+       struct rpcrdma_frmr *frmr;
+       struct ib_mr *mr;
        struct ib_send_wr fastreg_wr, *bad_wr;
        u8 key;
        int len, pageoff;
@@ -192,12 +320,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        u64 pa;
        int page_no;
 
+       mw = seg1->rl_mw;
+       seg1->rl_mw = NULL;
+       do {
+               if (mw)
+                       __frwr_queue_recovery(mw);
+               mw = rpcrdma_get_mw(r_xprt);
+               if (!mw)
+                       return -ENOMEM;
+       } while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
+       frmr = &mw->r.frmr;
+       frmr->fr_state = FRMR_IS_VALID;
+
        pageoff = offset_in_page(seg1->mr_offset);
        seg1->mr_offset -= pageoff;     /* start of page */
        seg1->mr_len += pageoff;
        len = -pageoff;
        if (nsegs > ia->ri_max_frmr_depth)
                nsegs = ia->ri_max_frmr_depth;
+
        for (page_no = i = 0; i < nsegs;) {
                rpcrdma_map_one(device, seg, direction);
                pa = seg->mr_dma;
@@ -216,8 +357,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        dprintk("RPC:       %s: Using frmr %p to map %d segments (%d bytes)\n",
                __func__, mw, i, len);
 
-       frmr->fr_state = FRMR_IS_VALID;
-
        memset(&fastreg_wr, 0, sizeof(fastreg_wr));
        fastreg_wr.wr_id = (unsigned long)(void *)mw;
        fastreg_wr.opcode = IB_WR_FAST_REG_MR;
@@ -229,6 +368,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        fastreg_wr.wr.fast_reg.access_flags = writing ?
                                IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
                                IB_ACCESS_REMOTE_READ;
+       mr = frmr->fr_mr;
        key = (u8)(mr->rkey & 0x000000FF);
        ib_update_fast_reg_key(mr, ++key);
        fastreg_wr.wr.fast_reg.rkey = mr->rkey;
@@ -238,6 +378,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        if (rc)
                goto out_senderr;
 
+       seg1->rl_mw = mw;
        seg1->mr_rkey = mr->rkey;
        seg1->mr_base = seg1->mr_dma + pageoff;
        seg1->mr_nsegs = i;
@@ -246,10 +387,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 
 out_senderr:
        dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
-       ib_update_fast_reg_key(mr, --key);
-       frmr->fr_state = FRMR_IS_INVALID;
        while (i--)
                rpcrdma_unmap_one(device, --seg);
+       __frwr_queue_recovery(mw);
        return rc;
 }
 
@@ -261,78 +401,46 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct rpcrdma_mr_seg *seg1 = seg;
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       struct rpcrdma_mw *mw = seg1->rl_mw;
        struct ib_send_wr invalidate_wr, *bad_wr;
        int rc, nsegs = seg->mr_nsegs;
-       struct ib_device *device;
 
-       seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+       dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
+
+       seg1->rl_mw = NULL;
+       mw->r.frmr.fr_state = FRMR_IS_INVALID;
 
        memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-       invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+       invalidate_wr.wr_id = (unsigned long)(void *)mw;
        invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+       invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
        DECR_CQCOUNT(&r_xprt->rx_ep);
 
-       read_lock(&ia->ri_qplock);
-       device = ia->ri_id->device;
        while (seg1->mr_nsegs--)
-               rpcrdma_unmap_one(device, seg++);
+               rpcrdma_unmap_one(ia->ri_device, seg++);
+       read_lock(&ia->ri_qplock);
        rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
        read_unlock(&ia->ri_qplock);
        if (rc)
                goto out_err;
+
+       rpcrdma_put_mw(r_xprt, mw);
        return nsegs;
 
 out_err:
-       /* Force rpcrdma_buffer_get() to retry */
-       seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
        dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+       __frwr_queue_recovery(mw);
        return nsegs;
 }
 
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each.  FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-frwr_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-       struct ib_device *device = r_xprt->rx_ia.ri_id->device;
-       unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
-       struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
-       struct rpcrdma_mw *r;
-       int rc;
-
-       list_for_each_entry(r, &buf->rb_all, mw_all) {
-               if (r->r.frmr.fr_state == FRMR_IS_INVALID)
-                       continue;
-
-               __frwr_release(r);
-               rc = __frwr_init(r, pd, device, depth);
-               if (rc) {
-                       dprintk("RPC:       %s: mw %p left %s\n",
-                               __func__, r,
-                               (r->r.frmr.fr_state == FRMR_IS_STALE ?
-                                       "stale" : "valid"));
-                       continue;
-               }
-
-               r->r.frmr.fr_state = FRMR_IS_INVALID;
-       }
-}
-
 static void
 frwr_op_destroy(struct rpcrdma_buffer *buf)
 {
        struct rpcrdma_mw *r;
 
+       /* Ensure stale MWs for "buf" are no longer in flight */
+       flush_workqueue(frwr_recovery_wq);
+
        while (!list_empty(&buf->rb_all)) {
                r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
                list_del(&r->mw_all);
@@ -347,7 +455,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
        .ro_open                        = frwr_op_open,
        .ro_maxpages                    = frwr_op_maxpages,
        .ro_init                        = frwr_op_init,
-       .ro_reset                       = frwr_op_reset,
        .ro_destroy                     = frwr_op_destroy,
        .ro_displayname                 = "frwr",
 };
index ba518af167873dfe2e9c1f5f6723665bda2bd2e7..41985d07fdb744b5d9523b7c34af93c30f70522d 100644 (file)
@@ -50,8 +50,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       rpcrdma_map_one(ia->ri_id->device, seg,
-                       rpcrdma_data_dir(writing));
+       rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
        seg->mr_rkey = ia->ri_bind_mem->rkey;
        seg->mr_base = seg->mr_dma;
        seg->mr_nsegs = 1;
@@ -65,18 +64,10 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 {
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-       read_lock(&ia->ri_qplock);
-       rpcrdma_unmap_one(ia->ri_id->device, seg);
-       read_unlock(&ia->ri_qplock);
-
+       rpcrdma_unmap_one(ia->ri_device, seg);
        return 1;
 }
 
-static void
-physical_op_reset(struct rpcrdma_xprt *r_xprt)
-{
-}
-
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -88,7 +79,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
        .ro_open                        = physical_op_open,
        .ro_maxpages                    = physical_op_maxpages,
        .ro_init                        = physical_op_init,
-       .ro_reset                       = physical_op_reset,
        .ro_destroy                     = physical_op_destroy,
        .ro_displayname                 = "physical",
 };
index 2c53ea9e1b83dae01ebdd1aa22d256174dfbae08..84ea37daef36b0aa885c27e5eda950dda818949a 100644 (file)
@@ -284,9 +284,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
        return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-       if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
-               return n;
-
        for (pos = 0; nchunks--;)
                pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
                                                      &req->rl_segments[pos]);
@@ -732,8 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
        struct rpcrdma_msg *headerp;
        struct rpcrdma_req *req;
        struct rpc_rqst *rqst;
-       struct rpc_xprt *xprt = rep->rr_xprt;
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+       struct rpc_xprt *xprt = &r_xprt->rx_xprt;
        __be32 *iptr;
        int rdmalen, status;
        unsigned long cwnd;
@@ -770,7 +767,6 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
                        rep->rr_len);
 repost:
                r_xprt->rx_stats.bad_reply_count++;
-               rep->rr_func = rpcrdma_reply_handler;
                if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
                        rpcrdma_recv_buffer_put(rep);
 
index 436da2caec955ded2b2f4f6a2056cb1191594373..680f888a9ddd045314b305ef772385c7c6d5624e 100644 (file)
@@ -240,6 +240,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
        xprt_clear_connecting(xprt);
 }
 
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+       struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
+                                                  rx_xprt);
+
+       pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
+       rdma_disconnect(r_xprt->rx_ia.ri_id);
+}
+
 /*
  * xprt_rdma_destroy
  *
@@ -612,12 +622,6 @@ xprt_rdma_send_request(struct rpc_task *task)
        if (req->rl_reply == NULL)              /* e.g. reconnection */
                rpcrdma_recv_buffer_get(req);
 
-       if (req->rl_reply) {
-               req->rl_reply->rr_func = rpcrdma_reply_handler;
-               /* this need only be done once, but... */
-               req->rl_reply->rr_xprt = xprt;
-       }
-
        /* Must suppress retransmit to maintain credits */
        if (req->rl_connect_cookie == xprt->connect_cookie)
                goto drop_connection;
@@ -676,6 +680,17 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
           r_xprt->rx_stats.bad_reply_count);
 }
 
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+       return -EINVAL;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
+}
+
 /*
  * Plumbing for rpc transport switch and kernel module
  */
@@ -694,7 +709,10 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
        .send_request           = xprt_rdma_send_request,
        .close                  = xprt_rdma_close,
        .destroy                = xprt_rdma_destroy,
-       .print_stats            = xprt_rdma_print_stats
+       .print_stats            = xprt_rdma_print_stats,
+       .enable_swap            = xprt_rdma_enable_swap,
+       .disable_swap           = xprt_rdma_disable_swap,
+       .inject_disconnect      = xprt_rdma_inject_disconnect
 };
 
 static struct xprt_class xprt_rdma = {
@@ -720,17 +738,24 @@ void xprt_rdma_cleanup(void)
        if (rc)
                dprintk("RPC:       %s: xprt_unregister returned %i\n",
                        __func__, rc);
+
+       frwr_destroy_recovery_wq();
 }
 
 int xprt_rdma_init(void)
 {
        int rc;
 
-       rc = xprt_register_transport(&xprt_rdma);
-
+       rc = frwr_alloc_recovery_wq();
        if (rc)
                return rc;
 
+       rc = xprt_register_transport(&xprt_rdma);
+       if (rc) {
+               frwr_destroy_recovery_wq();
+               return rc;
+       }
+
        dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
 
        dprintk("Defaults:\n");
index 52df265b472a9b2b79574c7d9363acba26ea5d8b..891c4ede2c20ea8d8c6bc79ee080f353d4df13d7 100644 (file)
@@ -80,7 +80,6 @@ static void
 rpcrdma_run_tasklet(unsigned long data)
 {
        struct rpcrdma_rep *rep;
-       void (*func)(struct rpcrdma_rep *);
        unsigned long flags;
 
        data = data;
@@ -89,14 +88,9 @@ rpcrdma_run_tasklet(unsigned long data)
                rep = list_entry(rpcrdma_tasklets_g.next,
                                 struct rpcrdma_rep, rr_list);
                list_del(&rep->rr_list);
-               func = rep->rr_func;
-               rep->rr_func = NULL;
                spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
 
-               if (func)
-                       func(rep);
-               else
-                       rpcrdma_recv_buffer_put(rep);
+               rpcrdma_reply_handler(rep);
 
                spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
        }
@@ -236,7 +230,7 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
                __func__, rep, wc->byte_len);
 
        rep->rr_len = wc->byte_len;
-       ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+       ib_dma_sync_single_for_cpu(rep->rr_device,
                                   rdmab_addr(rep->rr_rdmabuf),
                                   rep->rr_len, DMA_FROM_DEVICE);
        prefetch(rdmab_to_msg(rep->rr_rdmabuf));
@@ -407,7 +401,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 
                pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
                        sap, rpc_get_port(sap),
-                       ia->ri_id->device->name,
+                       ia->ri_device->name,
                        ia->ri_ops->ro_displayname,
                        xprt->rx_buf.rb_max_requests,
                        ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
@@ -508,8 +502,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                rc = PTR_ERR(ia->ri_id);
                goto out1;
        }
+       ia->ri_device = ia->ri_id->device;
 
-       ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+       ia->ri_pd = ib_alloc_pd(ia->ri_device);
        if (IS_ERR(ia->ri_pd)) {
                rc = PTR_ERR(ia->ri_pd);
                dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
@@ -517,7 +512,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                goto out2;
        }
 
-       rc = ib_query_device(ia->ri_id->device, devattr);
+       rc = ib_query_device(ia->ri_device, devattr);
        if (rc) {
                dprintk("RPC:       %s: ib_query_device failed %d\n",
                        __func__, rc);
@@ -526,7 +521,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 
        if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
                ia->ri_have_dma_lkey = 1;
-               ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+               ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
        }
 
        if (memreg == RPCRDMA_FRMR) {
@@ -541,7 +536,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                }
        }
        if (memreg == RPCRDMA_MTHCAFMR) {
-               if (!ia->ri_id->device->alloc_fmr) {
+               if (!ia->ri_device->alloc_fmr) {
                        dprintk("RPC:       %s: MTHCAFMR registration "
                                "not supported by HCA\n", __func__);
                        memreg = RPCRDMA_ALLPHYSICAL;
@@ -590,9 +585,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
        dprintk("RPC:       %s: memory registration strategy is '%s'\n",
                __func__, ia->ri_ops->ro_displayname);
 
-       /* Else will do memory reg/dereg for each chunk */
-       ia->ri_memreg_strategy = memreg;
-
        rwlock_init(&ia->ri_qplock);
        return 0;
 
@@ -622,17 +614,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
                dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
                        __func__, rc);
        }
+
        if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
                if (ia->ri_id->qp)
                        rdma_destroy_qp(ia->ri_id);
                rdma_destroy_id(ia->ri_id);
                ia->ri_id = NULL;
        }
-       if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
-               rc = ib_dealloc_pd(ia->ri_pd);
-               dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
-                       __func__, rc);
-       }
+
+       /* If the pd is still busy, xprtrdma missed freeing a resource */
+       if (ia->ri_pd && !IS_ERR(ia->ri_pd))
+               WARN_ON(ib_dealloc_pd(ia->ri_pd));
 }
 
 /*
@@ -693,8 +685,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
 
        cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
-       sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
-                                 rpcrdma_cq_async_error_upcall, ep, &cq_attr);
+       sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
+                             rpcrdma_cq_async_error_upcall, ep, &cq_attr);
        if (IS_ERR(sendcq)) {
                rc = PTR_ERR(sendcq);
                dprintk("RPC:       %s: failed to create send CQ: %i\n",
@@ -710,8 +702,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        }
 
        cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
-       recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
-                                 rpcrdma_cq_async_error_upcall, ep, &cq_attr);
+       recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
+                             rpcrdma_cq_async_error_upcall, ep, &cq_attr);
        if (IS_ERR(recvcq)) {
                rc = PTR_ERR(recvcq);
                dprintk("RPC:       %s: failed to create recv CQ: %i\n",
@@ -817,8 +809,6 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                rpcrdma_flush_cqs(ep);
 
                xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
-               ia->ri_ops->ro_reset(xprt);
-
                id = rpcrdma_create_id(xprt, ia,
                                (struct sockaddr *)&xprt->rx_data.addr);
                if (IS_ERR(id)) {
@@ -832,7 +822,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                 * More stuff I haven't thought of!
                 * Rrrgh!
                 */
-               if (ia->ri_id->device != id->device) {
+               if (ia->ri_device != id->device) {
                        printk("RPC:       %s: can't reconnect on "
                                "different device!\n", __func__);
                        rdma_destroy_id(id);
@@ -974,7 +964,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
                goto out_free;
        }
 
-       rep->rr_buffer = &r_xprt->rx_buf;
+       rep->rr_device = ia->ri_device;
+       rep->rr_rxprt = r_xprt;
        return rep;
 
 out_free:
@@ -1098,31 +1089,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
        kfree(buf->rb_pool);
 }
 
-/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
- * some req segments uninitialized.
- */
-static void
-rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+struct rpcrdma_mw *
+rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
 {
-       if (*mw) {
-               list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
-               *mw = NULL;
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+       struct rpcrdma_mw *mw = NULL;
+
+       spin_lock(&buf->rb_mwlock);
+       if (!list_empty(&buf->rb_mws)) {
+               mw = list_first_entry(&buf->rb_mws,
+                                     struct rpcrdma_mw, mw_list);
+               list_del_init(&mw->mw_list);
        }
+       spin_unlock(&buf->rb_mwlock);
+
+       if (!mw)
+               pr_err("RPC:       %s: no MWs available\n", __func__);
+       return mw;
 }
 
-/* Cycle mw's back in reverse order, and "spin" them.
- * This delays and scrambles reuse as much as possible.
- */
-static void
-rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+void
+rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
 {
-       struct rpcrdma_mr_seg *seg = req->rl_segments;
-       struct rpcrdma_mr_seg *seg1 = seg;
-       int i;
+       struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
 
-       for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
-               rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
-       rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
+       spin_lock(&buf->rb_mwlock);
+       list_add_tail(&mw->mw_list, &buf->rb_mws);
+       spin_unlock(&buf->rb_mwlock);
 }
 
 static void
@@ -1132,115 +1125,10 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
        req->rl_niovs = 0;
        if (req->rl_reply) {
                buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
-               req->rl_reply->rr_func = NULL;
                req->rl_reply = NULL;
        }
 }
 
-/* rpcrdma_unmap_one() was already done during deregistration.
- * Redo only the ib_post_send().
- */
-static void
-rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
-{
-       struct rpcrdma_xprt *r_xprt =
-                               container_of(ia, struct rpcrdma_xprt, rx_ia);
-       struct ib_send_wr invalidate_wr, *bad_wr;
-       int rc;
-
-       dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
-
-       /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
-       r->r.frmr.fr_state = FRMR_IS_INVALID;
-
-       memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-       invalidate_wr.wr_id = (unsigned long)(void *)r;
-       invalidate_wr.opcode = IB_WR_LOCAL_INV;
-       invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
-       DECR_CQCOUNT(&r_xprt->rx_ep);
-
-       dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
-               __func__, r, r->r.frmr.fr_mr->rkey);
-
-       read_lock(&ia->ri_qplock);
-       rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-       read_unlock(&ia->ri_qplock);
-       if (rc) {
-               /* Force rpcrdma_buffer_get() to retry */
-               r->r.frmr.fr_state = FRMR_IS_STALE;
-               dprintk("RPC:       %s: ib_post_send failed, %i\n",
-                       __func__, rc);
-       }
-}
-
-static void
-rpcrdma_retry_flushed_linv(struct list_head *stale,
-                          struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-       struct list_head *pos;
-       struct rpcrdma_mw *r;
-       unsigned long flags;
-
-       list_for_each(pos, stale) {
-               r = list_entry(pos, struct rpcrdma_mw, mw_list);
-               rpcrdma_retry_local_inv(r, ia);
-       }
-
-       spin_lock_irqsave(&buf->rb_lock, flags);
-       list_splice_tail(stale, &buf->rb_mws);
-       spin_unlock_irqrestore(&buf->rb_lock, flags);
-}
-
-static struct rpcrdma_req *
-rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
-                        struct list_head *stale)
-{
-       struct rpcrdma_mw *r;
-       int i;
-
-       i = RPCRDMA_MAX_SEGS - 1;
-       while (!list_empty(&buf->rb_mws)) {
-               r = list_entry(buf->rb_mws.next,
-                              struct rpcrdma_mw, mw_list);
-               list_del(&r->mw_list);
-               if (r->r.frmr.fr_state == FRMR_IS_STALE) {
-                       list_add(&r->mw_list, stale);
-                       continue;
-               }
-               req->rl_segments[i].rl_mw = r;
-               if (unlikely(i-- == 0))
-                       return req;     /* Success */
-       }
-
-       /* Not enough entries on rb_mws for this req */
-       rpcrdma_buffer_put_sendbuf(req, buf);
-       rpcrdma_buffer_put_mrs(req, buf);
-       return NULL;
-}
-
-static struct rpcrdma_req *
-rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
-       struct rpcrdma_mw *r;
-       int i;
-
-       i = RPCRDMA_MAX_SEGS - 1;
-       while (!list_empty(&buf->rb_mws)) {
-               r = list_entry(buf->rb_mws.next,
-                              struct rpcrdma_mw, mw_list);
-               list_del(&r->mw_list);
-               req->rl_segments[i].rl_mw = r;
-               if (unlikely(i-- == 0))
-                       return req;     /* Success */
-       }
-
-       /* Not enough entries on rb_mws for this req */
-       rpcrdma_buffer_put_sendbuf(req, buf);
-       rpcrdma_buffer_put_mrs(req, buf);
-       return NULL;
-}
-
 /*
  * Get a set of request/reply buffers.
  *
@@ -1253,12 +1141,11 @@ rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
 struct rpcrdma_req *
 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
 {
-       struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
-       struct list_head stale;
        struct rpcrdma_req *req;
        unsigned long flags;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
+
        if (buffers->rb_send_index == buffers->rb_max_requests) {
                spin_unlock_irqrestore(&buffers->rb_lock, flags);
                dprintk("RPC:       %s: out of request buffers\n", __func__);
@@ -1277,20 +1164,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
        }
        buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
 
-       INIT_LIST_HEAD(&stale);
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-               req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
-               break;
-       case RPCRDMA_MTHCAFMR:
-               req = rpcrdma_buffer_get_fmrs(req, buffers);
-               break;
-       default:
-               break;
-       }
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
-       if (!list_empty(&stale))
-               rpcrdma_retry_flushed_linv(&stale, buffers);
        return req;
 }
 
@@ -1302,19 +1176,10 @@ void
 rpcrdma_buffer_put(struct rpcrdma_req *req)
 {
        struct rpcrdma_buffer *buffers = req->rl_buffer;
-       struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
        unsigned long flags;
 
        spin_lock_irqsave(&buffers->rb_lock, flags);
        rpcrdma_buffer_put_sendbuf(req, buffers);
-       switch (ia->ri_memreg_strategy) {
-       case RPCRDMA_FRMR:
-       case RPCRDMA_MTHCAFMR:
-               rpcrdma_buffer_put_mrs(req, buffers);
-               break;
-       default:
-               break;
-       }
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
 }
 
@@ -1344,10 +1209,9 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
 void
 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
 {
-       struct rpcrdma_buffer *buffers = rep->rr_buffer;
+       struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
        unsigned long flags;
 
-       rep->rr_func = NULL;
        spin_lock_irqsave(&buffers->rb_lock, flags);
        buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
        spin_unlock_irqrestore(&buffers->rb_lock, flags);
@@ -1376,9 +1240,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
        /*
         * All memory passed here was kmalloc'ed, therefore phys-contiguous.
         */
-       iov->addr = ib_dma_map_single(ia->ri_id->device,
+       iov->addr = ib_dma_map_single(ia->ri_device,
                        va, len, DMA_BIDIRECTIONAL);
-       if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+       if (ib_dma_mapping_error(ia->ri_device, iov->addr))
                return -ENOMEM;
 
        iov->length = len;
@@ -1422,8 +1286,8 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
 {
        int rc;
 
-       ib_dma_unmap_single(ia->ri_id->device,
-                       iov->addr, iov->length, DMA_BIDIRECTIONAL);
+       ib_dma_unmap_single(ia->ri_device,
+                           iov->addr, iov->length, DMA_BIDIRECTIONAL);
 
        if (NULL == mr)
                return 0;
@@ -1516,15 +1380,18 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
        send_wr.num_sge = req->rl_niovs;
        send_wr.opcode = IB_WR_SEND;
        if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
-               ib_dma_sync_single_for_device(ia->ri_id->device,
-                       req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
-                       DMA_TO_DEVICE);
-       ib_dma_sync_single_for_device(ia->ri_id->device,
-               req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
-               DMA_TO_DEVICE);
-       ib_dma_sync_single_for_device(ia->ri_id->device,
-               req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
-               DMA_TO_DEVICE);
+               ib_dma_sync_single_for_device(ia->ri_device,
+                                             req->rl_send_iov[3].addr,
+                                             req->rl_send_iov[3].length,
+                                             DMA_TO_DEVICE);
+       ib_dma_sync_single_for_device(ia->ri_device,
+                                     req->rl_send_iov[1].addr,
+                                     req->rl_send_iov[1].length,
+                                     DMA_TO_DEVICE);
+       ib_dma_sync_single_for_device(ia->ri_device,
+                                     req->rl_send_iov[0].addr,
+                                     req->rl_send_iov[0].length,
+                                     DMA_TO_DEVICE);
 
        if (DECR_CQCOUNT(ep) > 0)
                send_wr.send_flags = 0;
@@ -1557,7 +1424,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
        recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
        recv_wr.num_sge = 1;
 
-       ib_dma_sync_single_for_cpu(ia->ri_id->device,
+       ib_dma_sync_single_for_cpu(ia->ri_device,
                                   rdmab_addr(rep->rr_rdmabuf),
                                   rdmab_length(rep->rr_rdmabuf),
                                   DMA_BIDIRECTIONAL);
index 58163b88738c2363c559be28d3f3807043065826..f49dd8b381221dceaef4847e4ae28d397ebcdf27 100644 (file)
@@ -62,6 +62,7 @@
 struct rpcrdma_ia {
        const struct rpcrdma_memreg_ops *ri_ops;
        rwlock_t                ri_qplock;
+       struct ib_device        *ri_device;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
        struct ib_mr            *ri_bind_mem;
@@ -69,7 +70,6 @@ struct rpcrdma_ia {
        int                     ri_have_dma_lkey;
        struct completion       ri_done;
        int                     ri_async_rc;
-       enum rpcrdma_memreg     ri_memreg_strategy;
        unsigned int            ri_max_frmr_depth;
        struct ib_device_attr   ri_devattr;
        struct ib_qp_attr       ri_qp_attr;
@@ -173,9 +173,8 @@ struct rpcrdma_buffer;
 
 struct rpcrdma_rep {
        unsigned int            rr_len;
-       struct rpcrdma_buffer   *rr_buffer;
-       struct rpc_xprt         *rr_xprt;
-       void                    (*rr_func)(struct rpcrdma_rep *);
+       struct ib_device        *rr_device;
+       struct rpcrdma_xprt     *rr_rxprt;
        struct list_head        rr_list;
        struct rpcrdma_regbuf   *rr_rdmabuf;
 };
@@ -203,11 +202,18 @@ struct rpcrdma_frmr {
        struct ib_fast_reg_page_list    *fr_pgl;
        struct ib_mr                    *fr_mr;
        enum rpcrdma_frmr_state         fr_state;
+       struct work_struct              fr_work;
+       struct rpcrdma_xprt             *fr_xprt;
+};
+
+struct rpcrdma_fmr {
+       struct ib_fmr           *fmr;
+       u64                     *physaddrs;
 };
 
 struct rpcrdma_mw {
        union {
-               struct ib_fmr           *fmr;
+               struct rpcrdma_fmr      fmr;
                struct rpcrdma_frmr     frmr;
        } r;
        void                    (*mw_sendcompletion)(struct ib_wc *);
@@ -281,15 +287,17 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
  * One of these is associated with a transport instance
  */
 struct rpcrdma_buffer {
-       spinlock_t      rb_lock;        /* protects indexes */
-       u32             rb_max_requests;/* client max requests */
-       struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
-       struct list_head rb_all;
-       int             rb_send_index;
+       spinlock_t              rb_mwlock;      /* protect rb_mws list */
+       struct list_head        rb_mws;
+       struct list_head        rb_all;
+       char                    *rb_pool;
+
+       spinlock_t              rb_lock;        /* protect buf arrays */
+       u32                     rb_max_requests;
+       int                     rb_send_index;
+       int                     rb_recv_index;
        struct rpcrdma_req      **rb_send_bufs;
-       int             rb_recv_index;
        struct rpcrdma_rep      **rb_recv_bufs;
-       char            *rb_pool;
 };
 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 
@@ -350,7 +358,6 @@ struct rpcrdma_memreg_ops {
                                   struct rpcrdma_create_data_internal *);
        size_t          (*ro_maxpages)(struct rpcrdma_xprt *);
        int             (*ro_init)(struct rpcrdma_xprt *);
-       void            (*ro_reset)(struct rpcrdma_xprt *);
        void            (*ro_destroy)(struct rpcrdma_buffer *);
        const char      *ro_displayname;
 };
@@ -413,6 +420,8 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 
+struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
+void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
@@ -425,6 +434,9 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 
 unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
 
+int frwr_alloc_recovery_wq(void);
+void frwr_destroy_recovery_wq(void);
+
 /*
  * Wrappers for chunk registration, shared by read/write chunk code.
  */
index b0517287075b2753101bbde70262c99c2a58f35b..e193c2b5476b3a83973e9799e2e826fdcd2b842c 100644 (file)
@@ -622,24 +622,6 @@ static int xs_udp_send_request(struct rpc_task *task)
        return status;
 }
 
-/**
- * xs_tcp_shutdown - gracefully shut down a TCP socket
- * @xprt: transport
- *
- * Initiates a graceful shutdown of the TCP socket by calling the
- * equivalent of shutdown(SHUT_RDWR);
- */
-static void xs_tcp_shutdown(struct rpc_xprt *xprt)
-{
-       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-       struct socket *sock = transport->sock;
-
-       if (sock != NULL) {
-               kernel_sock_shutdown(sock, SHUT_RDWR);
-               trace_rpc_socket_shutdown(xprt, sock);
-       }
-}
-
 /**
  * xs_tcp_send_request - write an RPC request to a TCP socket
  * @task: address of RPC task that manages the state of an RPC request
@@ -786,6 +768,7 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt)
        xs_sock_reset_connection_flags(xprt);
        /* Mark transport as closed and wake up all pending tasks */
        xprt_disconnect_done(xprt);
+       xprt_force_disconnect(xprt);
 }
 
 /**
@@ -827,6 +810,9 @@ static void xs_reset_transport(struct sock_xprt *transport)
        if (sk == NULL)
                return;
 
+       if (atomic_read(&transport->xprt.swapper))
+               sk_clear_memalloc(sk);
+
        write_lock_bh(&sk->sk_callback_lock);
        transport->inet = NULL;
        transport->sock = NULL;
@@ -863,6 +849,13 @@ static void xs_close(struct rpc_xprt *xprt)
        xprt_disconnect_done(xprt);
 }
 
+static void xs_inject_disconnect(struct rpc_xprt *xprt)
+{
+       dprintk("RPC:       injecting transport disconnect on xprt=%p\n",
+               xprt);
+       xprt_disconnect_done(xprt);
+}
+
 static void xs_xprt_free(struct rpc_xprt *xprt)
 {
        xs_free_peer_addresses(xprt);
@@ -901,7 +894,6 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 /**
  * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
  * @sk: socket with data to read
- * @len: how much data to read
  *
  * Currently this assumes we can read the whole reply in a single gulp.
  */
@@ -965,7 +957,6 @@ static void xs_local_data_ready(struct sock *sk)
 /**
  * xs_udp_data_ready - "data ready" callback for UDP sockets
  * @sk: socket with data to read
- * @len: how much data to read
  *
  */
 static void xs_udp_data_ready(struct sock *sk)
@@ -1389,7 +1380,6 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
 /**
  * xs_tcp_data_ready - "data ready" callback for TCP sockets
  * @sk: socket with data to read
- * @bytes: how much data to read
  *
  */
 static void xs_tcp_data_ready(struct sock *sk)
@@ -1886,9 +1876,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
 
 /**
  * xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
- * @xprt: RPC transport to connect
  * @transport: socket transport to connect
- * @create_sock: function to create a socket of the correct type
  */
 static int xs_local_setup_socket(struct sock_xprt *transport)
 {
@@ -1960,43 +1948,84 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
                msleep_interruptible(15000);
 }
 
-#ifdef CONFIG_SUNRPC_SWAP
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+/*
+ * Note that this should be called with XPRT_LOCKED held (or when we otherwise
+ * know that we have exclusive access to the socket), to guard against
+ * races with xs_reset_transport.
+ */
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
                        xprt);
 
-       if (xprt->swapper)
+       /*
+        * If there's no sock, then we have nothing to set. The
+        * reconnecting process will get it for us.
+        */
+       if (!transport->inet)
+               return;
+       if (atomic_read(&xprt->swapper))
                sk_set_memalloc(transport->inet);
 }
 
 /**
- * xs_swapper - Tag this transport as being used for swap.
+ * xs_enable_swap - Tag this transport as being used for swap.
  * @xprt: transport to tag
- * @enable: enable/disable
  *
+ * Take a reference to this transport on behalf of the rpc_clnt, and
+ * optionally mark it for swapping if it wasn't already.
  */
-int xs_swapper(struct rpc_xprt *xprt, int enable)
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
 {
-       struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
-                       xprt);
-       int err = 0;
+       struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
-       if (enable) {
-               xprt->swapper++;
-               xs_set_memalloc(xprt);
-       } else if (xprt->swapper) {
-               xprt->swapper--;
-               sk_clear_memalloc(transport->inet);
-       }
+       if (atomic_inc_return(&xprt->swapper) != 1)
+               return 0;
+       if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
+               return -ERESTARTSYS;
+       if (xs->inet)
+               sk_set_memalloc(xs->inet);
+       xprt_release_xprt(xprt, NULL);
+       return 0;
+}
 
-       return err;
+/**
+ * xs_disable_swap - Untag this transport as being used for swap.
+ * @xprt: transport to tag
+ *
+ * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
+ * swapper refcount goes to 0, untag the socket as a memalloc socket.
+ */
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+       struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
+
+       if (!atomic_dec_and_test(&xprt->swapper))
+               return;
+       if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
+               return;
+       if (xs->inet)
+               sk_clear_memalloc(xs->inet);
+       xprt_release_xprt(xprt, NULL);
 }
-EXPORT_SYMBOL_GPL(xs_swapper);
 #else
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
 }
+
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
+{
+       return -EINVAL;
+}
+
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+}
 #endif
 
 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2057,6 +2086,27 @@ static void xs_udp_setup_socket(struct work_struct *work)
        xprt_wake_pending_tasks(xprt, status);
 }
 
+/**
+ * xs_tcp_shutdown - gracefully shut down a TCP socket
+ * @xprt: transport
+ *
+ * Initiates a graceful shutdown of the TCP socket by calling the
+ * equivalent of shutdown(SHUT_RDWR);
+ */
+static void xs_tcp_shutdown(struct rpc_xprt *xprt)
+{
+       struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+       struct socket *sock = transport->sock;
+
+       if (sock == NULL)
+               return;
+       if (xprt_connected(xprt)) {
+               kernel_sock_shutdown(sock, SHUT_RDWR);
+               trace_rpc_socket_shutdown(xprt, sock);
+       } else
+               xs_reset_transport(transport);
+}
+
 static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
        struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2067,6 +2117,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                unsigned int keepidle = xprt->timeout->to_initval / HZ;
                unsigned int keepcnt = xprt->timeout->to_retries + 1;
                unsigned int opt_on = 1;
+               unsigned int timeo;
 
                /* TCP Keepalive options */
                kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2078,6 +2129,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
                                (char *)&keepcnt, sizeof(keepcnt));
 
+               /* TCP user timeout (see RFC5482) */
+               timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
+                       (xprt->timeout->to_retries + 1);
+               kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+                               (char *)&timeo, sizeof(timeo));
+
                write_lock_bh(&sk->sk_callback_lock);
 
                xs_save_old_callbacks(transport, sk);
@@ -2125,9 +2182,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 
 /**
  * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
- * @xprt: RPC transport to connect
- * @transport: socket transport to connect
- * @create_sock: function to create a socket of the correct type
  *
  * Invoked by a work queue tasklet.
  */
@@ -2463,6 +2517,8 @@ static struct rpc_xprt_ops xs_local_ops = {
        .close                  = xs_close,
        .destroy                = xs_destroy,
        .print_stats            = xs_local_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
 };
 
 static struct rpc_xprt_ops xs_udp_ops = {
@@ -2482,6 +2538,9 @@ static struct rpc_xprt_ops xs_udp_ops = {
        .close                  = xs_close,
        .destroy                = xs_destroy,
        .print_stats            = xs_udp_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
+       .inject_disconnect      = xs_inject_disconnect,
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2498,6 +2557,9 @@ static struct rpc_xprt_ops xs_tcp_ops = {
        .close                  = xs_tcp_shutdown,
        .destroy                = xs_destroy,
        .print_stats            = xs_tcp_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
+       .inject_disconnect      = xs_inject_disconnect,
 };
 
 /*
@@ -2515,6 +2577,9 @@ static struct rpc_xprt_ops bc_tcp_ops = {
        .close                  = bc_close,
        .destroy                = bc_destroy,
        .print_stats            = xs_tcp_print_stats,
+       .enable_swap            = xs_enable_swap,
+       .disable_swap           = xs_disable_swap,
+       .inject_disconnect      = xs_inject_disconnect,
 };
 
 static int xs_init_anyaddr(const int family, struct sockaddr *sap)
index cdb491d845035e59bff19a5cde7f1ca84c28c17f..c0a932dff3290c4675688f5ce865c1f933a88b0d 100755 (executable)
@@ -154,7 +154,7 @@ exuberant()
 {
        all_target_sources | xargs $1 -a                        \
        -I __initdata,__exitdata,__initconst,                   \
-       -I __cpuinitdata,__initdata_memblock                    \
+       -I __initdata_memblock                                  \
        -I __refdata,__attribute,__maybe_unused,__always_unused \
        -I __acquires,__releases,__deprecated                   \
        -I __read_mostly,__aligned,____cacheline_aligned        \