]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - drivers/gpu/drm/i915/intel_pm.c
drm/i915: Fix the g4x watermark TLB miss workaround
[linux.git] / drivers / gpu / drm / i915 / intel_pm.c
index 249623d45be0caa3e891e8a272706dff84dbc4be..09d4676419fba473877fae7df2bbbf141604aa5c 100644 (file)
@@ -65,12 +65,12 @@ static void gen9_init_clock_gating(struct drm_i915_private *dev_priv)
        I915_WRITE(GEN8_CONFIG0,
                   I915_READ(GEN8_CONFIG0) | GEN9_DEFAULT_FIXES);
 
-       /* WaEnableChickenDCPR:skl,bxt,kbl */
+       /* WaEnableChickenDCPR:skl,bxt,kbl,glk */
        I915_WRITE(GEN8_CHICKEN_DCPR_1,
                   I915_READ(GEN8_CHICKEN_DCPR_1) | MASK_WAKEMEM);
 
        /* WaFbcTurnOffFbcWatermark:skl,bxt,kbl */
-       /* WaFbcWakeMemOn:skl,bxt,kbl */
+       /* WaFbcWakeMemOn:skl,bxt,kbl,glk */
        I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
                   DISP_FBC_WM_DIS |
                   DISP_FBC_MEMORY_WAKE);
@@ -99,9 +99,31 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv)
         * Wa: Backlight PWM may stop in the asserted state, causing backlight
         * to stay fully on.
         */
-       if (IS_BXT_REVID(dev_priv, BXT_REVID_B0, REVID_FOREVER))
-               I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) |
-                          PWM1_GATING_DIS | PWM2_GATING_DIS);
+       I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) |
+                  PWM1_GATING_DIS | PWM2_GATING_DIS);
+}
+
+static void glk_init_clock_gating(struct drm_i915_private *dev_priv)
+{
+       gen9_init_clock_gating(dev_priv);
+
+       /*
+        * WaDisablePWMClockGating:glk
+        * Backlight PWM may stop in the asserted state, causing backlight
+        * to stay fully on.
+        */
+       I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) |
+                  PWM1_GATING_DIS | PWM2_GATING_DIS);
+
+       /* WaDDIIOTimeout:glk */
+       if (IS_GLK_REVID(dev_priv, 0, GLK_REVID_A1)) {
+               u32 val = I915_READ(CHICKEN_MISC_2);
+               val &= ~(GLK_CL0_PWR_DOWN |
+                        GLK_CL1_PWR_DOWN |
+                        GLK_CL2_PWR_DOWN);
+               I915_WRITE(CHICKEN_MISC_2, val);
+       }
+
 }
 
 static void i915_pineview_get_mem_freq(struct drm_i915_private *dev_priv)
@@ -355,6 +377,8 @@ static bool _intel_set_memory_cxsr(struct drm_i915_private *dev_priv, bool enabl
                return false;
        }
 
+       trace_intel_memory_cxsr(dev_priv, was_enabled, enable);
+
        DRM_DEBUG_KMS("memory self-refresh is %s (was %s)\n",
                      enableddisabled(enable),
                      enableddisabled(was_enabled));
@@ -362,6 +386,43 @@ static bool _intel_set_memory_cxsr(struct drm_i915_private *dev_priv, bool enabl
        return was_enabled;
 }
 
+/**
+ * intel_set_memory_cxsr - Configure CxSR state
+ * @dev_priv: i915 device
+ * @enable: Allow vs. disallow CxSR
+ *
+ * Allow or disallow the system to enter a special CxSR
+ * (C-state self refresh) state. What typically happens in CxSR mode
+ * is that several display FIFOs may get combined into a single larger
+ * FIFO for a particular plane (so called max FIFO mode) to allow the
+ * system to defer memory fetches longer, and the memory will enter
+ * self refresh.
+ *
+ * Note that enabling CxSR does not guarantee that the system enter
+ * this special mode, nor does it guarantee that the system stays
+ * in that mode once entered. So this just allows/disallows the system
+ * to autonomously utilize the CxSR mode. Other factors such as core
+ * C-states will affect when/if the system actually enters/exits the
+ * CxSR mode.
+ *
+ * Note that on VLV/CHV this actually only controls the max FIFO mode,
+ * and the system is free to enter/exit memory self refresh at any time
+ * even when the use of CxSR has been disallowed.
+ *
+ * While the system is actually in the CxSR/max FIFO mode, some plane
+ * control registers will not get latched on vblank. Thus in order to
+ * guarantee the system will respond to changes in the plane registers
+ * we must always disallow CxSR prior to making changes to those registers.
+ * Unfortunately the system will re-evaluate the CxSR conditions at
+ * frame start which happens after vblank start (which is when the plane
+ * registers would get latched), so we can't proceed with the plane update
+ * during the same frame where we disallowed CxSR.
+ *
+ * Certain platforms also have a deeper HPLL SR mode. Fortunately the
+ * HPLL SR mode depends on CxSR itself, so we don't have to hand hold
+ * the hardware w.r.t. HPLL SR when writing to plane registers.
+ * Disallowing just CxSR is sufficient.
+ */
 bool intel_set_memory_cxsr(struct drm_i915_private *dev_priv, bool enable)
 {
        bool ret;
@@ -393,15 +454,15 @@ static const int pessimal_latency_ns = 5000;
 #define VLV_FIFO_START(dsparb, dsparb2, lo_shift, hi_shift) \
        ((((dsparb) >> (lo_shift)) & 0xff) | ((((dsparb2) >> (hi_shift)) & 0x1) << 8))
 
-static int vlv_get_fifo_size(struct intel_plane *plane)
+static void vlv_get_fifo_size(struct intel_crtc_state *crtc_state)
 {
-       struct drm_i915_private *dev_priv = to_i915(plane->base.dev);
-       int sprite0_start, sprite1_start, size;
-
-       if (plane->id == PLANE_CURSOR)
-               return 63;
+       struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+       struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+       struct vlv_fifo_state *fifo_state = &crtc_state->wm.vlv.fifo_state;
+       enum pipe pipe = crtc->pipe;
+       int sprite0_start, sprite1_start;
 
-       switch (plane->pipe) {
+       switch (pipe) {
                uint32_t dsparb, dsparb2, dsparb3;
        case PIPE_A:
                dsparb = I915_READ(DSPARB);
@@ -422,26 +483,14 @@ static int vlv_get_fifo_size(struct intel_plane *plane)
                sprite1_start = VLV_FIFO_START(dsparb3, dsparb2, 8, 20);
                break;
        default:
-               return 0;
-       }
-
-       switch (plane->id) {
-       case PLANE_PRIMARY:
-               size = sprite0_start;
-               break;
-       case PLANE_SPRITE0:
-               size = sprite1_start - sprite0_start;
-               break;
-       case PLANE_SPRITE1:
-               size = 512 - 1 - sprite1_start;
-               break;
-       default:
-               return 0;
+               MISSING_CASE(pipe);
+               return;
        }
 
-       DRM_DEBUG_KMS("%s FIFO size: %d\n", plane->base.name, size);
-
-       return size;
+       fifo_state->plane[PLANE_PRIMARY] = sprite0_start;
+       fifo_state->plane[PLANE_SPRITE0] = sprite1_start - sprite0_start;
+       fifo_state->plane[PLANE_SPRITE1] = 511 - sprite1_start;
+       fifo_state->plane[PLANE_CURSOR] = 63;
 }
 
 static int i9xx_get_fifo_size(struct drm_i915_private *dev_priv, int plane)
@@ -636,6 +685,34 @@ static unsigned long intel_calculate_wm(unsigned long clock_in_khz,
        return wm_size;
 }
 
+static int intel_wm_num_levels(struct drm_i915_private *dev_priv)
+{
+       return dev_priv->wm.max_level + 1;
+}
+
+static bool intel_wm_plane_visible(const struct intel_crtc_state *crtc_state,
+                                  const struct intel_plane_state *plane_state)
+{
+       struct intel_plane *plane = to_intel_plane(plane_state->base.plane);
+
+       /* FIXME check the 'enable' instead */
+       if (!crtc_state->base.active)
+               return false;
+
+       /*
+        * Treat cursor with fb as always visible since cursor updates
+        * can happen faster than the vrefresh rate, and the current
+        * watermark code doesn't handle that correctly. Cursor updates
+        * which set/clear the fb or change the cursor size are going
+        * to get throttled by intel_legacy_cursor_update() to work
+        * around this problem with the watermark code.
+        */
+       if (plane->id == PLANE_CURSOR)
+               return plane_state->base.fb != NULL;
+       else
+               return plane_state->base.visible;
+}
+
 static struct intel_crtc *single_enabled_crtc(struct drm_i915_private *dev_priv)
 {
        struct intel_crtc *crtc, *enabled = NULL;
@@ -691,7 +768,7 @@ static void pineview_update_wm(struct intel_crtc *unused_crtc)
                /* cursor SR */
                wm = intel_calculate_wm(clock, &pineview_cursor_wm,
                                        pineview_display_wm.fifo_size,
-                                       cpp, latency->cursor_sr);
+                                       4, latency->cursor_sr);
                reg = I915_READ(DSPFW3);
                reg &= ~DSPFW_CURSOR_SR_MASK;
                reg |= FW_WM(wm, CURSOR_SR);
@@ -709,7 +786,7 @@ static void pineview_update_wm(struct intel_crtc *unused_crtc)
                /* cursor HPLL off SR */
                wm = intel_calculate_wm(clock, &pineview_cursor_hplloff_wm,
                                        pineview_display_hplloff_wm.fifo_size,
-                                       cpp, latency->cursor_hpll_disable);
+                                       4, latency->cursor_hpll_disable);
                reg = I915_READ(DSPFW3);
                reg &= ~DSPFW_HPLL_CURSOR_MASK;
                reg |= FW_WM(wm, HPLL_CURSOR);
@@ -734,7 +811,7 @@ static bool g4x_compute_wm0(struct drm_i915_private *dev_priv,
        struct intel_crtc *crtc;
        const struct drm_display_mode *adjusted_mode;
        const struct drm_framebuffer *fb;
-       int htotal, hdisplay, clock, cpp;
+       int htotal, plane_width, cursor_width, clock, cpp;
        int line_time_us, line_count;
        int entries, tlb_miss;
 
@@ -749,12 +826,13 @@ static bool g4x_compute_wm0(struct drm_i915_private *dev_priv,
        fb = crtc->base.primary->state->fb;
        clock = adjusted_mode->crtc_clock;
        htotal = adjusted_mode->crtc_htotal;
-       hdisplay = crtc->config->pipe_src_w;
+       plane_width = crtc->config->pipe_src_w;
+       cursor_width = crtc->base.cursor->state->crtc_w;
        cpp = fb->format->cpp[0];
 
        /* Use the small buffer method to calculate plane watermark */
        entries = ((clock * cpp / 1000) * display_latency_ns) / 1000;
-       tlb_miss = display->fifo_size*display->cacheline_size - hdisplay * 8;
+       tlb_miss = display->fifo_size*display->cacheline_size - plane_width * cpp * 8;
        if (tlb_miss > 0)
                entries += tlb_miss;
        entries = DIV_ROUND_UP(entries, display->cacheline_size);
@@ -765,8 +843,8 @@ static bool g4x_compute_wm0(struct drm_i915_private *dev_priv,
        /* Use the large buffer method to calculate cursor watermark */
        line_time_us = max(htotal * 1000 / clock, 1);
        line_count = (cursor_latency_ns / line_time_us + 1000) / 1000;
-       entries = line_count * crtc->base.cursor->state->crtc_w * cpp;
-       tlb_miss = cursor->fifo_size*cursor->cacheline_size - hdisplay * 8;
+       entries = line_count * cursor_width * 4;
+       tlb_miss = cursor->fifo_size*cursor->cacheline_size - cursor_width * 4 * 8;
        if (tlb_miss > 0)
                entries += tlb_miss;
        entries = DIV_ROUND_UP(entries, cursor->cacheline_size);
@@ -853,7 +931,7 @@ static bool g4x_compute_srwm(struct drm_i915_private *dev_priv,
        *display_wm = entries + display->guard_size;
 
        /* calculate the self-refresh watermark for display cursor */
-       entries = line_count * cpp * crtc->base.cursor->state->crtc_w;
+       entries = line_count * 4 * crtc->base.cursor->state->crtc_w;
        entries = DIV_ROUND_UP(entries, cursor->cacheline_size);
        *cursor_wm = entries + cursor->guard_size;
 
@@ -871,6 +949,8 @@ static void vlv_write_wm_values(struct drm_i915_private *dev_priv,
        enum pipe pipe;
 
        for_each_pipe(dev_priv, pipe) {
+               trace_vlv_wm(intel_get_crtc_for_pipe(dev_priv, pipe), wm);
+
                I915_WRITE(VLV_DDL(pipe),
                           (wm->ddl[pipe].plane[PLANE_CURSOR] << DDL_CURSOR_SHIFT) |
                           (wm->ddl[pipe].plane[PLANE_SPRITE1] << DDL_SPRITE_SHIFT(1)) |
@@ -941,12 +1021,6 @@ static void vlv_write_wm_values(struct drm_i915_private *dev_priv,
 
 #undef FW_WM_VLV
 
-enum vlv_wm_level {
-       VLV_WM_LEVEL_PM2,
-       VLV_WM_LEVEL_PM5,
-       VLV_WM_LEVEL_DDR_DVFS,
-};
-
 /* latency must be in 0.1us units. */
 static unsigned int vlv_wm_method2(unsigned int pixel_rate,
                                   unsigned int pipe_htotal,
@@ -991,7 +1065,7 @@ static uint16_t vlv_compute_wm_level(const struct intel_crtc_state *crtc_state,
        if (dev_priv->wm.pri_latency[level] == 0)
                return USHRT_MAX;
 
-       if (!plane_state->base.visible)
+       if (!intel_wm_plane_visible(crtc_state, plane_state))
                return 0;
 
        cpp = plane_state->base.fb->format->cpp[0];
@@ -1001,7 +1075,7 @@ static uint16_t vlv_compute_wm_level(const struct intel_crtc_state *crtc_state,
        if (WARN_ON(htotal == 0))
                htotal = 1;
 
-       if (plane->base.type == DRM_PLANE_TYPE_CURSOR) {
+       if (plane->id == PLANE_CURSOR) {
                /*
                 * FIXME the formula gives values that are
                 * too big for the cursor FIFO, and hence we
@@ -1017,71 +1091,109 @@ static uint16_t vlv_compute_wm_level(const struct intel_crtc_state *crtc_state,
        return min_t(int, wm, USHRT_MAX);
 }
 
-static void vlv_compute_fifo(struct intel_crtc *crtc)
+static bool vlv_need_sprite0_fifo_workaround(unsigned int active_planes)
 {
-       struct drm_device *dev = crtc->base.dev;
-       struct vlv_wm_state *wm_state = &crtc->wm_state;
-       struct intel_plane *plane;
-       unsigned int total_rate = 0;
-       const int fifo_size = 512 - 1;
+       return (active_planes & (BIT(PLANE_SPRITE0) |
+                                BIT(PLANE_SPRITE1))) == BIT(PLANE_SPRITE1);
+}
+
+static int vlv_compute_fifo(struct intel_crtc_state *crtc_state)
+{
+       struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+       const struct g4x_pipe_wm *raw =
+               &crtc_state->wm.vlv.raw[VLV_WM_LEVEL_PM2];
+       struct vlv_fifo_state *fifo_state = &crtc_state->wm.vlv.fifo_state;
+       unsigned int active_planes = crtc_state->active_planes & ~BIT(PLANE_CURSOR);
+       int num_active_planes = hweight32(active_planes);
+       const int fifo_size = 511;
        int fifo_extra, fifo_left = fifo_size;
+       int sprite0_fifo_extra = 0;
+       unsigned int total_rate;
+       enum plane_id plane_id;
 
-       for_each_intel_plane_on_crtc(dev, crtc, plane) {
-               struct intel_plane_state *state =
-                       to_intel_plane_state(plane->base.state);
+       /*
+        * When enabling sprite0 after sprite1 has already been enabled
+        * we tend to get an underrun unless sprite0 already has some
+        * FIFO space allcoated. Hence we always allocate at least one
+        * cacheline for sprite0 whenever sprite1 is enabled.
+        *
+        * All other plane enable sequences appear immune to this problem.
+        */
+       if (vlv_need_sprite0_fifo_workaround(active_planes))
+               sprite0_fifo_extra = 1;
 
-               if (plane->base.type == DRM_PLANE_TYPE_CURSOR)
-                       continue;
+       total_rate = raw->plane[PLANE_PRIMARY] +
+               raw->plane[PLANE_SPRITE0] +
+               raw->plane[PLANE_SPRITE1] +
+               sprite0_fifo_extra;
 
-               if (state->base.visible) {
-                       wm_state->num_active_planes++;
-                       total_rate += state->base.fb->format->cpp[0];
-               }
-       }
+       if (total_rate > fifo_size)
+               return -EINVAL;
 
-       for_each_intel_plane_on_crtc(dev, crtc, plane) {
-               struct intel_plane_state *state =
-                       to_intel_plane_state(plane->base.state);
-               unsigned int rate;
+       if (total_rate == 0)
+               total_rate = 1;
 
-               if (plane->base.type == DRM_PLANE_TYPE_CURSOR) {
-                       plane->wm.fifo_size = 63;
-                       continue;
-               }
+       for_each_plane_id_on_crtc(crtc, plane_id) {
+               unsigned int rate;
 
-               if (!state->base.visible) {
-                       plane->wm.fifo_size = 0;
+               if ((active_planes & BIT(plane_id)) == 0) {
+                       fifo_state->plane[plane_id] = 0;
                        continue;
                }
 
-               rate = state->base.fb->format->cpp[0];
-               plane->wm.fifo_size = fifo_size * rate / total_rate;
-               fifo_left -= plane->wm.fifo_size;
+               rate = raw->plane[plane_id];
+               fifo_state->plane[plane_id] = fifo_size * rate / total_rate;
+               fifo_left -= fifo_state->plane[plane_id];
        }
 
-       fifo_extra = DIV_ROUND_UP(fifo_left, wm_state->num_active_planes ?: 1);
+       fifo_state->plane[PLANE_SPRITE0] += sprite0_fifo_extra;
+       fifo_left -= sprite0_fifo_extra;
+
+       fifo_state->plane[PLANE_CURSOR] = 63;
+
+       fifo_extra = DIV_ROUND_UP(fifo_left, num_active_planes ?: 1);
 
        /* spread the remainder evenly */
-       for_each_intel_plane_on_crtc(dev, crtc, plane) {
+       for_each_plane_id_on_crtc(crtc, plane_id) {
                int plane_extra;
 
                if (fifo_left == 0)
                        break;
 
-               if (plane->base.type == DRM_PLANE_TYPE_CURSOR)
-                       continue;
-
-               /* give it all to the first plane if none are active */
-               if (plane->wm.fifo_size == 0 &&
-                   wm_state->num_active_planes)
+               if ((active_planes & BIT(plane_id)) == 0)
                        continue;
 
                plane_extra = min(fifo_extra, fifo_left);
-               plane->wm.fifo_size += plane_extra;
+               fifo_state->plane[plane_id] += plane_extra;
                fifo_left -= plane_extra;
        }
 
-       WARN_ON(fifo_left != 0);
+       WARN_ON(active_planes != 0 && fifo_left != 0);
+
+       /* give it all to the first plane if none are active */
+       if (active_planes == 0) {
+               WARN_ON(fifo_left != fifo_size);
+               fifo_state->plane[PLANE_PRIMARY] = fifo_left;
+       }
+
+       return 0;
+}
+
+/* mark all levels starting from 'level' as invalid */
+static void vlv_invalidate_wms(struct intel_crtc *crtc,
+                              struct vlv_wm_state *wm_state, int level)
+{
+       struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+
+       for (; level < intel_wm_num_levels(dev_priv); level++) {
+               enum plane_id plane_id;
+
+               for_each_plane_id_on_crtc(crtc, plane_id)
+                       wm_state->wm[level].plane[plane_id] = USHRT_MAX;
+
+               wm_state->sr[level].cursor = USHRT_MAX;
+               wm_state->sr[level].plane = USHRT_MAX;
+       }
 }
 
 static u16 vlv_invert_wm_value(u16 wm, u16 fifo_size)
@@ -1092,144 +1204,230 @@ static u16 vlv_invert_wm_value(u16 wm, u16 fifo_size)
                return fifo_size - wm;
 }
 
-static void vlv_invert_wms(struct intel_crtc *crtc)
+/*
+ * Starting from 'level' set all higher
+ * levels to 'value' in the "raw" watermarks.
+ */
+static bool vlv_raw_plane_wm_set(struct intel_crtc_state *crtc_state,
+                                int level, enum plane_id plane_id, u16 value)
 {
-       struct vlv_wm_state *wm_state = &crtc->wm_state;
-       int level;
-
-       for (level = 0; level < wm_state->num_levels; level++) {
-               struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
-               const int sr_fifo_size =
-                       INTEL_INFO(dev_priv)->num_pipes * 512 - 1;
-               struct intel_plane *plane;
+       struct drm_i915_private *dev_priv = to_i915(crtc_state->base.crtc->dev);
+       int num_levels = intel_wm_num_levels(dev_priv);
+       bool dirty = false;
 
-               wm_state->sr[level].plane =
-                       vlv_invert_wm_value(wm_state->sr[level].plane,
-                                           sr_fifo_size);
-               wm_state->sr[level].cursor =
-                       vlv_invert_wm_value(wm_state->sr[level].cursor,
-                                           63);
+       for (; level < num_levels; level++) {
+               struct g4x_pipe_wm *raw = &crtc_state->wm.vlv.raw[level];
 
-               for_each_intel_plane_on_crtc(&dev_priv->drm, crtc, plane) {
-                       wm_state->wm[level].plane[plane->id] =
-                               vlv_invert_wm_value(wm_state->wm[level].plane[plane->id],
-                                                   plane->wm.fifo_size);
-               }
+               dirty |= raw->plane[plane_id] != value;
+               raw->plane[plane_id] = value;
        }
+
+       return dirty;
 }
 
-static void vlv_compute_wm(struct intel_crtc *crtc)
+static bool vlv_raw_plane_wm_compute(struct intel_crtc_state *crtc_state,
+                                    const struct intel_plane_state *plane_state)
 {
-       struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
-       struct vlv_wm_state *wm_state = &crtc->wm_state;
-       struct intel_plane *plane;
+       struct intel_plane *plane = to_intel_plane(plane_state->base.plane);
+       enum plane_id plane_id = plane->id;
+       int num_levels = intel_wm_num_levels(to_i915(plane->base.dev));
        int level;
+       bool dirty = false;
+
+       if (!intel_wm_plane_visible(crtc_state, plane_state)) {
+               dirty |= vlv_raw_plane_wm_set(crtc_state, 0, plane_id, 0);
+               goto out;
+       }
+
+       for (level = 0; level < num_levels; level++) {
+               struct g4x_pipe_wm *raw = &crtc_state->wm.vlv.raw[level];
+               int wm = vlv_compute_wm_level(crtc_state, plane_state, level);
+               int max_wm = plane_id == PLANE_CURSOR ? 63 : 511;
 
-       memset(wm_state, 0, sizeof(*wm_state));
+               if (wm > max_wm)
+                       break;
 
-       wm_state->cxsr = crtc->pipe != PIPE_C && crtc->wm.cxsr_allowed;
-       wm_state->num_levels = dev_priv->wm.max_level + 1;
+               dirty |= raw->plane[plane_id] != wm;
+               raw->plane[plane_id] = wm;
+       }
 
-       wm_state->num_active_planes = 0;
+       /* mark all higher levels as invalid */
+       dirty |= vlv_raw_plane_wm_set(crtc_state, level, plane_id, USHRT_MAX);
 
-       vlv_compute_fifo(crtc);
+out:
+       if (dirty)
+               DRM_DEBUG_KMS("%s watermarks: PM2=%d, PM5=%d, DDR DVFS=%d\n",
+                             plane->base.name,
+                             crtc_state->wm.vlv.raw[VLV_WM_LEVEL_PM2].plane[plane_id],
+                             crtc_state->wm.vlv.raw[VLV_WM_LEVEL_PM5].plane[plane_id],
+                             crtc_state->wm.vlv.raw[VLV_WM_LEVEL_DDR_DVFS].plane[plane_id]);
 
-       if (wm_state->num_active_planes != 1)
-               wm_state->cxsr = false;
+       return dirty;
+}
 
-       for_each_intel_plane_on_crtc(&dev_priv->drm, crtc, plane) {
-               struct intel_plane_state *state =
+static bool vlv_raw_plane_wm_is_valid(const struct intel_crtc_state *crtc_state,
+                                     enum plane_id plane_id, int level)
+{
+       const struct g4x_pipe_wm *raw =
+               &crtc_state->wm.vlv.raw[level];
+       const struct vlv_fifo_state *fifo_state =
+               &crtc_state->wm.vlv.fifo_state;
+
+       return raw->plane[plane_id] <= fifo_state->plane[plane_id];
+}
+
+static bool vlv_raw_crtc_wm_is_valid(const struct intel_crtc_state *crtc_state, int level)
+{
+       return vlv_raw_plane_wm_is_valid(crtc_state, PLANE_PRIMARY, level) &&
+               vlv_raw_plane_wm_is_valid(crtc_state, PLANE_SPRITE0, level) &&
+               vlv_raw_plane_wm_is_valid(crtc_state, PLANE_SPRITE1, level) &&
+               vlv_raw_plane_wm_is_valid(crtc_state, PLANE_CURSOR, level);
+}
+
+static int vlv_compute_pipe_wm(struct intel_crtc_state *crtc_state)
+{
+       struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+       struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+       struct intel_atomic_state *state =
+               to_intel_atomic_state(crtc_state->base.state);
+       struct vlv_wm_state *wm_state = &crtc_state->wm.vlv.optimal;
+       const struct vlv_fifo_state *fifo_state =
+               &crtc_state->wm.vlv.fifo_state;
+       int num_active_planes = hweight32(crtc_state->active_planes &
+                                         ~BIT(PLANE_CURSOR));
+       bool needs_modeset = drm_atomic_crtc_needs_modeset(&crtc_state->base);
+       struct intel_plane_state *plane_state;
+       struct intel_plane *plane;
+       enum plane_id plane_id;
+       int level, ret, i;
+       unsigned int dirty = 0;
+
+       for_each_intel_plane_in_state(state, plane, plane_state, i) {
+               const struct intel_plane_state *old_plane_state =
                        to_intel_plane_state(plane->base.state);
-               int level;
 
-               if (!state->base.visible)
+               if (plane_state->base.crtc != &crtc->base &&
+                   old_plane_state->base.crtc != &crtc->base)
                        continue;
 
-               /* normal watermarks */
-               for (level = 0; level < wm_state->num_levels; level++) {
-                       int wm = vlv_compute_wm_level(crtc->config, state, level);
-                       int max_wm = plane->wm.fifo_size;
+               if (vlv_raw_plane_wm_compute(crtc_state, plane_state))
+                       dirty |= BIT(plane->id);
+       }
 
-                       /* hack */
-                       if (WARN_ON(level == 0 && wm > max_wm))
-                               wm = max_wm;
+       /*
+        * DSPARB registers may have been reset due to the
+        * power well being turned off. Make sure we restore
+        * them to a consistent state even if no primary/sprite
+        * planes are initially active.
+        */
+       if (needs_modeset)
+               crtc_state->fifo_changed = true;
 
-                       if (wm > max_wm)
-                               break;
+       if (!dirty)
+               return 0;
 
-                       wm_state->wm[level].plane[plane->id] = wm;
-               }
+       /* cursor changes don't warrant a FIFO recompute */
+       if (dirty & ~BIT(PLANE_CURSOR)) {
+               const struct intel_crtc_state *old_crtc_state =
+                       to_intel_crtc_state(crtc->base.state);
+               const struct vlv_fifo_state *old_fifo_state =
+                       &old_crtc_state->wm.vlv.fifo_state;
+
+               ret = vlv_compute_fifo(crtc_state);
+               if (ret)
+                       return ret;
 
-               wm_state->num_levels = level;
+               if (needs_modeset ||
+                   memcmp(old_fifo_state, fifo_state,
+                          sizeof(*fifo_state)) != 0)
+                       crtc_state->fifo_changed = true;
+       }
 
-               if (!wm_state->cxsr)
-                       continue;
+       /* initially allow all levels */
+       wm_state->num_levels = intel_wm_num_levels(dev_priv);
+       /*
+        * Note that enabling cxsr with no primary/sprite planes
+        * enabled can wedge the pipe. Hence we only allow cxsr
+        * with exactly one enabled primary/sprite plane.
+        */
+       wm_state->cxsr = crtc->pipe != PIPE_C && num_active_planes == 1;
 
-               /* maxfifo watermarks */
-               if (plane->id == PLANE_CURSOR) {
-                       for (level = 0; level < wm_state->num_levels; level++)
-                               wm_state->sr[level].cursor =
-                                       wm_state->wm[level].plane[PLANE_CURSOR];
-               } else {
-                       for (level = 0; level < wm_state->num_levels; level++)
-                               wm_state->sr[level].plane =
-                                       max(wm_state->sr[level].plane,
-                                           wm_state->wm[level].plane[plane->id]);
+       for (level = 0; level < wm_state->num_levels; level++) {
+               const struct g4x_pipe_wm *raw = &crtc_state->wm.vlv.raw[level];
+               const int sr_fifo_size = INTEL_INFO(dev_priv)->num_pipes * 512 - 1;
+
+               if (!vlv_raw_crtc_wm_is_valid(crtc_state, level))
+                       break;
+
+               for_each_plane_id_on_crtc(crtc, plane_id) {
+                       wm_state->wm[level].plane[plane_id] =
+                               vlv_invert_wm_value(raw->plane[plane_id],
+                                                   fifo_state->plane[plane_id]);
                }
-       }
 
-       /* clear any (partially) filled invalid levels */
-       for (level = wm_state->num_levels; level < dev_priv->wm.max_level + 1; level++) {
-               memset(&wm_state->wm[level], 0, sizeof(wm_state->wm[level]));
-               memset(&wm_state->sr[level], 0, sizeof(wm_state->sr[level]));
+               wm_state->sr[level].plane =
+                       vlv_invert_wm_value(max3(raw->plane[PLANE_PRIMARY],
+                                                raw->plane[PLANE_SPRITE0],
+                                                raw->plane[PLANE_SPRITE1]),
+                                           sr_fifo_size);
+
+               wm_state->sr[level].cursor =
+                       vlv_invert_wm_value(raw->plane[PLANE_CURSOR],
+                                           63);
        }
 
-       vlv_invert_wms(crtc);
+       if (level == 0)
+               return -EINVAL;
+
+       /* limit to only levels we can actually handle */
+       wm_state->num_levels = level;
+
+       /* invalidate the higher levels */
+       vlv_invalidate_wms(crtc, wm_state, level);
+
+       return 0;
 }
 
 #define VLV_FIFO(plane, value) \
        (((value) << DSPARB_ ## plane ## _SHIFT_VLV) & DSPARB_ ## plane ## _MASK_VLV)
 
-static void vlv_pipe_set_fifo_size(struct intel_crtc *crtc)
+static void vlv_atomic_update_fifo(struct intel_atomic_state *state,
+                                  struct intel_crtc_state *crtc_state)
 {
-       struct drm_device *dev = crtc->base.dev;
-       struct drm_i915_private *dev_priv = to_i915(dev);
-       struct intel_plane *plane;
-       int sprite0_start = 0, sprite1_start = 0, fifo_size = 0;
+       struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+       struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+       const struct vlv_fifo_state *fifo_state =
+               &crtc_state->wm.vlv.fifo_state;
+       int sprite0_start, sprite1_start, fifo_size;
 
-       for_each_intel_plane_on_crtc(dev, crtc, plane) {
-               switch (plane->id) {
-               case PLANE_PRIMARY:
-                       sprite0_start = plane->wm.fifo_size;
-                       break;
-               case PLANE_SPRITE0:
-                       sprite1_start = sprite0_start + plane->wm.fifo_size;
-                       break;
-               case PLANE_SPRITE1:
-                       fifo_size = sprite1_start + plane->wm.fifo_size;
-                       break;
-               case PLANE_CURSOR:
-                       WARN_ON(plane->wm.fifo_size != 63);
-                       break;
-               default:
-                       MISSING_CASE(plane->id);
-                       break;
-               }
-       }
+       if (!crtc_state->fifo_changed)
+               return;
+
+       sprite0_start = fifo_state->plane[PLANE_PRIMARY];
+       sprite1_start = fifo_state->plane[PLANE_SPRITE0] + sprite0_start;
+       fifo_size = fifo_state->plane[PLANE_SPRITE1] + sprite1_start;
 
-       WARN_ON(fifo_size != 512 - 1);
+       WARN_ON(fifo_state->plane[PLANE_CURSOR] != 63);
+       WARN_ON(fifo_size != 511);
 
-       DRM_DEBUG_KMS("Pipe %c FIFO split %d / %d / %d\n",
-                     pipe_name(crtc->pipe), sprite0_start,
-                     sprite1_start, fifo_size);
+       trace_vlv_fifo_size(crtc, sprite0_start, sprite1_start, fifo_size);
 
-       spin_lock(&dev_priv->wm.dsparb_lock);
+       /*
+        * uncore.lock serves a double purpose here. It allows us to
+        * use the less expensive I915_{READ,WRITE}_FW() functions, and
+        * it protects the DSPARB registers from getting clobbered by
+        * parallel updates from multiple pipes.
+        *
+        * intel_pipe_update_start() has already disabled interrupts
+        * for us, so a plain spin_lock() is sufficient here.
+        */
+       spin_lock(&dev_priv->uncore.lock);
 
        switch (crtc->pipe) {
                uint32_t dsparb, dsparb2, dsparb3;
        case PIPE_A:
-               dsparb = I915_READ(DSPARB);
-               dsparb2 = I915_READ(DSPARB2);
+               dsparb = I915_READ_FW(DSPARB);
+               dsparb2 = I915_READ_FW(DSPARB2);
 
                dsparb &= ~(VLV_FIFO(SPRITEA, 0xff) |
                            VLV_FIFO(SPRITEB, 0xff));
@@ -1241,12 +1439,12 @@ static void vlv_pipe_set_fifo_size(struct intel_crtc *crtc)
                dsparb2 |= (VLV_FIFO(SPRITEA_HI, sprite0_start >> 8) |
                           VLV_FIFO(SPRITEB_HI, sprite1_start >> 8));
 
-               I915_WRITE(DSPARB, dsparb);
-               I915_WRITE(DSPARB2, dsparb2);
+               I915_WRITE_FW(DSPARB, dsparb);
+               I915_WRITE_FW(DSPARB2, dsparb2);
                break;
        case PIPE_B:
-               dsparb = I915_READ(DSPARB);
-               dsparb2 = I915_READ(DSPARB2);
+               dsparb = I915_READ_FW(DSPARB);
+               dsparb2 = I915_READ_FW(DSPARB2);
 
                dsparb &= ~(VLV_FIFO(SPRITEC, 0xff) |
                            VLV_FIFO(SPRITED, 0xff));
@@ -1258,12 +1456,12 @@ static void vlv_pipe_set_fifo_size(struct intel_crtc *crtc)
                dsparb2 |= (VLV_FIFO(SPRITEC_HI, sprite0_start >> 8) |
                           VLV_FIFO(SPRITED_HI, sprite1_start >> 8));
 
-               I915_WRITE(DSPARB, dsparb);
-               I915_WRITE(DSPARB2, dsparb2);
+               I915_WRITE_FW(DSPARB, dsparb);
+               I915_WRITE_FW(DSPARB2, dsparb2);
                break;
        case PIPE_C:
-               dsparb3 = I915_READ(DSPARB3);
-               dsparb2 = I915_READ(DSPARB2);
+               dsparb3 = I915_READ_FW(DSPARB3);
+               dsparb2 = I915_READ_FW(DSPARB2);
 
                dsparb3 &= ~(VLV_FIFO(SPRITEE, 0xff) |
                             VLV_FIFO(SPRITEF, 0xff));
@@ -1275,20 +1473,60 @@ static void vlv_pipe_set_fifo_size(struct intel_crtc *crtc)
                dsparb2 |= (VLV_FIFO(SPRITEE_HI, sprite0_start >> 8) |
                           VLV_FIFO(SPRITEF_HI, sprite1_start >> 8));
 
-               I915_WRITE(DSPARB3, dsparb3);
-               I915_WRITE(DSPARB2, dsparb2);
+               I915_WRITE_FW(DSPARB3, dsparb3);
+               I915_WRITE_FW(DSPARB2, dsparb2);
                break;
        default:
                break;
        }
 
-       POSTING_READ(DSPARB);
+       POSTING_READ_FW(DSPARB);
 
-       spin_unlock(&dev_priv->wm.dsparb_lock);
+       spin_unlock(&dev_priv->uncore.lock);
 }
 
 #undef VLV_FIFO
 
+static int vlv_compute_intermediate_wm(struct drm_device *dev,
+                                      struct intel_crtc *crtc,
+                                      struct intel_crtc_state *crtc_state)
+{
+       struct vlv_wm_state *intermediate = &crtc_state->wm.vlv.intermediate;
+       const struct vlv_wm_state *optimal = &crtc_state->wm.vlv.optimal;
+       const struct vlv_wm_state *active = &crtc->wm.active.vlv;
+       int level;
+
+       intermediate->num_levels = min(optimal->num_levels, active->num_levels);
+       intermediate->cxsr = optimal->cxsr && active->cxsr &&
+               !crtc_state->disable_cxsr;
+
+       for (level = 0; level < intermediate->num_levels; level++) {
+               enum plane_id plane_id;
+
+               for_each_plane_id_on_crtc(crtc, plane_id) {
+                       intermediate->wm[level].plane[plane_id] =
+                               min(optimal->wm[level].plane[plane_id],
+                                   active->wm[level].plane[plane_id]);
+               }
+
+               intermediate->sr[level].plane = min(optimal->sr[level].plane,
+                                                   active->sr[level].plane);
+               intermediate->sr[level].cursor = min(optimal->sr[level].cursor,
+                                                    active->sr[level].cursor);
+       }
+
+       vlv_invalidate_wms(crtc, intermediate, level);
+
+       /*
+        * If our intermediate WM are identical to the final WM, then we can
+        * omit the post-vblank programming; only update if it's different.
+        */
+       if (memcmp(intermediate, optimal, sizeof(*intermediate)) != 0)
+               crtc_state->wm.need_postvbl_update = true;
+
+       return 0;
+}
+
 static void vlv_merge_wm(struct drm_i915_private *dev_priv,
                         struct vlv_wm_values *wm)
 {
@@ -1299,7 +1537,7 @@ static void vlv_merge_wm(struct drm_i915_private *dev_priv,
        wm->cxsr = true;
 
        for_each_intel_crtc(&dev_priv->drm, crtc) {
-               const struct vlv_wm_state *wm_state = &crtc->wm_state;
+               const struct vlv_wm_state *wm_state = &crtc->wm.active.vlv;
 
                if (!crtc->active)
                        continue;
@@ -1318,14 +1556,11 @@ static void vlv_merge_wm(struct drm_i915_private *dev_priv,
                wm->level = VLV_WM_LEVEL_PM2;
 
        for_each_intel_crtc(&dev_priv->drm, crtc) {
-               struct vlv_wm_state *wm_state = &crtc->wm_state;
+               const struct vlv_wm_state *wm_state = &crtc->wm.active.vlv;
                enum pipe pipe = crtc->pipe;
 
-               if (!crtc->active)
-                       continue;
-
                wm->pipe[pipe] = wm_state->wm[wm->level];
-               if (wm->cxsr)
+               if (crtc->active && wm->cxsr)
                        wm->sr = wm_state->sr[wm->level];
 
                wm->ddl[pipe].plane[PLANE_PRIMARY] = DDL_PRECISION_HIGH | 2;
@@ -1345,22 +1580,15 @@ static bool is_enabling(int old, int new, int threshold)
        return old < threshold && new >= threshold;
 }
 
-static void vlv_update_wm(struct intel_crtc *crtc)
+static void vlv_program_watermarks(struct drm_i915_private *dev_priv)
 {
-       struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
-       enum pipe pipe = crtc->pipe;
        struct vlv_wm_values *old_wm = &dev_priv->wm.vlv;
        struct vlv_wm_values new_wm = {};
 
-       vlv_compute_wm(crtc);
        vlv_merge_wm(dev_priv, &new_wm);
 
-       if (memcmp(old_wm, &new_wm, sizeof(new_wm)) == 0) {
-               /* FIXME should be part of crtc atomic commit */
-               vlv_pipe_set_fifo_size(crtc);
-
+       if (memcmp(old_wm, &new_wm, sizeof(new_wm)) == 0)
                return;
-       }
 
        if (is_disabling(old_wm->level, new_wm.level, VLV_WM_LEVEL_DDR_DVFS))
                chv_set_memory_dvfs(dev_priv, false);
@@ -1371,17 +1599,8 @@ static void vlv_update_wm(struct intel_crtc *crtc)
        if (is_disabling(old_wm->cxsr, new_wm.cxsr, true))
                _intel_set_memory_cxsr(dev_priv, false);
 
-       /* FIXME should be part of crtc atomic commit */
-       vlv_pipe_set_fifo_size(crtc);
-
        vlv_write_wm_values(dev_priv, &new_wm);
 
-       DRM_DEBUG_KMS("Setting FIFO watermarks - %c: plane=%d, cursor=%d, "
-                     "sprite0=%d, sprite1=%d, SR: plane=%d, cursor=%d level=%d cxsr=%d\n",
-                     pipe_name(pipe), new_wm.pipe[pipe].plane[PLANE_PRIMARY], new_wm.pipe[pipe].plane[PLANE_CURSOR],
-                     new_wm.pipe[pipe].plane[PLANE_SPRITE0], new_wm.pipe[pipe].plane[PLANE_SPRITE1],
-                     new_wm.sr.plane, new_wm.sr.cursor, new_wm.level, new_wm.cxsr);
-
        if (is_enabling(old_wm->cxsr, new_wm.cxsr, true))
                _intel_set_memory_cxsr(dev_priv, true);
 
@@ -1394,6 +1613,33 @@ static void vlv_update_wm(struct intel_crtc *crtc)
        *old_wm = new_wm;
 }
 
+static void vlv_initial_watermarks(struct intel_atomic_state *state,
+                                  struct intel_crtc_state *crtc_state)
+{
+       struct drm_i915_private *dev_priv = to_i915(crtc_state->base.crtc->dev);
+       struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
+
+       mutex_lock(&dev_priv->wm.wm_mutex);
+       crtc->wm.active.vlv = crtc_state->wm.vlv.intermediate;
+       vlv_program_watermarks(dev_priv);
+       mutex_unlock(&dev_priv->wm.wm_mutex);
+}
+
+static void vlv_optimize_watermarks(struct intel_atomic_state *state,
+                                   struct intel_crtc_state *crtc_state)
+{
+       struct drm_i915_private *dev_priv = to_i915(crtc_state->base.crtc->dev);
+       struct intel_crtc *intel_crtc = to_intel_crtc(crtc_state->base.crtc);
+
+       if (!crtc_state->wm.need_postvbl_update)
+               return;
+
+       mutex_lock(&dev_priv->wm.wm_mutex);
+       intel_crtc->wm.active.vlv = crtc_state->wm.vlv.optimal;
+       vlv_program_watermarks(dev_priv);
+       mutex_unlock(&dev_priv->wm.wm_mutex);
+}
+
 #define single_plane_enabled(mask) is_power_of_2(mask)
 
 static void g4x_update_wm(struct intel_crtc *crtc)
@@ -1491,7 +1737,7 @@ static void i965_update_wm(struct intel_crtc *unused_crtc)
                              entries, srwm);
 
                entries = (((sr_latency_ns / line_time_us) + 1000) / 1000) *
-                       cpp * crtc->base.cursor->state->crtc_w;
+                       4 * crtc->base.cursor->state->crtc_w;
                entries = DIV_ROUND_UP(entries,
                                          i965_cursor_wm_info.cacheline_size);
                cursor_sr = i965_cursor_wm_info.fifo_size -
@@ -1701,39 +1947,6 @@ static void i845_update_wm(struct intel_crtc *unused_crtc)
        I915_WRITE(FW_BLC, fwater_lo);
 }
 
-uint32_t ilk_pipe_pixel_rate(const struct intel_crtc_state *pipe_config)
-{
-       uint32_t pixel_rate;
-
-       pixel_rate = pipe_config->base.adjusted_mode.crtc_clock;
-
-       /* We only use IF-ID interlacing. If we ever use PF-ID we'll need to
-        * adjust the pixel_rate here. */
-
-       if (pipe_config->pch_pfit.enabled) {
-               uint64_t pipe_w, pipe_h, pfit_w, pfit_h;
-               uint32_t pfit_size = pipe_config->pch_pfit.size;
-
-               pipe_w = pipe_config->pipe_src_w;
-               pipe_h = pipe_config->pipe_src_h;
-
-               pfit_w = (pfit_size >> 16) & 0xFFFF;
-               pfit_h = pfit_size & 0xFFFF;
-               if (pipe_w < pfit_w)
-                       pipe_w = pfit_w;
-               if (pipe_h < pfit_h)
-                       pipe_h = pfit_h;
-
-               if (WARN_ON(!pfit_w || !pfit_h))
-                       return pixel_rate;
-
-               pixel_rate = div_u64((uint64_t) pixel_rate * pipe_w * pipe_h,
-                                    pfit_w * pfit_h);
-       }
-
-       return pixel_rate;
-}
-
 /* latency must be in 0.1us units. */
 static uint32_t ilk_wm_method1(uint32_t pixel_rate, uint8_t cpp, uint32_t latency)
 {
@@ -1802,17 +2015,17 @@ static uint32_t ilk_compute_pri_wm(const struct intel_crtc_state *cstate,
        uint32_t method1, method2;
        int cpp;
 
-       if (!cstate->base.active || !pstate->base.visible)
+       if (!intel_wm_plane_visible(cstate, pstate))
                return 0;
 
        cpp = pstate->base.fb->format->cpp[0];
 
-       method1 = ilk_wm_method1(ilk_pipe_pixel_rate(cstate), cpp, mem_value);
+       method1 = ilk_wm_method1(cstate->pixel_rate, cpp, mem_value);
 
        if (!is_lp)
                return method1;
 
-       method2 = ilk_wm_method2(ilk_pipe_pixel_rate(cstate),
+       method2 = ilk_wm_method2(cstate->pixel_rate,
                                 cstate->base.adjusted_mode.crtc_htotal,
                                 drm_rect_width(&pstate->base.dst),
                                 cpp, mem_value);
@@ -1831,13 +2044,13 @@ static uint32_t ilk_compute_spr_wm(const struct intel_crtc_state *cstate,
        uint32_t method1, method2;
        int cpp;
 
-       if (!cstate->base.active || !pstate->base.visible)
+       if (!intel_wm_plane_visible(cstate, pstate))
                return 0;
 
        cpp = pstate->base.fb->format->cpp[0];
 
-       method1 = ilk_wm_method1(ilk_pipe_pixel_rate(cstate), cpp, mem_value);
-       method2 = ilk_wm_method2(ilk_pipe_pixel_rate(cstate),
+       method1 = ilk_wm_method1(cstate->pixel_rate, cpp, mem_value);
+       method2 = ilk_wm_method2(cstate->pixel_rate,
                                 cstate->base.adjusted_mode.crtc_htotal,
                                 drm_rect_width(&pstate->base.dst),
                                 cpp, mem_value);
@@ -1852,20 +2065,16 @@ static uint32_t ilk_compute_cur_wm(const struct intel_crtc_state *cstate,
                                   const struct intel_plane_state *pstate,
                                   uint32_t mem_value)
 {
-       /*
-        * We treat the cursor plane as always-on for the purposes of watermark
-        * calculation.  Until we have two-stage watermark programming merged,
-        * this is necessary to avoid flickering.
-        */
-       int cpp = 4;
-       int width = pstate->base.visible ? pstate->base.crtc_w : 64;
+       int cpp;
 
-       if (!cstate->base.active)
+       if (!intel_wm_plane_visible(cstate, pstate))
                return 0;
 
-       return ilk_wm_method2(ilk_pipe_pixel_rate(cstate),
+       cpp = pstate->base.fb->format->cpp[0];
+
+       return ilk_wm_method2(cstate->pixel_rate,
                              cstate->base.adjusted_mode.crtc_htotal,
-                             width, cpp, mem_value);
+                             pstate->base.crtc_w, cpp, mem_value);
 }
 
 /* Only for WM_LP. */
@@ -1875,7 +2084,7 @@ static uint32_t ilk_compute_fbc_wm(const struct intel_crtc_state *cstate,
 {
        int cpp;
 
-       if (!cstate->base.active || !pstate->base.visible)
+       if (!intel_wm_plane_visible(cstate, pstate))
                return 0;
 
        cpp = pstate->base.fb->format->cpp[0];
@@ -2095,7 +2304,7 @@ hsw_compute_linetime_wm(const struct intel_crtc_state *cstate)
                return 0;
        if (WARN_ON(adjusted_mode->crtc_clock == 0))
                return 0;
-       if (WARN_ON(intel_state->cdclk == 0))
+       if (WARN_ON(intel_state->cdclk.logical.cdclk == 0))
                return 0;
 
        /* The WM are computed with base on how long it takes to fill a single
@@ -2104,7 +2313,7 @@ hsw_compute_linetime_wm(const struct intel_crtc_state *cstate)
        linetime = DIV_ROUND_CLOSEST(adjusted_mode->crtc_htotal * 1000 * 8,
                                     adjusted_mode->crtc_clock);
        ips_linetime = DIV_ROUND_CLOSEST(adjusted_mode->crtc_htotal * 1000 * 8,
-                                        intel_state->cdclk);
+                                        intel_state->cdclk.logical.cdclk);
 
        return PIPE_WM_LINETIME_IPS_LINETIME(ips_linetime) |
               PIPE_WM_LINETIME_TIME(linetime);
@@ -2173,7 +2382,7 @@ static void intel_read_wm_latency(struct drm_i915_private *dev_priv,
                }
 
                /*
-                * WaWmMemoryReadLatency:skl
+                * WaWmMemoryReadLatency:skl,glk
                 *
                 * punit doesn't take into account the read latency so we need
                 * to add 2us to the various latency levels we retrieve from the
@@ -2498,8 +2707,8 @@ static int ilk_compute_intermediate_wm(struct drm_device *dev,
         * If our intermediate WM are identical to the final WM, then we can
         * omit the post-vblank programming; only update if it's different.
         */
-       if (memcmp(a, &newstate->wm.ilk.optimal, sizeof(*a)) == 0)
-               newstate->wm.need_postvbl_update = false;
+       if (memcmp(a, &newstate->wm.ilk.optimal, sizeof(*a)) != 0)
+               newstate->wm.need_postvbl_update = true;
 
        return 0;
 }
@@ -2895,8 +3104,7 @@ static bool skl_needs_memory_bw_wa(struct intel_atomic_state *state)
 {
        struct drm_i915_private *dev_priv = to_i915(state->base.dev);
 
-       if (IS_SKYLAKE(dev_priv) || IS_BROXTON(dev_priv) ||
-           IS_KABYLAKE(dev_priv))
+       if (IS_GEN9_BC(dev_priv) || IS_BROXTON(dev_priv))
                return true;
 
        return false;
@@ -3184,19 +3392,29 @@ void skl_ddb_get_hw_state(struct drm_i915_private *dev_priv,
  * Caller should take care of dividing & rounding off the value.
  */
 static uint32_t
-skl_plane_downscale_amount(const struct intel_plane_state *pstate)
+skl_plane_downscale_amount(const struct intel_crtc_state *cstate,
+                          const struct intel_plane_state *pstate)
 {
+       struct intel_plane *plane = to_intel_plane(pstate->base.plane);
        uint32_t downscale_h, downscale_w;
        uint32_t src_w, src_h, dst_w, dst_h;
 
-       if (WARN_ON(!pstate->base.visible))
+       if (WARN_ON(!intel_wm_plane_visible(cstate, pstate)))
                return DRM_PLANE_HELPER_NO_SCALING;
 
        /* n.b., src is 16.16 fixed point, dst is whole integer */
-       src_w = drm_rect_width(&pstate->base.src);
-       src_h = drm_rect_height(&pstate->base.src);
-       dst_w = drm_rect_width(&pstate->base.dst);
-       dst_h = drm_rect_height(&pstate->base.dst);
+       if (plane->id == PLANE_CURSOR) {
+               src_w = pstate->base.src_w;
+               src_h = pstate->base.src_h;
+               dst_w = pstate->base.crtc_w;
+               dst_h = pstate->base.crtc_h;
+       } else {
+               src_w = drm_rect_width(&pstate->base.src);
+               src_h = drm_rect_height(&pstate->base.src);
+               dst_w = drm_rect_width(&pstate->base.dst);
+               dst_h = drm_rect_height(&pstate->base.dst);
+       }
+
        if (drm_rotation_90_or_270(pstate->base.rotation))
                swap(dst_w, dst_h);
 
@@ -3212,6 +3430,7 @@ skl_plane_relative_data_rate(const struct intel_crtc_state *cstate,
                             const struct drm_plane_state *pstate,
                             int y)
 {
+       struct intel_plane *plane = to_intel_plane(pstate->plane);
        struct intel_plane_state *intel_pstate = to_intel_plane_state(pstate);
        uint32_t down_scale_amount, data_rate;
        uint32_t width = 0, height = 0;
@@ -3224,7 +3443,7 @@ skl_plane_relative_data_rate(const struct intel_crtc_state *cstate,
        fb = pstate->fb;
        format = fb->format->format;
 
-       if (pstate->plane->type == DRM_PLANE_TYPE_CURSOR)
+       if (plane->id == PLANE_CURSOR)
                return 0;
        if (y && format != DRM_FORMAT_NV12)
                return 0;
@@ -3248,7 +3467,7 @@ skl_plane_relative_data_rate(const struct intel_crtc_state *cstate,
                data_rate = width * height * fb->format->cpp[0];
        }
 
-       down_scale_amount = skl_plane_downscale_amount(intel_pstate);
+       down_scale_amount = skl_plane_downscale_amount(cstate, intel_pstate);
 
        return (uint64_t)data_rate * down_scale_amount >> 16;
 }
@@ -3540,15 +3759,15 @@ static uint32_t skl_adjusted_plane_pixel_rate(const struct intel_crtc_state *cst
        uint64_t pixel_rate;
 
        /* Shouldn't reach here on disabled planes... */
-       if (WARN_ON(!pstate->base.visible))
+       if (WARN_ON(!intel_wm_plane_visible(cstate, pstate)))
                return 0;
 
        /*
         * Adjusted plane pixel rate is just the pipe's adjusted pixel rate
         * with additional adjustments for plane-specific scaling.
         */
-       adjusted_pixel_rate = ilk_pipe_pixel_rate(cstate);
-       downscale_amount = skl_plane_downscale_amount(pstate);
+       adjusted_pixel_rate = cstate->pixel_rate;
+       downscale_amount = skl_plane_downscale_amount(cstate, pstate);
 
        pixel_rate = adjusted_pixel_rate * downscale_amount >> 16;
        WARN_ON(pixel_rate != clamp_t(uint32_t, pixel_rate, 0, ~0));
@@ -3565,6 +3784,7 @@ static int skl_compute_plane_wm(const struct drm_i915_private *dev_priv,
                                uint8_t *out_lines, /* out */
                                bool *enabled /* out */)
 {
+       struct intel_plane *plane = to_intel_plane(intel_pstate->base.plane);
        struct drm_plane_state *pstate = &intel_pstate->base;
        struct drm_framebuffer *fb = pstate->fb;
        uint32_t latency = dev_priv->wm.skl_latency[level];
@@ -3584,7 +3804,8 @@ static int skl_compute_plane_wm(const struct drm_i915_private *dev_priv,
        bool apply_memory_bw_wa = skl_needs_memory_bw_wa(state);
        bool y_tiled, x_tiled;
 
-       if (latency == 0 || !cstate->base.active || !intel_pstate->base.visible) {
+       if (latency == 0 ||
+           !intel_wm_plane_visible(cstate, intel_pstate)) {
                *enabled = false;
                return 0;
        }
@@ -3600,8 +3821,13 @@ static int skl_compute_plane_wm(const struct drm_i915_private *dev_priv,
        if (apply_memory_bw_wa && x_tiled)
                latency += 15;
 
-       width = drm_rect_width(&intel_pstate->base.src) >> 16;
-       height = drm_rect_height(&intel_pstate->base.src) >> 16;
+       if (plane->id == PLANE_CURSOR) {
+               width = intel_pstate->base.crtc_w;
+               height = intel_pstate->base.crtc_h;
+       } else {
+               width = drm_rect_width(&intel_pstate->base.src) >> 16;
+               height = drm_rect_height(&intel_pstate->base.src) >> 16;
+       }
 
        if (drm_rotation_90_or_270(pstate->rotation))
                swap(width, height);
@@ -3775,7 +4001,7 @@ skl_compute_linetime_wm(struct intel_crtc_state *cstate)
        if (!cstate->base.active)
                return 0;
 
-       pixel_rate = ilk_pipe_pixel_rate(cstate);
+       pixel_rate = cstate->pixel_rate;
 
        if (WARN_ON(pixel_rate == 0))
                return 0;
@@ -3967,7 +4193,7 @@ pipes_modified(struct drm_atomic_state *state)
        struct drm_crtc_state *cstate;
        uint32_t i, ret = 0;
 
-       for_each_crtc_in_state(state, crtc, cstate, i)
+       for_each_new_crtc_in_state(state, crtc, cstate, i)
                ret |= drm_crtc_mask(crtc);
 
        return ret;
@@ -4110,7 +4336,7 @@ skl_print_wm_changes(const struct drm_atomic_state *state)
        const struct skl_ddb_allocation *new_ddb = &intel_state->wm_results.ddb;
        int i;
 
-       for_each_crtc_in_state(state, crtc, cstate, i) {
+       for_each_new_crtc_in_state(state, crtc, cstate, i) {
                const struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
                enum pipe pipe = intel_crtc->pipe;
 
@@ -4152,7 +4378,7 @@ skl_compute_wm(struct drm_atomic_state *state)
         * since any racing commits that want to update them would need to
         * hold _all_ CRTC state mutexes.
         */
-       for_each_crtc_in_state(state, crtc, cstate, i)
+       for_each_new_crtc_in_state(state, crtc, cstate, i)
                changed = true;
        if (!changed)
                return 0;
@@ -4174,7 +4400,7 @@ skl_compute_wm(struct drm_atomic_state *state)
         * should allow skl_update_pipe_wm() to return failure in cases where
         * no suitable watermark values can be found.
         */
-       for_each_crtc_in_state(state, crtc, cstate, i) {
+       for_each_new_crtc_in_state(state, crtc, cstate, i) {
                struct intel_crtc_state *intel_cstate =
                        to_intel_crtc_state(cstate);
                const struct skl_pipe_wm *old_pipe_wm =
@@ -4539,15 +4765,11 @@ void vlv_wm_get_hw_state(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = to_i915(dev);
        struct vlv_wm_values *wm = &dev_priv->wm.vlv;
-       struct intel_plane *plane;
-       enum pipe pipe;
+       struct intel_crtc *crtc;
        u32 val;
 
        vlv_read_wm_values(dev_priv, wm);
 
-       for_each_intel_plane(dev, plane)
-               plane->wm.fifo_size = vlv_get_fifo_size(plane);
-
        wm->cxsr = I915_READ(FW_BLC_SELF_VLV) & FW_CSPWRDWNEN;
        wm->level = VLV_WM_LEVEL_PM2;
 
@@ -4585,18 +4807,107 @@ void vlv_wm_get_hw_state(struct drm_device *dev)
                mutex_unlock(&dev_priv->rps.hw_lock);
        }
 
-       for_each_pipe(dev_priv, pipe)
+       for_each_intel_crtc(dev, crtc) {
+               struct intel_crtc_state *crtc_state =
+                       to_intel_crtc_state(crtc->base.state);
+               struct vlv_wm_state *active = &crtc->wm.active.vlv;
+               const struct vlv_fifo_state *fifo_state =
+                       &crtc_state->wm.vlv.fifo_state;
+               enum pipe pipe = crtc->pipe;
+               enum plane_id plane_id;
+               int level;
+
+               vlv_get_fifo_size(crtc_state);
+
+               active->num_levels = wm->level + 1;
+               active->cxsr = wm->cxsr;
+
+               for (level = 0; level < active->num_levels; level++) {
+                       struct g4x_pipe_wm *raw =
+                               &crtc_state->wm.vlv.raw[level];
+
+                       active->sr[level].plane = wm->sr.plane;
+                       active->sr[level].cursor = wm->sr.cursor;
+
+                       for_each_plane_id_on_crtc(crtc, plane_id) {
+                               active->wm[level].plane[plane_id] =
+                                       wm->pipe[pipe].plane[plane_id];
+
+                               raw->plane[plane_id] =
+                                       vlv_invert_wm_value(active->wm[level].plane[plane_id],
+                                                           fifo_state->plane[plane_id]);
+                       }
+               }
+
+               for_each_plane_id_on_crtc(crtc, plane_id)
+                       vlv_raw_plane_wm_set(crtc_state, level,
+                                            plane_id, USHRT_MAX);
+               vlv_invalidate_wms(crtc, active, level);
+
+               crtc_state->wm.vlv.optimal = *active;
+               crtc_state->wm.vlv.intermediate = *active;
+
                DRM_DEBUG_KMS("Initial watermarks: pipe %c, plane=%d, cursor=%d, sprite0=%d, sprite1=%d\n",
                              pipe_name(pipe),
                              wm->pipe[pipe].plane[PLANE_PRIMARY],
                              wm->pipe[pipe].plane[PLANE_CURSOR],
                              wm->pipe[pipe].plane[PLANE_SPRITE0],
                              wm->pipe[pipe].plane[PLANE_SPRITE1]);
+       }
 
        DRM_DEBUG_KMS("Initial watermarks: SR plane=%d, SR cursor=%d level=%d cxsr=%d\n",
                      wm->sr.plane, wm->sr.cursor, wm->level, wm->cxsr);
 }
 
+void vlv_wm_sanitize(struct drm_i915_private *dev_priv)
+{
+       struct intel_plane *plane;
+       struct intel_crtc *crtc;
+
+       mutex_lock(&dev_priv->wm.wm_mutex);
+
+       for_each_intel_plane(&dev_priv->drm, plane) {
+               struct intel_crtc *crtc =
+                       intel_get_crtc_for_pipe(dev_priv, plane->pipe);
+               struct intel_crtc_state *crtc_state =
+                       to_intel_crtc_state(crtc->base.state);
+               struct intel_plane_state *plane_state =
+                       to_intel_plane_state(plane->base.state);
+               struct vlv_wm_state *wm_state = &crtc_state->wm.vlv.optimal;
+               const struct vlv_fifo_state *fifo_state =
+                       &crtc_state->wm.vlv.fifo_state;
+               enum plane_id plane_id = plane->id;
+               int level;
+
+               if (plane_state->base.visible)
+                       continue;
+
+               for (level = 0; level < wm_state->num_levels; level++) {
+                       struct g4x_pipe_wm *raw =
+                               &crtc_state->wm.vlv.raw[level];
+
+                       raw->plane[plane_id] = 0;
+
+                       wm_state->wm[level].plane[plane_id] =
+                               vlv_invert_wm_value(raw->plane[plane_id],
+                                                   fifo_state->plane[plane_id]);
+               }
+       }
+
+       for_each_intel_crtc(&dev_priv->drm, crtc) {
+               struct intel_crtc_state *crtc_state =
+                       to_intel_crtc_state(crtc->base.state);
+
+               crtc_state->wm.vlv.intermediate =
+                       crtc_state->wm.vlv.optimal;
+               crtc->wm.active.vlv = crtc_state->wm.vlv.optimal;
+       }
+
+       vlv_program_watermarks(dev_priv);
+
+       mutex_unlock(&dev_priv->wm.wm_mutex);
+}
+
 void ilk_wm_get_hw_state(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = to_i915(dev);
@@ -4680,7 +4991,7 @@ bool ironlake_set_drps(struct drm_i915_private *dev_priv, u8 val)
 {
        u16 rgvswctl;
 
-       assert_spin_locked(&mchdev_lock);
+       lockdep_assert_held(&mchdev_lock);
 
        rgvswctl = I915_READ16(MEMSWCTL);
        if (rgvswctl & MEMCTL_CMD_STS) {
@@ -4891,6 +5202,12 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
                break;
        }
 
+       /* When byt can survive without system hang with dynamic
+        * sw freq adjustments, this restriction can be lifted.
+        */
+       if (IS_VALLEYVIEW(dev_priv))
+               goto skip_hw_write;
+
        I915_WRITE(GEN6_RP_UP_EI,
                   GT_INTERVAL_FROM_US(dev_priv, ei_up));
        I915_WRITE(GEN6_RP_UP_THRESHOLD,
@@ -4911,6 +5228,7 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
                   GEN6_RP_UP_BUSY_AVG |
                   GEN6_RP_DOWN_IDLE_AVG);
 
+skip_hw_write:
        dev_priv->rps.power = new_power;
        dev_priv->rps.up_threshold = threshold_up;
        dev_priv->rps.down_threshold = threshold_down;
@@ -4921,8 +5239,9 @@ static u32 gen6_rps_pm_mask(struct drm_i915_private *dev_priv, u8 val)
 {
        u32 mask = 0;
 
+       /* We use UP_EI_EXPIRED interupts for both up/down in manual mode */
        if (val > dev_priv->rps.min_freq_softlimit)
-               mask |= GEN6_PM_RP_DOWN_EI_EXPIRED | GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT;
+               mask |= GEN6_PM_RP_UP_EI_EXPIRED | GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT;
        if (val < dev_priv->rps.max_freq_softlimit)
                mask |= GEN6_PM_RP_UP_EI_EXPIRED | GEN6_PM_RP_UP_THRESHOLD;
 
@@ -4934,16 +5253,8 @@ static u32 gen6_rps_pm_mask(struct drm_i915_private *dev_priv, u8 val)
 /* gen6_set_rps is called to update the frequency request, but should also be
  * called when the range (min_delay and max_delay) is modified so that we can
  * update the GEN6_RP_INTERRUPT_LIMITS register accordingly. */
-static void gen6_set_rps(struct drm_i915_private *dev_priv, u8 val)
+static int gen6_set_rps(struct drm_i915_private *dev_priv, u8 val)
 {
-       /* WaGsvDisableTurbo: Workaround to disable turbo on BXT A* */
-       if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1))
-               return;
-
-       WARN_ON(!mutex_is_locked(&dev_priv->rps.hw_lock));
-       WARN_ON(val > dev_priv->rps.max_freq);
-       WARN_ON(val < dev_priv->rps.min_freq);
-
        /* min/max delay may still have been modified so be sure to
         * write the limits value.
         */
@@ -4969,17 +5280,15 @@ static void gen6_set_rps(struct drm_i915_private *dev_priv, u8 val)
        I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, intel_rps_limits(dev_priv, val));
        I915_WRITE(GEN6_PMINTRMSK, gen6_rps_pm_mask(dev_priv, val));
 
-       POSTING_READ(GEN6_RPNSWREQ);
-
        dev_priv->rps.cur_freq = val;
        trace_intel_gpu_freq_change(intel_gpu_freq(dev_priv, val));
+
+       return 0;
 }
 
-static void valleyview_set_rps(struct drm_i915_private *dev_priv, u8 val)
+static int valleyview_set_rps(struct drm_i915_private *dev_priv, u8 val)
 {
-       WARN_ON(!mutex_is_locked(&dev_priv->rps.hw_lock));
-       WARN_ON(val > dev_priv->rps.max_freq);
-       WARN_ON(val < dev_priv->rps.min_freq);
+       int err;
 
        if (WARN_ONCE(IS_CHERRYVIEW(dev_priv) && (val & 1),
                      "Odd GPU freq value\n"))
@@ -4988,13 +5297,17 @@ static void valleyview_set_rps(struct drm_i915_private *dev_priv, u8 val)
        I915_WRITE(GEN6_PMINTRMSK, gen6_rps_pm_mask(dev_priv, val));
 
        if (val != dev_priv->rps.cur_freq) {
-               vlv_punit_write(dev_priv, PUNIT_REG_GPU_FREQ_REQ, val);
-               if (!IS_CHERRYVIEW(dev_priv))
-                       gen6_set_rps_thresholds(dev_priv, val);
+               err = vlv_punit_write(dev_priv, PUNIT_REG_GPU_FREQ_REQ, val);
+               if (err)
+                       return err;
+
+               gen6_set_rps_thresholds(dev_priv, val);
        }
 
        dev_priv->rps.cur_freq = val;
        trace_intel_gpu_freq_change(intel_gpu_freq(dev_priv, val));
+
+       return 0;
 }
 
 /* vlv_set_rps_idle: Set the frequency to idle, if Gfx clocks are down
@@ -5007,6 +5320,7 @@ static void valleyview_set_rps(struct drm_i915_private *dev_priv, u8 val)
 static void vlv_set_rps_idle(struct drm_i915_private *dev_priv)
 {
        u32 val = dev_priv->rps.idle_freq;
+       int err;
 
        if (dev_priv->rps.cur_freq <= val)
                return;
@@ -5024,26 +5338,37 @@ static void vlv_set_rps_idle(struct drm_i915_private *dev_priv)
         * power than the render powerwell.
         */
        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_MEDIA);
-       valleyview_set_rps(dev_priv, val);
+       err = valleyview_set_rps(dev_priv, val);
        intel_uncore_forcewake_put(dev_priv, FORCEWAKE_MEDIA);
+
+       if (err)
+               DRM_ERROR("Failed to set RPS for idle\n");
 }
 
 void gen6_rps_busy(struct drm_i915_private *dev_priv)
 {
        mutex_lock(&dev_priv->rps.hw_lock);
        if (dev_priv->rps.enabled) {
-               if (dev_priv->pm_rps_events & (GEN6_PM_RP_DOWN_EI_EXPIRED | GEN6_PM_RP_UP_EI_EXPIRED))
+               u8 freq;
+
+               if (dev_priv->pm_rps_events & GEN6_PM_RP_UP_EI_EXPIRED)
                        gen6_rps_reset_ei(dev_priv);
                I915_WRITE(GEN6_PMINTRMSK,
                           gen6_rps_pm_mask(dev_priv, dev_priv->rps.cur_freq));
 
                gen6_enable_rps_interrupts(dev_priv);
 
-               /* Ensure we start at the user's desired frequency */
-               intel_set_rps(dev_priv,
-                             clamp(dev_priv->rps.cur_freq,
-                                   dev_priv->rps.min_freq_softlimit,
-                                   dev_priv->rps.max_freq_softlimit));
+               /* Use the user's desired frequency as a guide, but for better
+                * performance, jump directly to RPe as our starting frequency.
+                */
+               freq = max(dev_priv->rps.cur_freq,
+                          dev_priv->rps.efficient_freq);
+
+               if (intel_set_rps(dev_priv,
+                                 clamp(freq,
+                                       dev_priv->rps.min_freq_softlimit,
+                                       dev_priv->rps.max_freq_softlimit)))
+                       DRM_DEBUG_DRIVER("Failed to set idle frequency\n");
        }
        mutex_unlock(&dev_priv->rps.hw_lock);
 }
@@ -5111,12 +5436,25 @@ void gen6_rps_boost(struct drm_i915_private *dev_priv,
        spin_unlock(&dev_priv->rps.client_lock);
 }
 
-void intel_set_rps(struct drm_i915_private *dev_priv, u8 val)
+int intel_set_rps(struct drm_i915_private *dev_priv, u8 val)
 {
+       int err;
+
+       lockdep_assert_held(&dev_priv->rps.hw_lock);
+       GEM_BUG_ON(val > dev_priv->rps.max_freq);
+       GEM_BUG_ON(val < dev_priv->rps.min_freq);
+
+       if (!dev_priv->rps.enabled) {
+               dev_priv->rps.cur_freq = val;
+               return 0;
+       }
+
        if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
-               valleyview_set_rps(dev_priv, val);
+               err = valleyview_set_rps(dev_priv, val);
        else
-               gen6_set_rps(dev_priv, val);
+               err = gen6_set_rps(dev_priv, val);
+
+       return err;
 }
 
 static void gen9_disable_rc6(struct drm_i915_private *dev_priv)
@@ -5294,7 +5632,7 @@ static void gen6_init_rps_frequencies(struct drm_i915_private *dev_priv)
 
        dev_priv->rps.efficient_freq = dev_priv->rps.rp1_freq;
        if (IS_HASWELL(dev_priv) || IS_BROADWELL(dev_priv) ||
-           IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
+           IS_GEN9_BC(dev_priv)) {
                u32 ddcc_status = 0;
 
                if (sandybridge_pcode_read(dev_priv,
@@ -5307,7 +5645,7 @@ static void gen6_init_rps_frequencies(struct drm_i915_private *dev_priv)
                                        dev_priv->rps.max_freq);
        }
 
-       if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
+       if (IS_GEN9_BC(dev_priv)) {
                /* Store the frequency values in 16.66 MHZ units, which is
                 * the natural hardware unit for SKL
                 */
@@ -5320,7 +5658,7 @@ static void gen6_init_rps_frequencies(struct drm_i915_private *dev_priv)
 }
 
 static void reset_rps(struct drm_i915_private *dev_priv,
-                     void (*set)(struct drm_i915_private *, u8))
+                     int (*set)(struct drm_i915_private *, u8))
 {
        u8 freq = dev_priv->rps.cur_freq;
 
@@ -5328,7 +5666,8 @@ static void reset_rps(struct drm_i915_private *dev_priv,
        dev_priv->rps.power = -1;
        dev_priv->rps.cur_freq = -1;
 
-       set(dev_priv, freq);
+       if (set(dev_priv, freq))
+               DRM_ERROR("Failed to reset RPS to initial values\n");
 }
 
 /* See the Gen9_GT_PM_Programming_Guide doc for the below */
@@ -5336,22 +5675,6 @@ static void gen9_enable_rps(struct drm_i915_private *dev_priv)
 {
        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 
-       /* WaGsvDisableTurbo: Workaround to disable turbo on BXT A* */
-       if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
-               /*
-                * BIOS could leave the Hw Turbo enabled, so need to explicitly
-                * clear out the Control register just to avoid inconsitency
-                * with debugfs interface, which will show  Turbo as enabled
-                * only and that is not expected by the User after adding the
-                * WaGsvDisableTurbo. Apart from this there is no problem even
-                * if the Turbo is left enabled in the Control register, as the
-                * Up/Down interrupts would remain masked.
-                */
-               gen9_disable_rps(dev_priv);
-               intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-               return;
-       }
-
        /* Program defaults and thresholds for RPS*/
        I915_WRITE(GEN6_RC_VIDEO_FREQ,
                GEN9_FREQUENCY(dev_priv->rps.rp1_freq));
@@ -5411,18 +5734,9 @@ static void gen9_enable_rc6(struct drm_i915_private *dev_priv)
        if (intel_enable_rc6() & INTEL_RC6_ENABLE)
                rc6_mask = GEN6_RC_CTL_RC6_ENABLE;
        DRM_INFO("RC6 %s\n", onoff(rc6_mask & GEN6_RC_CTL_RC6_ENABLE));
-       /* WaRsUseTimeoutMode:bxt */
-       if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) {
-               I915_WRITE(GEN6_RC6_THRESHOLD, 625); /* 800us */
-               I915_WRITE(GEN6_RC_CONTROL, GEN6_RC_CTL_HW_ENABLE |
-                          GEN7_RC_CTL_TO_MODE |
-                          rc6_mask);
-       } else {
-               I915_WRITE(GEN6_RC6_THRESHOLD, 37500); /* 37.5/125ms per EI */
-               I915_WRITE(GEN6_RC_CONTROL, GEN6_RC_CTL_HW_ENABLE |
-                          GEN6_RC_CTL_EI_MODE(1) |
-                          rc6_mask);
-       }
+       I915_WRITE(GEN6_RC6_THRESHOLD, 37500); /* 37.5/125ms per EI */
+       I915_WRITE(GEN6_RC_CONTROL,
+                  GEN6_RC_CTL_HW_ENABLE | GEN6_RC_CTL_EI_MODE(1) | rc6_mask);
 
        /*
         * 3b: Enable Coarse Power Gating only when RC6 is enabled.
@@ -5637,7 +5951,7 @@ static void gen6_update_ring_freq(struct drm_i915_private *dev_priv)
        /* convert DDR frequency from units of 266.6MHz to bandwidth */
        min_ring_freq = mult_frac(min_ring_freq, 8, 3);
 
-       if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
+       if (IS_GEN9_BC(dev_priv)) {
                /* Convert GT frequency to 50 HZ units */
                min_gpu_freq = dev_priv->rps.min_freq / GEN9_FREQ_SCALER;
                max_gpu_freq = dev_priv->rps.max_freq / GEN9_FREQ_SCALER;
@@ -5655,7 +5969,7 @@ static void gen6_update_ring_freq(struct drm_i915_private *dev_priv)
                int diff = max_gpu_freq - gpu_freq;
                unsigned int ia_freq = 0, ring_freq = 0;
 
-               if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
+               if (IS_GEN9_BC(dev_priv)) {
                        /*
                         * ring_freq = 2 * GT. ring_freq is in 100MHz units
                         * No floor required for ring frequency on SKL.
@@ -5739,6 +6053,17 @@ static int cherryview_rps_guar_freq(struct drm_i915_private *dev_priv)
        return rp1;
 }
 
+static u32 cherryview_rps_min_freq(struct drm_i915_private *dev_priv)
+{
+       u32 val, rpn;
+
+       val = vlv_punit_read(dev_priv, FB_GFX_FMIN_AT_VMIN_FUSE);
+       rpn = ((val >> FB_GFX_FMIN_AT_VMIN_FUSE_SHIFT) &
+                      FB_GFX_FREQ_FUSE_MASK);
+
+       return rpn;
+}
+
 static int valleyview_rps_guar_freq(struct drm_i915_private *dev_priv)
 {
        u32 val, rp1;
@@ -5975,8 +6300,7 @@ static void cherryview_init_gt_powersave(struct drm_i915_private *dev_priv)
                         intel_gpu_freq(dev_priv, dev_priv->rps.rp1_freq),
                         dev_priv->rps.rp1_freq);
 
-       /* PUnit validated range is only [RPe, RP0] */
-       dev_priv->rps.min_freq = dev_priv->rps.efficient_freq;
+       dev_priv->rps.min_freq = cherryview_rps_min_freq(dev_priv);
        DRM_DEBUG_DRIVER("min GPU freq: %d MHz (%u)\n",
                         intel_gpu_freq(dev_priv, dev_priv->rps.min_freq),
                         dev_priv->rps.min_freq);
@@ -6132,7 +6456,8 @@ static void valleyview_enable_rps(struct drm_i915_private *dev_priv)
 
        /* allows RC6 residency counter to work */
        I915_WRITE(VLV_COUNTER_CONTROL,
-                  _MASKED_BIT_ENABLE(VLV_MEDIA_RC0_COUNT_EN |
+                  _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH |
+                                     VLV_MEDIA_RC0_COUNT_EN |
                                      VLV_RENDER_RC0_COUNT_EN |
                                      VLV_MEDIA_RC6_COUNT_EN |
                                      VLV_RENDER_RC6_COUNT_EN));
@@ -6199,7 +6524,7 @@ static unsigned long __i915_chipset_val(struct drm_i915_private *dev_priv)
        unsigned long now = jiffies_to_msecs(jiffies), diff1;
        int i;
 
-       assert_spin_locked(&mchdev_lock);
+       lockdep_assert_held(&mchdev_lock);
 
        diff1 = now - dev_priv->ips.last_time1;
 
@@ -6304,7 +6629,7 @@ static void __i915_update_gfx_val(struct drm_i915_private *dev_priv)
        u64 now, diff, diffms;
        u32 count;
 
-       assert_spin_locked(&mchdev_lock);
+       lockdep_assert_held(&mchdev_lock);
 
        now = ktime_get_raw_ns();
        diffms = now - dev_priv->ips.last_time2;
@@ -6349,7 +6674,7 @@ static unsigned long __i915_gfx_val(struct drm_i915_private *dev_priv)
        unsigned long t, corr, state1, corr2, state2;
        u32 pxvid, ext_v;
 
-       assert_spin_locked(&mchdev_lock);
+       lockdep_assert_held(&mchdev_lock);
 
        pxvid = I915_READ(PXVFREQ(dev_priv->rps.cur_freq));
        pxvid = (pxvid >> 24) & 0x7f;
@@ -6775,7 +7100,7 @@ void intel_enable_gt_powersave(struct drm_i915_private *dev_priv)
        } else if (INTEL_GEN(dev_priv) >= 9) {
                gen9_enable_rc6(dev_priv);
                gen9_enable_rps(dev_priv);
-               if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
+               if (IS_GEN9_BC(dev_priv))
                        gen6_update_ring_freq(dev_priv);
        } else if (IS_BROADWELL(dev_priv)) {
                gen8_enable_rps(dev_priv);
@@ -6825,7 +7150,7 @@ static void __intel_autoenable_gt_powersave(struct work_struct *work)
                rcs->init_context(req);
 
        /* Mark the device busy, calling intel_enable_gt_powersave() */
-       i915_add_request_no_flush(req);
+       i915_add_request(req);
 
 unlock:
        mutex_unlock(&dev_priv->drm.struct_mutex);
@@ -7260,6 +7585,14 @@ static void broadwell_init_clock_gating(struct drm_i915_private *dev_priv)
                   | KVM_CONFIG_CHANGE_NOTIFICATION_SELECT);
 
        lpt_init_clock_gating(dev_priv);
+
+       /* WaDisableDopClockGating:bdw
+        *
+        * Also see the CHICKEN2 write in bdw_init_workarounds() to disable DOP
+        * clock gating.
+        */
+       I915_WRITE(GEN6_UCGCTL1,
+                  I915_READ(GEN6_UCGCTL1) | GEN6_EU_TCUNIT_CLOCK_GATE_DISABLE);
 }
 
 static void haswell_init_clock_gating(struct drm_i915_private *dev_priv)
@@ -7656,8 +7989,10 @@ void intel_init_clock_gating_hooks(struct drm_i915_private *dev_priv)
                dev_priv->display.init_clock_gating = skylake_init_clock_gating;
        else if (IS_KABYLAKE(dev_priv))
                dev_priv->display.init_clock_gating = kabylake_init_clock_gating;
-       else if (IS_GEN9_LP(dev_priv))
+       else if (IS_BROXTON(dev_priv))
                dev_priv->display.init_clock_gating = bxt_init_clock_gating;
+       else if (IS_GEMINILAKE(dev_priv))
+               dev_priv->display.init_clock_gating = glk_init_clock_gating;
        else if (IS_BROADWELL(dev_priv))
                dev_priv->display.init_clock_gating = broadwell_init_clock_gating;
        else if (IS_CHERRYVIEW(dev_priv))
@@ -7727,7 +8062,11 @@ void intel_init_pm(struct drm_i915_private *dev_priv)
                }
        } else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) {
                vlv_setup_wm_latency(dev_priv);
-               dev_priv->display.update_wm = vlv_update_wm;
+               dev_priv->display.compute_pipe_wm = vlv_compute_pipe_wm;
+               dev_priv->display.compute_intermediate_wm = vlv_compute_intermediate_wm;
+               dev_priv->display.initial_watermarks = vlv_initial_watermarks;
+               dev_priv->display.optimize_watermarks = vlv_optimize_watermarks;
+               dev_priv->display.atomic_update_watermarks = vlv_atomic_update_fifo;
        } else if (IS_PINEVIEW(dev_priv)) {
                if (!intel_get_cxsr_latency(IS_PINEVIEW_G(dev_priv),
                                            dev_priv->is_ddr3,
@@ -7780,7 +8119,7 @@ static inline int gen6_check_mailbox_status(struct drm_i915_private *dev_priv)
        case GEN6_PCODE_TIMEOUT:
                return -ETIMEDOUT;
        default:
-               MISSING_CASE(flags)
+               MISSING_CASE(flags);
                return 0;
        }
 }
@@ -7827,9 +8166,9 @@ int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u32 mbox, u32 *val
        I915_WRITE_FW(GEN6_PCODE_DATA1, 0);
        I915_WRITE_FW(GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
 
-       if (intel_wait_for_register_fw(dev_priv,
-                                      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY, 0,
-                                      500)) {
+       if (__intel_wait_for_register_fw(dev_priv,
+                                        GEN6_PCODE_MAILBOX, GEN6_PCODE_READY, 0,
+                                        500, 0, NULL)) {
                DRM_ERROR("timeout waiting for pcode read (%d) to finish\n", mbox);
                return -ETIMEDOUT;
        }
@@ -7872,9 +8211,9 @@ int sandybridge_pcode_write(struct drm_i915_private *dev_priv,
        I915_WRITE_FW(GEN6_PCODE_DATA1, 0);
        I915_WRITE_FW(GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox);
 
-       if (intel_wait_for_register_fw(dev_priv,
-                                      GEN6_PCODE_MAILBOX, GEN6_PCODE_READY, 0,
-                                      500)) {
+       if (__intel_wait_for_register_fw(dev_priv,
+                                        GEN6_PCODE_MAILBOX, GEN6_PCODE_READY, 0,
+                                        500, 0, NULL)) {
                DRM_ERROR("timeout waiting for pcode write (%d) to finish\n", mbox);
                return -ETIMEDOUT;
        }
@@ -7916,10 +8255,10 @@ static bool skl_pcode_try_request(struct drm_i915_private *dev_priv, u32 mbox,
  * @timeout_base_ms: timeout for polling with preemption enabled
  *
  * Keep resending the @request to @mbox until PCODE acknowledges it, PCODE
- * reports an error or an overall timeout of @timeout_base_ms+10 ms expires.
+ * reports an error or an overall timeout of @timeout_base_ms+50 ms expires.
  * The request is acknowledged once the PCODE reply dword equals @reply after
  * applying @reply_mask. Polling is first attempted with preemption enabled
- * for @timeout_base_ms and if this times out for another 10 ms with
+ * for @timeout_base_ms and if this times out for another 50 ms with
  * preemption disabled.
  *
  * Returns 0 on success, %-ETIMEDOUT in case of a timeout, <0 in case of some
@@ -7955,14 +8294,15 @@ int skl_pcode_request(struct drm_i915_private *dev_priv, u32 mbox, u32 request,
         * worst case) _and_ PCODE was busy for some reason even after a
         * (queued) request and @timeout_base_ms delay. As a workaround retry
         * the poll with preemption disabled to maximize the number of
-        * requests. Increase the timeout from @timeout_base_ms to 10ms to
+        * requests. Increase the timeout from @timeout_base_ms to 50ms to
         * account for interrupts that could reduce the number of these
-        * requests.
+        * requests, and for any quirks of the PCODE firmware that delays
+        * the request completion.
         */
        DRM_DEBUG_KMS("PCODE timeout, retrying with preemption disabled\n");
        WARN_ON_ONCE(timeout_base_ms > 3);
        preempt_disable();
-       ret = wait_for_atomic(COND, 10);
+       ret = wait_for_atomic(COND, 50);
        preempt_enable();
 
 out:
@@ -8074,3 +8414,80 @@ void intel_pm_setup(struct drm_i915_private *dev_priv)
        dev_priv->pm.suspended = false;
        atomic_set(&dev_priv->pm.wakeref_count, 0);
 }
+
+static u64 vlv_residency_raw(struct drm_i915_private *dev_priv,
+                            const i915_reg_t reg)
+{
+       u32 lower, upper, tmp;
+       int loop = 2;
+
+       /* The register accessed do not need forcewake. We borrow
+        * uncore lock to prevent concurrent access to range reg.
+        */
+       spin_lock_irq(&dev_priv->uncore.lock);
+
+       /* vlv and chv residency counters are 40 bits in width.
+        * With a control bit, we can choose between upper or lower
+        * 32bit window into this counter.
+        *
+        * Although we always use the counter in high-range mode elsewhere,
+        * userspace may attempt to read the value before rc6 is initialised,
+        * before we have set the default VLV_COUNTER_CONTROL value. So always
+        * set the high bit to be safe.
+        */
+       I915_WRITE_FW(VLV_COUNTER_CONTROL,
+                     _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH));
+       upper = I915_READ_FW(reg);
+       do {
+               tmp = upper;
+
+               I915_WRITE_FW(VLV_COUNTER_CONTROL,
+                             _MASKED_BIT_DISABLE(VLV_COUNT_RANGE_HIGH));
+               lower = I915_READ_FW(reg);
+
+               I915_WRITE_FW(VLV_COUNTER_CONTROL,
+                             _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH));
+               upper = I915_READ_FW(reg);
+       } while (upper != tmp && --loop);
+
+       /* Everywhere else we always use VLV_COUNTER_CONTROL with the
+        * VLV_COUNT_RANGE_HIGH bit set - so it is safe to leave it set
+        * now.
+        */
+
+       spin_unlock_irq(&dev_priv->uncore.lock);
+
+       return lower | (u64)upper << 8;
+}
+
+u64 intel_rc6_residency_us(struct drm_i915_private *dev_priv,
+                          const i915_reg_t reg)
+{
+       u64 time_hw, units, div;
+
+       if (!intel_enable_rc6())
+               return 0;
+
+       intel_runtime_pm_get(dev_priv);
+
+       /* On VLV and CHV, residency time is in CZ units rather than 1.28us */
+       if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) {
+               units = 1000;
+               div = dev_priv->czclk_freq;
+
+               time_hw = vlv_residency_raw(dev_priv, reg);
+       } else if (IS_GEN9_LP(dev_priv)) {
+               units = 1000;
+               div = 1200;             /* 833.33ns */
+
+               time_hw = I915_READ(reg);
+       } else {
+               units = 128000; /* 1.28us */
+               div = 100000;
+
+               time_hw = I915_READ(reg);
+       }
+
+       intel_runtime_pm_put(dev_priv);
+       return DIV_ROUND_UP_ULL(time_hw * units, div);
+}