drm/i915: Flush GPU relocs harder for gen3

[linux.git] / drivers / gpu / drm / i915 / intel_ringbuffer.c
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c

index d0ef50bf930ad747abe7b4510521f8ad79923ba5..1f8d2a66c791fee7a4e279942324015d6add1371 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -69,19 +69,28 @@ unsigned int intel_ring_update_space(struct intel_ring *ring)
  static int
  gen2_render_ring_flush(struct i915_request *rq, u32 mode)
  {
+       unsigned int num_store_dw;
         u32 cmd, *cs;
  
         cmd = MI_FLUSH;
-
+       num_store_dw = 0;
         if (mode & EMIT_INVALIDATE)
                 cmd |= MI_READ_FLUSH;
+       if (mode & EMIT_FLUSH)
+               num_store_dw = 4;
  
-       cs = intel_ring_begin(rq, 2);
+       cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
         if (IS_ERR(cs))
                 return PTR_ERR(cs);
  
         *cs++ = cmd;
-       *cs++ = MI_NOOP;
+       while (num_store_dw--) {
+               *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+               *cs++ = i915_scratch_offset(rq->i915);
+               *cs++ = 0;
+       }
+       *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
+
         intel_ring_advance(rq, cs);
  
         return 0;
@@ -91,6 +100,7 @@ static int
  gen4_render_ring_flush(struct i915_request *rq, u32 mode)
  {
         u32 cmd, *cs;
+       int i;
  
         /*
          * read/write caches:
@@ -127,12 +137,43 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode)
                         cmd |= MI_INVALIDATE_ISP;
         }
  
-       cs = intel_ring_begin(rq, 2);
+       i = 2;
+       if (mode & EMIT_INVALIDATE)
+               i += 20;
+
+       cs = intel_ring_begin(rq, i);
         if (IS_ERR(cs))
                 return PTR_ERR(cs);
  
         *cs++ = cmd;
-       *cs++ = MI_NOOP;
+
+       /*
+        * A random delay to let the CS invalidate take effect? Without this
+        * delay, the GPU relocation path fails as the CS does not see
+        * the updated contents. Just as important, if we apply the flushes
+        * to the EMIT_FLUSH branch (i.e. immediately after the relocation
+        * write and before the invalidate on the next batch), the relocations
+        * still fail. This implies that is a delay following invalidation
+        * that is required to reset the caches as opposed to a delay to
+        * ensure the memory is written.
+        */
+       if (mode & EMIT_INVALIDATE) {
+               *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
+               *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
+               *cs++ = 0;
+               *cs++ = 0;
+
+               for (i = 0; i < 12; i++)
+                       *cs++ = MI_FLUSH;
+
+               *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
+               *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
+               *cs++ = 0;
+               *cs++ = 0;
+       }
+
+       *cs++ = cmd;
+
         intel_ring_advance(rq, cs);
  
         return 0;
@@ -178,8 +219,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode)
  static int
  intel_emit_post_sync_nonzero_flush(struct i915_request *rq)
  {
-       u32 scratch_addr =
-               i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES;
+       u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
         u32 *cs;
  
         cs = intel_ring_begin(rq, 6);
@@ -212,8 +252,7 @@ intel_emit_post_sync_nonzero_flush(struct i915_request *rq)
  static int
  gen6_render_ring_flush(struct i915_request *rq, u32 mode)
  {
-       u32 scratch_addr =
-               i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES;
+       u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
         u32 *cs, flags = 0;
         int ret;
  
@@ -282,8 +321,7 @@ gen7_render_ring_cs_stall_wa(struct i915_request *rq)
  static int
  gen7_render_ring_flush(struct i915_request *rq, u32 mode)
  {
-       u32 scratch_addr =
-               i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES;
+       u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
         u32 *cs, flags = 0;
  
         /*
@@ -937,7 +975,7 @@ i965_emit_bb_start(struct i915_request *rq,
  }
  
  /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
-#define I830_BATCH_LIMIT (256*1024)
+#define I830_BATCH_LIMIT SZ_256K
  #define I830_TLB_ENTRIES (2)
  #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
  static int
@@ -945,7 +983,9 @@ i830_emit_bb_start(struct i915_request *rq,
                    u64 offset, u32 len,
                    unsigned int dispatch_flags)
  {
-       u32 *cs, cs_offset = i915_ggtt_offset(rq->engine->scratch);
+       u32 *cs, cs_offset = i915_scratch_offset(rq->i915);
+
+       GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE);
  
         cs = intel_ring_begin(rq, 6);
         if (IS_ERR(cs))
@@ -1403,7 +1443,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
  {
         struct i915_timeline *timeline;
         struct intel_ring *ring;
-       unsigned int size;
         int err;
  
         intel_engine_setup_common(engine);
@@ -1428,21 +1467,12 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
         GEM_BUG_ON(engine->buffer);
         engine->buffer = ring;
  
-       size = PAGE_SIZE;
-       if (HAS_BROKEN_CS_TLB(engine->i915))
-               size = I830_WA_SIZE;
-       err = intel_engine_create_scratch(engine, size);
-       if (err)
-               goto err_unpin;
-
         err = intel_engine_init_common(engine);
         if (err)
-               goto err_scratch;
+               goto err_unpin;
  
         return 0;
  
-err_scratch:
-       intel_engine_cleanup_scratch(engine);
  err_unpin:
         intel_ring_unpin(ring);
  err_ring:
@@ -1516,7 +1546,7 @@ static int flush_pd_dir(struct i915_request *rq)
         /* Stall until the page table load is complete */
         *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
         *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine));
-       *cs++ = i915_ggtt_offset(engine->scratch);
+       *cs++ = i915_scratch_offset(rq->i915);
         *cs++ = MI_NOOP;
  
         intel_ring_advance(rq, cs);
@@ -1625,7 +1655,7 @@ static inline int mi_set_context(struct i915_request *rq, u32 flags)
                         /* Insert a delay before the next switch! */
                         *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
                         *cs++ = i915_mmio_reg_offset(last_reg);
-                       *cs++ = i915_ggtt_offset(engine->scratch);
+                       *cs++ = i915_scratch_offset(rq->i915);
                         *cs++ = MI_NOOP;
                 }
                 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;