]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - drivers/gpu/drm/i915/intel_ringbuffer.c
drm/i915: Flush GPU relocs harder for gen3
[linux.git] / drivers / gpu / drm / i915 / intel_ringbuffer.c
index d0ef50bf930ad747abe7b4510521f8ad79923ba5..1f8d2a66c791fee7a4e279942324015d6add1371 100644 (file)
@@ -69,19 +69,28 @@ unsigned int intel_ring_update_space(struct intel_ring *ring)
 static int
 gen2_render_ring_flush(struct i915_request *rq, u32 mode)
 {
+       unsigned int num_store_dw;
        u32 cmd, *cs;
 
        cmd = MI_FLUSH;
-
+       num_store_dw = 0;
        if (mode & EMIT_INVALIDATE)
                cmd |= MI_READ_FLUSH;
+       if (mode & EMIT_FLUSH)
+               num_store_dw = 4;
 
-       cs = intel_ring_begin(rq, 2);
+       cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
        if (IS_ERR(cs))
                return PTR_ERR(cs);
 
        *cs++ = cmd;
-       *cs++ = MI_NOOP;
+       while (num_store_dw--) {
+               *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+               *cs++ = i915_scratch_offset(rq->i915);
+               *cs++ = 0;
+       }
+       *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
+
        intel_ring_advance(rq, cs);
 
        return 0;
@@ -91,6 +100,7 @@ static int
 gen4_render_ring_flush(struct i915_request *rq, u32 mode)
 {
        u32 cmd, *cs;
+       int i;
 
        /*
         * read/write caches:
@@ -127,12 +137,43 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode)
                        cmd |= MI_INVALIDATE_ISP;
        }
 
-       cs = intel_ring_begin(rq, 2);
+       i = 2;
+       if (mode & EMIT_INVALIDATE)
+               i += 20;
+
+       cs = intel_ring_begin(rq, i);
        if (IS_ERR(cs))
                return PTR_ERR(cs);
 
        *cs++ = cmd;
-       *cs++ = MI_NOOP;
+
+       /*
+        * A random delay to let the CS invalidate take effect? Without this
+        * delay, the GPU relocation path fails as the CS does not see
+        * the updated contents. Just as important, if we apply the flushes
+        * to the EMIT_FLUSH branch (i.e. immediately after the relocation
+        * write and before the invalidate on the next batch), the relocations
+        * still fail. This implies that is a delay following invalidation
+        * that is required to reset the caches as opposed to a delay to
+        * ensure the memory is written.
+        */
+       if (mode & EMIT_INVALIDATE) {
+               *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
+               *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
+               *cs++ = 0;
+               *cs++ = 0;
+
+               for (i = 0; i < 12; i++)
+                       *cs++ = MI_FLUSH;
+
+               *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
+               *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT;
+               *cs++ = 0;
+               *cs++ = 0;
+       }
+
+       *cs++ = cmd;
+
        intel_ring_advance(rq, cs);
 
        return 0;
@@ -178,8 +219,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode)
 static int
 intel_emit_post_sync_nonzero_flush(struct i915_request *rq)
 {
-       u32 scratch_addr =
-               i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES;
+       u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
        u32 *cs;
 
        cs = intel_ring_begin(rq, 6);
@@ -212,8 +252,7 @@ intel_emit_post_sync_nonzero_flush(struct i915_request *rq)
 static int
 gen6_render_ring_flush(struct i915_request *rq, u32 mode)
 {
-       u32 scratch_addr =
-               i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES;
+       u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
        u32 *cs, flags = 0;
        int ret;
 
@@ -282,8 +321,7 @@ gen7_render_ring_cs_stall_wa(struct i915_request *rq)
 static int
 gen7_render_ring_flush(struct i915_request *rq, u32 mode)
 {
-       u32 scratch_addr =
-               i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES;
+       u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES;
        u32 *cs, flags = 0;
 
        /*
@@ -937,7 +975,7 @@ i965_emit_bb_start(struct i915_request *rq,
 }
 
 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
-#define I830_BATCH_LIMIT (256*1024)
+#define I830_BATCH_LIMIT SZ_256K
 #define I830_TLB_ENTRIES (2)
 #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
 static int
@@ -945,7 +983,9 @@ i830_emit_bb_start(struct i915_request *rq,
                   u64 offset, u32 len,
                   unsigned int dispatch_flags)
 {
-       u32 *cs, cs_offset = i915_ggtt_offset(rq->engine->scratch);
+       u32 *cs, cs_offset = i915_scratch_offset(rq->i915);
+
+       GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE);
 
        cs = intel_ring_begin(rq, 6);
        if (IS_ERR(cs))
@@ -1403,7 +1443,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 {
        struct i915_timeline *timeline;
        struct intel_ring *ring;
-       unsigned int size;
        int err;
 
        intel_engine_setup_common(engine);
@@ -1428,21 +1467,12 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
        GEM_BUG_ON(engine->buffer);
        engine->buffer = ring;
 
-       size = PAGE_SIZE;
-       if (HAS_BROKEN_CS_TLB(engine->i915))
-               size = I830_WA_SIZE;
-       err = intel_engine_create_scratch(engine, size);
-       if (err)
-               goto err_unpin;
-
        err = intel_engine_init_common(engine);
        if (err)
-               goto err_scratch;
+               goto err_unpin;
 
        return 0;
 
-err_scratch:
-       intel_engine_cleanup_scratch(engine);
 err_unpin:
        intel_ring_unpin(ring);
 err_ring:
@@ -1516,7 +1546,7 @@ static int flush_pd_dir(struct i915_request *rq)
        /* Stall until the page table load is complete */
        *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
        *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine));
-       *cs++ = i915_ggtt_offset(engine->scratch);
+       *cs++ = i915_scratch_offset(rq->i915);
        *cs++ = MI_NOOP;
 
        intel_ring_advance(rq, cs);
@@ -1625,7 +1655,7 @@ static inline int mi_set_context(struct i915_request *rq, u32 flags)
                        /* Insert a delay before the next switch! */
                        *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
                        *cs++ = i915_mmio_reg_offset(last_reg);
-                       *cs++ = i915_ggtt_offset(engine->scratch);
+                       *cs++ = i915_scratch_offset(rq->i915);
                        *cs++ = MI_NOOP;
                }
                *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;