2 * SPDX-License-Identifier: MIT
4 * Copyright © 2014-2016 Intel Corporation
7 #include "display/intel_frontbuffer.h"
10 #include "i915_gem_clflush.h"
11 #include "i915_gem_gtt.h"
12 #include "i915_gem_ioctls.h"
13 #include "i915_gem_object.h"
16 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
19 * We manually flush the CPU domain so that we can override and
20 * force the flush for the display, and perform it asyncrhonously.
22 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
24 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
25 obj->write_domain = 0;
28 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
30 if (!READ_ONCE(obj->pin_global))
33 i915_gem_object_lock(obj);
34 __i915_gem_object_flush_for_display(obj);
35 i915_gem_object_unlock(obj);
39 * Moves a single object to the WC read, and possibly write domain.
40 * @obj: object to act on
41 * @write: ask for write access or read only
43 * This function returns when the move is complete, including waiting on
47 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
51 assert_object_held(obj);
53 ret = i915_gem_object_wait(obj,
54 I915_WAIT_INTERRUPTIBLE |
55 (write ? I915_WAIT_ALL : 0),
56 MAX_SCHEDULE_TIMEOUT);
60 if (obj->write_domain == I915_GEM_DOMAIN_WC)
63 /* Flush and acquire obj->pages so that we are coherent through
64 * direct access in memory with previous cached writes through
65 * shmemfs and that our cache domain tracking remains valid.
66 * For example, if the obj->filp was moved to swap without us
67 * being notified and releasing the pages, we would mistakenly
68 * continue to assume that the obj remained out of the CPU cached
71 ret = i915_gem_object_pin_pages(obj);
75 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
77 /* Serialise direct access to this object with the barriers for
78 * coherent writes from the GPU, by effectively invalidating the
79 * WC domain upon first access.
81 if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
84 /* It should now be out of any other write domains, and we can update
85 * the domain values for our changes.
87 GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
88 obj->read_domains |= I915_GEM_DOMAIN_WC;
90 obj->read_domains = I915_GEM_DOMAIN_WC;
91 obj->write_domain = I915_GEM_DOMAIN_WC;
95 i915_gem_object_unpin_pages(obj);
100 * Moves a single object to the GTT read, and possibly write domain.
101 * @obj: object to act on
102 * @write: ask for write access or read only
104 * This function returns when the move is complete, including waiting on
108 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
112 assert_object_held(obj);
114 ret = i915_gem_object_wait(obj,
115 I915_WAIT_INTERRUPTIBLE |
116 (write ? I915_WAIT_ALL : 0),
117 MAX_SCHEDULE_TIMEOUT);
121 if (obj->write_domain == I915_GEM_DOMAIN_GTT)
124 /* Flush and acquire obj->pages so that we are coherent through
125 * direct access in memory with previous cached writes through
126 * shmemfs and that our cache domain tracking remains valid.
127 * For example, if the obj->filp was moved to swap without us
128 * being notified and releasing the pages, we would mistakenly
129 * continue to assume that the obj remained out of the CPU cached
132 ret = i915_gem_object_pin_pages(obj);
136 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
138 /* Serialise direct access to this object with the barriers for
139 * coherent writes from the GPU, by effectively invalidating the
140 * GTT domain upon first access.
142 if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
145 /* It should now be out of any other write domains, and we can update
146 * the domain values for our changes.
148 GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
149 obj->read_domains |= I915_GEM_DOMAIN_GTT;
151 obj->read_domains = I915_GEM_DOMAIN_GTT;
152 obj->write_domain = I915_GEM_DOMAIN_GTT;
153 obj->mm.dirty = true;
156 i915_gem_object_unpin_pages(obj);
161 * Changes the cache-level of an object across all VMA.
162 * @obj: object to act on
163 * @cache_level: new cache level to set for the object
165 * After this function returns, the object will be in the new cache-level
166 * across all GTT and the contents of the backing storage will be coherent,
167 * with respect to the new cache-level. In order to keep the backing storage
168 * coherent for all users, we only allow a single cache level to be set
169 * globally on the object and prevent it from being changed whilst the
170 * hardware is reading from the object. That is if the object is currently
171 * on the scanout it will be set to uncached (or equivalent display
172 * cache coherency) and all non-MOCS GPU access will also be uncached so
173 * that all direct access to the scanout remains coherent.
175 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
176 enum i915_cache_level cache_level)
178 struct i915_vma *vma;
181 assert_object_held(obj);
183 if (obj->cache_level == cache_level)
186 /* Inspect the list of currently bound VMA and unbind any that would
187 * be invalid given the new cache-level. This is principally to
188 * catch the issue of the CS prefetch crossing page boundaries and
189 * reading an invalid PTE on older architectures.
192 list_for_each_entry(vma, &obj->vma.list, obj_link) {
193 if (!drm_mm_node_allocated(&vma->node))
196 if (i915_vma_is_pinned(vma)) {
197 DRM_DEBUG("can not change the cache level of pinned objects\n");
201 if (!i915_vma_is_closed(vma) &&
202 i915_gem_valid_gtt_space(vma, cache_level))
205 ret = i915_vma_unbind(vma);
209 /* As unbinding may affect other elements in the
210 * obj->vma_list (due to side-effects from retiring
211 * an active vma), play safe and restart the iterator.
216 /* We can reuse the existing drm_mm nodes but need to change the
217 * cache-level on the PTE. We could simply unbind them all and
218 * rebind with the correct cache-level on next use. However since
219 * we already have a valid slot, dma mapping, pages etc, we may as
220 * rewrite the PTE in the belief that doing so tramples upon less
221 * state and so involves less work.
223 if (atomic_read(&obj->bind_count)) {
224 /* Before we change the PTE, the GPU must not be accessing it.
225 * If we wait upon the object, we know that all the bound
226 * VMA are no longer active.
228 ret = i915_gem_object_wait(obj,
229 I915_WAIT_INTERRUPTIBLE |
231 MAX_SCHEDULE_TIMEOUT);
235 if (!HAS_LLC(to_i915(obj->base.dev)) &&
236 cache_level != I915_CACHE_NONE) {
237 /* Access to snoopable pages through the GTT is
238 * incoherent and on some machines causes a hard
239 * lockup. Relinquish the CPU mmaping to force
240 * userspace to refault in the pages and we can
241 * then double check if the GTT mapping is still
242 * valid for that pointer access.
244 i915_gem_object_release_mmap(obj);
246 /* As we no longer need a fence for GTT access,
247 * we can relinquish it now (and so prevent having
248 * to steal a fence from someone else on the next
249 * fence request). Note GPU activity would have
250 * dropped the fence as all snoopable access is
251 * supposed to be linear.
253 for_each_ggtt_vma(vma, obj) {
254 ret = i915_vma_put_fence(vma);
259 /* We either have incoherent backing store and
260 * so no GTT access or the architecture is fully
261 * coherent. In such cases, existing GTT mmaps
262 * ignore the cache bit in the PTE and we can
263 * rewrite it without confusing the GPU or having
264 * to force userspace to fault back in its mmaps.
268 list_for_each_entry(vma, &obj->vma.list, obj_link) {
269 if (!drm_mm_node_allocated(&vma->node))
272 ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
278 list_for_each_entry(vma, &obj->vma.list, obj_link)
279 vma->node.color = cache_level;
280 i915_gem_object_set_cache_coherency(obj, cache_level);
281 obj->cache_dirty = true; /* Always invalidate stale cachelines */
286 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
287 struct drm_file *file)
289 struct drm_i915_gem_caching *args = data;
290 struct drm_i915_gem_object *obj;
294 obj = i915_gem_object_lookup_rcu(file, args->handle);
300 switch (obj->cache_level) {
302 case I915_CACHE_L3_LLC:
303 args->caching = I915_CACHING_CACHED;
307 args->caching = I915_CACHING_DISPLAY;
311 args->caching = I915_CACHING_NONE;
319 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
320 struct drm_file *file)
322 struct drm_i915_private *i915 = to_i915(dev);
323 struct drm_i915_gem_caching *args = data;
324 struct drm_i915_gem_object *obj;
325 enum i915_cache_level level;
328 switch (args->caching) {
329 case I915_CACHING_NONE:
330 level = I915_CACHE_NONE;
332 case I915_CACHING_CACHED:
334 * Due to a HW issue on BXT A stepping, GPU stores via a
335 * snooped mapping may leave stale data in a corresponding CPU
336 * cacheline, whereas normally such cachelines would get
339 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
342 level = I915_CACHE_LLC;
344 case I915_CACHING_DISPLAY:
345 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
351 obj = i915_gem_object_lookup(file, args->handle);
356 * The caching mode of proxy object is handled by its generator, and
357 * not allowed to be changed by userspace.
359 if (i915_gem_object_is_proxy(obj)) {
364 if (obj->cache_level == level)
367 ret = i915_gem_object_wait(obj,
368 I915_WAIT_INTERRUPTIBLE,
369 MAX_SCHEDULE_TIMEOUT);
373 ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
377 ret = i915_gem_object_lock_interruptible(obj);
379 ret = i915_gem_object_set_cache_level(obj, level);
380 i915_gem_object_unlock(obj);
382 mutex_unlock(&i915->drm.struct_mutex);
385 i915_gem_object_put(obj);
390 * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
391 * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
392 * (for pageflips). We only flush the caches while preparing the buffer for
393 * display, the callers are responsible for frontbuffer flush.
396 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
398 const struct i915_ggtt_view *view,
401 struct i915_vma *vma;
404 assert_object_held(obj);
406 /* Mark the global pin early so that we account for the
407 * display coherency whilst setting up the cache domains.
411 /* The display engine is not coherent with the LLC cache on gen6. As
412 * a result, we make sure that the pinning that is about to occur is
413 * done with uncached PTEs. This is lowest common denominator for all
416 * However for gen6+, we could do better by using the GFDT bit instead
417 * of uncaching, which would allow us to flush all the LLC-cached data
418 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
420 ret = i915_gem_object_set_cache_level(obj,
421 HAS_WT(to_i915(obj->base.dev)) ?
422 I915_CACHE_WT : I915_CACHE_NONE);
425 goto err_unpin_global;
428 /* As the user may map the buffer once pinned in the display plane
429 * (e.g. libkms for the bootup splash), we have to ensure that we
430 * always use map_and_fenceable for all scanout buffers. However,
431 * it may simply be too big to fit into mappable, in which case
432 * put it anyway and hope that userspace can cope (but always first
433 * try to preserve the existing ABI).
435 vma = ERR_PTR(-ENOSPC);
436 if ((flags & PIN_MAPPABLE) == 0 &&
437 (!view || view->type == I915_GGTT_VIEW_NORMAL))
438 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
443 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
445 goto err_unpin_global;
447 vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
449 __i915_gem_object_flush_for_display(obj);
451 /* It should now be out of any other write domains, and we can update
452 * the domain values for our changes.
454 obj->read_domains |= I915_GEM_DOMAIN_GTT;
463 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
465 struct drm_i915_private *i915 = to_i915(obj->base.dev);
466 struct i915_vma *vma;
468 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
470 mutex_lock(&i915->ggtt.vm.mutex);
471 for_each_ggtt_vma(vma, obj) {
472 if (!drm_mm_node_allocated(&vma->node))
475 list_move_tail(&vma->vm_link, &vma->vm->bound_list);
477 mutex_unlock(&i915->ggtt.vm.mutex);
479 if (i915_gem_object_is_shrinkable(obj)) {
482 spin_lock_irqsave(&i915->mm.obj_lock, flags);
484 if (obj->mm.madv == I915_MADV_WILLNEED)
485 list_move_tail(&obj->mm.link, &i915->mm.shrink_list);
487 spin_unlock_irqrestore(&i915->mm.obj_lock, flags);
492 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
494 struct drm_i915_gem_object *obj = vma->obj;
496 assert_object_held(obj);
498 if (WARN_ON(obj->pin_global == 0))
501 if (--obj->pin_global == 0)
502 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
504 /* Bump the LRU to try and avoid premature eviction whilst flipping */
505 i915_gem_object_bump_inactive_ggtt(obj);
511 * Moves a single object to the CPU read, and possibly write domain.
512 * @obj: object to act on
513 * @write: requesting write or read-only access
515 * This function returns when the move is complete, including waiting on
519 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
523 assert_object_held(obj);
525 ret = i915_gem_object_wait(obj,
526 I915_WAIT_INTERRUPTIBLE |
527 (write ? I915_WAIT_ALL : 0),
528 MAX_SCHEDULE_TIMEOUT);
532 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
534 /* Flush the CPU cache if it's still invalid. */
535 if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
536 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
537 obj->read_domains |= I915_GEM_DOMAIN_CPU;
540 /* It should now be out of any other write domains, and we can update
541 * the domain values for our changes.
543 GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
545 /* If we're writing through the CPU, then the GPU read domains will
546 * need to be invalidated at next use.
549 __start_cpu_write(obj);
554 static inline enum fb_op_origin
555 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
557 return (domain == I915_GEM_DOMAIN_GTT ?
558 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
562 * Called when user space prepares to use an object with the CPU, either
563 * through the mmap ioctl's mapping or a GTT mapping.
565 * @data: ioctl data blob
569 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
570 struct drm_file *file)
572 struct drm_i915_gem_set_domain *args = data;
573 struct drm_i915_gem_object *obj;
574 u32 read_domains = args->read_domains;
575 u32 write_domain = args->write_domain;
578 /* Only handle setting domains to types used by the CPU. */
579 if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
583 * Having something in the write domain implies it's in the read
584 * domain, and only that read domain. Enforce that in the request.
586 if (write_domain && read_domains != write_domain)
592 obj = i915_gem_object_lookup(file, args->handle);
597 * Already in the desired write domain? Nothing for us to do!
599 * We apply a little bit of cunning here to catch a broader set of
600 * no-ops. If obj->write_domain is set, we must be in the same
601 * obj->read_domains, and only that domain. Therefore, if that
602 * obj->write_domain matches the request read_domains, we are
603 * already in the same read/write domain and can skip the operation,
604 * without having to further check the requested write_domain.
606 if (READ_ONCE(obj->write_domain) == read_domains) {
612 * Try to flush the object off the GPU without holding the lock.
613 * We will repeat the flush holding the lock in the normal manner
614 * to catch cases where we are gazumped.
616 err = i915_gem_object_wait(obj,
617 I915_WAIT_INTERRUPTIBLE |
619 (write_domain ? I915_WAIT_ALL : 0),
620 MAX_SCHEDULE_TIMEOUT);
625 * Proxy objects do not control access to the backing storage, ergo
626 * they cannot be used as a means to manipulate the cache domain
627 * tracking for that backing storage. The proxy object is always
628 * considered to be outside of any cache domain.
630 if (i915_gem_object_is_proxy(obj)) {
636 * Flush and acquire obj->pages so that we are coherent through
637 * direct access in memory with previous cached writes through
638 * shmemfs and that our cache domain tracking remains valid.
639 * For example, if the obj->filp was moved to swap without us
640 * being notified and releasing the pages, we would mistakenly
641 * continue to assume that the obj remained out of the CPU cached
644 err = i915_gem_object_pin_pages(obj);
648 err = i915_gem_object_lock_interruptible(obj);
652 if (read_domains & I915_GEM_DOMAIN_WC)
653 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
654 else if (read_domains & I915_GEM_DOMAIN_GTT)
655 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
657 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
659 /* And bump the LRU for this access */
660 i915_gem_object_bump_inactive_ggtt(obj);
662 i915_gem_object_unlock(obj);
664 if (write_domain != 0)
665 intel_fb_obj_invalidate(obj,
666 fb_write_origin(obj, write_domain));
669 i915_gem_object_unpin_pages(obj);
671 i915_gem_object_put(obj);
676 * Pins the specified object's pages and synchronizes the object with
677 * GPU accesses. Sets needs_clflush to non-zero if the caller should
678 * flush the object from the CPU cache.
680 int i915_gem_object_prepare_read(struct drm_i915_gem_object *obj,
681 unsigned int *needs_clflush)
686 if (!i915_gem_object_has_struct_page(obj))
689 ret = i915_gem_object_lock_interruptible(obj);
693 ret = i915_gem_object_wait(obj,
694 I915_WAIT_INTERRUPTIBLE,
695 MAX_SCHEDULE_TIMEOUT);
699 ret = i915_gem_object_pin_pages(obj);
703 if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
704 !static_cpu_has(X86_FEATURE_CLFLUSH)) {
705 ret = i915_gem_object_set_to_cpu_domain(obj, false);
712 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
714 /* If we're not in the cpu read domain, set ourself into the gtt
715 * read domain and manually flush cachelines (if required). This
716 * optimizes for the case when the gpu will dirty the data
717 * anyway again before the next pread happens.
719 if (!obj->cache_dirty &&
720 !(obj->read_domains & I915_GEM_DOMAIN_CPU))
721 *needs_clflush = CLFLUSH_BEFORE;
724 /* return with the pages pinned */
728 i915_gem_object_unpin_pages(obj);
730 i915_gem_object_unlock(obj);
734 int i915_gem_object_prepare_write(struct drm_i915_gem_object *obj,
735 unsigned int *needs_clflush)
740 if (!i915_gem_object_has_struct_page(obj))
743 ret = i915_gem_object_lock_interruptible(obj);
747 ret = i915_gem_object_wait(obj,
748 I915_WAIT_INTERRUPTIBLE |
750 MAX_SCHEDULE_TIMEOUT);
754 ret = i915_gem_object_pin_pages(obj);
758 if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
759 !static_cpu_has(X86_FEATURE_CLFLUSH)) {
760 ret = i915_gem_object_set_to_cpu_domain(obj, true);
767 i915_gem_object_flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
769 /* If we're not in the cpu write domain, set ourself into the
770 * gtt write domain and manually flush cachelines (as required).
771 * This optimizes for the case when the gpu will use the data
772 * right away and we therefore have to clflush anyway.
774 if (!obj->cache_dirty) {
775 *needs_clflush |= CLFLUSH_AFTER;
778 * Same trick applies to invalidate partially written
779 * cachelines read before writing.
781 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
782 *needs_clflush |= CLFLUSH_BEFORE;
786 intel_fb_obj_invalidate(obj, ORIGIN_CPU);
787 obj->mm.dirty = true;
788 /* return with the pages pinned */
792 i915_gem_object_unpin_pages(obj);
794 i915_gem_object_unlock(obj);