drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drm_vma_manager.h>
  29 #include <drm/i915_drm.h>
  30 #include <linux/dma-fence-array.h>
  31 #include <linux/kthread.h>
  32 #include <linux/reservation.h>
  33 #include <linux/shmem_fs.h>
  34 #include <linux/slab.h>
  35 #include <linux/stop_machine.h>
  36 #include <linux/swap.h>
  37 #include <linux/pci.h>
  38 #include <linux/dma-buf.h>
  39
  40 #include "i915_drv.h"
  41 #include "i915_gem_clflush.h"
  42 #include "i915_gemfs.h"
  43 #include "i915_reset.h"
  44 #include "i915_trace.h"
  45 #include "i915_vgpu.h"
  46
  47 #include "intel_drv.h"
  48 #include "intel_frontbuffer.h"
  49 #include "intel_mocs.h"
  50 #include "intel_workarounds.h"
  51
  52 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  53
  54 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  55 {
  56         if (obj->cache_dirty)
  57                 return false;
  58
  59         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  60                 return true;
  61
  62         return obj->pin_global; /* currently in use by HW, keep flushed */
  63 }
  64
  65 static int
  66 insert_mappable_node(struct i915_ggtt *ggtt,
  67                      struct drm_mm_node *node, u32 size)
  68 {
  69         memset(node, 0, sizeof(*node));
  70         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  71                                            size, 0, I915_COLOR_UNEVICTABLE,
  72                                            0, ggtt->mappable_end,
  73                                            DRM_MM_INSERT_LOW);
  74 }
  75
  76 static void
  77 remove_mappable_node(struct drm_mm_node *node)
  78 {
  79         drm_mm_remove_node(node);
  80 }
  81
  82 /* some bookkeeping */
  83 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  84                                   u64 size)
  85 {
  86         spin_lock(&dev_priv->mm.object_stat_lock);
  87         dev_priv->mm.object_count++;
  88         dev_priv->mm.object_memory += size;
  89         spin_unlock(&dev_priv->mm.object_stat_lock);
  90 }
  91
  92 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  93                                      u64 size)
  94 {
  95         spin_lock(&dev_priv->mm.object_stat_lock);
  96         dev_priv->mm.object_count--;
  97         dev_priv->mm.object_memory -= size;
  98         spin_unlock(&dev_priv->mm.object_stat_lock);
  99 }
 100
 101 static int
 102 i915_gem_wait_for_error(struct i915_gpu_error *error)
 103 {
 104         int ret;
 105
 106         might_sleep();
 107
 108         /*
 109          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 110          * userspace. If it takes that long something really bad is going on and
 111          * we should simply try to bail out and fail as gracefully as possible.
 112          */
 113         ret = wait_event_interruptible_timeout(error->reset_queue,
 114                                                !i915_reset_backoff(error),
 115                                                I915_RESET_TIMEOUT);
 116         if (ret == 0) {
 117                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 118                 return -EIO;
 119         } else if (ret < 0) {
 120                 return ret;
 121         } else {
 122                 return 0;
 123         }
 124 }
 125
 126 int i915_mutex_lock_interruptible(struct drm_device *dev)
 127 {
 128         struct drm_i915_private *dev_priv = to_i915(dev);
 129         int ret;
 130
 131         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 132         if (ret)
 133                 return ret;
 134
 135         ret = mutex_lock_interruptible(&dev->struct_mutex);
 136         if (ret)
 137                 return ret;
 138
 139         return 0;
 140 }
 141
 142 static u32 __i915_gem_park(struct drm_i915_private *i915)
 143 {
 144         intel_wakeref_t wakeref;
 145
 146         GEM_TRACE("\n");
 147
 148         lockdep_assert_held(&i915->drm.struct_mutex);
 149         GEM_BUG_ON(i915->gt.active_requests);
 150         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 151
 152         if (!i915->gt.awake)
 153                 return I915_EPOCH_INVALID;
 154
 155         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 156
 157         /*
 158          * Be paranoid and flush a concurrent interrupt to make sure
 159          * we don't reactivate any irq tasklets after parking.
 160          *
 161          * FIXME: Note that even though we have waited for execlists to be idle,
 162          * there may still be an in-flight interrupt even though the CSB
 163          * is now empty. synchronize_irq() makes sure that a residual interrupt
 164          * is completed before we continue, but it doesn't prevent the HW from
 165          * raising a spurious interrupt later. To complete the shield we should
 166          * coordinate disabling the CS irq with flushing the interrupts.
 167          */
 168         synchronize_irq(i915->drm.irq);
 169
 170         intel_engines_park(i915);
 171         i915_timelines_park(i915);
 172
 173         i915_pmu_gt_parked(i915);
 174         i915_vma_parked(i915);
 175
 176         wakeref = fetch_and_zero(&i915->gt.awake);
 177         GEM_BUG_ON(!wakeref);
 178
 179         if (INTEL_GEN(i915) >= 6)
 180                 gen6_rps_idle(i915);
 181
 182         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref);
 183
 184         return i915->gt.epoch;
 185 }
 186
 187 void i915_gem_park(struct drm_i915_private *i915)
 188 {
 189         GEM_TRACE("\n");
 190
 191         lockdep_assert_held(&i915->drm.struct_mutex);
 192         GEM_BUG_ON(i915->gt.active_requests);
 193
 194         if (!i915->gt.awake)
 195                 return;
 196
 197         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 198         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 199 }
 200
 201 void i915_gem_unpark(struct drm_i915_private *i915)
 202 {
 203         GEM_TRACE("\n");
 204
 205         lockdep_assert_held(&i915->drm.struct_mutex);
 206         GEM_BUG_ON(!i915->gt.active_requests);
 207         assert_rpm_wakelock_held(i915);
 208
 209         if (i915->gt.awake)
 210                 return;
 211
 212         /*
 213          * It seems that the DMC likes to transition between the DC states a lot
 214          * when there are no connected displays (no active power domains) during
 215          * command submission.
 216          *
 217          * This activity has negative impact on the performance of the chip with
 218          * huge latencies observed in the interrupt handler and elsewhere.
 219          *
 220          * Work around it by grabbing a GT IRQ power domain whilst there is any
 221          * GT activity, preventing any DC state transitions.
 222          */
 223         i915->gt.awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 224         GEM_BUG_ON(!i915->gt.awake);
 225
 226         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 227                 i915->gt.epoch = 1;
 228
 229         intel_enable_gt_powersave(i915);
 230         i915_update_gfx_val(i915);
 231         if (INTEL_GEN(i915) >= 6)
 232                 gen6_rps_busy(i915);
 233         i915_pmu_gt_unparked(i915);
 234
 235         intel_engines_unpark(i915);
 236
 237         i915_queue_hangcheck(i915);
 238
 239         queue_delayed_work(i915->wq,
 240                            &i915->gt.retire_work,
 241                            round_jiffies_up_relative(HZ));
 242 }
 243
 244 int
 245 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 246                             struct drm_file *file)
 247 {
 248         struct i915_ggtt *ggtt = &to_i915(dev)->ggtt;
 249         struct drm_i915_gem_get_aperture *args = data;
 250         struct i915_vma *vma;
 251         u64 pinned;
 252
 253         mutex_lock(&ggtt->vm.mutex);
 254
 255         pinned = ggtt->vm.reserved;
 256         list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
 257                 if (i915_vma_is_pinned(vma))
 258                         pinned += vma->node.size;
 259
 260         mutex_unlock(&ggtt->vm.mutex);
 261
 262         args->aper_size = ggtt->vm.total;
 263         args->aper_available_size = args->aper_size - pinned;
 264
 265         return 0;
 266 }
 267
 268 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 269 {
 270         struct address_space *mapping = obj->base.filp->f_mapping;
 271         drm_dma_handle_t *phys;
 272         struct sg_table *st;
 273         struct scatterlist *sg;
 274         char *vaddr;
 275         int i;
 276         int err;
 277
 278         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 279                 return -EINVAL;
 280
 281         /* Always aligning to the object size, allows a single allocation
 282          * to handle all possible callers, and given typical object sizes,
 283          * the alignment of the buddy allocation will naturally match.
 284          */
 285         phys = drm_pci_alloc(obj->base.dev,
 286                              roundup_pow_of_two(obj->base.size),
 287                              roundup_pow_of_two(obj->base.size));
 288         if (!phys)
 289                 return -ENOMEM;
 290
 291         vaddr = phys->vaddr;
 292         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 293                 struct page *page;
 294                 char *src;
 295
 296                 page = shmem_read_mapping_page(mapping, i);
 297                 if (IS_ERR(page)) {
 298                         err = PTR_ERR(page);
 299                         goto err_phys;
 300                 }
 301
 302                 src = kmap_atomic(page);
 303                 memcpy(vaddr, src, PAGE_SIZE);
 304                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 305                 kunmap_atomic(src);
 306
 307                 put_page(page);
 308                 vaddr += PAGE_SIZE;
 309         }
 310
 311         i915_gem_chipset_flush(to_i915(obj->base.dev));
 312
 313         st = kmalloc(sizeof(*st), GFP_KERNEL);
 314         if (!st) {
 315                 err = -ENOMEM;
 316                 goto err_phys;
 317         }
 318
 319         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 320                 kfree(st);
 321                 err = -ENOMEM;
 322                 goto err_phys;
 323         }
 324
 325         sg = st->sgl;
 326         sg->offset = 0;
 327         sg->length = obj->base.size;
 328
 329         sg_dma_address(sg) = phys->busaddr;
 330         sg_dma_len(sg) = obj->base.size;
 331
 332         obj->phys_handle = phys;
 333
 334         __i915_gem_object_set_pages(obj, st, sg->length);
 335
 336         return 0;
 337
 338 err_phys:
 339         drm_pci_free(obj->base.dev, phys);
 340
 341         return err;
 342 }
 343
 344 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 345 {
 346         obj->read_domains = I915_GEM_DOMAIN_CPU;
 347         obj->write_domain = I915_GEM_DOMAIN_CPU;
 348         if (cpu_write_needs_clflush(obj))
 349                 obj->cache_dirty = true;
 350 }
 351
 352 static void
 353 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 354                                 struct sg_table *pages,
 355                                 bool needs_clflush)
 356 {
 357         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 358
 359         if (obj->mm.madv == I915_MADV_DONTNEED)
 360                 obj->mm.dirty = false;
 361
 362         if (needs_clflush &&
 363             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 364             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 365                 drm_clflush_sg(pages);
 366
 367         __start_cpu_write(obj);
 368 }
 369
 370 static void
 371 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 372                                struct sg_table *pages)
 373 {
 374         __i915_gem_object_release_shmem(obj, pages, false);
 375
 376         if (obj->mm.dirty) {
 377                 struct address_space *mapping = obj->base.filp->f_mapping;
 378                 char *vaddr = obj->phys_handle->vaddr;
 379                 int i;
 380
 381                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 382                         struct page *page;
 383                         char *dst;
 384
 385                         page = shmem_read_mapping_page(mapping, i);
 386                         if (IS_ERR(page))
 387                                 continue;
 388
 389                         dst = kmap_atomic(page);
 390                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 391                         memcpy(dst, vaddr, PAGE_SIZE);
 392                         kunmap_atomic(dst);
 393
 394                         set_page_dirty(page);
 395                         if (obj->mm.madv == I915_MADV_WILLNEED)
 396                                 mark_page_accessed(page);
 397                         put_page(page);
 398                         vaddr += PAGE_SIZE;
 399                 }
 400                 obj->mm.dirty = false;
 401         }
 402
 403         sg_free_table(pages);
 404         kfree(pages);
 405
 406         drm_pci_free(obj->base.dev, obj->phys_handle);
 407 }
 408
 409 static void
 410 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 411 {
 412         i915_gem_object_unpin_pages(obj);
 413 }
 414
 415 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 416         .get_pages = i915_gem_object_get_pages_phys,
 417         .put_pages = i915_gem_object_put_pages_phys,
 418         .release = i915_gem_object_release_phys,
 419 };
 420
 421 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 422
 423 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 424 {
 425         struct i915_vma *vma;
 426         LIST_HEAD(still_in_list);
 427         int ret;
 428
 429         lockdep_assert_held(&obj->base.dev->struct_mutex);
 430
 431         /* Closed vma are removed from the obj->vma_list - but they may
 432          * still have an active binding on the object. To remove those we
 433          * must wait for all rendering to complete to the object (as unbinding
 434          * must anyway), and retire the requests.
 435          */
 436         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 437         if (ret)
 438                 return ret;
 439
 440         spin_lock(&obj->vma.lock);
 441         while (!ret && (vma = list_first_entry_or_null(&obj->vma.list,
 442                                                        struct i915_vma,
 443                                                        obj_link))) {
 444                 list_move_tail(&vma->obj_link, &still_in_list);
 445                 spin_unlock(&obj->vma.lock);
 446
 447                 ret = i915_vma_unbind(vma);
 448
 449                 spin_lock(&obj->vma.lock);
 450         }
 451         list_splice(&still_in_list, &obj->vma.list);
 452         spin_unlock(&obj->vma.lock);
 453
 454         return ret;
 455 }
 456
 457 static long
 458 i915_gem_object_wait_fence(struct dma_fence *fence,
 459                            unsigned int flags,
 460                            long timeout,
 461                            struct intel_rps_client *rps_client)
 462 {
 463         struct i915_request *rq;
 464
 465         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 466
 467         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 468                 return timeout;
 469
 470         if (!dma_fence_is_i915(fence))
 471                 return dma_fence_wait_timeout(fence,
 472                                               flags & I915_WAIT_INTERRUPTIBLE,
 473                                               timeout);
 474
 475         rq = to_request(fence);
 476         if (i915_request_completed(rq))
 477                 goto out;
 478
 479         /*
 480          * This client is about to stall waiting for the GPU. In many cases
 481          * this is undesirable and limits the throughput of the system, as
 482          * many clients cannot continue processing user input/output whilst
 483          * blocked. RPS autotuning may take tens of milliseconds to respond
 484          * to the GPU load and thus incurs additional latency for the client.
 485          * We can circumvent that by promoting the GPU frequency to maximum
 486          * before we wait. This makes the GPU throttle up much more quickly
 487          * (good for benchmarks and user experience, e.g. window animations),
 488          * but at a cost of spending more power processing the workload
 489          * (bad for battery). Not all clients even want their results
 490          * immediately and for them we should just let the GPU select its own
 491          * frequency to maximise efficiency. To prevent a single client from
 492          * forcing the clocks too high for the whole system, we only allow
 493          * each client to waitboost once in a busy period.
 494          */
 495         if (rps_client && !i915_request_started(rq)) {
 496                 if (INTEL_GEN(rq->i915) >= 6)
 497                         gen6_rps_boost(rq, rps_client);
 498         }
 499
 500         timeout = i915_request_wait(rq, flags, timeout);
 501
 502 out:
 503         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 504                 i915_request_retire_upto(rq);
 505
 506         return timeout;
 507 }
 508
 509 static long
 510 i915_gem_object_wait_reservation(struct reservation_object *resv,
 511                                  unsigned int flags,
 512                                  long timeout,
 513                                  struct intel_rps_client *rps_client)
 514 {
 515         unsigned int seq = __read_seqcount_begin(&resv->seq);
 516         struct dma_fence *excl;
 517         bool prune_fences = false;
 518
 519         if (flags & I915_WAIT_ALL) {
 520                 struct dma_fence **shared;
 521                 unsigned int count, i;
 522                 int ret;
 523
 524                 ret = reservation_object_get_fences_rcu(resv,
 525                                                         &excl, &count, &shared);
 526                 if (ret)
 527                         return ret;
 528
 529                 for (i = 0; i < count; i++) {
 530                         timeout = i915_gem_object_wait_fence(shared[i],
 531                                                              flags, timeout,
 532                                                              rps_client);
 533                         if (timeout < 0)
 534                                 break;
 535
 536                         dma_fence_put(shared[i]);
 537                 }
 538
 539                 for (; i < count; i++)
 540                         dma_fence_put(shared[i]);
 541                 kfree(shared);
 542
 543                 /*
 544                  * If both shared fences and an exclusive fence exist,
 545                  * then by construction the shared fences must be later
 546                  * than the exclusive fence. If we successfully wait for
 547                  * all the shared fences, we know that the exclusive fence
 548                  * must all be signaled. If all the shared fences are
 549                  * signaled, we can prune the array and recover the
 550                  * floating references on the fences/requests.
 551                  */
 552                 prune_fences = count && timeout >= 0;
 553         } else {
 554                 excl = reservation_object_get_excl_rcu(resv);
 555         }
 556
 557         if (excl && timeout >= 0)
 558                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 559                                                      rps_client);
 560
 561         dma_fence_put(excl);
 562
 563         /*
 564          * Opportunistically prune the fences iff we know they have *all* been
 565          * signaled and that the reservation object has not been changed (i.e.
 566          * no new fences have been added).
 567          */
 568         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 569                 if (reservation_object_trylock(resv)) {
 570                         if (!__read_seqcount_retry(&resv->seq, seq))
 571                                 reservation_object_add_excl_fence(resv, NULL);
 572                         reservation_object_unlock(resv);
 573                 }
 574         }
 575
 576         return timeout;
 577 }
 578
 579 static void __fence_set_priority(struct dma_fence *fence,
 580                                  const struct i915_sched_attr *attr)
 581 {
 582         struct i915_request *rq;
 583         struct intel_engine_cs *engine;
 584
 585         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 586                 return;
 587
 588         rq = to_request(fence);
 589         engine = rq->engine;
 590
 591         local_bh_disable();
 592         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 593         if (engine->schedule)
 594                 engine->schedule(rq, attr);
 595         rcu_read_unlock();
 596         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 597 }
 598
 599 static void fence_set_priority(struct dma_fence *fence,
 600                                const struct i915_sched_attr *attr)
 601 {
 602         /* Recurse once into a fence-array */
 603         if (dma_fence_is_array(fence)) {
 604                 struct dma_fence_array *array = to_dma_fence_array(fence);
 605                 int i;
 606
 607                 for (i = 0; i < array->num_fences; i++)
 608                         __fence_set_priority(array->fences[i], attr);
 609         } else {
 610                 __fence_set_priority(fence, attr);
 611         }
 612 }
 613
 614 int
 615 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 616                               unsigned int flags,
 617                               const struct i915_sched_attr *attr)
 618 {
 619         struct dma_fence *excl;
 620
 621         if (flags & I915_WAIT_ALL) {
 622                 struct dma_fence **shared;
 623                 unsigned int count, i;
 624                 int ret;
 625
 626                 ret = reservation_object_get_fences_rcu(obj->resv,
 627                                                         &excl, &count, &shared);
 628                 if (ret)
 629                         return ret;
 630
 631                 for (i = 0; i < count; i++) {
 632                         fence_set_priority(shared[i], attr);
 633                         dma_fence_put(shared[i]);
 634                 }
 635
 636                 kfree(shared);
 637         } else {
 638                 excl = reservation_object_get_excl_rcu(obj->resv);
 639         }
 640
 641         if (excl) {
 642                 fence_set_priority(excl, attr);
 643                 dma_fence_put(excl);
 644         }
 645         return 0;
 646 }
 647
 648 /**
 649  * Waits for rendering to the object to be completed
 650  * @obj: i915 gem object
 651  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 652  * @timeout: how long to wait
 653  * @rps_client: client (user process) to charge for any waitboosting
 654  */
 655 int
 656 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 657                      unsigned int flags,
 658                      long timeout,
 659                      struct intel_rps_client *rps_client)
 660 {
 661         might_sleep();
 662         GEM_BUG_ON(timeout < 0);
 663
 664         timeout = i915_gem_object_wait_reservation(obj->resv,
 665                                                    flags, timeout,
 666                                                    rps_client);
 667         return timeout < 0 ? timeout : 0;
 668 }
 669
 670 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 671 {
 672         struct drm_i915_file_private *fpriv = file->driver_priv;
 673
 674         return &fpriv->rps_client;
 675 }
 676
 677 static int
 678 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 679                      struct drm_i915_gem_pwrite *args,
 680                      struct drm_file *file)
 681 {
 682         void *vaddr = obj->phys_handle->vaddr + args->offset;
 683         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 684
 685         /* We manually control the domain here and pretend that it
 686          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 687          */
 688         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 689         if (copy_from_user(vaddr, user_data, args->size))
 690                 return -EFAULT;
 691
 692         drm_clflush_virt_range(vaddr, args->size);
 693         i915_gem_chipset_flush(to_i915(obj->base.dev));
 694
 695         intel_fb_obj_flush(obj, ORIGIN_CPU);
 696         return 0;
 697 }
 698
 699 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 700 {
 701         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 702 }
 703
 704 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 705 {
 706         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 707         kmem_cache_free(dev_priv->objects, obj);
 708 }
 709
 710 static int
 711 i915_gem_create(struct drm_file *file,
 712                 struct drm_i915_private *dev_priv,
 713                 u64 size,
 714                 u32 *handle_p)
 715 {
 716         struct drm_i915_gem_object *obj;
 717         int ret;
 718         u32 handle;
 719
 720         size = roundup(size, PAGE_SIZE);
 721         if (size == 0)
 722                 return -EINVAL;
 723
 724         /* Allocate the new object */
 725         obj = i915_gem_object_create(dev_priv, size);
 726         if (IS_ERR(obj))
 727                 return PTR_ERR(obj);
 728
 729         ret = drm_gem_handle_create(file, &obj->base, &handle);
 730         /* drop reference from allocate - handle holds it now */
 731         i915_gem_object_put(obj);
 732         if (ret)
 733                 return ret;
 734
 735         *handle_p = handle;
 736         return 0;
 737 }
 738
 739 int
 740 i915_gem_dumb_create(struct drm_file *file,
 741                      struct drm_device *dev,
 742                      struct drm_mode_create_dumb *args)
 743 {
 744         /* have to work out size/pitch and return them */
 745         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 746         args->size = args->pitch * args->height;
 747         return i915_gem_create(file, to_i915(dev),
 748                                args->size, &args->handle);
 749 }
 750
 751 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 752 {
 753         return !(obj->cache_level == I915_CACHE_NONE ||
 754                  obj->cache_level == I915_CACHE_WT);
 755 }
 756
 757 /**
 758  * Creates a new mm object and returns a handle to it.
 759  * @dev: drm device pointer
 760  * @data: ioctl data blob
 761  * @file: drm file pointer
 762  */
 763 int
 764 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 765                       struct drm_file *file)
 766 {
 767         struct drm_i915_private *dev_priv = to_i915(dev);
 768         struct drm_i915_gem_create *args = data;
 769
 770         i915_gem_flush_free_objects(dev_priv);
 771
 772         return i915_gem_create(file, dev_priv,
 773                                args->size, &args->handle);
 774 }
 775
 776 static inline enum fb_op_origin
 777 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 778 {
 779         return (domain == I915_GEM_DOMAIN_GTT ?
 780                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 781 }
 782
 783 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 784 {
 785         intel_wakeref_t wakeref;
 786
 787         /*
 788          * No actual flushing is required for the GTT write domain for reads
 789          * from the GTT domain. Writes to it "immediately" go to main memory
 790          * as far as we know, so there's no chipset flush. It also doesn't
 791          * land in the GPU render cache.
 792          *
 793          * However, we do have to enforce the order so that all writes through
 794          * the GTT land before any writes to the device, such as updates to
 795          * the GATT itself.
 796          *
 797          * We also have to wait a bit for the writes to land from the GTT.
 798          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 799          * timing. This issue has only been observed when switching quickly
 800          * between GTT writes and CPU reads from inside the kernel on recent hw,
 801          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 802          * system agents we cannot reproduce this behaviour, until Cannonlake
 803          * that was!).
 804          */
 805
 806         wmb();
 807
 808         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
 809                 return;
 810
 811         i915_gem_chipset_flush(dev_priv);
 812
 813         with_intel_runtime_pm(dev_priv, wakeref) {
 814                 spin_lock_irq(&dev_priv->uncore.lock);
 815
 816                 POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 817
 818                 spin_unlock_irq(&dev_priv->uncore.lock);
 819         }
 820 }
 821
 822 static void
 823 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 824 {
 825         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 826         struct i915_vma *vma;
 827
 828         if (!(obj->write_domain & flush_domains))
 829                 return;
 830
 831         switch (obj->write_domain) {
 832         case I915_GEM_DOMAIN_GTT:
 833                 i915_gem_flush_ggtt_writes(dev_priv);
 834
 835                 intel_fb_obj_flush(obj,
 836                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 837
 838                 for_each_ggtt_vma(vma, obj) {
 839                         if (vma->iomap)
 840                                 continue;
 841
 842                         i915_vma_unset_ggtt_write(vma);
 843                 }
 844                 break;
 845
 846         case I915_GEM_DOMAIN_WC:
 847                 wmb();
 848                 break;
 849
 850         case I915_GEM_DOMAIN_CPU:
 851                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 852                 break;
 853
 854         case I915_GEM_DOMAIN_RENDER:
 855                 if (gpu_write_needs_clflush(obj))
 856                         obj->cache_dirty = true;
 857                 break;
 858         }
 859
 860         obj->write_domain = 0;
 861 }
 862
 863 /*
 864  * Pins the specified object's pages and synchronizes the object with
 865  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 866  * flush the object from the CPU cache.
 867  */
 868 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 869                                     unsigned int *needs_clflush)
 870 {
 871         int ret;
 872
 873         lockdep_assert_held(&obj->base.dev->struct_mutex);
 874
 875         *needs_clflush = 0;
 876         if (!i915_gem_object_has_struct_page(obj))
 877                 return -ENODEV;
 878
 879         ret = i915_gem_object_wait(obj,
 880                                    I915_WAIT_INTERRUPTIBLE |
 881                                    I915_WAIT_LOCKED,
 882                                    MAX_SCHEDULE_TIMEOUT,
 883                                    NULL);
 884         if (ret)
 885                 return ret;
 886
 887         ret = i915_gem_object_pin_pages(obj);
 888         if (ret)
 889                 return ret;
 890
 891         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 892             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 893                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 894                 if (ret)
 895                         goto err_unpin;
 896                 else
 897                         goto out;
 898         }
 899
 900         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 901
 902         /* If we're not in the cpu read domain, set ourself into the gtt
 903          * read domain and manually flush cachelines (if required). This
 904          * optimizes for the case when the gpu will dirty the data
 905          * anyway again before the next pread happens.
 906          */
 907         if (!obj->cache_dirty &&
 908             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 909                 *needs_clflush = CLFLUSH_BEFORE;
 910
 911 out:
 912         /* return with the pages pinned */
 913         return 0;
 914
 915 err_unpin:
 916         i915_gem_object_unpin_pages(obj);
 917         return ret;
 918 }
 919
 920 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 921                                      unsigned int *needs_clflush)
 922 {
 923         int ret;
 924
 925         lockdep_assert_held(&obj->base.dev->struct_mutex);
 926
 927         *needs_clflush = 0;
 928         if (!i915_gem_object_has_struct_page(obj))
 929                 return -ENODEV;
 930
 931         ret = i915_gem_object_wait(obj,
 932                                    I915_WAIT_INTERRUPTIBLE |
 933                                    I915_WAIT_LOCKED |
 934                                    I915_WAIT_ALL,
 935                                    MAX_SCHEDULE_TIMEOUT,
 936                                    NULL);
 937         if (ret)
 938                 return ret;
 939
 940         ret = i915_gem_object_pin_pages(obj);
 941         if (ret)
 942                 return ret;
 943
 944         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 945             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 946                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 947                 if (ret)
 948                         goto err_unpin;
 949                 else
 950                         goto out;
 951         }
 952
 953         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 954
 955         /* If we're not in the cpu write domain, set ourself into the
 956          * gtt write domain and manually flush cachelines (as required).
 957          * This optimizes for the case when the gpu will use the data
 958          * right away and we therefore have to clflush anyway.
 959          */
 960         if (!obj->cache_dirty) {
 961                 *needs_clflush |= CLFLUSH_AFTER;
 962
 963                 /*
 964                  * Same trick applies to invalidate partially written
 965                  * cachelines read before writing.
 966                  */
 967                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
 968                         *needs_clflush |= CLFLUSH_BEFORE;
 969         }
 970
 971 out:
 972         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 973         obj->mm.dirty = true;
 974         /* return with the pages pinned */
 975         return 0;
 976
 977 err_unpin:
 978         i915_gem_object_unpin_pages(obj);
 979         return ret;
 980 }
 981
 982 static int
 983 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
 984             bool needs_clflush)
 985 {
 986         char *vaddr;
 987         int ret;
 988
 989         vaddr = kmap(page);
 990
 991         if (needs_clflush)
 992                 drm_clflush_virt_range(vaddr + offset, len);
 993
 994         ret = __copy_to_user(user_data, vaddr + offset, len);
 995
 996         kunmap(page);
 997
 998         return ret ? -EFAULT : 0;
 999 }
1000
1001 static int
1002 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1003                      struct drm_i915_gem_pread *args)
1004 {
1005         char __user *user_data;
1006         u64 remain;
1007         unsigned int needs_clflush;
1008         unsigned int idx, offset;
1009         int ret;
1010
1011         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1012         if (ret)
1013                 return ret;
1014
1015         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1016         mutex_unlock(&obj->base.dev->struct_mutex);
1017         if (ret)
1018                 return ret;
1019
1020         remain = args->size;
1021         user_data = u64_to_user_ptr(args->data_ptr);
1022         offset = offset_in_page(args->offset);
1023         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1024                 struct page *page = i915_gem_object_get_page(obj, idx);
1025                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1026
1027                 ret = shmem_pread(page, offset, length, user_data,
1028                                   needs_clflush);
1029                 if (ret)
1030                         break;
1031
1032                 remain -= length;
1033                 user_data += length;
1034                 offset = 0;
1035         }
1036
1037         i915_gem_obj_finish_shmem_access(obj);
1038         return ret;
1039 }
1040
1041 static inline bool
1042 gtt_user_read(struct io_mapping *mapping,
1043               loff_t base, int offset,
1044               char __user *user_data, int length)
1045 {
1046         void __iomem *vaddr;
1047         unsigned long unwritten;
1048
1049         /* We can use the cpu mem copy function because this is X86. */
1050         vaddr = io_mapping_map_atomic_wc(mapping, base);
1051         unwritten = __copy_to_user_inatomic(user_data,
1052                                             (void __force *)vaddr + offset,
1053                                             length);
1054         io_mapping_unmap_atomic(vaddr);
1055         if (unwritten) {
1056                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1057                 unwritten = copy_to_user(user_data,
1058                                          (void __force *)vaddr + offset,
1059                                          length);
1060                 io_mapping_unmap(vaddr);
1061         }
1062         return unwritten;
1063 }
1064
1065 static int
1066 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1067                    const struct drm_i915_gem_pread *args)
1068 {
1069         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1070         struct i915_ggtt *ggtt = &i915->ggtt;
1071         intel_wakeref_t wakeref;
1072         struct drm_mm_node node;
1073         struct i915_vma *vma;
1074         void __user *user_data;
1075         u64 remain, offset;
1076         int ret;
1077
1078         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1079         if (ret)
1080                 return ret;
1081
1082         wakeref = intel_runtime_pm_get(i915);
1083         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1084                                        PIN_MAPPABLE |
1085                                        PIN_NONFAULT |
1086                                        PIN_NONBLOCK);
1087         if (!IS_ERR(vma)) {
1088                 node.start = i915_ggtt_offset(vma);
1089                 node.allocated = false;
1090                 ret = i915_vma_put_fence(vma);
1091                 if (ret) {
1092                         i915_vma_unpin(vma);
1093                         vma = ERR_PTR(ret);
1094                 }
1095         }
1096         if (IS_ERR(vma)) {
1097                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1098                 if (ret)
1099                         goto out_unlock;
1100                 GEM_BUG_ON(!node.allocated);
1101         }
1102
1103         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1104         if (ret)
1105                 goto out_unpin;
1106
1107         mutex_unlock(&i915->drm.struct_mutex);
1108
1109         user_data = u64_to_user_ptr(args->data_ptr);
1110         remain = args->size;
1111         offset = args->offset;
1112
1113         while (remain > 0) {
1114                 /* Operation in this page
1115                  *
1116                  * page_base = page offset within aperture
1117                  * page_offset = offset within page
1118                  * page_length = bytes to copy for this page
1119                  */
1120                 u32 page_base = node.start;
1121                 unsigned page_offset = offset_in_page(offset);
1122                 unsigned page_length = PAGE_SIZE - page_offset;
1123                 page_length = remain < page_length ? remain : page_length;
1124                 if (node.allocated) {
1125                         wmb();
1126                         ggtt->vm.insert_page(&ggtt->vm,
1127                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1128                                              node.start, I915_CACHE_NONE, 0);
1129                         wmb();
1130                 } else {
1131                         page_base += offset & PAGE_MASK;
1132                 }
1133
1134                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1135                                   user_data, page_length)) {
1136                         ret = -EFAULT;
1137                         break;
1138                 }
1139
1140                 remain -= page_length;
1141                 user_data += page_length;
1142                 offset += page_length;
1143         }
1144
1145         mutex_lock(&i915->drm.struct_mutex);
1146 out_unpin:
1147         if (node.allocated) {
1148                 wmb();
1149                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1150                 remove_mappable_node(&node);
1151         } else {
1152                 i915_vma_unpin(vma);
1153         }
1154 out_unlock:
1155         intel_runtime_pm_put(i915, wakeref);
1156         mutex_unlock(&i915->drm.struct_mutex);
1157
1158         return ret;
1159 }
1160
1161 /**
1162  * Reads data from the object referenced by handle.
1163  * @dev: drm device pointer
1164  * @data: ioctl data blob
1165  * @file: drm file pointer
1166  *
1167  * On error, the contents of *data are undefined.
1168  */
1169 int
1170 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1171                      struct drm_file *file)
1172 {
1173         struct drm_i915_gem_pread *args = data;
1174         struct drm_i915_gem_object *obj;
1175         int ret;
1176
1177         if (args->size == 0)
1178                 return 0;
1179
1180         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1181                        args->size))
1182                 return -EFAULT;
1183
1184         obj = i915_gem_object_lookup(file, args->handle);
1185         if (!obj)
1186                 return -ENOENT;
1187
1188         /* Bounds check source.  */
1189         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1190                 ret = -EINVAL;
1191                 goto out;
1192         }
1193
1194         trace_i915_gem_object_pread(obj, args->offset, args->size);
1195
1196         ret = i915_gem_object_wait(obj,
1197                                    I915_WAIT_INTERRUPTIBLE,
1198                                    MAX_SCHEDULE_TIMEOUT,
1199                                    to_rps_client(file));
1200         if (ret)
1201                 goto out;
1202
1203         ret = i915_gem_object_pin_pages(obj);
1204         if (ret)
1205                 goto out;
1206
1207         ret = i915_gem_shmem_pread(obj, args);
1208         if (ret == -EFAULT || ret == -ENODEV)
1209                 ret = i915_gem_gtt_pread(obj, args);
1210
1211         i915_gem_object_unpin_pages(obj);
1212 out:
1213         i915_gem_object_put(obj);
1214         return ret;
1215 }
1216
1217 /* This is the fast write path which cannot handle
1218  * page faults in the source data
1219  */
1220
1221 static inline bool
1222 ggtt_write(struct io_mapping *mapping,
1223            loff_t base, int offset,
1224            char __user *user_data, int length)
1225 {
1226         void __iomem *vaddr;
1227         unsigned long unwritten;
1228
1229         /* We can use the cpu mem copy function because this is X86. */
1230         vaddr = io_mapping_map_atomic_wc(mapping, base);
1231         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1232                                                       user_data, length);
1233         io_mapping_unmap_atomic(vaddr);
1234         if (unwritten) {
1235                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1236                 unwritten = copy_from_user((void __force *)vaddr + offset,
1237                                            user_data, length);
1238                 io_mapping_unmap(vaddr);
1239         }
1240
1241         return unwritten;
1242 }
1243
1244 /**
1245  * This is the fast pwrite path, where we copy the data directly from the
1246  * user into the GTT, uncached.
1247  * @obj: i915 GEM object
1248  * @args: pwrite arguments structure
1249  */
1250 static int
1251 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1252                          const struct drm_i915_gem_pwrite *args)
1253 {
1254         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1255         struct i915_ggtt *ggtt = &i915->ggtt;
1256         intel_wakeref_t wakeref;
1257         struct drm_mm_node node;
1258         struct i915_vma *vma;
1259         u64 remain, offset;
1260         void __user *user_data;
1261         int ret;
1262
1263         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1264         if (ret)
1265                 return ret;
1266
1267         if (i915_gem_object_has_struct_page(obj)) {
1268                 /*
1269                  * Avoid waking the device up if we can fallback, as
1270                  * waking/resuming is very slow (worst-case 10-100 ms
1271                  * depending on PCI sleeps and our own resume time).
1272                  * This easily dwarfs any performance advantage from
1273                  * using the cache bypass of indirect GGTT access.
1274                  */
1275                 wakeref = intel_runtime_pm_get_if_in_use(i915);
1276                 if (!wakeref) {
1277                         ret = -EFAULT;
1278                         goto out_unlock;
1279                 }
1280         } else {
1281                 /* No backing pages, no fallback, we must force GGTT access */
1282                 wakeref = intel_runtime_pm_get(i915);
1283         }
1284
1285         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1286                                        PIN_MAPPABLE |
1287                                        PIN_NONFAULT |
1288                                        PIN_NONBLOCK);
1289         if (!IS_ERR(vma)) {
1290                 node.start = i915_ggtt_offset(vma);
1291                 node.allocated = false;
1292                 ret = i915_vma_put_fence(vma);
1293                 if (ret) {
1294                         i915_vma_unpin(vma);
1295                         vma = ERR_PTR(ret);
1296                 }
1297         }
1298         if (IS_ERR(vma)) {
1299                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1300                 if (ret)
1301                         goto out_rpm;
1302                 GEM_BUG_ON(!node.allocated);
1303         }
1304
1305         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1306         if (ret)
1307                 goto out_unpin;
1308
1309         mutex_unlock(&i915->drm.struct_mutex);
1310
1311         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1312
1313         user_data = u64_to_user_ptr(args->data_ptr);
1314         offset = args->offset;
1315         remain = args->size;
1316         while (remain) {
1317                 /* Operation in this page
1318                  *
1319                  * page_base = page offset within aperture
1320                  * page_offset = offset within page
1321                  * page_length = bytes to copy for this page
1322                  */
1323                 u32 page_base = node.start;
1324                 unsigned int page_offset = offset_in_page(offset);
1325                 unsigned int page_length = PAGE_SIZE - page_offset;
1326                 page_length = remain < page_length ? remain : page_length;
1327                 if (node.allocated) {
1328                         wmb(); /* flush the write before we modify the GGTT */
1329                         ggtt->vm.insert_page(&ggtt->vm,
1330                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1331                                              node.start, I915_CACHE_NONE, 0);
1332                         wmb(); /* flush modifications to the GGTT (insert_page) */
1333                 } else {
1334                         page_base += offset & PAGE_MASK;
1335                 }
1336                 /* If we get a fault while copying data, then (presumably) our
1337                  * source page isn't available.  Return the error and we'll
1338                  * retry in the slow path.
1339                  * If the object is non-shmem backed, we retry again with the
1340                  * path that handles page fault.
1341                  */
1342                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1343                                user_data, page_length)) {
1344                         ret = -EFAULT;
1345                         break;
1346                 }
1347
1348                 remain -= page_length;
1349                 user_data += page_length;
1350                 offset += page_length;
1351         }
1352         intel_fb_obj_flush(obj, ORIGIN_CPU);
1353
1354         mutex_lock(&i915->drm.struct_mutex);
1355 out_unpin:
1356         if (node.allocated) {
1357                 wmb();
1358                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1359                 remove_mappable_node(&node);
1360         } else {
1361                 i915_vma_unpin(vma);
1362         }
1363 out_rpm:
1364         intel_runtime_pm_put(i915, wakeref);
1365 out_unlock:
1366         mutex_unlock(&i915->drm.struct_mutex);
1367         return ret;
1368 }
1369
1370 /* Per-page copy function for the shmem pwrite fastpath.
1371  * Flushes invalid cachelines before writing to the target if
1372  * needs_clflush_before is set and flushes out any written cachelines after
1373  * writing if needs_clflush is set.
1374  */
1375 static int
1376 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1377              bool needs_clflush_before,
1378              bool needs_clflush_after)
1379 {
1380         char *vaddr;
1381         int ret;
1382
1383         vaddr = kmap(page);
1384
1385         if (needs_clflush_before)
1386                 drm_clflush_virt_range(vaddr + offset, len);
1387
1388         ret = __copy_from_user(vaddr + offset, user_data, len);
1389         if (!ret && needs_clflush_after)
1390                 drm_clflush_virt_range(vaddr + offset, len);
1391
1392         kunmap(page);
1393
1394         return ret ? -EFAULT : 0;
1395 }
1396
1397 static int
1398 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1399                       const struct drm_i915_gem_pwrite *args)
1400 {
1401         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1402         void __user *user_data;
1403         u64 remain;
1404         unsigned int partial_cacheline_write;
1405         unsigned int needs_clflush;
1406         unsigned int offset, idx;
1407         int ret;
1408
1409         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1410         if (ret)
1411                 return ret;
1412
1413         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1414         mutex_unlock(&i915->drm.struct_mutex);
1415         if (ret)
1416                 return ret;
1417
1418         /* If we don't overwrite a cacheline completely we need to be
1419          * careful to have up-to-date data by first clflushing. Don't
1420          * overcomplicate things and flush the entire patch.
1421          */
1422         partial_cacheline_write = 0;
1423         if (needs_clflush & CLFLUSH_BEFORE)
1424                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1425
1426         user_data = u64_to_user_ptr(args->data_ptr);
1427         remain = args->size;
1428         offset = offset_in_page(args->offset);
1429         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1430                 struct page *page = i915_gem_object_get_page(obj, idx);
1431                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1432
1433                 ret = shmem_pwrite(page, offset, length, user_data,
1434                                    (offset | length) & partial_cacheline_write,
1435                                    needs_clflush & CLFLUSH_AFTER);
1436                 if (ret)
1437                         break;
1438
1439                 remain -= length;
1440                 user_data += length;
1441                 offset = 0;
1442         }
1443
1444         intel_fb_obj_flush(obj, ORIGIN_CPU);
1445         i915_gem_obj_finish_shmem_access(obj);
1446         return ret;
1447 }
1448
1449 /**
1450  * Writes data to the object referenced by handle.
1451  * @dev: drm device
1452  * @data: ioctl data blob
1453  * @file: drm file
1454  *
1455  * On error, the contents of the buffer that were to be modified are undefined.
1456  */
1457 int
1458 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1459                       struct drm_file *file)
1460 {
1461         struct drm_i915_gem_pwrite *args = data;
1462         struct drm_i915_gem_object *obj;
1463         int ret;
1464
1465         if (args->size == 0)
1466                 return 0;
1467
1468         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1469                 return -EFAULT;
1470
1471         obj = i915_gem_object_lookup(file, args->handle);
1472         if (!obj)
1473                 return -ENOENT;
1474
1475         /* Bounds check destination. */
1476         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1477                 ret = -EINVAL;
1478                 goto err;
1479         }
1480
1481         /* Writes not allowed into this read-only object */
1482         if (i915_gem_object_is_readonly(obj)) {
1483                 ret = -EINVAL;
1484                 goto err;
1485         }
1486
1487         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1488
1489         ret = -ENODEV;
1490         if (obj->ops->pwrite)
1491                 ret = obj->ops->pwrite(obj, args);
1492         if (ret != -ENODEV)
1493                 goto err;
1494
1495         ret = i915_gem_object_wait(obj,
1496                                    I915_WAIT_INTERRUPTIBLE |
1497                                    I915_WAIT_ALL,
1498                                    MAX_SCHEDULE_TIMEOUT,
1499                                    to_rps_client(file));
1500         if (ret)
1501                 goto err;
1502
1503         ret = i915_gem_object_pin_pages(obj);
1504         if (ret)
1505                 goto err;
1506
1507         ret = -EFAULT;
1508         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1509          * it would end up going through the fenced access, and we'll get
1510          * different detiling behavior between reading and writing.
1511          * pread/pwrite currently are reading and writing from the CPU
1512          * perspective, requiring manual detiling by the client.
1513          */
1514         if (!i915_gem_object_has_struct_page(obj) ||
1515             cpu_write_needs_clflush(obj))
1516                 /* Note that the gtt paths might fail with non-page-backed user
1517                  * pointers (e.g. gtt mappings when moving data between
1518                  * textures). Fallback to the shmem path in that case.
1519                  */
1520                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1521
1522         if (ret == -EFAULT || ret == -ENOSPC) {
1523                 if (obj->phys_handle)
1524                         ret = i915_gem_phys_pwrite(obj, args, file);
1525                 else
1526                         ret = i915_gem_shmem_pwrite(obj, args);
1527         }
1528
1529         i915_gem_object_unpin_pages(obj);
1530 err:
1531         i915_gem_object_put(obj);
1532         return ret;
1533 }
1534
1535 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1536 {
1537         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1538         struct list_head *list;
1539         struct i915_vma *vma;
1540
1541         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1542
1543         mutex_lock(&i915->ggtt.vm.mutex);
1544         for_each_ggtt_vma(vma, obj) {
1545                 if (!drm_mm_node_allocated(&vma->node))
1546                         continue;
1547
1548                 list_move_tail(&vma->vm_link, &vma->vm->bound_list);
1549         }
1550         mutex_unlock(&i915->ggtt.vm.mutex);
1551
1552         spin_lock(&i915->mm.obj_lock);
1553         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1554         list_move_tail(&obj->mm.link, list);
1555         spin_unlock(&i915->mm.obj_lock);
1556 }
1557
1558 /**
1559  * Called when user space prepares to use an object with the CPU, either
1560  * through the mmap ioctl's mapping or a GTT mapping.
1561  * @dev: drm device
1562  * @data: ioctl data blob
1563  * @file: drm file
1564  */
1565 int
1566 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1567                           struct drm_file *file)
1568 {
1569         struct drm_i915_gem_set_domain *args = data;
1570         struct drm_i915_gem_object *obj;
1571         u32 read_domains = args->read_domains;
1572         u32 write_domain = args->write_domain;
1573         int err;
1574
1575         /* Only handle setting domains to types used by the CPU. */
1576         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1577                 return -EINVAL;
1578
1579         /* Having something in the write domain implies it's in the read
1580          * domain, and only that read domain.  Enforce that in the request.
1581          */
1582         if (write_domain != 0 && read_domains != write_domain)
1583                 return -EINVAL;
1584
1585         obj = i915_gem_object_lookup(file, args->handle);
1586         if (!obj)
1587                 return -ENOENT;
1588
1589         /* Try to flush the object off the GPU without holding the lock.
1590          * We will repeat the flush holding the lock in the normal manner
1591          * to catch cases where we are gazumped.
1592          */
1593         err = i915_gem_object_wait(obj,
1594                                    I915_WAIT_INTERRUPTIBLE |
1595                                    I915_WAIT_PRIORITY |
1596                                    (write_domain ? I915_WAIT_ALL : 0),
1597                                    MAX_SCHEDULE_TIMEOUT,
1598                                    to_rps_client(file));
1599         if (err)
1600                 goto out;
1601
1602         /*
1603          * Proxy objects do not control access to the backing storage, ergo
1604          * they cannot be used as a means to manipulate the cache domain
1605          * tracking for that backing storage. The proxy object is always
1606          * considered to be outside of any cache domain.
1607          */
1608         if (i915_gem_object_is_proxy(obj)) {
1609                 err = -ENXIO;
1610                 goto out;
1611         }
1612
1613         /*
1614          * Flush and acquire obj->pages so that we are coherent through
1615          * direct access in memory with previous cached writes through
1616          * shmemfs and that our cache domain tracking remains valid.
1617          * For example, if the obj->filp was moved to swap without us
1618          * being notified and releasing the pages, we would mistakenly
1619          * continue to assume that the obj remained out of the CPU cached
1620          * domain.
1621          */
1622         err = i915_gem_object_pin_pages(obj);
1623         if (err)
1624                 goto out;
1625
1626         err = i915_mutex_lock_interruptible(dev);
1627         if (err)
1628                 goto out_unpin;
1629
1630         if (read_domains & I915_GEM_DOMAIN_WC)
1631                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1632         else if (read_domains & I915_GEM_DOMAIN_GTT)
1633                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1634         else
1635                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1636
1637         /* And bump the LRU for this access */
1638         i915_gem_object_bump_inactive_ggtt(obj);
1639
1640         mutex_unlock(&dev->struct_mutex);
1641
1642         if (write_domain != 0)
1643                 intel_fb_obj_invalidate(obj,
1644                                         fb_write_origin(obj, write_domain));
1645
1646 out_unpin:
1647         i915_gem_object_unpin_pages(obj);
1648 out:
1649         i915_gem_object_put(obj);
1650         return err;
1651 }
1652
1653 /**
1654  * Called when user space has done writes to this buffer
1655  * @dev: drm device
1656  * @data: ioctl data blob
1657  * @file: drm file
1658  */
1659 int
1660 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1661                          struct drm_file *file)
1662 {
1663         struct drm_i915_gem_sw_finish *args = data;
1664         struct drm_i915_gem_object *obj;
1665
1666         obj = i915_gem_object_lookup(file, args->handle);
1667         if (!obj)
1668                 return -ENOENT;
1669
1670         /*
1671          * Proxy objects are barred from CPU access, so there is no
1672          * need to ban sw_finish as it is a nop.
1673          */
1674
1675         /* Pinned buffers may be scanout, so flush the cache */
1676         i915_gem_object_flush_if_display(obj);
1677         i915_gem_object_put(obj);
1678
1679         return 0;
1680 }
1681
1682 static inline bool
1683 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1684               unsigned long addr, unsigned long size)
1685 {
1686         if (vma->vm_file != filp)
1687                 return false;
1688
1689         return vma->vm_start == addr && (vma->vm_end - vma->vm_start) == size;
1690 }
1691
1692 /**
1693  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1694  *                       it is mapped to.
1695  * @dev: drm device
1696  * @data: ioctl data blob
1697  * @file: drm file
1698  *
1699  * While the mapping holds a reference on the contents of the object, it doesn't
1700  * imply a ref on the object itself.
1701  *
1702  * IMPORTANT:
1703  *
1704  * DRM driver writers who look a this function as an example for how to do GEM
1705  * mmap support, please don't implement mmap support like here. The modern way
1706  * to implement DRM mmap support is with an mmap offset ioctl (like
1707  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1708  * That way debug tooling like valgrind will understand what's going on, hiding
1709  * the mmap call in a driver private ioctl will break that. The i915 driver only
1710  * does cpu mmaps this way because we didn't know better.
1711  */
1712 int
1713 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1714                     struct drm_file *file)
1715 {
1716         struct drm_i915_gem_mmap *args = data;
1717         struct drm_i915_gem_object *obj;
1718         unsigned long addr;
1719
1720         if (args->flags & ~(I915_MMAP_WC))
1721                 return -EINVAL;
1722
1723         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1724                 return -ENODEV;
1725
1726         obj = i915_gem_object_lookup(file, args->handle);
1727         if (!obj)
1728                 return -ENOENT;
1729
1730         /* prime objects have no backing filp to GEM mmap
1731          * pages from.
1732          */
1733         if (!obj->base.filp) {
1734                 i915_gem_object_put(obj);
1735                 return -ENXIO;
1736         }
1737
1738         addr = vm_mmap(obj->base.filp, 0, args->size,
1739                        PROT_READ | PROT_WRITE, MAP_SHARED,
1740                        args->offset);
1741         if (IS_ERR_VALUE(addr))
1742                 goto err;
1743
1744         if (args->flags & I915_MMAP_WC) {
1745                 struct mm_struct *mm = current->mm;
1746                 struct vm_area_struct *vma;
1747
1748                 if (down_write_killable(&mm->mmap_sem)) {
1749                         i915_gem_object_put(obj);
1750                         return -EINTR;
1751                 }
1752                 vma = find_vma(mm, addr);
1753                 if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1754                         vma->vm_page_prot =
1755                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1756                 else
1757                         addr = -ENOMEM;
1758                 up_write(&mm->mmap_sem);
1759                 if (IS_ERR_VALUE(addr))
1760                         goto err;
1761
1762                 /* This may race, but that's ok, it only gets set */
1763                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1764         }
1765         i915_gem_object_put(obj);
1766
1767         args->addr_ptr = (u64)addr;
1768
1769         return 0;
1770
1771 err:
1772         i915_gem_object_put(obj);
1773
1774         return addr;
1775 }
1776
1777 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1778 {
1779         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1780 }
1781
1782 /**
1783  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1784  *
1785  * A history of the GTT mmap interface:
1786  *
1787  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1788  *     aligned and suitable for fencing, and still fit into the available
1789  *     mappable space left by the pinned display objects. A classic problem
1790  *     we called the page-fault-of-doom where we would ping-pong between
1791  *     two objects that could not fit inside the GTT and so the memcpy
1792  *     would page one object in at the expense of the other between every
1793  *     single byte.
1794  *
1795  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1796  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1797  *     object is too large for the available space (or simply too large
1798  *     for the mappable aperture!), a view is created instead and faulted
1799  *     into userspace. (This view is aligned and sized appropriately for
1800  *     fenced access.)
1801  *
1802  * 2 - Recognise WC as a separate cache domain so that we can flush the
1803  *     delayed writes via GTT before performing direct access via WC.
1804  *
1805  * Restrictions:
1806  *
1807  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1808  *    hangs on some architectures, corruption on others. An attempt to service
1809  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1810  *
1811  *  * the object must be able to fit into RAM (physical memory, though no
1812  *    limited to the mappable aperture).
1813  *
1814  *
1815  * Caveats:
1816  *
1817  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1818  *    all data to system memory. Subsequent access will not be synchronized.
1819  *
1820  *  * all mappings are revoked on runtime device suspend.
1821  *
1822  *  * there are only 8, 16 or 32 fence registers to share between all users
1823  *    (older machines require fence register for display and blitter access
1824  *    as well). Contention of the fence registers will cause the previous users
1825  *    to be unmapped and any new access will generate new page faults.
1826  *
1827  *  * running out of memory while servicing a fault may generate a SIGBUS,
1828  *    rather than the expected SIGSEGV.
1829  */
1830 int i915_gem_mmap_gtt_version(void)
1831 {
1832         return 2;
1833 }
1834
1835 static inline struct i915_ggtt_view
1836 compute_partial_view(const struct drm_i915_gem_object *obj,
1837                      pgoff_t page_offset,
1838                      unsigned int chunk)
1839 {
1840         struct i915_ggtt_view view;
1841
1842         if (i915_gem_object_is_tiled(obj))
1843                 chunk = roundup(chunk, tile_row_pages(obj));
1844
1845         view.type = I915_GGTT_VIEW_PARTIAL;
1846         view.partial.offset = rounddown(page_offset, chunk);
1847         view.partial.size =
1848                 min_t(unsigned int, chunk,
1849                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1850
1851         /* If the partial covers the entire object, just create a normal VMA. */
1852         if (chunk >= obj->base.size >> PAGE_SHIFT)
1853                 view.type = I915_GGTT_VIEW_NORMAL;
1854
1855         return view;
1856 }
1857
1858 /**
1859  * i915_gem_fault - fault a page into the GTT
1860  * @vmf: fault info
1861  *
1862  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1863  * from userspace.  The fault handler takes care of binding the object to
1864  * the GTT (if needed), allocating and programming a fence register (again,
1865  * only if needed based on whether the old reg is still valid or the object
1866  * is tiled) and inserting a new PTE into the faulting process.
1867  *
1868  * Note that the faulting process may involve evicting existing objects
1869  * from the GTT and/or fence registers to make room.  So performance may
1870  * suffer if the GTT working set is large or there are few fence registers
1871  * left.
1872  *
1873  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1874  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1875  */
1876 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1877 {
1878 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1879         struct vm_area_struct *area = vmf->vma;
1880         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1881         struct drm_device *dev = obj->base.dev;
1882         struct drm_i915_private *dev_priv = to_i915(dev);
1883         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1884         bool write = area->vm_flags & VM_WRITE;
1885         intel_wakeref_t wakeref;
1886         struct i915_vma *vma;
1887         pgoff_t page_offset;
1888         int ret;
1889
1890         /* Sanity check that we allow writing into this object */
1891         if (i915_gem_object_is_readonly(obj) && write)
1892                 return VM_FAULT_SIGBUS;
1893
1894         /* We don't use vmf->pgoff since that has the fake offset */
1895         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1896
1897         trace_i915_gem_object_fault(obj, page_offset, true, write);
1898
1899         /* Try to flush the object off the GPU first without holding the lock.
1900          * Upon acquiring the lock, we will perform our sanity checks and then
1901          * repeat the flush holding the lock in the normal manner to catch cases
1902          * where we are gazumped.
1903          */
1904         ret = i915_gem_object_wait(obj,
1905                                    I915_WAIT_INTERRUPTIBLE,
1906                                    MAX_SCHEDULE_TIMEOUT,
1907                                    NULL);
1908         if (ret)
1909                 goto err;
1910
1911         ret = i915_gem_object_pin_pages(obj);
1912         if (ret)
1913                 goto err;
1914
1915         wakeref = intel_runtime_pm_get(dev_priv);
1916
1917         ret = i915_mutex_lock_interruptible(dev);
1918         if (ret)
1919                 goto err_rpm;
1920
1921         /* Access to snoopable pages through the GTT is incoherent. */
1922         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1923                 ret = -EFAULT;
1924                 goto err_unlock;
1925         }
1926
1927
1928         /* Now pin it into the GTT as needed */
1929         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1930                                        PIN_MAPPABLE |
1931                                        PIN_NONBLOCK |
1932                                        PIN_NONFAULT);
1933         if (IS_ERR(vma)) {
1934                 /* Use a partial view if it is bigger than available space */
1935                 struct i915_ggtt_view view =
1936                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1937                 unsigned int flags;
1938
1939                 flags = PIN_MAPPABLE;
1940                 if (view.type == I915_GGTT_VIEW_NORMAL)
1941                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1942
1943                 /*
1944                  * Userspace is now writing through an untracked VMA, abandon
1945                  * all hope that the hardware is able to track future writes.
1946                  */
1947                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1948
1949                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1950                 if (IS_ERR(vma) && !view.type) {
1951                         flags = PIN_MAPPABLE;
1952                         view.type = I915_GGTT_VIEW_PARTIAL;
1953                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1954                 }
1955         }
1956         if (IS_ERR(vma)) {
1957                 ret = PTR_ERR(vma);
1958                 goto err_unlock;
1959         }
1960
1961         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1962         if (ret)
1963                 goto err_unpin;
1964
1965         ret = i915_vma_pin_fence(vma);
1966         if (ret)
1967                 goto err_unpin;
1968
1969         /* Finally, remap it using the new GTT offset */
1970         ret = remap_io_mapping(area,
1971                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1972                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1973                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1974                                &ggtt->iomap);
1975         if (ret)
1976                 goto err_fence;
1977
1978         /* Mark as being mmapped into userspace for later revocation */
1979         assert_rpm_wakelock_held(dev_priv);
1980         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1981                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1982         GEM_BUG_ON(!obj->userfault_count);
1983
1984         i915_vma_set_ggtt_write(vma);
1985
1986 err_fence:
1987         i915_vma_unpin_fence(vma);
1988 err_unpin:
1989         __i915_vma_unpin(vma);
1990 err_unlock:
1991         mutex_unlock(&dev->struct_mutex);
1992 err_rpm:
1993         intel_runtime_pm_put(dev_priv, wakeref);
1994         i915_gem_object_unpin_pages(obj);
1995 err:
1996         switch (ret) {
1997         case -EIO:
1998                 /*
1999                  * We eat errors when the gpu is terminally wedged to avoid
2000                  * userspace unduly crashing (gl has no provisions for mmaps to
2001                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
2002                  * and so needs to be reported.
2003                  */
2004                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
2005                         return VM_FAULT_SIGBUS;
2006                 /* else: fall through */
2007         case -EAGAIN:
2008                 /*
2009                  * EAGAIN means the gpu is hung and we'll wait for the error
2010                  * handler to reset everything when re-faulting in
2011                  * i915_mutex_lock_interruptible.
2012                  */
2013         case 0:
2014         case -ERESTARTSYS:
2015         case -EINTR:
2016         case -EBUSY:
2017                 /*
2018                  * EBUSY is ok: this just means that another thread
2019                  * already did the job.
2020                  */
2021                 return VM_FAULT_NOPAGE;
2022         case -ENOMEM:
2023                 return VM_FAULT_OOM;
2024         case -ENOSPC:
2025         case -EFAULT:
2026                 return VM_FAULT_SIGBUS;
2027         default:
2028                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2029                 return VM_FAULT_SIGBUS;
2030         }
2031 }
2032
2033 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2034 {
2035         struct i915_vma *vma;
2036
2037         GEM_BUG_ON(!obj->userfault_count);
2038
2039         obj->userfault_count = 0;
2040         list_del(&obj->userfault_link);
2041         drm_vma_node_unmap(&obj->base.vma_node,
2042                            obj->base.dev->anon_inode->i_mapping);
2043
2044         for_each_ggtt_vma(vma, obj)
2045                 i915_vma_unset_userfault(vma);
2046 }
2047
2048 /**
2049  * i915_gem_release_mmap - remove physical page mappings
2050  * @obj: obj in question
2051  *
2052  * Preserve the reservation of the mmapping with the DRM core code, but
2053  * relinquish ownership of the pages back to the system.
2054  *
2055  * It is vital that we remove the page mapping if we have mapped a tiled
2056  * object through the GTT and then lose the fence register due to
2057  * resource pressure. Similarly if the object has been moved out of the
2058  * aperture, than pages mapped into userspace must be revoked. Removing the
2059  * mapping will then trigger a page fault on the next user access, allowing
2060  * fixup by i915_gem_fault().
2061  */
2062 void
2063 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2064 {
2065         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2066         intel_wakeref_t wakeref;
2067
2068         /* Serialisation between user GTT access and our code depends upon
2069          * revoking the CPU's PTE whilst the mutex is held. The next user
2070          * pagefault then has to wait until we release the mutex.
2071          *
2072          * Note that RPM complicates somewhat by adding an additional
2073          * requirement that operations to the GGTT be made holding the RPM
2074          * wakeref.
2075          */
2076         lockdep_assert_held(&i915->drm.struct_mutex);
2077         wakeref = intel_runtime_pm_get(i915);
2078
2079         if (!obj->userfault_count)
2080                 goto out;
2081
2082         __i915_gem_object_release_mmap(obj);
2083
2084         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2085          * memory transactions from userspace before we return. The TLB
2086          * flushing implied above by changing the PTE above *should* be
2087          * sufficient, an extra barrier here just provides us with a bit
2088          * of paranoid documentation about our requirement to serialise
2089          * memory writes before touching registers / GSM.
2090          */
2091         wmb();
2092
2093 out:
2094         intel_runtime_pm_put(i915, wakeref);
2095 }
2096
2097 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2098 {
2099         struct drm_i915_gem_object *obj, *on;
2100         int i;
2101
2102         /*
2103          * Only called during RPM suspend. All users of the userfault_list
2104          * must be holding an RPM wakeref to ensure that this can not
2105          * run concurrently with themselves (and use the struct_mutex for
2106          * protection between themselves).
2107          */
2108
2109         list_for_each_entry_safe(obj, on,
2110                                  &dev_priv->mm.userfault_list, userfault_link)
2111                 __i915_gem_object_release_mmap(obj);
2112
2113         /* The fence will be lost when the device powers down. If any were
2114          * in use by hardware (i.e. they are pinned), we should not be powering
2115          * down! All other fences will be reacquired by the user upon waking.
2116          */
2117         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2118                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2119
2120                 /* Ideally we want to assert that the fence register is not
2121                  * live at this point (i.e. that no piece of code will be
2122                  * trying to write through fence + GTT, as that both violates
2123                  * our tracking of activity and associated locking/barriers,
2124                  * but also is illegal given that the hw is powered down).
2125                  *
2126                  * Previously we used reg->pin_count as a "liveness" indicator.
2127                  * That is not sufficient, and we need a more fine-grained
2128                  * tool if we want to have a sanity check here.
2129                  */
2130
2131                 if (!reg->vma)
2132                         continue;
2133
2134                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2135                 reg->dirty = true;
2136         }
2137 }
2138
2139 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2140 {
2141         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2142         int err;
2143
2144         err = drm_gem_create_mmap_offset(&obj->base);
2145         if (likely(!err))
2146                 return 0;
2147
2148         /* Attempt to reap some mmap space from dead objects */
2149         do {
2150                 err = i915_gem_wait_for_idle(dev_priv,
2151                                              I915_WAIT_INTERRUPTIBLE,
2152                                              MAX_SCHEDULE_TIMEOUT);
2153                 if (err)
2154                         break;
2155
2156                 i915_gem_drain_freed_objects(dev_priv);
2157                 err = drm_gem_create_mmap_offset(&obj->base);
2158                 if (!err)
2159                         break;
2160
2161         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2162
2163         return err;
2164 }
2165
2166 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2167 {
2168         drm_gem_free_mmap_offset(&obj->base);
2169 }
2170
2171 int
2172 i915_gem_mmap_gtt(struct drm_file *file,
2173                   struct drm_device *dev,
2174                   u32 handle,
2175                   u64 *offset)
2176 {
2177         struct drm_i915_gem_object *obj;
2178         int ret;
2179
2180         obj = i915_gem_object_lookup(file, handle);
2181         if (!obj)
2182                 return -ENOENT;
2183
2184         ret = i915_gem_object_create_mmap_offset(obj);
2185         if (ret == 0)
2186                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2187
2188         i915_gem_object_put(obj);
2189         return ret;
2190 }
2191
2192 /**
2193  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2194  * @dev: DRM device
2195  * @data: GTT mapping ioctl data
2196  * @file: GEM object info
2197  *
2198  * Simply returns the fake offset to userspace so it can mmap it.
2199  * The mmap call will end up in drm_gem_mmap(), which will set things
2200  * up so we can get faults in the handler above.
2201  *
2202  * The fault handler will take care of binding the object into the GTT
2203  * (since it may have been evicted to make room for something), allocating
2204  * a fence register, and mapping the appropriate aperture address into
2205  * userspace.
2206  */
2207 int
2208 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2209                         struct drm_file *file)
2210 {
2211         struct drm_i915_gem_mmap_gtt *args = data;
2212
2213         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2214 }
2215
2216 /* Immediately discard the backing storage */
2217 static void
2218 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2219 {
2220         i915_gem_object_free_mmap_offset(obj);
2221
2222         if (obj->base.filp == NULL)
2223                 return;
2224
2225         /* Our goal here is to return as much of the memory as
2226          * is possible back to the system as we are called from OOM.
2227          * To do this we must instruct the shmfs to drop all of its
2228          * backing pages, *now*.
2229          */
2230         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2231         obj->mm.madv = __I915_MADV_PURGED;
2232         obj->mm.pages = ERR_PTR(-EFAULT);
2233 }
2234
2235 /* Try to discard unwanted pages */
2236 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2237 {
2238         struct address_space *mapping;
2239
2240         lockdep_assert_held(&obj->mm.lock);
2241         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2242
2243         switch (obj->mm.madv) {
2244         case I915_MADV_DONTNEED:
2245                 i915_gem_object_truncate(obj);
2246         case __I915_MADV_PURGED:
2247                 return;
2248         }
2249
2250         if (obj->base.filp == NULL)
2251                 return;
2252
2253         mapping = obj->base.filp->f_mapping,
2254         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2255 }
2256
2257 /*
2258  * Move pages to appropriate lru and release the pagevec, decrementing the
2259  * ref count of those pages.
2260  */
2261 static void check_release_pagevec(struct pagevec *pvec)
2262 {
2263         check_move_unevictable_pages(pvec);
2264         __pagevec_release(pvec);
2265         cond_resched();
2266 }
2267
2268 static void
2269 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2270                               struct sg_table *pages)
2271 {
2272         struct sgt_iter sgt_iter;
2273         struct pagevec pvec;
2274         struct page *page;
2275
2276         __i915_gem_object_release_shmem(obj, pages, true);
2277
2278         i915_gem_gtt_finish_pages(obj, pages);
2279
2280         if (i915_gem_object_needs_bit17_swizzle(obj))
2281                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2282
2283         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2284
2285         pagevec_init(&pvec);
2286         for_each_sgt_page(page, sgt_iter, pages) {
2287                 if (obj->mm.dirty)
2288                         set_page_dirty(page);
2289
2290                 if (obj->mm.madv == I915_MADV_WILLNEED)
2291                         mark_page_accessed(page);
2292
2293                 if (!pagevec_add(&pvec, page))
2294                         check_release_pagevec(&pvec);
2295         }
2296         if (pagevec_count(&pvec))
2297                 check_release_pagevec(&pvec);
2298         obj->mm.dirty = false;
2299
2300         sg_free_table(pages);
2301         kfree(pages);
2302 }
2303
2304 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2305 {
2306         struct radix_tree_iter iter;
2307         void __rcu **slot;
2308
2309         rcu_read_lock();
2310         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2311                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2312         rcu_read_unlock();
2313 }
2314
2315 static struct sg_table *
2316 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2317 {
2318         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2319         struct sg_table *pages;
2320
2321         pages = fetch_and_zero(&obj->mm.pages);
2322         if (IS_ERR_OR_NULL(pages))
2323                 return pages;
2324
2325         spin_lock(&i915->mm.obj_lock);
2326         list_del(&obj->mm.link);
2327         spin_unlock(&i915->mm.obj_lock);
2328
2329         if (obj->mm.mapping) {
2330                 void *ptr;
2331
2332                 ptr = page_mask_bits(obj->mm.mapping);
2333                 if (is_vmalloc_addr(ptr))
2334                         vunmap(ptr);
2335                 else
2336                         kunmap(kmap_to_page(ptr));
2337
2338                 obj->mm.mapping = NULL;
2339         }
2340
2341         __i915_gem_object_reset_page_iter(obj);
2342         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2343
2344         return pages;
2345 }
2346
2347 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2348                                 enum i915_mm_subclass subclass)
2349 {
2350         struct sg_table *pages;
2351         int ret;
2352
2353         if (i915_gem_object_has_pinned_pages(obj))
2354                 return -EBUSY;
2355
2356         GEM_BUG_ON(obj->bind_count);
2357
2358         /* May be called by shrinker from within get_pages() (on another bo) */
2359         mutex_lock_nested(&obj->mm.lock, subclass);
2360         if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
2361                 ret = -EBUSY;
2362                 goto unlock;
2363         }
2364
2365         /*
2366          * ->put_pages might need to allocate memory for the bit17 swizzle
2367          * array, hence protect them from being reaped by removing them from gtt
2368          * lists early.
2369          */
2370         pages = __i915_gem_object_unset_pages(obj);
2371
2372         /*
2373          * XXX Temporary hijinx to avoid updating all backends to handle
2374          * NULL pages. In the future, when we have more asynchronous
2375          * get_pages backends we should be better able to handle the
2376          * cancellation of the async task in a more uniform manner.
2377          */
2378         if (!pages && !i915_gem_object_needs_async_cancel(obj))
2379                 pages = ERR_PTR(-EINVAL);
2380
2381         if (!IS_ERR(pages))
2382                 obj->ops->put_pages(obj, pages);
2383
2384         ret = 0;
2385 unlock:
2386         mutex_unlock(&obj->mm.lock);
2387
2388         return ret;
2389 }
2390
2391 bool i915_sg_trim(struct sg_table *orig_st)
2392 {
2393         struct sg_table new_st;
2394         struct scatterlist *sg, *new_sg;
2395         unsigned int i;
2396
2397         if (orig_st->nents == orig_st->orig_nents)
2398                 return false;
2399
2400         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2401                 return false;
2402
2403         new_sg = new_st.sgl;
2404         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2405                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2406                 sg_dma_address(new_sg) = sg_dma_address(sg);
2407                 sg_dma_len(new_sg) = sg_dma_len(sg);
2408
2409                 new_sg = sg_next(new_sg);
2410         }
2411         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2412
2413         sg_free_table(orig_st);
2414
2415         *orig_st = new_st;
2416         return true;
2417 }
2418
2419 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2420 {
2421         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2422         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2423         unsigned long i;
2424         struct address_space *mapping;
2425         struct sg_table *st;
2426         struct scatterlist *sg;
2427         struct sgt_iter sgt_iter;
2428         struct page *page;
2429         unsigned long last_pfn = 0;     /* suppress gcc warning */
2430         unsigned int max_segment = i915_sg_segment_size();
2431         unsigned int sg_page_sizes;
2432         struct pagevec pvec;
2433         gfp_t noreclaim;
2434         int ret;
2435
2436         /*
2437          * Assert that the object is not currently in any GPU domain. As it
2438          * wasn't in the GTT, there shouldn't be any way it could have been in
2439          * a GPU cache
2440          */
2441         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2442         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2443
2444         /*
2445          * If there's no chance of allocating enough pages for the whole
2446          * object, bail early.
2447          */
2448         if (page_count > totalram_pages())
2449                 return -ENOMEM;
2450
2451         st = kmalloc(sizeof(*st), GFP_KERNEL);
2452         if (st == NULL)
2453                 return -ENOMEM;
2454
2455 rebuild_st:
2456         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2457                 kfree(st);
2458                 return -ENOMEM;
2459         }
2460
2461         /*
2462          * Get the list of pages out of our struct file.  They'll be pinned
2463          * at this point until we release them.
2464          *
2465          * Fail silently without starting the shrinker
2466          */
2467         mapping = obj->base.filp->f_mapping;
2468         mapping_set_unevictable(mapping);
2469         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2470         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2471
2472         sg = st->sgl;
2473         st->nents = 0;
2474         sg_page_sizes = 0;
2475         for (i = 0; i < page_count; i++) {
2476                 const unsigned int shrink[] = {
2477                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2478                         0,
2479                 }, *s = shrink;
2480                 gfp_t gfp = noreclaim;
2481
2482                 do {
2483                         cond_resched();
2484                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2485                         if (likely(!IS_ERR(page)))
2486                                 break;
2487
2488                         if (!*s) {
2489                                 ret = PTR_ERR(page);
2490                                 goto err_sg;
2491                         }
2492
2493                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2494
2495                         /*
2496                          * We've tried hard to allocate the memory by reaping
2497                          * our own buffer, now let the real VM do its job and
2498                          * go down in flames if truly OOM.
2499                          *
2500                          * However, since graphics tend to be disposable,
2501                          * defer the oom here by reporting the ENOMEM back
2502                          * to userspace.
2503                          */
2504                         if (!*s) {
2505                                 /* reclaim and warn, but no oom */
2506                                 gfp = mapping_gfp_mask(mapping);
2507
2508                                 /*
2509                                  * Our bo are always dirty and so we require
2510                                  * kswapd to reclaim our pages (direct reclaim
2511                                  * does not effectively begin pageout of our
2512                                  * buffers on its own). However, direct reclaim
2513                                  * only waits for kswapd when under allocation
2514                                  * congestion. So as a result __GFP_RECLAIM is
2515                                  * unreliable and fails to actually reclaim our
2516                                  * dirty pages -- unless you try over and over
2517                                  * again with !__GFP_NORETRY. However, we still
2518                                  * want to fail this allocation rather than
2519                                  * trigger the out-of-memory killer and for
2520                                  * this we want __GFP_RETRY_MAYFAIL.
2521                                  */
2522                                 gfp |= __GFP_RETRY_MAYFAIL;
2523                         }
2524                 } while (1);
2525
2526                 if (!i ||
2527                     sg->length >= max_segment ||
2528                     page_to_pfn(page) != last_pfn + 1) {
2529                         if (i) {
2530                                 sg_page_sizes |= sg->length;
2531                                 sg = sg_next(sg);
2532                         }
2533                         st->nents++;
2534                         sg_set_page(sg, page, PAGE_SIZE, 0);
2535                 } else {
2536                         sg->length += PAGE_SIZE;
2537                 }
2538                 last_pfn = page_to_pfn(page);
2539
2540                 /* Check that the i965g/gm workaround works. */
2541                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2542         }
2543         if (sg) { /* loop terminated early; short sg table */
2544                 sg_page_sizes |= sg->length;
2545                 sg_mark_end(sg);
2546         }
2547
2548         /* Trim unused sg entries to avoid wasting memory. */
2549         i915_sg_trim(st);
2550
2551         ret = i915_gem_gtt_prepare_pages(obj, st);
2552         if (ret) {
2553                 /*
2554                  * DMA remapping failed? One possible cause is that
2555                  * it could not reserve enough large entries, asking
2556                  * for PAGE_SIZE chunks instead may be helpful.
2557                  */
2558                 if (max_segment > PAGE_SIZE) {
2559                         for_each_sgt_page(page, sgt_iter, st)
2560                                 put_page(page);
2561                         sg_free_table(st);
2562
2563                         max_segment = PAGE_SIZE;
2564                         goto rebuild_st;
2565                 } else {
2566                         dev_warn(&dev_priv->drm.pdev->dev,
2567                                  "Failed to DMA remap %lu pages\n",
2568                                  page_count);
2569                         goto err_pages;
2570                 }
2571         }
2572
2573         if (i915_gem_object_needs_bit17_swizzle(obj))
2574                 i915_gem_object_do_bit_17_swizzle(obj, st);
2575
2576         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2577
2578         return 0;
2579
2580 err_sg:
2581         sg_mark_end(sg);
2582 err_pages:
2583         mapping_clear_unevictable(mapping);
2584         pagevec_init(&pvec);
2585         for_each_sgt_page(page, sgt_iter, st) {
2586                 if (!pagevec_add(&pvec, page))
2587                         check_release_pagevec(&pvec);
2588         }
2589         if (pagevec_count(&pvec))
2590                 check_release_pagevec(&pvec);
2591         sg_free_table(st);
2592         kfree(st);
2593
2594         /*
2595          * shmemfs first checks if there is enough memory to allocate the page
2596          * and reports ENOSPC should there be insufficient, along with the usual
2597          * ENOMEM for a genuine allocation failure.
2598          *
2599          * We use ENOSPC in our driver to mean that we have run out of aperture
2600          * space and so want to translate the error from shmemfs back to our
2601          * usual understanding of ENOMEM.
2602          */
2603         if (ret == -ENOSPC)
2604                 ret = -ENOMEM;
2605
2606         return ret;
2607 }
2608
2609 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2610                                  struct sg_table *pages,
2611                                  unsigned int sg_page_sizes)
2612 {
2613         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2614         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2615         int i;
2616
2617         lockdep_assert_held(&obj->mm.lock);
2618
2619         obj->mm.get_page.sg_pos = pages->sgl;
2620         obj->mm.get_page.sg_idx = 0;
2621
2622         obj->mm.pages = pages;
2623
2624         if (i915_gem_object_is_tiled(obj) &&
2625             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2626                 GEM_BUG_ON(obj->mm.quirked);
2627                 __i915_gem_object_pin_pages(obj);
2628                 obj->mm.quirked = true;
2629         }
2630
2631         GEM_BUG_ON(!sg_page_sizes);
2632         obj->mm.page_sizes.phys = sg_page_sizes;
2633
2634         /*
2635          * Calculate the supported page-sizes which fit into the given
2636          * sg_page_sizes. This will give us the page-sizes which we may be able
2637          * to use opportunistically when later inserting into the GTT. For
2638          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2639          * 64K or 4K pages, although in practice this will depend on a number of
2640          * other factors.
2641          */
2642         obj->mm.page_sizes.sg = 0;
2643         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2644                 if (obj->mm.page_sizes.phys & ~0u << i)
2645                         obj->mm.page_sizes.sg |= BIT(i);
2646         }
2647         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2648
2649         spin_lock(&i915->mm.obj_lock);
2650         list_add(&obj->mm.link, &i915->mm.unbound_list);
2651         spin_unlock(&i915->mm.obj_lock);
2652 }
2653
2654 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2655 {
2656         int err;
2657
2658         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2659                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2660                 return -EFAULT;
2661         }
2662
2663         err = obj->ops->get_pages(obj);
2664         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2665
2666         return err;
2667 }
2668
2669 /* Ensure that the associated pages are gathered from the backing storage
2670  * and pinned into our object. i915_gem_object_pin_pages() may be called
2671  * multiple times before they are released by a single call to
2672  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2673  * either as a result of memory pressure (reaping pages under the shrinker)
2674  * or as the object is itself released.
2675  */
2676 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2677 {
2678         int err;
2679
2680         err = mutex_lock_interruptible(&obj->mm.lock);
2681         if (err)
2682                 return err;
2683
2684         if (unlikely(!i915_gem_object_has_pages(obj))) {
2685                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2686
2687                 err = ____i915_gem_object_get_pages(obj);
2688                 if (err)
2689                         goto unlock;
2690
2691                 smp_mb__before_atomic();
2692         }
2693         atomic_inc(&obj->mm.pages_pin_count);
2694
2695 unlock:
2696         mutex_unlock(&obj->mm.lock);
2697         return err;
2698 }
2699
2700 /* The 'mapping' part of i915_gem_object_pin_map() below */
2701 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2702                                  enum i915_map_type type)
2703 {
2704         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2705         struct sg_table *sgt = obj->mm.pages;
2706         struct sgt_iter sgt_iter;
2707         struct page *page;
2708         struct page *stack_pages[32];
2709         struct page **pages = stack_pages;
2710         unsigned long i = 0;
2711         pgprot_t pgprot;
2712         void *addr;
2713
2714         /* A single page can always be kmapped */
2715         if (n_pages == 1 && type == I915_MAP_WB)
2716                 return kmap(sg_page(sgt->sgl));
2717
2718         if (n_pages > ARRAY_SIZE(stack_pages)) {
2719                 /* Too big for stack -- allocate temporary array instead */
2720                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2721                 if (!pages)
2722                         return NULL;
2723         }
2724
2725         for_each_sgt_page(page, sgt_iter, sgt)
2726                 pages[i++] = page;
2727
2728         /* Check that we have the expected number of pages */
2729         GEM_BUG_ON(i != n_pages);
2730
2731         switch (type) {
2732         default:
2733                 MISSING_CASE(type);
2734                 /* fallthrough to use PAGE_KERNEL anyway */
2735         case I915_MAP_WB:
2736                 pgprot = PAGE_KERNEL;
2737                 break;
2738         case I915_MAP_WC:
2739                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2740                 break;
2741         }
2742         addr = vmap(pages, n_pages, 0, pgprot);
2743
2744         if (pages != stack_pages)
2745                 kvfree(pages);
2746
2747         return addr;
2748 }
2749
2750 /* get, pin, and map the pages of the object into kernel space */
2751 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2752                               enum i915_map_type type)
2753 {
2754         enum i915_map_type has_type;
2755         bool pinned;
2756         void *ptr;
2757         int ret;
2758
2759         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2760                 return ERR_PTR(-ENXIO);
2761
2762         ret = mutex_lock_interruptible(&obj->mm.lock);
2763         if (ret)
2764                 return ERR_PTR(ret);
2765
2766         pinned = !(type & I915_MAP_OVERRIDE);
2767         type &= ~I915_MAP_OVERRIDE;
2768
2769         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2770                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2771                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2772
2773                         ret = ____i915_gem_object_get_pages(obj);
2774                         if (ret)
2775                                 goto err_unlock;
2776
2777                         smp_mb__before_atomic();
2778                 }
2779                 atomic_inc(&obj->mm.pages_pin_count);
2780                 pinned = false;
2781         }
2782         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2783
2784         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2785         if (ptr && has_type != type) {
2786                 if (pinned) {
2787                         ret = -EBUSY;
2788                         goto err_unpin;
2789                 }
2790
2791                 if (is_vmalloc_addr(ptr))
2792                         vunmap(ptr);
2793                 else
2794                         kunmap(kmap_to_page(ptr));
2795
2796                 ptr = obj->mm.mapping = NULL;
2797         }
2798
2799         if (!ptr) {
2800                 ptr = i915_gem_object_map(obj, type);
2801                 if (!ptr) {
2802                         ret = -ENOMEM;
2803                         goto err_unpin;
2804                 }
2805
2806                 obj->mm.mapping = page_pack_bits(ptr, type);
2807         }
2808
2809 out_unlock:
2810         mutex_unlock(&obj->mm.lock);
2811         return ptr;
2812
2813 err_unpin:
2814         atomic_dec(&obj->mm.pages_pin_count);
2815 err_unlock:
2816         ptr = ERR_PTR(ret);
2817         goto out_unlock;
2818 }
2819
2820 static int
2821 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2822                            const struct drm_i915_gem_pwrite *arg)
2823 {
2824         struct address_space *mapping = obj->base.filp->f_mapping;
2825         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2826         u64 remain, offset;
2827         unsigned int pg;
2828
2829         /* Before we instantiate/pin the backing store for our use, we
2830          * can prepopulate the shmemfs filp efficiently using a write into
2831          * the pagecache. We avoid the penalty of instantiating all the
2832          * pages, important if the user is just writing to a few and never
2833          * uses the object on the GPU, and using a direct write into shmemfs
2834          * allows it to avoid the cost of retrieving a page (either swapin
2835          * or clearing-before-use) before it is overwritten.
2836          */
2837         if (i915_gem_object_has_pages(obj))
2838                 return -ENODEV;
2839
2840         if (obj->mm.madv != I915_MADV_WILLNEED)
2841                 return -EFAULT;
2842
2843         /* Before the pages are instantiated the object is treated as being
2844          * in the CPU domain. The pages will be clflushed as required before
2845          * use, and we can freely write into the pages directly. If userspace
2846          * races pwrite with any other operation; corruption will ensue -
2847          * that is userspace's prerogative!
2848          */
2849
2850         remain = arg->size;
2851         offset = arg->offset;
2852         pg = offset_in_page(offset);
2853
2854         do {
2855                 unsigned int len, unwritten;
2856                 struct page *page;
2857                 void *data, *vaddr;
2858                 int err;
2859
2860                 len = PAGE_SIZE - pg;
2861                 if (len > remain)
2862                         len = remain;
2863
2864                 err = pagecache_write_begin(obj->base.filp, mapping,
2865                                             offset, len, 0,
2866                                             &page, &data);
2867                 if (err < 0)
2868                         return err;
2869
2870                 vaddr = kmap(page);
2871                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2872                 kunmap(page);
2873
2874                 err = pagecache_write_end(obj->base.filp, mapping,
2875                                           offset, len, len - unwritten,
2876                                           page, data);
2877                 if (err < 0)
2878                         return err;
2879
2880                 if (unwritten)
2881                         return -EFAULT;
2882
2883                 remain -= len;
2884                 user_data += len;
2885                 offset += len;
2886                 pg = 0;
2887         } while (remain);
2888
2889         return 0;
2890 }
2891
2892 static bool match_ring(struct i915_request *rq)
2893 {
2894         struct drm_i915_private *dev_priv = rq->i915;
2895         u32 ring = I915_READ(RING_START(rq->engine->mmio_base));
2896
2897         return ring == i915_ggtt_offset(rq->ring->vma);
2898 }
2899
2900 struct i915_request *
2901 i915_gem_find_active_request(struct intel_engine_cs *engine)
2902 {
2903         struct i915_request *request, *active = NULL;
2904         unsigned long flags;
2905
2906         /*
2907          * We are called by the error capture, reset and to dump engine
2908          * state at random points in time. In particular, note that neither is
2909          * crucially ordered with an interrupt. After a hang, the GPU is dead
2910          * and we assume that no more writes can happen (we waited long enough
2911          * for all writes that were in transaction to be flushed) - adding an
2912          * extra delay for a recent interrupt is pointless. Hence, we do
2913          * not need an engine->irq_seqno_barrier() before the seqno reads.
2914          * At all other times, we must assume the GPU is still running, but
2915          * we only care about the snapshot of this moment.
2916          */
2917         spin_lock_irqsave(&engine->timeline.lock, flags);
2918         list_for_each_entry(request, &engine->timeline.requests, link) {
2919                 if (i915_request_completed(request))
2920                         continue;
2921
2922                 if (!i915_request_started(request))
2923                         break;
2924
2925                 /* More than one preemptible request may match! */
2926                 if (!match_ring(request))
2927                         break;
2928
2929                 active = request;
2930                 break;
2931         }
2932         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2933
2934         return active;
2935 }
2936
2937 static void
2938 i915_gem_retire_work_handler(struct work_struct *work)
2939 {
2940         struct drm_i915_private *dev_priv =
2941                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
2942         struct drm_device *dev = &dev_priv->drm;
2943
2944         /* Come back later if the device is busy... */
2945         if (mutex_trylock(&dev->struct_mutex)) {
2946                 i915_retire_requests(dev_priv);
2947                 mutex_unlock(&dev->struct_mutex);
2948         }
2949
2950         /*
2951          * Keep the retire handler running until we are finally idle.
2952          * We do not need to do this test under locking as in the worst-case
2953          * we queue the retire worker once too often.
2954          */
2955         if (READ_ONCE(dev_priv->gt.awake))
2956                 queue_delayed_work(dev_priv->wq,
2957                                    &dev_priv->gt.retire_work,
2958                                    round_jiffies_up_relative(HZ));
2959 }
2960
2961 static void shrink_caches(struct drm_i915_private *i915)
2962 {
2963         /*
2964          * kmem_cache_shrink() discards empty slabs and reorders partially
2965          * filled slabs to prioritise allocating from the mostly full slabs,
2966          * with the aim of reducing fragmentation.
2967          */
2968         kmem_cache_shrink(i915->priorities);
2969         kmem_cache_shrink(i915->dependencies);
2970         kmem_cache_shrink(i915->requests);
2971         kmem_cache_shrink(i915->luts);
2972         kmem_cache_shrink(i915->vmas);
2973         kmem_cache_shrink(i915->objects);
2974 }
2975
2976 struct sleep_rcu_work {
2977         union {
2978                 struct rcu_head rcu;
2979                 struct work_struct work;
2980         };
2981         struct drm_i915_private *i915;
2982         unsigned int epoch;
2983 };
2984
2985 static inline bool
2986 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
2987 {
2988         /*
2989          * There is a small chance that the epoch wrapped since we started
2990          * sleeping. If we assume that epoch is at least a u32, then it will
2991          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
2992          */
2993         return epoch == READ_ONCE(i915->gt.epoch);
2994 }
2995
2996 static void __sleep_work(struct work_struct *work)
2997 {
2998         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
2999         struct drm_i915_private *i915 = s->i915;
3000         unsigned int epoch = s->epoch;
3001
3002         kfree(s);
3003         if (same_epoch(i915, epoch))
3004                 shrink_caches(i915);
3005 }
3006
3007 static void __sleep_rcu(struct rcu_head *rcu)
3008 {
3009         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3010         struct drm_i915_private *i915 = s->i915;
3011
3012         destroy_rcu_head(&s->rcu);
3013
3014         if (same_epoch(i915, s->epoch)) {
3015                 INIT_WORK(&s->work, __sleep_work);
3016                 queue_work(i915->wq, &s->work);
3017         } else {
3018                 kfree(s);
3019         }
3020 }
3021
3022 static inline bool
3023 new_requests_since_last_retire(const struct drm_i915_private *i915)
3024 {
3025         return (READ_ONCE(i915->gt.active_requests) ||
3026                 work_pending(&i915->gt.idle_work.work));
3027 }
3028
3029 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3030 {
3031         struct intel_engine_cs *engine;
3032         enum intel_engine_id id;
3033
3034         if (i915_terminally_wedged(&i915->gpu_error))
3035                 return;
3036
3037         GEM_BUG_ON(i915->gt.active_requests);
3038         for_each_engine(engine, i915, id) {
3039                 GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
3040                 GEM_BUG_ON(engine->last_retired_context !=
3041                            to_intel_context(i915->kernel_context, engine));
3042         }
3043 }
3044
3045 static void
3046 i915_gem_idle_work_handler(struct work_struct *work)
3047 {
3048         struct drm_i915_private *dev_priv =
3049                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3050         unsigned int epoch = I915_EPOCH_INVALID;
3051         bool rearm_hangcheck;
3052
3053         if (!READ_ONCE(dev_priv->gt.awake))
3054                 return;
3055
3056         if (READ_ONCE(dev_priv->gt.active_requests))
3057                 return;
3058
3059         /*
3060          * Flush out the last user context, leaving only the pinned
3061          * kernel context resident. When we are idling on the kernel_context,
3062          * no more new requests (with a context switch) are emitted and we
3063          * can finally rest. A consequence is that the idle work handler is
3064          * always called at least twice before idling (and if the system is
3065          * idle that implies a round trip through the retire worker).
3066          */
3067         mutex_lock(&dev_priv->drm.struct_mutex);
3068         i915_gem_switch_to_kernel_context(dev_priv);
3069         mutex_unlock(&dev_priv->drm.struct_mutex);
3070
3071         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3072                   READ_ONCE(dev_priv->gt.active_requests));
3073
3074         /*
3075          * Wait for last execlists context complete, but bail out in case a
3076          * new request is submitted. As we don't trust the hardware, we
3077          * continue on if the wait times out. This is necessary to allow
3078          * the machine to suspend even if the hardware dies, and we will
3079          * try to recover in resume (after depriving the hardware of power,
3080          * it may be in a better mmod).
3081          */
3082         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3083                    intel_engines_are_idle(dev_priv),
3084                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3085                    10, 500);
3086
3087         rearm_hangcheck =
3088                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3089
3090         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3091                 /* Currently busy, come back later */
3092                 mod_delayed_work(dev_priv->wq,
3093                                  &dev_priv->gt.idle_work,
3094                                  msecs_to_jiffies(50));
3095                 goto out_rearm;
3096         }
3097
3098         /*
3099          * New request retired after this work handler started, extend active
3100          * period until next instance of the work.
3101          */
3102         if (new_requests_since_last_retire(dev_priv))
3103                 goto out_unlock;
3104
3105         epoch = __i915_gem_park(dev_priv);
3106
3107         assert_kernel_context_is_current(dev_priv);
3108
3109         rearm_hangcheck = false;
3110 out_unlock:
3111         mutex_unlock(&dev_priv->drm.struct_mutex);
3112
3113 out_rearm:
3114         if (rearm_hangcheck) {
3115                 GEM_BUG_ON(!dev_priv->gt.awake);
3116                 i915_queue_hangcheck(dev_priv);
3117         }
3118
3119         /*
3120          * When we are idle, it is an opportune time to reap our caches.
3121          * However, we have many objects that utilise RCU and the ordered
3122          * i915->wq that this work is executing on. To try and flush any
3123          * pending frees now we are idle, we first wait for an RCU grace
3124          * period, and then queue a task (that will run last on the wq) to
3125          * shrink and re-optimize the caches.
3126          */
3127         if (same_epoch(dev_priv, epoch)) {
3128                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3129                 if (s) {
3130                         init_rcu_head(&s->rcu);
3131                         s->i915 = dev_priv;
3132                         s->epoch = epoch;
3133                         call_rcu(&s->rcu, __sleep_rcu);
3134                 }
3135         }
3136 }
3137
3138 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3139 {
3140         struct drm_i915_private *i915 = to_i915(gem->dev);
3141         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3142         struct drm_i915_file_private *fpriv = file->driver_priv;
3143         struct i915_lut_handle *lut, *ln;
3144
3145         mutex_lock(&i915->drm.struct_mutex);
3146
3147         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3148                 struct i915_gem_context *ctx = lut->ctx;
3149                 struct i915_vma *vma;
3150
3151                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3152                 if (ctx->file_priv != fpriv)
3153                         continue;
3154
3155                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3156                 GEM_BUG_ON(vma->obj != obj);
3157
3158                 /* We allow the process to have multiple handles to the same
3159                  * vma, in the same fd namespace, by virtue of flink/open.
3160                  */
3161                 GEM_BUG_ON(!vma->open_count);
3162                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3163                         i915_vma_close(vma);
3164
3165                 list_del(&lut->obj_link);
3166                 list_del(&lut->ctx_link);
3167
3168                 kmem_cache_free(i915->luts, lut);
3169                 __i915_gem_object_release_unless_active(obj);
3170         }
3171
3172         mutex_unlock(&i915->drm.struct_mutex);
3173 }
3174
3175 static unsigned long to_wait_timeout(s64 timeout_ns)
3176 {
3177         if (timeout_ns < 0)
3178                 return MAX_SCHEDULE_TIMEOUT;
3179
3180         if (timeout_ns == 0)
3181                 return 0;
3182
3183         return nsecs_to_jiffies_timeout(timeout_ns);
3184 }
3185
3186 /**
3187  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3188  * @dev: drm device pointer
3189  * @data: ioctl data blob
3190  * @file: drm file pointer
3191  *
3192  * Returns 0 if successful, else an error is returned with the remaining time in
3193  * the timeout parameter.
3194  *  -ETIME: object is still busy after timeout
3195  *  -ERESTARTSYS: signal interrupted the wait
3196  *  -ENONENT: object doesn't exist
3197  * Also possible, but rare:
3198  *  -EAGAIN: incomplete, restart syscall
3199  *  -ENOMEM: damn
3200  *  -ENODEV: Internal IRQ fail
3201  *  -E?: The add request failed
3202  *
3203  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3204  * non-zero timeout parameter the wait ioctl will wait for the given number of
3205  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3206  * without holding struct_mutex the object may become re-busied before this
3207  * function completes. A similar but shorter * race condition exists in the busy
3208  * ioctl
3209  */
3210 int
3211 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3212 {
3213         struct drm_i915_gem_wait *args = data;
3214         struct drm_i915_gem_object *obj;
3215         ktime_t start;
3216         long ret;
3217
3218         if (args->flags != 0)
3219                 return -EINVAL;
3220
3221         obj = i915_gem_object_lookup(file, args->bo_handle);
3222         if (!obj)
3223                 return -ENOENT;
3224
3225         start = ktime_get();
3226
3227         ret = i915_gem_object_wait(obj,
3228                                    I915_WAIT_INTERRUPTIBLE |
3229                                    I915_WAIT_PRIORITY |
3230                                    I915_WAIT_ALL,
3231                                    to_wait_timeout(args->timeout_ns),
3232                                    to_rps_client(file));
3233
3234         if (args->timeout_ns > 0) {
3235                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3236                 if (args->timeout_ns < 0)
3237                         args->timeout_ns = 0;
3238
3239                 /*
3240                  * Apparently ktime isn't accurate enough and occasionally has a
3241                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3242                  * things up to make the test happy. We allow up to 1 jiffy.
3243                  *
3244                  * This is a regression from the timespec->ktime conversion.
3245                  */
3246                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3247                         args->timeout_ns = 0;
3248
3249                 /* Asked to wait beyond the jiffie/scheduler precision? */
3250                 if (ret == -ETIME && args->timeout_ns)
3251                         ret = -EAGAIN;
3252         }
3253
3254         i915_gem_object_put(obj);
3255         return ret;
3256 }
3257
3258 static int wait_for_engines(struct drm_i915_private *i915)
3259 {
3260         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3261                 dev_err(i915->drm.dev,
3262                         "Failed to idle engines, declaring wedged!\n");
3263                 GEM_TRACE_DUMP();
3264                 i915_gem_set_wedged(i915);
3265                 return -EIO;
3266         }
3267
3268         return 0;
3269 }
3270
3271 static long
3272 wait_for_timelines(struct drm_i915_private *i915,
3273                    unsigned int flags, long timeout)
3274 {
3275         struct i915_gt_timelines *gt = &i915->gt.timelines;
3276         struct i915_timeline *tl;
3277
3278         if (!READ_ONCE(i915->gt.active_requests))
3279                 return timeout;
3280
3281         mutex_lock(&gt->mutex);
3282         list_for_each_entry(tl, &gt->active_list, link) {
3283                 struct i915_request *rq;
3284
3285                 rq = i915_active_request_get_unlocked(&tl->last_request);
3286                 if (!rq)
3287                         continue;
3288
3289                 mutex_unlock(&gt->mutex);
3290
3291                 /*
3292                  * "Race-to-idle".
3293                  *
3294                  * Switching to the kernel context is often used a synchronous
3295                  * step prior to idling, e.g. in suspend for flushing all
3296                  * current operations to memory before sleeping. These we
3297                  * want to complete as quickly as possible to avoid prolonged
3298                  * stalls, so allow the gpu to boost to maximum clocks.
3299                  */
3300                 if (flags & I915_WAIT_FOR_IDLE_BOOST)
3301                         gen6_rps_boost(rq, NULL);
3302
3303                 timeout = i915_request_wait(rq, flags, timeout);
3304                 i915_request_put(rq);
3305                 if (timeout < 0)
3306                         return timeout;
3307
3308                 /* restart after reacquiring the lock */
3309                 mutex_lock(&gt->mutex);
3310                 tl = list_entry(&gt->active_list, typeof(*tl), link);
3311         }
3312         mutex_unlock(&gt->mutex);
3313
3314         return timeout;
3315 }
3316
3317 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3318                            unsigned int flags, long timeout)
3319 {
3320         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3321                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3322                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3323
3324         /* If the device is asleep, we have no requests outstanding */
3325         if (!READ_ONCE(i915->gt.awake))
3326                 return 0;
3327
3328         timeout = wait_for_timelines(i915, flags, timeout);
3329         if (timeout < 0)
3330                 return timeout;
3331
3332         if (flags & I915_WAIT_LOCKED) {
3333                 int err;
3334
3335                 lockdep_assert_held(&i915->drm.struct_mutex);
3336
3337                 if (GEM_SHOW_DEBUG() && !timeout) {
3338                         /* Presume that timeout was non-zero to begin with! */
3339                         dev_warn(&i915->drm.pdev->dev,
3340                                  "Missed idle-completion interrupt!\n");
3341                         GEM_TRACE_DUMP();
3342                 }
3343
3344                 err = wait_for_engines(i915);
3345                 if (err)
3346                         return err;
3347
3348                 i915_retire_requests(i915);
3349                 GEM_BUG_ON(i915->gt.active_requests);
3350         }
3351
3352         return 0;
3353 }
3354
3355 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3356 {
3357         /*
3358          * We manually flush the CPU domain so that we can override and
3359          * force the flush for the display, and perform it asyncrhonously.
3360          */
3361         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3362         if (obj->cache_dirty)
3363                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3364         obj->write_domain = 0;
3365 }
3366
3367 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3368 {
3369         if (!READ_ONCE(obj->pin_global))
3370                 return;
3371
3372         mutex_lock(&obj->base.dev->struct_mutex);
3373         __i915_gem_object_flush_for_display(obj);
3374         mutex_unlock(&obj->base.dev->struct_mutex);
3375 }
3376
3377 /**
3378  * Moves a single object to the WC read, and possibly write domain.
3379  * @obj: object to act on
3380  * @write: ask for write access or read only
3381  *
3382  * This function returns when the move is complete, including waiting on
3383  * flushes to occur.
3384  */
3385 int
3386 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3387 {
3388         int ret;
3389
3390         lockdep_assert_held(&obj->base.dev->struct_mutex);
3391
3392         ret = i915_gem_object_wait(obj,
3393                                    I915_WAIT_INTERRUPTIBLE |
3394                                    I915_WAIT_LOCKED |
3395                                    (write ? I915_WAIT_ALL : 0),
3396                                    MAX_SCHEDULE_TIMEOUT,
3397                                    NULL);
3398         if (ret)
3399                 return ret;
3400
3401         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3402                 return 0;
3403
3404         /* Flush and acquire obj->pages so that we are coherent through
3405          * direct access in memory with previous cached writes through
3406          * shmemfs and that our cache domain tracking remains valid.
3407          * For example, if the obj->filp was moved to swap without us
3408          * being notified and releasing the pages, we would mistakenly
3409          * continue to assume that the obj remained out of the CPU cached
3410          * domain.
3411          */
3412         ret = i915_gem_object_pin_pages(obj);
3413         if (ret)
3414                 return ret;
3415
3416         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3417
3418         /* Serialise direct access to this object with the barriers for
3419          * coherent writes from the GPU, by effectively invalidating the
3420          * WC domain upon first access.
3421          */
3422         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3423                 mb();
3424
3425         /* It should now be out of any other write domains, and we can update
3426          * the domain values for our changes.
3427          */
3428         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3429         obj->read_domains |= I915_GEM_DOMAIN_WC;
3430         if (write) {
3431                 obj->read_domains = I915_GEM_DOMAIN_WC;
3432                 obj->write_domain = I915_GEM_DOMAIN_WC;
3433                 obj->mm.dirty = true;
3434         }
3435
3436         i915_gem_object_unpin_pages(obj);
3437         return 0;
3438 }
3439
3440 /**
3441  * Moves a single object to the GTT read, and possibly write domain.
3442  * @obj: object to act on
3443  * @write: ask for write access or read only
3444  *
3445  * This function returns when the move is complete, including waiting on
3446  * flushes to occur.
3447  */
3448 int
3449 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3450 {
3451         int ret;
3452
3453         lockdep_assert_held(&obj->base.dev->struct_mutex);
3454
3455         ret = i915_gem_object_wait(obj,
3456                                    I915_WAIT_INTERRUPTIBLE |
3457                                    I915_WAIT_LOCKED |
3458                                    (write ? I915_WAIT_ALL : 0),
3459                                    MAX_SCHEDULE_TIMEOUT,
3460                                    NULL);
3461         if (ret)
3462                 return ret;
3463
3464         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3465                 return 0;
3466
3467         /* Flush and acquire obj->pages so that we are coherent through
3468          * direct access in memory with previous cached writes through
3469          * shmemfs and that our cache domain tracking remains valid.
3470          * For example, if the obj->filp was moved to swap without us
3471          * being notified and releasing the pages, we would mistakenly
3472          * continue to assume that the obj remained out of the CPU cached
3473          * domain.
3474          */
3475         ret = i915_gem_object_pin_pages(obj);
3476         if (ret)
3477                 return ret;
3478
3479         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3480
3481         /* Serialise direct access to this object with the barriers for
3482          * coherent writes from the GPU, by effectively invalidating the
3483          * GTT domain upon first access.
3484          */
3485         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3486                 mb();
3487
3488         /* It should now be out of any other write domains, and we can update
3489          * the domain values for our changes.
3490          */
3491         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3492         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3493         if (write) {
3494                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3495                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3496                 obj->mm.dirty = true;
3497         }
3498
3499         i915_gem_object_unpin_pages(obj);
3500         return 0;
3501 }
3502
3503 /**
3504  * Changes the cache-level of an object across all VMA.
3505  * @obj: object to act on
3506  * @cache_level: new cache level to set for the object
3507  *
3508  * After this function returns, the object will be in the new cache-level
3509  * across all GTT and the contents of the backing storage will be coherent,
3510  * with respect to the new cache-level. In order to keep the backing storage
3511  * coherent for all users, we only allow a single cache level to be set
3512  * globally on the object and prevent it from being changed whilst the
3513  * hardware is reading from the object. That is if the object is currently
3514  * on the scanout it will be set to uncached (or equivalent display
3515  * cache coherency) and all non-MOCS GPU access will also be uncached so
3516  * that all direct access to the scanout remains coherent.
3517  */
3518 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3519                                     enum i915_cache_level cache_level)
3520 {
3521         struct i915_vma *vma;
3522         int ret;
3523
3524         lockdep_assert_held(&obj->base.dev->struct_mutex);
3525
3526         if (obj->cache_level == cache_level)
3527                 return 0;
3528
3529         /* Inspect the list of currently bound VMA and unbind any that would
3530          * be invalid given the new cache-level. This is principally to
3531          * catch the issue of the CS prefetch crossing page boundaries and
3532          * reading an invalid PTE on older architectures.
3533          */
3534 restart:
3535         list_for_each_entry(vma, &obj->vma.list, obj_link) {
3536                 if (!drm_mm_node_allocated(&vma->node))
3537                         continue;
3538
3539                 if (i915_vma_is_pinned(vma)) {
3540                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3541                         return -EBUSY;
3542                 }
3543
3544                 if (!i915_vma_is_closed(vma) &&
3545                     i915_gem_valid_gtt_space(vma, cache_level))
3546                         continue;
3547
3548                 ret = i915_vma_unbind(vma);
3549                 if (ret)
3550                         return ret;
3551
3552                 /* As unbinding may affect other elements in the
3553                  * obj->vma_list (due to side-effects from retiring
3554                  * an active vma), play safe and restart the iterator.
3555                  */
3556                 goto restart;
3557         }
3558
3559         /* We can reuse the existing drm_mm nodes but need to change the
3560          * cache-level on the PTE. We could simply unbind them all and
3561          * rebind with the correct cache-level on next use. However since
3562          * we already have a valid slot, dma mapping, pages etc, we may as
3563          * rewrite the PTE in the belief that doing so tramples upon less
3564          * state and so involves less work.
3565          */
3566         if (obj->bind_count) {
3567                 /* Before we change the PTE, the GPU must not be accessing it.
3568                  * If we wait upon the object, we know that all the bound
3569                  * VMA are no longer active.
3570                  */
3571                 ret = i915_gem_object_wait(obj,
3572                                            I915_WAIT_INTERRUPTIBLE |
3573                                            I915_WAIT_LOCKED |
3574                                            I915_WAIT_ALL,
3575                                            MAX_SCHEDULE_TIMEOUT,
3576                                            NULL);
3577                 if (ret)
3578                         return ret;
3579
3580                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3581                     cache_level != I915_CACHE_NONE) {
3582                         /* Access to snoopable pages through the GTT is
3583                          * incoherent and on some machines causes a hard
3584                          * lockup. Relinquish the CPU mmaping to force
3585                          * userspace to refault in the pages and we can
3586                          * then double check if the GTT mapping is still
3587                          * valid for that pointer access.
3588                          */
3589                         i915_gem_release_mmap(obj);
3590
3591                         /* As we no longer need a fence for GTT access,
3592                          * we can relinquish it now (and so prevent having
3593                          * to steal a fence from someone else on the next
3594                          * fence request). Note GPU activity would have
3595                          * dropped the fence as all snoopable access is
3596                          * supposed to be linear.
3597                          */
3598                         for_each_ggtt_vma(vma, obj) {
3599                                 ret = i915_vma_put_fence(vma);
3600                                 if (ret)
3601                                         return ret;
3602                         }
3603                 } else {
3604                         /* We either have incoherent backing store and
3605                          * so no GTT access or the architecture is fully
3606                          * coherent. In such cases, existing GTT mmaps
3607                          * ignore the cache bit in the PTE and we can
3608                          * rewrite it without confusing the GPU or having
3609                          * to force userspace to fault back in its mmaps.
3610                          */
3611                 }
3612
3613                 list_for_each_entry(vma, &obj->vma.list, obj_link) {
3614                         if (!drm_mm_node_allocated(&vma->node))
3615                                 continue;
3616
3617                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3618                         if (ret)
3619                                 return ret;
3620                 }
3621         }
3622
3623         list_for_each_entry(vma, &obj->vma.list, obj_link)
3624                 vma->node.color = cache_level;
3625         i915_gem_object_set_cache_coherency(obj, cache_level);
3626         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3627
3628         return 0;
3629 }
3630
3631 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3632                                struct drm_file *file)
3633 {
3634         struct drm_i915_gem_caching *args = data;
3635         struct drm_i915_gem_object *obj;
3636         int err = 0;
3637
3638         rcu_read_lock();
3639         obj = i915_gem_object_lookup_rcu(file, args->handle);
3640         if (!obj) {
3641                 err = -ENOENT;
3642                 goto out;
3643         }
3644
3645         switch (obj->cache_level) {
3646         case I915_CACHE_LLC:
3647         case I915_CACHE_L3_LLC:
3648                 args->caching = I915_CACHING_CACHED;
3649                 break;
3650
3651         case I915_CACHE_WT:
3652                 args->caching = I915_CACHING_DISPLAY;
3653                 break;
3654
3655         default:
3656                 args->caching = I915_CACHING_NONE;
3657                 break;
3658         }
3659 out:
3660         rcu_read_unlock();
3661         return err;
3662 }
3663
3664 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
3665                                struct drm_file *file)
3666 {
3667         struct drm_i915_private *i915 = to_i915(dev);
3668         struct drm_i915_gem_caching *args = data;
3669         struct drm_i915_gem_object *obj;
3670         enum i915_cache_level level;
3671         int ret = 0;
3672
3673         switch (args->caching) {
3674         case I915_CACHING_NONE:
3675                 level = I915_CACHE_NONE;
3676                 break;
3677         case I915_CACHING_CACHED:
3678                 /*
3679                  * Due to a HW issue on BXT A stepping, GPU stores via a
3680                  * snooped mapping may leave stale data in a corresponding CPU
3681                  * cacheline, whereas normally such cachelines would get
3682                  * invalidated.
3683                  */
3684                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
3685                         return -ENODEV;
3686
3687                 level = I915_CACHE_LLC;
3688                 break;
3689         case I915_CACHING_DISPLAY:
3690                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
3691                 break;
3692         default:
3693                 return -EINVAL;
3694         }
3695
3696         obj = i915_gem_object_lookup(file, args->handle);
3697         if (!obj)
3698                 return -ENOENT;
3699
3700         /*
3701          * The caching mode of proxy object is handled by its generator, and
3702          * not allowed to be changed by userspace.
3703          */
3704         if (i915_gem_object_is_proxy(obj)) {
3705                 ret = -ENXIO;
3706                 goto out;
3707         }
3708
3709         if (obj->cache_level == level)
3710                 goto out;
3711
3712         ret = i915_gem_object_wait(obj,
3713                                    I915_WAIT_INTERRUPTIBLE,
3714                                    MAX_SCHEDULE_TIMEOUT,
3715                                    to_rps_client(file));
3716         if (ret)
3717                 goto out;
3718
3719         ret = i915_mutex_lock_interruptible(dev);
3720         if (ret)
3721                 goto out;
3722
3723         ret = i915_gem_object_set_cache_level(obj, level);
3724         mutex_unlock(&dev->struct_mutex);
3725
3726 out:
3727         i915_gem_object_put(obj);
3728         return ret;
3729 }
3730
3731 /*
3732  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
3733  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
3734  * (for pageflips). We only flush the caches while preparing the buffer for
3735  * display, the callers are responsible for frontbuffer flush.
3736  */
3737 struct i915_vma *
3738 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
3739                                      u32 alignment,
3740                                      const struct i915_ggtt_view *view,
3741                                      unsigned int flags)
3742 {
3743         struct i915_vma *vma;
3744         int ret;
3745
3746         lockdep_assert_held(&obj->base.dev->struct_mutex);
3747
3748         /* Mark the global pin early so that we account for the
3749          * display coherency whilst setting up the cache domains.
3750          */
3751         obj->pin_global++;
3752
3753         /* The display engine is not coherent with the LLC cache on gen6.  As
3754          * a result, we make sure that the pinning that is about to occur is
3755          * done with uncached PTEs. This is lowest common denominator for all
3756          * chipsets.
3757          *
3758          * However for gen6+, we could do better by using the GFDT bit instead
3759          * of uncaching, which would allow us to flush all the LLC-cached data
3760          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
3761          */
3762         ret = i915_gem_object_set_cache_level(obj,
3763                                               HAS_WT(to_i915(obj->base.dev)) ?
3764                                               I915_CACHE_WT : I915_CACHE_NONE);
3765         if (ret) {
3766                 vma = ERR_PTR(ret);
3767                 goto err_unpin_global;
3768         }
3769
3770         /* As the user may map the buffer once pinned in the display plane
3771          * (e.g. libkms for the bootup splash), we have to ensure that we
3772          * always use map_and_fenceable for all scanout buffers. However,
3773          * it may simply be too big to fit into mappable, in which case
3774          * put it anyway and hope that userspace can cope (but always first
3775          * try to preserve the existing ABI).
3776          */
3777         vma = ERR_PTR(-ENOSPC);
3778         if ((flags & PIN_MAPPABLE) == 0 &&
3779             (!view || view->type == I915_GGTT_VIEW_NORMAL))
3780                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
3781                                                flags |
3782                                                PIN_MAPPABLE |
3783                                                PIN_NONBLOCK);
3784         if (IS_ERR(vma))
3785                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
3786         if (IS_ERR(vma))
3787                 goto err_unpin_global;
3788
3789         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
3790
3791         __i915_gem_object_flush_for_display(obj);
3792
3793         /* It should now be out of any other write domains, and we can update
3794          * the domain values for our changes.
3795          */
3796         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3797
3798         return vma;
3799
3800 err_unpin_global:
3801         obj->pin_global--;
3802         return vma;
3803 }
3804
3805 void
3806 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
3807 {
3808         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
3809
3810         if (WARN_ON(vma->obj->pin_global == 0))
3811                 return;
3812
3813         if (--vma->obj->pin_global == 0)
3814                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
3815
3816         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
3817         i915_gem_object_bump_inactive_ggtt(vma->obj);
3818
3819         i915_vma_unpin(vma);
3820 }
3821
3822 /**
3823  * Moves a single object to the CPU read, and possibly write domain.
3824  * @obj: object to act on
3825  * @write: requesting write or read-only access
3826  *
3827  * This function returns when the move is complete, including waiting on
3828  * flushes to occur.
3829  */
3830 int
3831 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
3832 {
3833         int ret;
3834
3835         lockdep_assert_held(&obj->base.dev->struct_mutex);
3836
3837         ret = i915_gem_object_wait(obj,
3838                                    I915_WAIT_INTERRUPTIBLE |
3839                                    I915_WAIT_LOCKED |
3840                                    (write ? I915_WAIT_ALL : 0),
3841                                    MAX_SCHEDULE_TIMEOUT,
3842                                    NULL);
3843         if (ret)
3844                 return ret;
3845
3846         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3847
3848         /* Flush the CPU cache if it's still invalid. */
3849         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
3850                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
3851                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
3852         }
3853
3854         /* It should now be out of any other write domains, and we can update
3855          * the domain values for our changes.
3856          */
3857         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
3858
3859         /* If we're writing through the CPU, then the GPU read domains will
3860          * need to be invalidated at next use.
3861          */
3862         if (write)
3863                 __start_cpu_write(obj);
3864
3865         return 0;
3866 }
3867
3868 /* Throttle our rendering by waiting until the ring has completed our requests
3869  * emitted over 20 msec ago.
3870  *
3871  * Note that if we were to use the current jiffies each time around the loop,
3872  * we wouldn't escape the function with any frames outstanding if the time to
3873  * render a frame was over 20ms.
3874  *
3875  * This should get us reasonable parallelism between CPU and GPU but also
3876  * relatively low latency when blocking on a particular request to finish.
3877  */
3878 static int
3879 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
3880 {
3881         struct drm_i915_private *dev_priv = to_i915(dev);
3882         struct drm_i915_file_private *file_priv = file->driver_priv;
3883         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
3884         struct i915_request *request, *target = NULL;
3885         long ret;
3886
3887         /* ABI: return -EIO if already wedged */
3888         if (i915_terminally_wedged(&dev_priv->gpu_error))
3889                 return -EIO;
3890
3891         spin_lock(&file_priv->mm.lock);
3892         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
3893                 if (time_after_eq(request->emitted_jiffies, recent_enough))
3894                         break;
3895
3896                 if (target) {
3897                         list_del(&target->client_link);
3898                         target->file_priv = NULL;
3899                 }
3900
3901                 target = request;
3902         }
3903         if (target)
3904                 i915_request_get(target);
3905         spin_unlock(&file_priv->mm.lock);
3906
3907         if (target == NULL)
3908                 return 0;
3909
3910         ret = i915_request_wait(target,
3911                                 I915_WAIT_INTERRUPTIBLE,
3912                                 MAX_SCHEDULE_TIMEOUT);
3913         i915_request_put(target);
3914
3915         return ret < 0 ? ret : 0;
3916 }
3917
3918 struct i915_vma *
3919 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
3920                          const struct i915_ggtt_view *view,
3921                          u64 size,
3922                          u64 alignment,
3923                          u64 flags)
3924 {
3925         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
3926         struct i915_address_space *vm = &dev_priv->ggtt.vm;
3927         struct i915_vma *vma;
3928         int ret;
3929
3930         lockdep_assert_held(&obj->base.dev->struct_mutex);
3931
3932         if (flags & PIN_MAPPABLE &&
3933             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
3934                 /* If the required space is larger than the available
3935                  * aperture, we will not able to find a slot for the
3936                  * object and unbinding the object now will be in
3937                  * vain. Worse, doing so may cause us to ping-pong
3938                  * the object in and out of the Global GTT and
3939                  * waste a lot of cycles under the mutex.
3940                  */
3941                 if (obj->base.size > dev_priv->ggtt.mappable_end)
3942                         return ERR_PTR(-E2BIG);
3943
3944                 /* If NONBLOCK is set the caller is optimistically
3945                  * trying to cache the full object within the mappable
3946                  * aperture, and *must* have a fallback in place for
3947                  * situations where we cannot bind the object. We
3948                  * can be a little more lax here and use the fallback
3949                  * more often to avoid costly migrations of ourselves
3950                  * and other objects within the aperture.
3951                  *
3952                  * Half-the-aperture is used as a simple heuristic.
3953                  * More interesting would to do search for a free
3954                  * block prior to making the commitment to unbind.
3955                  * That caters for the self-harm case, and with a
3956                  * little more heuristics (e.g. NOFAULT, NOEVICT)
3957                  * we could try to minimise harm to others.
3958                  */
3959                 if (flags & PIN_NONBLOCK &&
3960                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
3961                         return ERR_PTR(-ENOSPC);
3962         }
3963
3964         vma = i915_vma_instance(obj, vm, view);
3965         if (unlikely(IS_ERR(vma)))
3966                 return vma;
3967
3968         if (i915_vma_misplaced(vma, size, alignment, flags)) {
3969                 if (flags & PIN_NONBLOCK) {
3970                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
3971                                 return ERR_PTR(-ENOSPC);
3972
3973                         if (flags & PIN_MAPPABLE &&
3974                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
3975                                 return ERR_PTR(-ENOSPC);
3976                 }
3977
3978                 WARN(i915_vma_is_pinned(vma),
3979                      "bo is already pinned in ggtt with incorrect alignment:"
3980                      " offset=%08x, req.alignment=%llx,"
3981                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
3982                      i915_ggtt_offset(vma), alignment,
3983                      !!(flags & PIN_MAPPABLE),
3984                      i915_vma_is_map_and_fenceable(vma));
3985                 ret = i915_vma_unbind(vma);
3986                 if (ret)
3987                         return ERR_PTR(ret);
3988         }
3989
3990         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
3991         if (ret)
3992                 return ERR_PTR(ret);
3993
3994         return vma;
3995 }
3996
3997 static __always_inline unsigned int __busy_read_flag(unsigned int id)
3998 {
3999         /* Note that we could alias engines in the execbuf API, but
4000          * that would be very unwise as it prevents userspace from
4001          * fine control over engine selection. Ahem.
4002          *
4003          * This should be something like EXEC_MAX_ENGINE instead of
4004          * I915_NUM_ENGINES.
4005          */
4006         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
4007         return 0x10000 << id;
4008 }
4009
4010 static __always_inline unsigned int __busy_write_id(unsigned int id)
4011 {
4012         /* The uABI guarantees an active writer is also amongst the read
4013          * engines. This would be true if we accessed the activity tracking
4014          * under the lock, but as we perform the lookup of the object and
4015          * its activity locklessly we can not guarantee that the last_write
4016          * being active implies that we have set the same engine flag from
4017          * last_read - hence we always set both read and write busy for
4018          * last_write.
4019          */
4020         return id | __busy_read_flag(id);
4021 }
4022
4023 static __always_inline unsigned int
4024 __busy_set_if_active(const struct dma_fence *fence,
4025                      unsigned int (*flag)(unsigned int id))
4026 {
4027         struct i915_request *rq;
4028
4029         /* We have to check the current hw status of the fence as the uABI
4030          * guarantees forward progress. We could rely on the idle worker
4031          * to eventually flush us, but to minimise latency just ask the
4032          * hardware.
4033          *
4034          * Note we only report on the status of native fences.
4035          */
4036         if (!dma_fence_is_i915(fence))
4037                 return 0;
4038
4039         /* opencode to_request() in order to avoid const warnings */
4040         rq = container_of(fence, struct i915_request, fence);
4041         if (i915_request_completed(rq))
4042                 return 0;
4043
4044         return flag(rq->engine->uabi_id);
4045 }
4046
4047 static __always_inline unsigned int
4048 busy_check_reader(const struct dma_fence *fence)
4049 {
4050         return __busy_set_if_active(fence, __busy_read_flag);
4051 }
4052
4053 static __always_inline unsigned int
4054 busy_check_writer(const struct dma_fence *fence)
4055 {
4056         if (!fence)
4057                 return 0;
4058
4059         return __busy_set_if_active(fence, __busy_write_id);
4060 }
4061
4062 int
4063 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4064                     struct drm_file *file)
4065 {
4066         struct drm_i915_gem_busy *args = data;
4067         struct drm_i915_gem_object *obj;
4068         struct reservation_object_list *list;
4069         unsigned int seq;
4070         int err;
4071
4072         err = -ENOENT;
4073         rcu_read_lock();
4074         obj = i915_gem_object_lookup_rcu(file, args->handle);
4075         if (!obj)
4076                 goto out;
4077
4078         /* A discrepancy here is that we do not report the status of
4079          * non-i915 fences, i.e. even though we may report the object as idle,
4080          * a call to set-domain may still stall waiting for foreign rendering.
4081          * This also means that wait-ioctl may report an object as busy,
4082          * where busy-ioctl considers it idle.
4083          *
4084          * We trade the ability to warn of foreign fences to report on which
4085          * i915 engines are active for the object.
4086          *
4087          * Alternatively, we can trade that extra information on read/write
4088          * activity with
4089          *      args->busy =
4090          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4091          * to report the overall busyness. This is what the wait-ioctl does.
4092          *
4093          */
4094 retry:
4095         seq = raw_read_seqcount(&obj->resv->seq);
4096
4097         /* Translate the exclusive fence to the READ *and* WRITE engine */
4098         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4099
4100         /* Translate shared fences to READ set of engines */
4101         list = rcu_dereference(obj->resv->fence);
4102         if (list) {
4103                 unsigned int shared_count = list->shared_count, i;
4104
4105                 for (i = 0; i < shared_count; ++i) {
4106                         struct dma_fence *fence =
4107                                 rcu_dereference(list->shared[i]);
4108
4109                         args->busy |= busy_check_reader(fence);
4110                 }
4111         }
4112
4113         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4114                 goto retry;
4115
4116         err = 0;
4117 out:
4118         rcu_read_unlock();
4119         return err;
4120 }
4121
4122 int
4123 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4124                         struct drm_file *file_priv)
4125 {
4126         return i915_gem_ring_throttle(dev, file_priv);
4127 }
4128
4129 int
4130 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4131                        struct drm_file *file_priv)
4132 {
4133         struct drm_i915_private *dev_priv = to_i915(dev);
4134         struct drm_i915_gem_madvise *args = data;
4135         struct drm_i915_gem_object *obj;
4136         int err;
4137
4138         switch (args->madv) {
4139         case I915_MADV_DONTNEED:
4140         case I915_MADV_WILLNEED:
4141             break;
4142         default:
4143             return -EINVAL;
4144         }
4145
4146         obj = i915_gem_object_lookup(file_priv, args->handle);
4147         if (!obj)
4148                 return -ENOENT;
4149
4150         err = mutex_lock_interruptible(&obj->mm.lock);
4151         if (err)
4152                 goto out;
4153
4154         if (i915_gem_object_has_pages(obj) &&
4155             i915_gem_object_is_tiled(obj) &&
4156             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4157                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4158                         GEM_BUG_ON(!obj->mm.quirked);
4159                         __i915_gem_object_unpin_pages(obj);
4160                         obj->mm.quirked = false;
4161                 }
4162                 if (args->madv == I915_MADV_WILLNEED) {
4163                         GEM_BUG_ON(obj->mm.quirked);
4164                         __i915_gem_object_pin_pages(obj);
4165                         obj->mm.quirked = true;
4166                 }
4167         }
4168
4169         if (obj->mm.madv != __I915_MADV_PURGED)
4170                 obj->mm.madv = args->madv;
4171
4172         /* if the object is no longer attached, discard its backing storage */
4173         if (obj->mm.madv == I915_MADV_DONTNEED &&
4174             !i915_gem_object_has_pages(obj))
4175                 i915_gem_object_truncate(obj);
4176
4177         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4178         mutex_unlock(&obj->mm.lock);
4179
4180 out:
4181         i915_gem_object_put(obj);
4182         return err;
4183 }
4184
4185 static void
4186 frontbuffer_retire(struct i915_active_request *active,
4187                    struct i915_request *request)
4188 {
4189         struct drm_i915_gem_object *obj =
4190                 container_of(active, typeof(*obj), frontbuffer_write);
4191
4192         intel_fb_obj_flush(obj, ORIGIN_CS);
4193 }
4194
4195 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4196                           const struct drm_i915_gem_object_ops *ops)
4197 {
4198         mutex_init(&obj->mm.lock);
4199
4200         spin_lock_init(&obj->vma.lock);
4201         INIT_LIST_HEAD(&obj->vma.list);
4202
4203         INIT_LIST_HEAD(&obj->lut_list);
4204         INIT_LIST_HEAD(&obj->batch_pool_link);
4205
4206         init_rcu_head(&obj->rcu);
4207
4208         obj->ops = ops;
4209
4210         reservation_object_init(&obj->__builtin_resv);
4211         obj->resv = &obj->__builtin_resv;
4212
4213         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4214         i915_active_request_init(&obj->frontbuffer_write,
4215                                  NULL, frontbuffer_retire);
4216
4217         obj->mm.madv = I915_MADV_WILLNEED;
4218         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4219         mutex_init(&obj->mm.get_page.lock);
4220
4221         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4222 }
4223
4224 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4225         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4226                  I915_GEM_OBJECT_IS_SHRINKABLE,
4227
4228         .get_pages = i915_gem_object_get_pages_gtt,
4229         .put_pages = i915_gem_object_put_pages_gtt,
4230
4231         .pwrite = i915_gem_object_pwrite_gtt,
4232 };
4233
4234 static int i915_gem_object_create_shmem(struct drm_device *dev,
4235                                         struct drm_gem_object *obj,
4236                                         size_t size)
4237 {
4238         struct drm_i915_private *i915 = to_i915(dev);
4239         unsigned long flags = VM_NORESERVE;
4240         struct file *filp;
4241
4242         drm_gem_private_object_init(dev, obj, size);
4243
4244         if (i915->mm.gemfs)
4245                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4246                                                  flags);
4247         else
4248                 filp = shmem_file_setup("i915", size, flags);
4249
4250         if (IS_ERR(filp))
4251                 return PTR_ERR(filp);
4252
4253         obj->filp = filp;
4254
4255         return 0;
4256 }
4257
4258 struct drm_i915_gem_object *
4259 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4260 {
4261         struct drm_i915_gem_object *obj;
4262         struct address_space *mapping;
4263         unsigned int cache_level;
4264         gfp_t mask;
4265         int ret;
4266
4267         /* There is a prevalence of the assumption that we fit the object's
4268          * page count inside a 32bit _signed_ variable. Let's document this and
4269          * catch if we ever need to fix it. In the meantime, if you do spot
4270          * such a local variable, please consider fixing!
4271          */
4272         if (size >> PAGE_SHIFT > INT_MAX)
4273                 return ERR_PTR(-E2BIG);
4274
4275         if (overflows_type(size, obj->base.size))
4276                 return ERR_PTR(-E2BIG);
4277
4278         obj = i915_gem_object_alloc(dev_priv);
4279         if (obj == NULL)
4280                 return ERR_PTR(-ENOMEM);
4281
4282         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4283         if (ret)
4284                 goto fail;
4285
4286         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4287         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4288                 /* 965gm cannot relocate objects above 4GiB. */
4289                 mask &= ~__GFP_HIGHMEM;
4290                 mask |= __GFP_DMA32;
4291         }
4292
4293         mapping = obj->base.filp->f_mapping;
4294         mapping_set_gfp_mask(mapping, mask);
4295         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4296
4297         i915_gem_object_init(obj, &i915_gem_object_ops);
4298
4299         obj->write_domain = I915_GEM_DOMAIN_CPU;
4300         obj->read_domains = I915_GEM_DOMAIN_CPU;
4301
4302         if (HAS_LLC(dev_priv))
4303                 /* On some devices, we can have the GPU use the LLC (the CPU
4304                  * cache) for about a 10% performance improvement
4305                  * compared to uncached.  Graphics requests other than
4306                  * display scanout are coherent with the CPU in
4307                  * accessing this cache.  This means in this mode we
4308                  * don't need to clflush on the CPU side, and on the
4309                  * GPU side we only need to flush internal caches to
4310                  * get data visible to the CPU.
4311                  *
4312                  * However, we maintain the display planes as UC, and so
4313                  * need to rebind when first used as such.
4314                  */
4315                 cache_level = I915_CACHE_LLC;
4316         else
4317                 cache_level = I915_CACHE_NONE;
4318
4319         i915_gem_object_set_cache_coherency(obj, cache_level);
4320
4321         trace_i915_gem_object_create(obj);
4322
4323         return obj;
4324
4325 fail:
4326         i915_gem_object_free(obj);
4327         return ERR_PTR(ret);
4328 }
4329
4330 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4331 {
4332         /* If we are the last user of the backing storage (be it shmemfs
4333          * pages or stolen etc), we know that the pages are going to be
4334          * immediately released. In this case, we can then skip copying
4335          * back the contents from the GPU.
4336          */
4337
4338         if (obj->mm.madv != I915_MADV_WILLNEED)
4339                 return false;
4340
4341         if (obj->base.filp == NULL)
4342                 return true;
4343
4344         /* At first glance, this looks racy, but then again so would be
4345          * userspace racing mmap against close. However, the first external
4346          * reference to the filp can only be obtained through the
4347          * i915_gem_mmap_ioctl() which safeguards us against the user
4348          * acquiring such a reference whilst we are in the middle of
4349          * freeing the object.
4350          */
4351         return atomic_long_read(&obj->base.filp->f_count) == 1;
4352 }
4353
4354 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4355                                     struct llist_node *freed)
4356 {
4357         struct drm_i915_gem_object *obj, *on;
4358         intel_wakeref_t wakeref;
4359
4360         wakeref = intel_runtime_pm_get(i915);
4361         llist_for_each_entry_safe(obj, on, freed, freed) {
4362                 struct i915_vma *vma, *vn;
4363
4364                 trace_i915_gem_object_destroy(obj);
4365
4366                 mutex_lock(&i915->drm.struct_mutex);
4367
4368                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4369                 list_for_each_entry_safe(vma, vn, &obj->vma.list, obj_link) {
4370                         GEM_BUG_ON(i915_vma_is_active(vma));
4371                         vma->flags &= ~I915_VMA_PIN_MASK;
4372                         i915_vma_destroy(vma);
4373                 }
4374                 GEM_BUG_ON(!list_empty(&obj->vma.list));
4375                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma.tree));
4376
4377                 /* This serializes freeing with the shrinker. Since the free
4378                  * is delayed, first by RCU then by the workqueue, we want the
4379                  * shrinker to be able to free pages of unreferenced objects,
4380                  * or else we may oom whilst there are plenty of deferred
4381                  * freed objects.
4382                  */
4383                 if (i915_gem_object_has_pages(obj)) {
4384                         spin_lock(&i915->mm.obj_lock);
4385                         list_del_init(&obj->mm.link);
4386                         spin_unlock(&i915->mm.obj_lock);
4387                 }
4388
4389                 mutex_unlock(&i915->drm.struct_mutex);
4390
4391                 GEM_BUG_ON(obj->bind_count);
4392                 GEM_BUG_ON(obj->userfault_count);
4393                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4394                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4395
4396                 if (obj->ops->release)
4397                         obj->ops->release(obj);
4398
4399                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4400                         atomic_set(&obj->mm.pages_pin_count, 0);
4401                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4402                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4403
4404                 if (obj->base.import_attach)
4405                         drm_prime_gem_destroy(&obj->base, NULL);
4406
4407                 reservation_object_fini(&obj->__builtin_resv);
4408                 drm_gem_object_release(&obj->base);
4409                 i915_gem_info_remove_obj(i915, obj->base.size);
4410
4411                 kfree(obj->bit_17);
4412                 i915_gem_object_free(obj);
4413
4414                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4415                 atomic_dec(&i915->mm.free_count);
4416
4417                 if (on)
4418                         cond_resched();
4419         }
4420         intel_runtime_pm_put(i915, wakeref);
4421 }
4422
4423 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4424 {
4425         struct llist_node *freed;
4426
4427         /* Free the oldest, most stale object to keep the free_list short */
4428         freed = NULL;
4429         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4430                 /* Only one consumer of llist_del_first() allowed */
4431                 spin_lock(&i915->mm.free_lock);
4432                 freed = llist_del_first(&i915->mm.free_list);
4433                 spin_unlock(&i915->mm.free_lock);
4434         }
4435         if (unlikely(freed)) {
4436                 freed->next = NULL;
4437                 __i915_gem_free_objects(i915, freed);
4438         }
4439 }
4440
4441 static void __i915_gem_free_work(struct work_struct *work)
4442 {
4443         struct drm_i915_private *i915 =
4444                 container_of(work, struct drm_i915_private, mm.free_work);
4445         struct llist_node *freed;
4446
4447         /*
4448          * All file-owned VMA should have been released by this point through
4449          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4450          * However, the object may also be bound into the global GTT (e.g.
4451          * older GPUs without per-process support, or for direct access through
4452          * the GTT either for the user or for scanout). Those VMA still need to
4453          * unbound now.
4454          */
4455
4456         spin_lock(&i915->mm.free_lock);
4457         while ((freed = llist_del_all(&i915->mm.free_list))) {
4458                 spin_unlock(&i915->mm.free_lock);
4459
4460                 __i915_gem_free_objects(i915, freed);
4461                 if (need_resched())
4462                         return;
4463
4464                 spin_lock(&i915->mm.free_lock);
4465         }
4466         spin_unlock(&i915->mm.free_lock);
4467 }
4468
4469 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4470 {
4471         struct drm_i915_gem_object *obj =
4472                 container_of(head, typeof(*obj), rcu);
4473         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4474
4475         /*
4476          * We reuse obj->rcu for the freed list, so we had better not treat
4477          * it like a rcu_head from this point forwards. And we expect all
4478          * objects to be freed via this path.
4479          */
4480         destroy_rcu_head(&obj->rcu);
4481
4482         /*
4483          * Since we require blocking on struct_mutex to unbind the freed
4484          * object from the GPU before releasing resources back to the
4485          * system, we can not do that directly from the RCU callback (which may
4486          * be a softirq context), but must instead then defer that work onto a
4487          * kthread. We use the RCU callback rather than move the freed object
4488          * directly onto the work queue so that we can mix between using the
4489          * worker and performing frees directly from subsequent allocations for
4490          * crude but effective memory throttling.
4491          */
4492         if (llist_add(&obj->freed, &i915->mm.free_list))
4493                 queue_work(i915->wq, &i915->mm.free_work);
4494 }
4495
4496 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4497 {
4498         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4499
4500         if (obj->mm.quirked)
4501                 __i915_gem_object_unpin_pages(obj);
4502
4503         if (discard_backing_storage(obj))
4504                 obj->mm.madv = I915_MADV_DONTNEED;
4505
4506         /*
4507          * Before we free the object, make sure any pure RCU-only
4508          * read-side critical sections are complete, e.g.
4509          * i915_gem_busy_ioctl(). For the corresponding synchronized
4510          * lookup see i915_gem_object_lookup_rcu().
4511          */
4512         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4513         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4514 }
4515
4516 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4517 {
4518         lockdep_assert_held(&obj->base.dev->struct_mutex);
4519
4520         if (!i915_gem_object_has_active_reference(obj) &&
4521             i915_gem_object_is_active(obj))
4522                 i915_gem_object_set_active_reference(obj);
4523         else
4524                 i915_gem_object_put(obj);
4525 }
4526
4527 void i915_gem_sanitize(struct drm_i915_private *i915)
4528 {
4529         intel_wakeref_t wakeref;
4530
4531         GEM_TRACE("\n");
4532
4533         wakeref = intel_runtime_pm_get(i915);
4534         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4535
4536         /*
4537          * As we have just resumed the machine and woken the device up from
4538          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4539          * back to defaults, recovering from whatever wedged state we left it
4540          * in and so worth trying to use the device once more.
4541          */
4542         if (i915_terminally_wedged(&i915->gpu_error))
4543                 i915_gem_unset_wedged(i915);
4544
4545         /*
4546          * If we inherit context state from the BIOS or earlier occupants
4547          * of the GPU, the GPU may be in an inconsistent state when we
4548          * try to take over. The only way to remove the earlier state
4549          * is by resetting. However, resetting on earlier gen is tricky as
4550          * it may impact the display and we are uncertain about the stability
4551          * of the reset, so this could be applied to even earlier gen.
4552          */
4553         intel_engines_sanitize(i915, false);
4554
4555         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4556         intel_runtime_pm_put(i915, wakeref);
4557
4558         mutex_lock(&i915->drm.struct_mutex);
4559         i915_gem_contexts_lost(i915);
4560         mutex_unlock(&i915->drm.struct_mutex);
4561 }
4562
4563 int i915_gem_suspend(struct drm_i915_private *i915)
4564 {
4565         intel_wakeref_t wakeref;
4566         int ret;
4567
4568         GEM_TRACE("\n");
4569
4570         wakeref = intel_runtime_pm_get(i915);
4571         intel_suspend_gt_powersave(i915);
4572
4573         flush_workqueue(i915->wq);
4574
4575         mutex_lock(&i915->drm.struct_mutex);
4576
4577         /*
4578          * We have to flush all the executing contexts to main memory so
4579          * that they can saved in the hibernation image. To ensure the last
4580          * context image is coherent, we have to switch away from it. That
4581          * leaves the i915->kernel_context still active when
4582          * we actually suspend, and its image in memory may not match the GPU
4583          * state. Fortunately, the kernel_context is disposable and we do
4584          * not rely on its state.
4585          */
4586         if (!i915_terminally_wedged(&i915->gpu_error)) {
4587                 ret = i915_gem_switch_to_kernel_context(i915);
4588                 if (ret)
4589                         goto err_unlock;
4590
4591                 ret = i915_gem_wait_for_idle(i915,
4592                                              I915_WAIT_INTERRUPTIBLE |
4593                                              I915_WAIT_LOCKED |
4594                                              I915_WAIT_FOR_IDLE_BOOST,
4595                                              MAX_SCHEDULE_TIMEOUT);
4596                 if (ret && ret != -EIO)
4597                         goto err_unlock;
4598
4599                 assert_kernel_context_is_current(i915);
4600         }
4601         i915_retire_requests(i915); /* ensure we flush after wedging */
4602
4603         mutex_unlock(&i915->drm.struct_mutex);
4604         i915_reset_flush(i915);
4605
4606         drain_delayed_work(&i915->gt.retire_work);
4607
4608         /*
4609          * As the idle_work is rearming if it detects a race, play safe and
4610          * repeat the flush until it is definitely idle.
4611          */
4612         drain_delayed_work(&i915->gt.idle_work);
4613
4614         intel_uc_suspend(i915);
4615
4616         /*
4617          * Assert that we successfully flushed all the work and
4618          * reset the GPU back to its idle, low power state.
4619          */
4620         WARN_ON(i915->gt.awake);
4621         if (WARN_ON(!intel_engines_are_idle(i915)))
4622                 i915_gem_set_wedged(i915); /* no hope, discard everything */
4623
4624         intel_runtime_pm_put(i915, wakeref);
4625         return 0;
4626
4627 err_unlock:
4628         mutex_unlock(&i915->drm.struct_mutex);
4629         intel_runtime_pm_put(i915, wakeref);
4630         return ret;
4631 }
4632
4633 void i915_gem_suspend_late(struct drm_i915_private *i915)
4634 {
4635         struct drm_i915_gem_object *obj;
4636         struct list_head *phases[] = {
4637                 &i915->mm.unbound_list,
4638                 &i915->mm.bound_list,
4639                 NULL
4640         }, **phase;
4641
4642         /*
4643          * Neither the BIOS, ourselves or any other kernel
4644          * expects the system to be in execlists mode on startup,
4645          * so we need to reset the GPU back to legacy mode. And the only
4646          * known way to disable logical contexts is through a GPU reset.
4647          *
4648          * So in order to leave the system in a known default configuration,
4649          * always reset the GPU upon unload and suspend. Afterwards we then
4650          * clean up the GEM state tracking, flushing off the requests and
4651          * leaving the system in a known idle state.
4652          *
4653          * Note that is of the upmost importance that the GPU is idle and
4654          * all stray writes are flushed *before* we dismantle the backing
4655          * storage for the pinned objects.
4656          *
4657          * However, since we are uncertain that resetting the GPU on older
4658          * machines is a good idea, we don't - just in case it leaves the
4659          * machine in an unusable condition.
4660          */
4661
4662         mutex_lock(&i915->drm.struct_mutex);
4663         for (phase = phases; *phase; phase++) {
4664                 list_for_each_entry(obj, *phase, mm.link)
4665                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
4666         }
4667         mutex_unlock(&i915->drm.struct_mutex);
4668
4669         intel_uc_sanitize(i915);
4670         i915_gem_sanitize(i915);
4671 }
4672
4673 void i915_gem_resume(struct drm_i915_private *i915)
4674 {
4675         GEM_TRACE("\n");
4676
4677         WARN_ON(i915->gt.awake);
4678
4679         mutex_lock(&i915->drm.struct_mutex);
4680         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4681
4682         i915_gem_restore_gtt_mappings(i915);
4683         i915_gem_restore_fences(i915);
4684
4685         /*
4686          * As we didn't flush the kernel context before suspend, we cannot
4687          * guarantee that the context image is complete. So let's just reset
4688          * it and start again.
4689          */
4690         i915->gt.resume(i915);
4691
4692         if (i915_gem_init_hw(i915))
4693                 goto err_wedged;
4694
4695         intel_uc_resume(i915);
4696
4697         /* Always reload a context for powersaving. */
4698         if (i915_gem_switch_to_kernel_context(i915))
4699                 goto err_wedged;
4700
4701 out_unlock:
4702         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4703         mutex_unlock(&i915->drm.struct_mutex);
4704         return;
4705
4706 err_wedged:
4707         if (!i915_terminally_wedged(&i915->gpu_error)) {
4708                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
4709                 i915_gem_set_wedged(i915);
4710         }
4711         goto out_unlock;
4712 }
4713
4714 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
4715 {
4716         if (INTEL_GEN(dev_priv) < 5 ||
4717             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
4718                 return;
4719
4720         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
4721                                  DISP_TILE_SURFACE_SWIZZLING);
4722
4723         if (IS_GEN(dev_priv, 5))
4724                 return;
4725
4726         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
4727         if (IS_GEN(dev_priv, 6))
4728                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
4729         else if (IS_GEN(dev_priv, 7))
4730                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
4731         else if (IS_GEN(dev_priv, 8))
4732                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
4733         else
4734                 BUG();
4735 }
4736
4737 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
4738 {
4739         I915_WRITE(RING_CTL(base), 0);
4740         I915_WRITE(RING_HEAD(base), 0);
4741         I915_WRITE(RING_TAIL(base), 0);
4742         I915_WRITE(RING_START(base), 0);
4743 }
4744
4745 static void init_unused_rings(struct drm_i915_private *dev_priv)
4746 {
4747         if (IS_I830(dev_priv)) {
4748                 init_unused_ring(dev_priv, PRB1_BASE);
4749                 init_unused_ring(dev_priv, SRB0_BASE);
4750                 init_unused_ring(dev_priv, SRB1_BASE);
4751                 init_unused_ring(dev_priv, SRB2_BASE);
4752                 init_unused_ring(dev_priv, SRB3_BASE);
4753         } else if (IS_GEN(dev_priv, 2)) {
4754                 init_unused_ring(dev_priv, SRB0_BASE);
4755                 init_unused_ring(dev_priv, SRB1_BASE);
4756         } else if (IS_GEN(dev_priv, 3)) {
4757                 init_unused_ring(dev_priv, PRB1_BASE);
4758                 init_unused_ring(dev_priv, PRB2_BASE);
4759         }
4760 }
4761
4762 static int __i915_gem_restart_engines(void *data)
4763 {
4764         struct drm_i915_private *i915 = data;
4765         struct intel_engine_cs *engine;
4766         enum intel_engine_id id;
4767         int err;
4768
4769         for_each_engine(engine, i915, id) {
4770                 err = engine->init_hw(engine);
4771                 if (err) {
4772                         DRM_ERROR("Failed to restart %s (%d)\n",
4773                                   engine->name, err);
4774                         return err;
4775                 }
4776         }
4777
4778         return 0;
4779 }
4780
4781 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
4782 {
4783         int ret;
4784
4785         dev_priv->gt.last_init_time = ktime_get();
4786
4787         /* Double layer security blanket, see i915_gem_init() */
4788         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
4789
4790         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
4791                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
4792
4793         if (IS_HASWELL(dev_priv))
4794                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
4795                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
4796
4797         /* Apply the GT workarounds... */
4798         intel_gt_apply_workarounds(dev_priv);
4799         /* ...and determine whether they are sticking. */
4800         intel_gt_verify_workarounds(dev_priv, "init");
4801
4802         i915_gem_init_swizzling(dev_priv);
4803
4804         /*
4805          * At least 830 can leave some of the unused rings
4806          * "active" (ie. head != tail) after resume which
4807          * will prevent c3 entry. Makes sure all unused rings
4808          * are totally idle.
4809          */
4810         init_unused_rings(dev_priv);
4811
4812         BUG_ON(!dev_priv->kernel_context);
4813         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
4814                 ret = -EIO;
4815                 goto out;
4816         }
4817
4818         ret = i915_ppgtt_init_hw(dev_priv);
4819         if (ret) {
4820                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
4821                 goto out;
4822         }
4823
4824         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
4825         if (ret) {
4826                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
4827                 goto out;
4828         }
4829
4830         /* We can't enable contexts until all firmware is loaded */
4831         ret = intel_uc_init_hw(dev_priv);
4832         if (ret) {
4833                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
4834                 goto out;
4835         }
4836
4837         intel_mocs_init_l3cc_table(dev_priv);
4838
4839         /* Only when the HW is re-initialised, can we replay the requests */
4840         ret = __i915_gem_restart_engines(dev_priv);
4841         if (ret)
4842                 goto cleanup_uc;
4843
4844         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4845
4846         return 0;
4847
4848 cleanup_uc:
4849         intel_uc_fini_hw(dev_priv);
4850 out:
4851         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4852
4853         return ret;
4854 }
4855
4856 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
4857 {
4858         struct i915_gem_context *ctx;
4859         struct intel_engine_cs *engine;
4860         enum intel_engine_id id;
4861         int err;
4862
4863         /*
4864          * As we reset the gpu during very early sanitisation, the current
4865          * register state on the GPU should reflect its defaults values.
4866          * We load a context onto the hw (with restore-inhibit), then switch
4867          * over to a second context to save that default register state. We
4868          * can then prime every new context with that state so they all start
4869          * from the same default HW values.
4870          */
4871
4872         ctx = i915_gem_context_create_kernel(i915, 0);
4873         if (IS_ERR(ctx))
4874                 return PTR_ERR(ctx);
4875
4876         for_each_engine(engine, i915, id) {
4877                 struct i915_request *rq;
4878
4879                 rq = i915_request_alloc(engine, ctx);
4880                 if (IS_ERR(rq)) {
4881                         err = PTR_ERR(rq);
4882                         goto out_ctx;
4883                 }
4884
4885                 err = 0;
4886                 if (engine->init_context)
4887                         err = engine->init_context(rq);
4888
4889                 i915_request_add(rq);
4890                 if (err)
4891                         goto err_active;
4892         }
4893
4894         err = i915_gem_switch_to_kernel_context(i915);
4895         if (err)
4896                 goto err_active;
4897
4898         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
4899                 i915_gem_set_wedged(i915);
4900                 err = -EIO; /* Caller will declare us wedged */
4901                 goto err_active;
4902         }
4903
4904         assert_kernel_context_is_current(i915);
4905
4906         /*
4907          * Immediately park the GPU so that we enable powersaving and
4908          * treat it as idle. The next time we issue a request, we will
4909          * unpark and start using the engine->pinned_default_state, otherwise
4910          * it is in limbo and an early reset may fail.
4911          */
4912         __i915_gem_park(i915);
4913
4914         for_each_engine(engine, i915, id) {
4915                 struct i915_vma *state;
4916                 void *vaddr;
4917
4918                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
4919
4920                 state = to_intel_context(ctx, engine)->state;
4921                 if (!state)
4922                         continue;
4923
4924                 /*
4925                  * As we will hold a reference to the logical state, it will
4926                  * not be torn down with the context, and importantly the
4927                  * object will hold onto its vma (making it possible for a
4928                  * stray GTT write to corrupt our defaults). Unmap the vma
4929                  * from the GTT to prevent such accidents and reclaim the
4930                  * space.
4931                  */
4932                 err = i915_vma_unbind(state);
4933                 if (err)
4934                         goto err_active;
4935
4936                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
4937                 if (err)
4938                         goto err_active;
4939
4940                 engine->default_state = i915_gem_object_get(state->obj);
4941
4942                 /* Check we can acquire the image of the context state */
4943                 vaddr = i915_gem_object_pin_map(engine->default_state,
4944                                                 I915_MAP_FORCE_WB);
4945                 if (IS_ERR(vaddr)) {
4946                         err = PTR_ERR(vaddr);
4947                         goto err_active;
4948                 }
4949
4950                 i915_gem_object_unpin_map(engine->default_state);
4951         }
4952
4953         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
4954                 unsigned int found = intel_engines_has_context_isolation(i915);
4955
4956                 /*
4957                  * Make sure that classes with multiple engine instances all
4958                  * share the same basic configuration.
4959                  */
4960                 for_each_engine(engine, i915, id) {
4961                         unsigned int bit = BIT(engine->uabi_class);
4962                         unsigned int expected = engine->default_state ? bit : 0;
4963
4964                         if ((found & bit) != expected) {
4965                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
4966                                           engine->uabi_class, engine->name);
4967                         }
4968                 }
4969         }
4970
4971 out_ctx:
4972         i915_gem_context_set_closed(ctx);
4973         i915_gem_context_put(ctx);
4974         return err;
4975
4976 err_active:
4977         /*
4978          * If we have to abandon now, we expect the engines to be idle
4979          * and ready to be torn-down. First try to flush any remaining
4980          * request, ensure we are pointing at the kernel context and
4981          * then remove it.
4982          */
4983         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
4984                 goto out_ctx;
4985
4986         if (WARN_ON(i915_gem_wait_for_idle(i915,
4987                                            I915_WAIT_LOCKED,
4988                                            MAX_SCHEDULE_TIMEOUT)))
4989                 goto out_ctx;
4990
4991         i915_gem_contexts_lost(i915);
4992         goto out_ctx;
4993 }
4994
4995 static int
4996 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
4997 {
4998         struct drm_i915_gem_object *obj;
4999         struct i915_vma *vma;
5000         int ret;
5001
5002         obj = i915_gem_object_create_stolen(i915, size);
5003         if (!obj)
5004                 obj = i915_gem_object_create_internal(i915, size);
5005         if (IS_ERR(obj)) {
5006                 DRM_ERROR("Failed to allocate scratch page\n");
5007                 return PTR_ERR(obj);
5008         }
5009
5010         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
5011         if (IS_ERR(vma)) {
5012                 ret = PTR_ERR(vma);
5013                 goto err_unref;
5014         }
5015
5016         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
5017         if (ret)
5018                 goto err_unref;
5019
5020         i915->gt.scratch = vma;
5021         return 0;
5022
5023 err_unref:
5024         i915_gem_object_put(obj);
5025         return ret;
5026 }
5027
5028 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
5029 {
5030         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
5031 }
5032
5033 int i915_gem_init(struct drm_i915_private *dev_priv)
5034 {
5035         int ret;
5036
5037         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5038         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5039                 mkwrite_device_info(dev_priv)->page_sizes =
5040                         I915_GTT_PAGE_SIZE_4K;
5041
5042         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5043
5044         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5045                 dev_priv->gt.resume = intel_lr_context_resume;
5046                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5047         } else {
5048                 dev_priv->gt.resume = intel_legacy_submission_resume;
5049                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5050         }
5051
5052         i915_timelines_init(dev_priv);
5053
5054         ret = i915_gem_init_userptr(dev_priv);
5055         if (ret)
5056                 return ret;
5057
5058         ret = intel_uc_init_misc(dev_priv);
5059         if (ret)
5060                 return ret;
5061
5062         ret = intel_wopcm_init(&dev_priv->wopcm);
5063         if (ret)
5064                 goto err_uc_misc;
5065
5066         /* This is just a security blanket to placate dragons.
5067          * On some systems, we very sporadically observe that the first TLBs
5068          * used by the CS may be stale, despite us poking the TLB reset. If
5069          * we hold the forcewake during initialisation these problems
5070          * just magically go away.
5071          */
5072         mutex_lock(&dev_priv->drm.struct_mutex);
5073         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5074
5075         ret = i915_gem_init_ggtt(dev_priv);
5076         if (ret) {
5077                 GEM_BUG_ON(ret == -EIO);
5078                 goto err_unlock;
5079         }
5080
5081         ret = i915_gem_init_scratch(dev_priv,
5082                                     IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
5083         if (ret) {
5084                 GEM_BUG_ON(ret == -EIO);
5085                 goto err_ggtt;
5086         }
5087
5088         ret = i915_gem_contexts_init(dev_priv);
5089         if (ret) {
5090                 GEM_BUG_ON(ret == -EIO);
5091                 goto err_scratch;
5092         }
5093
5094         ret = intel_engines_init(dev_priv);
5095         if (ret) {
5096                 GEM_BUG_ON(ret == -EIO);
5097                 goto err_context;
5098         }
5099
5100         intel_init_gt_powersave(dev_priv);
5101
5102         ret = intel_uc_init(dev_priv);
5103         if (ret)
5104                 goto err_pm;
5105
5106         ret = i915_gem_init_hw(dev_priv);
5107         if (ret)
5108                 goto err_uc_init;
5109
5110         /*
5111          * Despite its name intel_init_clock_gating applies both display
5112          * clock gating workarounds; GT mmio workarounds and the occasional
5113          * GT power context workaround. Worse, sometimes it includes a context
5114          * register workaround which we need to apply before we record the
5115          * default HW state for all contexts.
5116          *
5117          * FIXME: break up the workarounds and apply them at the right time!
5118          */
5119         intel_init_clock_gating(dev_priv);
5120
5121         ret = __intel_engines_record_defaults(dev_priv);
5122         if (ret)
5123                 goto err_init_hw;
5124
5125         if (i915_inject_load_failure()) {
5126                 ret = -ENODEV;
5127                 goto err_init_hw;
5128         }
5129
5130         if (i915_inject_load_failure()) {
5131                 ret = -EIO;
5132                 goto err_init_hw;
5133         }
5134
5135         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5136         mutex_unlock(&dev_priv->drm.struct_mutex);
5137
5138         return 0;
5139
5140         /*
5141          * Unwinding is complicated by that we want to handle -EIO to mean
5142          * disable GPU submission but keep KMS alive. We want to mark the
5143          * HW as irrevisibly wedged, but keep enough state around that the
5144          * driver doesn't explode during runtime.
5145          */
5146 err_init_hw:
5147         mutex_unlock(&dev_priv->drm.struct_mutex);
5148
5149         WARN_ON(i915_gem_suspend(dev_priv));
5150         i915_gem_suspend_late(dev_priv);
5151
5152         i915_gem_drain_workqueue(dev_priv);
5153
5154         mutex_lock(&dev_priv->drm.struct_mutex);
5155         intel_uc_fini_hw(dev_priv);
5156 err_uc_init:
5157         intel_uc_fini(dev_priv);
5158 err_pm:
5159         if (ret != -EIO) {
5160                 intel_cleanup_gt_powersave(dev_priv);
5161                 i915_gem_cleanup_engines(dev_priv);
5162         }
5163 err_context:
5164         if (ret != -EIO)
5165                 i915_gem_contexts_fini(dev_priv);
5166 err_scratch:
5167         i915_gem_fini_scratch(dev_priv);
5168 err_ggtt:
5169 err_unlock:
5170         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5171         mutex_unlock(&dev_priv->drm.struct_mutex);
5172
5173 err_uc_misc:
5174         intel_uc_fini_misc(dev_priv);
5175
5176         if (ret != -EIO) {
5177                 i915_gem_cleanup_userptr(dev_priv);
5178                 i915_timelines_fini(dev_priv);
5179         }
5180
5181         if (ret == -EIO) {
5182                 mutex_lock(&dev_priv->drm.struct_mutex);
5183
5184                 /*
5185                  * Allow engine initialisation to fail by marking the GPU as
5186                  * wedged. But we only want to do this where the GPU is angry,
5187                  * for all other failure, such as an allocation failure, bail.
5188                  */
5189                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5190                         i915_load_error(dev_priv,
5191                                         "Failed to initialize GPU, declaring it wedged!\n");
5192                         i915_gem_set_wedged(dev_priv);
5193                 }
5194
5195                 /* Minimal basic recovery for KMS */
5196                 ret = i915_ggtt_enable_hw(dev_priv);
5197                 i915_gem_restore_gtt_mappings(dev_priv);
5198                 i915_gem_restore_fences(dev_priv);
5199                 intel_init_clock_gating(dev_priv);
5200
5201                 mutex_unlock(&dev_priv->drm.struct_mutex);
5202         }
5203
5204         i915_gem_drain_freed_objects(dev_priv);
5205         return ret;
5206 }
5207
5208 void i915_gem_fini(struct drm_i915_private *dev_priv)
5209 {
5210         i915_gem_suspend_late(dev_priv);
5211         intel_disable_gt_powersave(dev_priv);
5212
5213         /* Flush any outstanding unpin_work. */
5214         i915_gem_drain_workqueue(dev_priv);
5215
5216         mutex_lock(&dev_priv->drm.struct_mutex);
5217         intel_uc_fini_hw(dev_priv);
5218         intel_uc_fini(dev_priv);
5219         i915_gem_cleanup_engines(dev_priv);
5220         i915_gem_contexts_fini(dev_priv);
5221         i915_gem_fini_scratch(dev_priv);
5222         mutex_unlock(&dev_priv->drm.struct_mutex);
5223
5224         intel_wa_list_free(&dev_priv->gt_wa_list);
5225
5226         intel_cleanup_gt_powersave(dev_priv);
5227
5228         intel_uc_fini_misc(dev_priv);
5229         i915_gem_cleanup_userptr(dev_priv);
5230         i915_timelines_fini(dev_priv);
5231
5232         i915_gem_drain_freed_objects(dev_priv);
5233
5234         WARN_ON(!list_empty(&dev_priv->contexts.list));
5235 }
5236
5237 void i915_gem_init_mmio(struct drm_i915_private *i915)
5238 {
5239         i915_gem_sanitize(i915);
5240 }
5241
5242 void
5243 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5244 {
5245         struct intel_engine_cs *engine;
5246         enum intel_engine_id id;
5247
5248         for_each_engine(engine, dev_priv, id)
5249                 dev_priv->gt.cleanup_engine(engine);
5250 }
5251
5252 void
5253 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5254 {
5255         int i;
5256
5257         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5258             !IS_CHERRYVIEW(dev_priv))
5259                 dev_priv->num_fence_regs = 32;
5260         else if (INTEL_GEN(dev_priv) >= 4 ||
5261                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5262                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5263                 dev_priv->num_fence_regs = 16;
5264         else
5265                 dev_priv->num_fence_regs = 8;
5266
5267         if (intel_vgpu_active(dev_priv))
5268                 dev_priv->num_fence_regs =
5269                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5270
5271         /* Initialize fence registers to zero */
5272         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5273                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5274
5275                 fence->i915 = dev_priv;
5276                 fence->id = i;
5277                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5278         }
5279         i915_gem_restore_fences(dev_priv);
5280
5281         i915_gem_detect_bit_6_swizzle(dev_priv);
5282 }
5283
5284 static void i915_gem_init__mm(struct drm_i915_private *i915)
5285 {
5286         spin_lock_init(&i915->mm.object_stat_lock);
5287         spin_lock_init(&i915->mm.obj_lock);
5288         spin_lock_init(&i915->mm.free_lock);
5289
5290         init_llist_head(&i915->mm.free_list);
5291
5292         INIT_LIST_HEAD(&i915->mm.unbound_list);
5293         INIT_LIST_HEAD(&i915->mm.bound_list);
5294         INIT_LIST_HEAD(&i915->mm.fence_list);
5295         INIT_LIST_HEAD(&i915->mm.userfault_list);
5296
5297         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5298 }
5299
5300 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5301 {
5302         int err = -ENOMEM;
5303
5304         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5305         if (!dev_priv->objects)
5306                 goto err_out;
5307
5308         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5309         if (!dev_priv->vmas)
5310                 goto err_objects;
5311
5312         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5313         if (!dev_priv->luts)
5314                 goto err_vmas;
5315
5316         dev_priv->requests = KMEM_CACHE(i915_request,
5317                                         SLAB_HWCACHE_ALIGN |
5318                                         SLAB_RECLAIM_ACCOUNT |
5319                                         SLAB_TYPESAFE_BY_RCU);
5320         if (!dev_priv->requests)
5321                 goto err_luts;
5322
5323         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5324                                             SLAB_HWCACHE_ALIGN |
5325                                             SLAB_RECLAIM_ACCOUNT);
5326         if (!dev_priv->dependencies)
5327                 goto err_requests;
5328
5329         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5330         if (!dev_priv->priorities)
5331                 goto err_dependencies;
5332
5333         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5334         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5335
5336         i915_gem_init__mm(dev_priv);
5337
5338         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5339                           i915_gem_retire_work_handler);
5340         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5341                           i915_gem_idle_work_handler);
5342         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5343         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5344         mutex_init(&dev_priv->gpu_error.wedge_mutex);
5345
5346         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5347
5348         spin_lock_init(&dev_priv->fb_tracking.lock);
5349
5350         err = i915_gemfs_init(dev_priv);
5351         if (err)
5352                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5353
5354         return 0;
5355
5356 err_dependencies:
5357         kmem_cache_destroy(dev_priv->dependencies);
5358 err_requests:
5359         kmem_cache_destroy(dev_priv->requests);
5360 err_luts:
5361         kmem_cache_destroy(dev_priv->luts);
5362 err_vmas:
5363         kmem_cache_destroy(dev_priv->vmas);
5364 err_objects:
5365         kmem_cache_destroy(dev_priv->objects);
5366 err_out:
5367         return err;
5368 }
5369
5370 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5371 {
5372         i915_gem_drain_freed_objects(dev_priv);
5373         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5374         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5375         WARN_ON(dev_priv->mm.object_count);
5376
5377         kmem_cache_destroy(dev_priv->priorities);
5378         kmem_cache_destroy(dev_priv->dependencies);
5379         kmem_cache_destroy(dev_priv->requests);
5380         kmem_cache_destroy(dev_priv->luts);
5381         kmem_cache_destroy(dev_priv->vmas);
5382         kmem_cache_destroy(dev_priv->objects);
5383
5384         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5385         rcu_barrier();
5386
5387         i915_gemfs_fini(dev_priv);
5388 }
5389
5390 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5391 {
5392         /* Discard all purgeable objects, let userspace recover those as
5393          * required after resuming.
5394          */
5395         i915_gem_shrink_all(dev_priv);
5396
5397         return 0;
5398 }
5399
5400 int i915_gem_freeze_late(struct drm_i915_private *i915)
5401 {
5402         struct drm_i915_gem_object *obj;
5403         struct list_head *phases[] = {
5404                 &i915->mm.unbound_list,
5405                 &i915->mm.bound_list,
5406                 NULL
5407         }, **phase;
5408
5409         /*
5410          * Called just before we write the hibernation image.
5411          *
5412          * We need to update the domain tracking to reflect that the CPU
5413          * will be accessing all the pages to create and restore from the
5414          * hibernation, and so upon restoration those pages will be in the
5415          * CPU domain.
5416          *
5417          * To make sure the hibernation image contains the latest state,
5418          * we update that state just before writing out the image.
5419          *
5420          * To try and reduce the hibernation image, we manually shrink
5421          * the objects as well, see i915_gem_freeze()
5422          */
5423
5424         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5425         i915_gem_drain_freed_objects(i915);
5426
5427         mutex_lock(&i915->drm.struct_mutex);
5428         for (phase = phases; *phase; phase++) {
5429                 list_for_each_entry(obj, *phase, mm.link)
5430                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5431         }
5432         mutex_unlock(&i915->drm.struct_mutex);
5433
5434         return 0;
5435 }
5436
5437 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5438 {
5439         struct drm_i915_file_private *file_priv = file->driver_priv;
5440         struct i915_request *request;
5441
5442         /* Clean up our request list when the client is going away, so that
5443          * later retire_requests won't dereference our soon-to-be-gone
5444          * file_priv.
5445          */
5446         spin_lock(&file_priv->mm.lock);
5447         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5448                 request->file_priv = NULL;
5449         spin_unlock(&file_priv->mm.lock);
5450 }
5451
5452 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5453 {
5454         struct drm_i915_file_private *file_priv;
5455         int ret;
5456
5457         DRM_DEBUG("\n");
5458
5459         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5460         if (!file_priv)
5461                 return -ENOMEM;
5462
5463         file->driver_priv = file_priv;
5464         file_priv->dev_priv = i915;
5465         file_priv->file = file;
5466
5467         spin_lock_init(&file_priv->mm.lock);
5468         INIT_LIST_HEAD(&file_priv->mm.request_list);
5469
5470         file_priv->bsd_engine = -1;
5471         file_priv->hang_timestamp = jiffies;
5472
5473         ret = i915_gem_context_open(i915, file);
5474         if (ret)
5475                 kfree(file_priv);
5476
5477         return ret;
5478 }
5479
5480 /**
5481  * i915_gem_track_fb - update frontbuffer tracking
5482  * @old: current GEM buffer for the frontbuffer slots
5483  * @new: new GEM buffer for the frontbuffer slots
5484  * @frontbuffer_bits: bitmask of frontbuffer slots
5485  *
5486  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5487  * from @old and setting them in @new. Both @old and @new can be NULL.
5488  */
5489 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5490                        struct drm_i915_gem_object *new,
5491                        unsigned frontbuffer_bits)
5492 {
5493         /* Control of individual bits within the mask are guarded by
5494          * the owning plane->mutex, i.e. we can never see concurrent
5495          * manipulation of individual bits. But since the bitfield as a whole
5496          * is updated using RMW, we need to use atomics in order to update
5497          * the bits.
5498          */
5499         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5500                      BITS_PER_TYPE(atomic_t));
5501
5502         if (old) {
5503                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5504                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5505         }
5506
5507         if (new) {
5508                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5509                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5510         }
5511 }
5512
5513 /* Allocate a new GEM object and fill it with the supplied data */
5514 struct drm_i915_gem_object *
5515 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5516                                  const void *data, size_t size)
5517 {
5518         struct drm_i915_gem_object *obj;
5519         struct file *file;
5520         size_t offset;
5521         int err;
5522
5523         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5524         if (IS_ERR(obj))
5525                 return obj;
5526
5527         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5528
5529         file = obj->base.filp;
5530         offset = 0;
5531         do {
5532                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5533                 struct page *page;
5534                 void *pgdata, *vaddr;
5535
5536                 err = pagecache_write_begin(file, file->f_mapping,
5537                                             offset, len, 0,
5538                                             &page, &pgdata);
5539                 if (err < 0)
5540                         goto fail;
5541
5542                 vaddr = kmap(page);
5543                 memcpy(vaddr, data, len);
5544                 kunmap(page);
5545
5546                 err = pagecache_write_end(file, file->f_mapping,
5547                                           offset, len, len,
5548                                           page, pgdata);
5549                 if (err < 0)
5550                         goto fail;
5551
5552                 size -= len;
5553                 data += len;
5554                 offset += len;
5555         } while (size);
5556
5557         return obj;
5558
5559 fail:
5560         i915_gem_object_put(obj);
5561         return ERR_PTR(err);
5562 }
5563
5564 struct scatterlist *
5565 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5566                        unsigned int n,
5567                        unsigned int *offset)
5568 {
5569         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5570         struct scatterlist *sg;
5571         unsigned int idx, count;
5572
5573         might_sleep();
5574         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5575         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5576
5577         /* As we iterate forward through the sg, we record each entry in a
5578          * radixtree for quick repeated (backwards) lookups. If we have seen
5579          * this index previously, we will have an entry for it.
5580          *
5581          * Initial lookup is O(N), but this is amortized to O(1) for
5582          * sequential page access (where each new request is consecutive
5583          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5584          * i.e. O(1) with a large constant!
5585          */
5586         if (n < READ_ONCE(iter->sg_idx))
5587                 goto lookup;
5588
5589         mutex_lock(&iter->lock);
5590
5591         /* We prefer to reuse the last sg so that repeated lookup of this
5592          * (or the subsequent) sg are fast - comparing against the last
5593          * sg is faster than going through the radixtree.
5594          */
5595
5596         sg = iter->sg_pos;
5597         idx = iter->sg_idx;
5598         count = __sg_page_count(sg);
5599
5600         while (idx + count <= n) {
5601                 void *entry;
5602                 unsigned long i;
5603                 int ret;
5604
5605                 /* If we cannot allocate and insert this entry, or the
5606                  * individual pages from this range, cancel updating the
5607                  * sg_idx so that on this lookup we are forced to linearly
5608                  * scan onwards, but on future lookups we will try the
5609                  * insertion again (in which case we need to be careful of
5610                  * the error return reporting that we have already inserted
5611                  * this index).
5612                  */
5613                 ret = radix_tree_insert(&iter->radix, idx, sg);
5614                 if (ret && ret != -EEXIST)
5615                         goto scan;
5616
5617                 entry = xa_mk_value(idx);
5618                 for (i = 1; i < count; i++) {
5619                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
5620                         if (ret && ret != -EEXIST)
5621                                 goto scan;
5622                 }
5623
5624                 idx += count;
5625                 sg = ____sg_next(sg);
5626                 count = __sg_page_count(sg);
5627         }
5628
5629 scan:
5630         iter->sg_pos = sg;
5631         iter->sg_idx = idx;
5632
5633         mutex_unlock(&iter->lock);
5634
5635         if (unlikely(n < idx)) /* insertion completed by another thread */
5636                 goto lookup;
5637
5638         /* In case we failed to insert the entry into the radixtree, we need
5639          * to look beyond the current sg.
5640          */
5641         while (idx + count <= n) {
5642                 idx += count;
5643                 sg = ____sg_next(sg);
5644                 count = __sg_page_count(sg);
5645         }
5646
5647         *offset = n - idx;
5648         return sg;
5649
5650 lookup:
5651         rcu_read_lock();
5652
5653         sg = radix_tree_lookup(&iter->radix, n);
5654         GEM_BUG_ON(!sg);
5655
5656         /* If this index is in the middle of multi-page sg entry,
5657          * the radix tree will contain a value entry that points
5658          * to the start of that range. We will return the pointer to
5659          * the base page and the offset of this page within the
5660          * sg entry's range.
5661          */
5662         *offset = 0;
5663         if (unlikely(xa_is_value(sg))) {
5664                 unsigned long base = xa_to_value(sg);
5665
5666                 sg = radix_tree_lookup(&iter->radix, base);
5667                 GEM_BUG_ON(!sg);
5668
5669                 *offset = n - base;
5670         }
5671
5672         rcu_read_unlock();
5673
5674         return sg;
5675 }
5676
5677 struct page *
5678 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5679 {
5680         struct scatterlist *sg;
5681         unsigned int offset;
5682
5683         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5684
5685         sg = i915_gem_object_get_sg(obj, n, &offset);
5686         return nth_page(sg_page(sg), offset);
5687 }
5688
5689 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5690 struct page *
5691 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5692                                unsigned int n)
5693 {
5694         struct page *page;
5695
5696         page = i915_gem_object_get_page(obj, n);
5697         if (!obj->mm.dirty)
5698                 set_page_dirty(page);
5699
5700         return page;
5701 }
5702
5703 dma_addr_t
5704 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5705                                 unsigned long n)
5706 {
5707         struct scatterlist *sg;
5708         unsigned int offset;
5709
5710         sg = i915_gem_object_get_sg(obj, n, &offset);
5711         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5712 }
5713
5714 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5715 {
5716         struct sg_table *pages;
5717         int err;
5718
5719         if (align > obj->base.size)
5720                 return -EINVAL;
5721
5722         if (obj->ops == &i915_gem_phys_ops)
5723                 return 0;
5724
5725         if (obj->ops != &i915_gem_object_ops)
5726                 return -EINVAL;
5727
5728         err = i915_gem_object_unbind(obj);
5729         if (err)
5730                 return err;
5731
5732         mutex_lock(&obj->mm.lock);
5733
5734         if (obj->mm.madv != I915_MADV_WILLNEED) {
5735                 err = -EFAULT;
5736                 goto err_unlock;
5737         }
5738
5739         if (obj->mm.quirked) {
5740                 err = -EFAULT;
5741                 goto err_unlock;
5742         }
5743
5744         if (obj->mm.mapping) {
5745                 err = -EBUSY;
5746                 goto err_unlock;
5747         }
5748
5749         pages = __i915_gem_object_unset_pages(obj);
5750
5751         obj->ops = &i915_gem_phys_ops;
5752
5753         err = ____i915_gem_object_get_pages(obj);
5754         if (err)
5755                 goto err_xfer;
5756
5757         /* Perma-pin (until release) the physical set of pages */
5758         __i915_gem_object_pin_pages(obj);
5759
5760         if (!IS_ERR_OR_NULL(pages))
5761                 i915_gem_object_ops.put_pages(obj, pages);
5762         mutex_unlock(&obj->mm.lock);
5763         return 0;
5764
5765 err_xfer:
5766         obj->ops = &i915_gem_object_ops;
5767         if (!IS_ERR_OR_NULL(pages)) {
5768                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
5769
5770                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
5771         }
5772 err_unlock:
5773         mutex_unlock(&obj->mm.lock);
5774         return err;
5775 }
5776
5777 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5778 #include "selftests/scatterlist.c"
5779 #include "selftests/mock_gem_device.c"
5780 #include "selftests/huge_gem_object.c"
5781 #include "selftests/huge_pages.c"
5782 #include "selftests/i915_gem_object.c"
5783 #include "selftests/i915_gem_coherency.c"
5784 #include "selftests/i915_gem.c"
5785 #endif