drivers/gpu/drm/i915/i915_gem.c

   1 /*
   2  * Copyright © 2008-2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 #include <drm/drm_vma_manager.h>
  29 #include <drm/i915_drm.h>
  30 #include <linux/dma-fence-array.h>
  31 #include <linux/kthread.h>
  32 #include <linux/reservation.h>
  33 #include <linux/shmem_fs.h>
  34 #include <linux/slab.h>
  35 #include <linux/stop_machine.h>
  36 #include <linux/swap.h>
  37 #include <linux/pci.h>
  38 #include <linux/dma-buf.h>
  39
  40 #include "i915_drv.h"
  41 #include "i915_gem_clflush.h"
  42 #include "i915_gemfs.h"
  43 #include "i915_reset.h"
  44 #include "i915_trace.h"
  45 #include "i915_vgpu.h"
  46
  47 #include "intel_drv.h"
  48 #include "intel_frontbuffer.h"
  49 #include "intel_mocs.h"
  50 #include "intel_workarounds.h"
  51
  52 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
  53
  54 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
  55 {
  56         if (obj->cache_dirty)
  57                 return false;
  58
  59         if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
  60                 return true;
  61
  62         return obj->pin_global; /* currently in use by HW, keep flushed */
  63 }
  64
  65 static int
  66 insert_mappable_node(struct i915_ggtt *ggtt,
  67                      struct drm_mm_node *node, u32 size)
  68 {
  69         memset(node, 0, sizeof(*node));
  70         return drm_mm_insert_node_in_range(&ggtt->vm.mm, node,
  71                                            size, 0, I915_COLOR_UNEVICTABLE,
  72                                            0, ggtt->mappable_end,
  73                                            DRM_MM_INSERT_LOW);
  74 }
  75
  76 static void
  77 remove_mappable_node(struct drm_mm_node *node)
  78 {
  79         drm_mm_remove_node(node);
  80 }
  81
  82 /* some bookkeeping */
  83 static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv,
  84                                   u64 size)
  85 {
  86         spin_lock(&dev_priv->mm.object_stat_lock);
  87         dev_priv->mm.object_count++;
  88         dev_priv->mm.object_memory += size;
  89         spin_unlock(&dev_priv->mm.object_stat_lock);
  90 }
  91
  92 static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
  93                                      u64 size)
  94 {
  95         spin_lock(&dev_priv->mm.object_stat_lock);
  96         dev_priv->mm.object_count--;
  97         dev_priv->mm.object_memory -= size;
  98         spin_unlock(&dev_priv->mm.object_stat_lock);
  99 }
 100
 101 static int
 102 i915_gem_wait_for_error(struct i915_gpu_error *error)
 103 {
 104         int ret;
 105
 106         might_sleep();
 107
 108         /*
 109          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 110          * userspace. If it takes that long something really bad is going on and
 111          * we should simply try to bail out and fail as gracefully as possible.
 112          */
 113         ret = wait_event_interruptible_timeout(error->reset_queue,
 114                                                !i915_reset_backoff(error),
 115                                                I915_RESET_TIMEOUT);
 116         if (ret == 0) {
 117                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
 118                 return -EIO;
 119         } else if (ret < 0) {
 120                 return ret;
 121         } else {
 122                 return 0;
 123         }
 124 }
 125
 126 int i915_mutex_lock_interruptible(struct drm_device *dev)
 127 {
 128         struct drm_i915_private *dev_priv = to_i915(dev);
 129         int ret;
 130
 131         ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
 132         if (ret)
 133                 return ret;
 134
 135         ret = mutex_lock_interruptible(&dev->struct_mutex);
 136         if (ret)
 137                 return ret;
 138
 139         return 0;
 140 }
 141
 142 static u32 __i915_gem_park(struct drm_i915_private *i915)
 143 {
 144         intel_wakeref_t wakeref;
 145
 146         GEM_TRACE("\n");
 147
 148         lockdep_assert_held(&i915->drm.struct_mutex);
 149         GEM_BUG_ON(i915->gt.active_requests);
 150         GEM_BUG_ON(!list_empty(&i915->gt.active_rings));
 151
 152         if (!i915->gt.awake)
 153                 return I915_EPOCH_INVALID;
 154
 155         GEM_BUG_ON(i915->gt.epoch == I915_EPOCH_INVALID);
 156
 157         /*
 158          * Be paranoid and flush a concurrent interrupt to make sure
 159          * we don't reactivate any irq tasklets after parking.
 160          *
 161          * FIXME: Note that even though we have waited for execlists to be idle,
 162          * there may still be an in-flight interrupt even though the CSB
 163          * is now empty. synchronize_irq() makes sure that a residual interrupt
 164          * is completed before we continue, but it doesn't prevent the HW from
 165          * raising a spurious interrupt later. To complete the shield we should
 166          * coordinate disabling the CS irq with flushing the interrupts.
 167          */
 168         synchronize_irq(i915->drm.irq);
 169
 170         intel_engines_park(i915);
 171         i915_timelines_park(i915);
 172
 173         i915_pmu_gt_parked(i915);
 174         i915_vma_parked(i915);
 175
 176         wakeref = fetch_and_zero(&i915->gt.awake);
 177         GEM_BUG_ON(!wakeref);
 178
 179         if (INTEL_GEN(i915) >= 6)
 180                 gen6_rps_idle(i915);
 181
 182         intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref);
 183
 184         return i915->gt.epoch;
 185 }
 186
 187 void i915_gem_park(struct drm_i915_private *i915)
 188 {
 189         GEM_TRACE("\n");
 190
 191         lockdep_assert_held(&i915->drm.struct_mutex);
 192         GEM_BUG_ON(i915->gt.active_requests);
 193
 194         if (!i915->gt.awake)
 195                 return;
 196
 197         /* Defer the actual call to __i915_gem_park() to prevent ping-pongs */
 198         mod_delayed_work(i915->wq, &i915->gt.idle_work, msecs_to_jiffies(100));
 199 }
 200
 201 void i915_gem_unpark(struct drm_i915_private *i915)
 202 {
 203         GEM_TRACE("\n");
 204
 205         lockdep_assert_held(&i915->drm.struct_mutex);
 206         GEM_BUG_ON(!i915->gt.active_requests);
 207         assert_rpm_wakelock_held(i915);
 208
 209         if (i915->gt.awake)
 210                 return;
 211
 212         /*
 213          * It seems that the DMC likes to transition between the DC states a lot
 214          * when there are no connected displays (no active power domains) during
 215          * command submission.
 216          *
 217          * This activity has negative impact on the performance of the chip with
 218          * huge latencies observed in the interrupt handler and elsewhere.
 219          *
 220          * Work around it by grabbing a GT IRQ power domain whilst there is any
 221          * GT activity, preventing any DC state transitions.
 222          */
 223         i915->gt.awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 224         GEM_BUG_ON(!i915->gt.awake);
 225
 226         if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 227                 i915->gt.epoch = 1;
 228
 229         intel_enable_gt_powersave(i915);
 230         i915_update_gfx_val(i915);
 231         if (INTEL_GEN(i915) >= 6)
 232                 gen6_rps_busy(i915);
 233         i915_pmu_gt_unparked(i915);
 234
 235         intel_engines_unpark(i915);
 236
 237         i915_queue_hangcheck(i915);
 238
 239         queue_delayed_work(i915->wq,
 240                            &i915->gt.retire_work,
 241                            round_jiffies_up_relative(HZ));
 242 }
 243
 244 int
 245 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 246                             struct drm_file *file)
 247 {
 248         struct i915_ggtt *ggtt = &to_i915(dev)->ggtt;
 249         struct drm_i915_gem_get_aperture *args = data;
 250         struct i915_vma *vma;
 251         u64 pinned;
 252
 253         mutex_lock(&ggtt->vm.mutex);
 254
 255         pinned = ggtt->vm.reserved;
 256         list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
 257                 if (i915_vma_is_pinned(vma))
 258                         pinned += vma->node.size;
 259
 260         mutex_unlock(&ggtt->vm.mutex);
 261
 262         args->aper_size = ggtt->vm.total;
 263         args->aper_available_size = args->aper_size - pinned;
 264
 265         return 0;
 266 }
 267
 268 static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 269 {
 270         struct address_space *mapping = obj->base.filp->f_mapping;
 271         drm_dma_handle_t *phys;
 272         struct sg_table *st;
 273         struct scatterlist *sg;
 274         char *vaddr;
 275         int i;
 276         int err;
 277
 278         if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj)))
 279                 return -EINVAL;
 280
 281         /* Always aligning to the object size, allows a single allocation
 282          * to handle all possible callers, and given typical object sizes,
 283          * the alignment of the buddy allocation will naturally match.
 284          */
 285         phys = drm_pci_alloc(obj->base.dev,
 286                              roundup_pow_of_two(obj->base.size),
 287                              roundup_pow_of_two(obj->base.size));
 288         if (!phys)
 289                 return -ENOMEM;
 290
 291         vaddr = phys->vaddr;
 292         for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 293                 struct page *page;
 294                 char *src;
 295
 296                 page = shmem_read_mapping_page(mapping, i);
 297                 if (IS_ERR(page)) {
 298                         err = PTR_ERR(page);
 299                         goto err_phys;
 300                 }
 301
 302                 src = kmap_atomic(page);
 303                 memcpy(vaddr, src, PAGE_SIZE);
 304                 drm_clflush_virt_range(vaddr, PAGE_SIZE);
 305                 kunmap_atomic(src);
 306
 307                 put_page(page);
 308                 vaddr += PAGE_SIZE;
 309         }
 310
 311         i915_gem_chipset_flush(to_i915(obj->base.dev));
 312
 313         st = kmalloc(sizeof(*st), GFP_KERNEL);
 314         if (!st) {
 315                 err = -ENOMEM;
 316                 goto err_phys;
 317         }
 318
 319         if (sg_alloc_table(st, 1, GFP_KERNEL)) {
 320                 kfree(st);
 321                 err = -ENOMEM;
 322                 goto err_phys;
 323         }
 324
 325         sg = st->sgl;
 326         sg->offset = 0;
 327         sg->length = obj->base.size;
 328
 329         sg_dma_address(sg) = phys->busaddr;
 330         sg_dma_len(sg) = obj->base.size;
 331
 332         obj->phys_handle = phys;
 333
 334         __i915_gem_object_set_pages(obj, st, sg->length);
 335
 336         return 0;
 337
 338 err_phys:
 339         drm_pci_free(obj->base.dev, phys);
 340
 341         return err;
 342 }
 343
 344 static void __start_cpu_write(struct drm_i915_gem_object *obj)
 345 {
 346         obj->read_domains = I915_GEM_DOMAIN_CPU;
 347         obj->write_domain = I915_GEM_DOMAIN_CPU;
 348         if (cpu_write_needs_clflush(obj))
 349                 obj->cache_dirty = true;
 350 }
 351
 352 static void
 353 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 354                                 struct sg_table *pages,
 355                                 bool needs_clflush)
 356 {
 357         GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED);
 358
 359         if (obj->mm.madv == I915_MADV_DONTNEED)
 360                 obj->mm.dirty = false;
 361
 362         if (needs_clflush &&
 363             (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 &&
 364             !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ))
 365                 drm_clflush_sg(pages);
 366
 367         __start_cpu_write(obj);
 368 }
 369
 370 static void
 371 i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj,
 372                                struct sg_table *pages)
 373 {
 374         __i915_gem_object_release_shmem(obj, pages, false);
 375
 376         if (obj->mm.dirty) {
 377                 struct address_space *mapping = obj->base.filp->f_mapping;
 378                 char *vaddr = obj->phys_handle->vaddr;
 379                 int i;
 380
 381                 for (i = 0; i < obj->base.size / PAGE_SIZE; i++) {
 382                         struct page *page;
 383                         char *dst;
 384
 385                         page = shmem_read_mapping_page(mapping, i);
 386                         if (IS_ERR(page))
 387                                 continue;
 388
 389                         dst = kmap_atomic(page);
 390                         drm_clflush_virt_range(vaddr, PAGE_SIZE);
 391                         memcpy(dst, vaddr, PAGE_SIZE);
 392                         kunmap_atomic(dst);
 393
 394                         set_page_dirty(page);
 395                         if (obj->mm.madv == I915_MADV_WILLNEED)
 396                                 mark_page_accessed(page);
 397                         put_page(page);
 398                         vaddr += PAGE_SIZE;
 399                 }
 400                 obj->mm.dirty = false;
 401         }
 402
 403         sg_free_table(pages);
 404         kfree(pages);
 405
 406         drm_pci_free(obj->base.dev, obj->phys_handle);
 407 }
 408
 409 static void
 410 i915_gem_object_release_phys(struct drm_i915_gem_object *obj)
 411 {
 412         i915_gem_object_unpin_pages(obj);
 413 }
 414
 415 static const struct drm_i915_gem_object_ops i915_gem_phys_ops = {
 416         .get_pages = i915_gem_object_get_pages_phys,
 417         .put_pages = i915_gem_object_put_pages_phys,
 418         .release = i915_gem_object_release_phys,
 419 };
 420
 421 static const struct drm_i915_gem_object_ops i915_gem_object_ops;
 422
 423 int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 424 {
 425         struct i915_vma *vma;
 426         LIST_HEAD(still_in_list);
 427         int ret;
 428
 429         lockdep_assert_held(&obj->base.dev->struct_mutex);
 430
 431         /* Closed vma are removed from the obj->vma_list - but they may
 432          * still have an active binding on the object. To remove those we
 433          * must wait for all rendering to complete to the object (as unbinding
 434          * must anyway), and retire the requests.
 435          */
 436         ret = i915_gem_object_set_to_cpu_domain(obj, false);
 437         if (ret)
 438                 return ret;
 439
 440         spin_lock(&obj->vma.lock);
 441         while (!ret && (vma = list_first_entry_or_null(&obj->vma.list,
 442                                                        struct i915_vma,
 443                                                        obj_link))) {
 444                 list_move_tail(&vma->obj_link, &still_in_list);
 445                 spin_unlock(&obj->vma.lock);
 446
 447                 ret = i915_vma_unbind(vma);
 448
 449                 spin_lock(&obj->vma.lock);
 450         }
 451         list_splice(&still_in_list, &obj->vma.list);
 452         spin_unlock(&obj->vma.lock);
 453
 454         return ret;
 455 }
 456
 457 static long
 458 i915_gem_object_wait_fence(struct dma_fence *fence,
 459                            unsigned int flags,
 460                            long timeout,
 461                            struct intel_rps_client *rps_client)
 462 {
 463         struct i915_request *rq;
 464
 465         BUILD_BUG_ON(I915_WAIT_INTERRUPTIBLE != 0x1);
 466
 467         if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))
 468                 return timeout;
 469
 470         if (!dma_fence_is_i915(fence))
 471                 return dma_fence_wait_timeout(fence,
 472                                               flags & I915_WAIT_INTERRUPTIBLE,
 473                                               timeout);
 474
 475         rq = to_request(fence);
 476         if (i915_request_completed(rq))
 477                 goto out;
 478
 479         /*
 480          * This client is about to stall waiting for the GPU. In many cases
 481          * this is undesirable and limits the throughput of the system, as
 482          * many clients cannot continue processing user input/output whilst
 483          * blocked. RPS autotuning may take tens of milliseconds to respond
 484          * to the GPU load and thus incurs additional latency for the client.
 485          * We can circumvent that by promoting the GPU frequency to maximum
 486          * before we wait. This makes the GPU throttle up much more quickly
 487          * (good for benchmarks and user experience, e.g. window animations),
 488          * but at a cost of spending more power processing the workload
 489          * (bad for battery). Not all clients even want their results
 490          * immediately and for them we should just let the GPU select its own
 491          * frequency to maximise efficiency. To prevent a single client from
 492          * forcing the clocks too high for the whole system, we only allow
 493          * each client to waitboost once in a busy period.
 494          */
 495         if (rps_client && !i915_request_started(rq)) {
 496                 if (INTEL_GEN(rq->i915) >= 6)
 497                         gen6_rps_boost(rq, rps_client);
 498         }
 499
 500         timeout = i915_request_wait(rq, flags, timeout);
 501
 502 out:
 503         if (flags & I915_WAIT_LOCKED && i915_request_completed(rq))
 504                 i915_request_retire_upto(rq);
 505
 506         return timeout;
 507 }
 508
 509 static long
 510 i915_gem_object_wait_reservation(struct reservation_object *resv,
 511                                  unsigned int flags,
 512                                  long timeout,
 513                                  struct intel_rps_client *rps_client)
 514 {
 515         unsigned int seq = __read_seqcount_begin(&resv->seq);
 516         struct dma_fence *excl;
 517         bool prune_fences = false;
 518
 519         if (flags & I915_WAIT_ALL) {
 520                 struct dma_fence **shared;
 521                 unsigned int count, i;
 522                 int ret;
 523
 524                 ret = reservation_object_get_fences_rcu(resv,
 525                                                         &excl, &count, &shared);
 526                 if (ret)
 527                         return ret;
 528
 529                 for (i = 0; i < count; i++) {
 530                         timeout = i915_gem_object_wait_fence(shared[i],
 531                                                              flags, timeout,
 532                                                              rps_client);
 533                         if (timeout < 0)
 534                                 break;
 535
 536                         dma_fence_put(shared[i]);
 537                 }
 538
 539                 for (; i < count; i++)
 540                         dma_fence_put(shared[i]);
 541                 kfree(shared);
 542
 543                 /*
 544                  * If both shared fences and an exclusive fence exist,
 545                  * then by construction the shared fences must be later
 546                  * than the exclusive fence. If we successfully wait for
 547                  * all the shared fences, we know that the exclusive fence
 548                  * must all be signaled. If all the shared fences are
 549                  * signaled, we can prune the array and recover the
 550                  * floating references on the fences/requests.
 551                  */
 552                 prune_fences = count && timeout >= 0;
 553         } else {
 554                 excl = reservation_object_get_excl_rcu(resv);
 555         }
 556
 557         if (excl && timeout >= 0)
 558                 timeout = i915_gem_object_wait_fence(excl, flags, timeout,
 559                                                      rps_client);
 560
 561         dma_fence_put(excl);
 562
 563         /*
 564          * Opportunistically prune the fences iff we know they have *all* been
 565          * signaled and that the reservation object has not been changed (i.e.
 566          * no new fences have been added).
 567          */
 568         if (prune_fences && !__read_seqcount_retry(&resv->seq, seq)) {
 569                 if (reservation_object_trylock(resv)) {
 570                         if (!__read_seqcount_retry(&resv->seq, seq))
 571                                 reservation_object_add_excl_fence(resv, NULL);
 572                         reservation_object_unlock(resv);
 573                 }
 574         }
 575
 576         return timeout;
 577 }
 578
 579 static void __fence_set_priority(struct dma_fence *fence,
 580                                  const struct i915_sched_attr *attr)
 581 {
 582         struct i915_request *rq;
 583         struct intel_engine_cs *engine;
 584
 585         if (dma_fence_is_signaled(fence) || !dma_fence_is_i915(fence))
 586                 return;
 587
 588         rq = to_request(fence);
 589         engine = rq->engine;
 590
 591         local_bh_disable();
 592         rcu_read_lock(); /* RCU serialisation for set-wedged protection */
 593         if (engine->schedule)
 594                 engine->schedule(rq, attr);
 595         rcu_read_unlock();
 596         local_bh_enable(); /* kick the tasklets if queues were reprioritised */
 597 }
 598
 599 static void fence_set_priority(struct dma_fence *fence,
 600                                const struct i915_sched_attr *attr)
 601 {
 602         /* Recurse once into a fence-array */
 603         if (dma_fence_is_array(fence)) {
 604                 struct dma_fence_array *array = to_dma_fence_array(fence);
 605                 int i;
 606
 607                 for (i = 0; i < array->num_fences; i++)
 608                         __fence_set_priority(array->fences[i], attr);
 609         } else {
 610                 __fence_set_priority(fence, attr);
 611         }
 612 }
 613
 614 int
 615 i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
 616                               unsigned int flags,
 617                               const struct i915_sched_attr *attr)
 618 {
 619         struct dma_fence *excl;
 620
 621         if (flags & I915_WAIT_ALL) {
 622                 struct dma_fence **shared;
 623                 unsigned int count, i;
 624                 int ret;
 625
 626                 ret = reservation_object_get_fences_rcu(obj->resv,
 627                                                         &excl, &count, &shared);
 628                 if (ret)
 629                         return ret;
 630
 631                 for (i = 0; i < count; i++) {
 632                         fence_set_priority(shared[i], attr);
 633                         dma_fence_put(shared[i]);
 634                 }
 635
 636                 kfree(shared);
 637         } else {
 638                 excl = reservation_object_get_excl_rcu(obj->resv);
 639         }
 640
 641         if (excl) {
 642                 fence_set_priority(excl, attr);
 643                 dma_fence_put(excl);
 644         }
 645         return 0;
 646 }
 647
 648 /**
 649  * Waits for rendering to the object to be completed
 650  * @obj: i915 gem object
 651  * @flags: how to wait (under a lock, for all rendering or just for writes etc)
 652  * @timeout: how long to wait
 653  * @rps_client: client (user process) to charge for any waitboosting
 654  */
 655 int
 656 i915_gem_object_wait(struct drm_i915_gem_object *obj,
 657                      unsigned int flags,
 658                      long timeout,
 659                      struct intel_rps_client *rps_client)
 660 {
 661         might_sleep();
 662         GEM_BUG_ON(timeout < 0);
 663
 664         timeout = i915_gem_object_wait_reservation(obj->resv,
 665                                                    flags, timeout,
 666                                                    rps_client);
 667         return timeout < 0 ? timeout : 0;
 668 }
 669
 670 static struct intel_rps_client *to_rps_client(struct drm_file *file)
 671 {
 672         struct drm_i915_file_private *fpriv = file->driver_priv;
 673
 674         return &fpriv->rps_client;
 675 }
 676
 677 static int
 678 i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
 679                      struct drm_i915_gem_pwrite *args,
 680                      struct drm_file *file)
 681 {
 682         void *vaddr = obj->phys_handle->vaddr + args->offset;
 683         char __user *user_data = u64_to_user_ptr(args->data_ptr);
 684
 685         /* We manually control the domain here and pretend that it
 686          * remains coherent i.e. in the GTT domain, like shmem_pwrite.
 687          */
 688         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 689         if (copy_from_user(vaddr, user_data, args->size))
 690                 return -EFAULT;
 691
 692         drm_clflush_virt_range(vaddr, args->size);
 693         i915_gem_chipset_flush(to_i915(obj->base.dev));
 694
 695         intel_fb_obj_flush(obj, ORIGIN_CPU);
 696         return 0;
 697 }
 698
 699 void *i915_gem_object_alloc(struct drm_i915_private *dev_priv)
 700 {
 701         return kmem_cache_zalloc(dev_priv->objects, GFP_KERNEL);
 702 }
 703
 704 void i915_gem_object_free(struct drm_i915_gem_object *obj)
 705 {
 706         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 707         kmem_cache_free(dev_priv->objects, obj);
 708 }
 709
 710 static int
 711 i915_gem_create(struct drm_file *file,
 712                 struct drm_i915_private *dev_priv,
 713                 u64 size,
 714                 u32 *handle_p)
 715 {
 716         struct drm_i915_gem_object *obj;
 717         int ret;
 718         u32 handle;
 719
 720         size = roundup(size, PAGE_SIZE);
 721         if (size == 0)
 722                 return -EINVAL;
 723
 724         /* Allocate the new object */
 725         obj = i915_gem_object_create(dev_priv, size);
 726         if (IS_ERR(obj))
 727                 return PTR_ERR(obj);
 728
 729         ret = drm_gem_handle_create(file, &obj->base, &handle);
 730         /* drop reference from allocate - handle holds it now */
 731         i915_gem_object_put(obj);
 732         if (ret)
 733                 return ret;
 734
 735         *handle_p = handle;
 736         return 0;
 737 }
 738
 739 int
 740 i915_gem_dumb_create(struct drm_file *file,
 741                      struct drm_device *dev,
 742                      struct drm_mode_create_dumb *args)
 743 {
 744         /* have to work out size/pitch and return them */
 745         args->pitch = ALIGN(args->width * DIV_ROUND_UP(args->bpp, 8), 64);
 746         args->size = args->pitch * args->height;
 747         return i915_gem_create(file, to_i915(dev),
 748                                args->size, &args->handle);
 749 }
 750
 751 static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 752 {
 753         return !(obj->cache_level == I915_CACHE_NONE ||
 754                  obj->cache_level == I915_CACHE_WT);
 755 }
 756
 757 /**
 758  * Creates a new mm object and returns a handle to it.
 759  * @dev: drm device pointer
 760  * @data: ioctl data blob
 761  * @file: drm file pointer
 762  */
 763 int
 764 i915_gem_create_ioctl(struct drm_device *dev, void *data,
 765                       struct drm_file *file)
 766 {
 767         struct drm_i915_private *dev_priv = to_i915(dev);
 768         struct drm_i915_gem_create *args = data;
 769
 770         i915_gem_flush_free_objects(dev_priv);
 771
 772         return i915_gem_create(file, dev_priv,
 773                                args->size, &args->handle);
 774 }
 775
 776 static inline enum fb_op_origin
 777 fb_write_origin(struct drm_i915_gem_object *obj, unsigned int domain)
 778 {
 779         return (domain == I915_GEM_DOMAIN_GTT ?
 780                 obj->frontbuffer_ggtt_origin : ORIGIN_CPU);
 781 }
 782
 783 void i915_gem_flush_ggtt_writes(struct drm_i915_private *dev_priv)
 784 {
 785         intel_wakeref_t wakeref;
 786
 787         /*
 788          * No actual flushing is required for the GTT write domain for reads
 789          * from the GTT domain. Writes to it "immediately" go to main memory
 790          * as far as we know, so there's no chipset flush. It also doesn't
 791          * land in the GPU render cache.
 792          *
 793          * However, we do have to enforce the order so that all writes through
 794          * the GTT land before any writes to the device, such as updates to
 795          * the GATT itself.
 796          *
 797          * We also have to wait a bit for the writes to land from the GTT.
 798          * An uncached read (i.e. mmio) seems to be ideal for the round-trip
 799          * timing. This issue has only been observed when switching quickly
 800          * between GTT writes and CPU reads from inside the kernel on recent hw,
 801          * and it appears to only affect discrete GTT blocks (i.e. on LLC
 802          * system agents we cannot reproduce this behaviour, until Cannonlake
 803          * that was!).
 804          */
 805
 806         wmb();
 807
 808         if (INTEL_INFO(dev_priv)->has_coherent_ggtt)
 809                 return;
 810
 811         i915_gem_chipset_flush(dev_priv);
 812
 813         with_intel_runtime_pm(dev_priv, wakeref) {
 814                 spin_lock_irq(&dev_priv->uncore.lock);
 815
 816                 POSTING_READ_FW(RING_HEAD(RENDER_RING_BASE));
 817
 818                 spin_unlock_irq(&dev_priv->uncore.lock);
 819         }
 820 }
 821
 822 static void
 823 flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 824 {
 825         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 826         struct i915_vma *vma;
 827
 828         if (!(obj->write_domain & flush_domains))
 829                 return;
 830
 831         switch (obj->write_domain) {
 832         case I915_GEM_DOMAIN_GTT:
 833                 i915_gem_flush_ggtt_writes(dev_priv);
 834
 835                 intel_fb_obj_flush(obj,
 836                                    fb_write_origin(obj, I915_GEM_DOMAIN_GTT));
 837
 838                 for_each_ggtt_vma(vma, obj) {
 839                         if (vma->iomap)
 840                                 continue;
 841
 842                         i915_vma_unset_ggtt_write(vma);
 843                 }
 844                 break;
 845
 846         case I915_GEM_DOMAIN_WC:
 847                 wmb();
 848                 break;
 849
 850         case I915_GEM_DOMAIN_CPU:
 851                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 852                 break;
 853
 854         case I915_GEM_DOMAIN_RENDER:
 855                 if (gpu_write_needs_clflush(obj))
 856                         obj->cache_dirty = true;
 857                 break;
 858         }
 859
 860         obj->write_domain = 0;
 861 }
 862
 863 /*
 864  * Pins the specified object's pages and synchronizes the object with
 865  * GPU accesses. Sets needs_clflush to non-zero if the caller should
 866  * flush the object from the CPU cache.
 867  */
 868 int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 869                                     unsigned int *needs_clflush)
 870 {
 871         int ret;
 872
 873         lockdep_assert_held(&obj->base.dev->struct_mutex);
 874
 875         *needs_clflush = 0;
 876         if (!i915_gem_object_has_struct_page(obj))
 877                 return -ENODEV;
 878
 879         ret = i915_gem_object_wait(obj,
 880                                    I915_WAIT_INTERRUPTIBLE |
 881                                    I915_WAIT_LOCKED,
 882                                    MAX_SCHEDULE_TIMEOUT,
 883                                    NULL);
 884         if (ret)
 885                 return ret;
 886
 887         ret = i915_gem_object_pin_pages(obj);
 888         if (ret)
 889                 return ret;
 890
 891         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ ||
 892             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 893                 ret = i915_gem_object_set_to_cpu_domain(obj, false);
 894                 if (ret)
 895                         goto err_unpin;
 896                 else
 897                         goto out;
 898         }
 899
 900         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 901
 902         /* If we're not in the cpu read domain, set ourself into the gtt
 903          * read domain and manually flush cachelines (if required). This
 904          * optimizes for the case when the gpu will dirty the data
 905          * anyway again before the next pread happens.
 906          */
 907         if (!obj->cache_dirty &&
 908             !(obj->read_domains & I915_GEM_DOMAIN_CPU))
 909                 *needs_clflush = CLFLUSH_BEFORE;
 910
 911 out:
 912         /* return with the pages pinned */
 913         return 0;
 914
 915 err_unpin:
 916         i915_gem_object_unpin_pages(obj);
 917         return ret;
 918 }
 919
 920 int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 921                                      unsigned int *needs_clflush)
 922 {
 923         int ret;
 924
 925         lockdep_assert_held(&obj->base.dev->struct_mutex);
 926
 927         *needs_clflush = 0;
 928         if (!i915_gem_object_has_struct_page(obj))
 929                 return -ENODEV;
 930
 931         ret = i915_gem_object_wait(obj,
 932                                    I915_WAIT_INTERRUPTIBLE |
 933                                    I915_WAIT_LOCKED |
 934                                    I915_WAIT_ALL,
 935                                    MAX_SCHEDULE_TIMEOUT,
 936                                    NULL);
 937         if (ret)
 938                 return ret;
 939
 940         ret = i915_gem_object_pin_pages(obj);
 941         if (ret)
 942                 return ret;
 943
 944         if (obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE ||
 945             !static_cpu_has(X86_FEATURE_CLFLUSH)) {
 946                 ret = i915_gem_object_set_to_cpu_domain(obj, true);
 947                 if (ret)
 948                         goto err_unpin;
 949                 else
 950                         goto out;
 951         }
 952
 953         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 954
 955         /* If we're not in the cpu write domain, set ourself into the
 956          * gtt write domain and manually flush cachelines (as required).
 957          * This optimizes for the case when the gpu will use the data
 958          * right away and we therefore have to clflush anyway.
 959          */
 960         if (!obj->cache_dirty) {
 961                 *needs_clflush |= CLFLUSH_AFTER;
 962
 963                 /*
 964                  * Same trick applies to invalidate partially written
 965                  * cachelines read before writing.
 966                  */
 967                 if (!(obj->read_domains & I915_GEM_DOMAIN_CPU))
 968                         *needs_clflush |= CLFLUSH_BEFORE;
 969         }
 970
 971 out:
 972         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
 973         obj->mm.dirty = true;
 974         /* return with the pages pinned */
 975         return 0;
 976
 977 err_unpin:
 978         i915_gem_object_unpin_pages(obj);
 979         return ret;
 980 }
 981
 982 static int
 983 shmem_pread(struct page *page, int offset, int len, char __user *user_data,
 984             bool needs_clflush)
 985 {
 986         char *vaddr;
 987         int ret;
 988
 989         vaddr = kmap(page);
 990
 991         if (needs_clflush)
 992                 drm_clflush_virt_range(vaddr + offset, len);
 993
 994         ret = __copy_to_user(user_data, vaddr + offset, len);
 995
 996         kunmap(page);
 997
 998         return ret ? -EFAULT : 0;
 999 }
1000
1001 static int
1002 i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
1003                      struct drm_i915_gem_pread *args)
1004 {
1005         char __user *user_data;
1006         u64 remain;
1007         unsigned int needs_clflush;
1008         unsigned int idx, offset;
1009         int ret;
1010
1011         ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
1012         if (ret)
1013                 return ret;
1014
1015         ret = i915_gem_obj_prepare_shmem_read(obj, &needs_clflush);
1016         mutex_unlock(&obj->base.dev->struct_mutex);
1017         if (ret)
1018                 return ret;
1019
1020         remain = args->size;
1021         user_data = u64_to_user_ptr(args->data_ptr);
1022         offset = offset_in_page(args->offset);
1023         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1024                 struct page *page = i915_gem_object_get_page(obj, idx);
1025                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1026
1027                 ret = shmem_pread(page, offset, length, user_data,
1028                                   needs_clflush);
1029                 if (ret)
1030                         break;
1031
1032                 remain -= length;
1033                 user_data += length;
1034                 offset = 0;
1035         }
1036
1037         i915_gem_obj_finish_shmem_access(obj);
1038         return ret;
1039 }
1040
1041 static inline bool
1042 gtt_user_read(struct io_mapping *mapping,
1043               loff_t base, int offset,
1044               char __user *user_data, int length)
1045 {
1046         void __iomem *vaddr;
1047         unsigned long unwritten;
1048
1049         /* We can use the cpu mem copy function because this is X86. */
1050         vaddr = io_mapping_map_atomic_wc(mapping, base);
1051         unwritten = __copy_to_user_inatomic(user_data,
1052                                             (void __force *)vaddr + offset,
1053                                             length);
1054         io_mapping_unmap_atomic(vaddr);
1055         if (unwritten) {
1056                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1057                 unwritten = copy_to_user(user_data,
1058                                          (void __force *)vaddr + offset,
1059                                          length);
1060                 io_mapping_unmap(vaddr);
1061         }
1062         return unwritten;
1063 }
1064
1065 static int
1066 i915_gem_gtt_pread(struct drm_i915_gem_object *obj,
1067                    const struct drm_i915_gem_pread *args)
1068 {
1069         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1070         struct i915_ggtt *ggtt = &i915->ggtt;
1071         intel_wakeref_t wakeref;
1072         struct drm_mm_node node;
1073         struct i915_vma *vma;
1074         void __user *user_data;
1075         u64 remain, offset;
1076         int ret;
1077
1078         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1079         if (ret)
1080                 return ret;
1081
1082         wakeref = intel_runtime_pm_get(i915);
1083         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1084                                        PIN_MAPPABLE |
1085                                        PIN_NONFAULT |
1086                                        PIN_NONBLOCK);
1087         if (!IS_ERR(vma)) {
1088                 node.start = i915_ggtt_offset(vma);
1089                 node.allocated = false;
1090                 ret = i915_vma_put_fence(vma);
1091                 if (ret) {
1092                         i915_vma_unpin(vma);
1093                         vma = ERR_PTR(ret);
1094                 }
1095         }
1096         if (IS_ERR(vma)) {
1097                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1098                 if (ret)
1099                         goto out_unlock;
1100                 GEM_BUG_ON(!node.allocated);
1101         }
1102
1103         ret = i915_gem_object_set_to_gtt_domain(obj, false);
1104         if (ret)
1105                 goto out_unpin;
1106
1107         mutex_unlock(&i915->drm.struct_mutex);
1108
1109         user_data = u64_to_user_ptr(args->data_ptr);
1110         remain = args->size;
1111         offset = args->offset;
1112
1113         while (remain > 0) {
1114                 /* Operation in this page
1115                  *
1116                  * page_base = page offset within aperture
1117                  * page_offset = offset within page
1118                  * page_length = bytes to copy for this page
1119                  */
1120                 u32 page_base = node.start;
1121                 unsigned page_offset = offset_in_page(offset);
1122                 unsigned page_length = PAGE_SIZE - page_offset;
1123                 page_length = remain < page_length ? remain : page_length;
1124                 if (node.allocated) {
1125                         wmb();
1126                         ggtt->vm.insert_page(&ggtt->vm,
1127                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1128                                              node.start, I915_CACHE_NONE, 0);
1129                         wmb();
1130                 } else {
1131                         page_base += offset & PAGE_MASK;
1132                 }
1133
1134                 if (gtt_user_read(&ggtt->iomap, page_base, page_offset,
1135                                   user_data, page_length)) {
1136                         ret = -EFAULT;
1137                         break;
1138                 }
1139
1140                 remain -= page_length;
1141                 user_data += page_length;
1142                 offset += page_length;
1143         }
1144
1145         mutex_lock(&i915->drm.struct_mutex);
1146 out_unpin:
1147         if (node.allocated) {
1148                 wmb();
1149                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1150                 remove_mappable_node(&node);
1151         } else {
1152                 i915_vma_unpin(vma);
1153         }
1154 out_unlock:
1155         intel_runtime_pm_put(i915, wakeref);
1156         mutex_unlock(&i915->drm.struct_mutex);
1157
1158         return ret;
1159 }
1160
1161 /**
1162  * Reads data from the object referenced by handle.
1163  * @dev: drm device pointer
1164  * @data: ioctl data blob
1165  * @file: drm file pointer
1166  *
1167  * On error, the contents of *data are undefined.
1168  */
1169 int
1170 i915_gem_pread_ioctl(struct drm_device *dev, void *data,
1171                      struct drm_file *file)
1172 {
1173         struct drm_i915_gem_pread *args = data;
1174         struct drm_i915_gem_object *obj;
1175         int ret;
1176
1177         if (args->size == 0)
1178                 return 0;
1179
1180         if (!access_ok(u64_to_user_ptr(args->data_ptr),
1181                        args->size))
1182                 return -EFAULT;
1183
1184         obj = i915_gem_object_lookup(file, args->handle);
1185         if (!obj)
1186                 return -ENOENT;
1187
1188         /* Bounds check source.  */
1189         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1190                 ret = -EINVAL;
1191                 goto out;
1192         }
1193
1194         trace_i915_gem_object_pread(obj, args->offset, args->size);
1195
1196         ret = i915_gem_object_wait(obj,
1197                                    I915_WAIT_INTERRUPTIBLE,
1198                                    MAX_SCHEDULE_TIMEOUT,
1199                                    to_rps_client(file));
1200         if (ret)
1201                 goto out;
1202
1203         ret = i915_gem_object_pin_pages(obj);
1204         if (ret)
1205                 goto out;
1206
1207         ret = i915_gem_shmem_pread(obj, args);
1208         if (ret == -EFAULT || ret == -ENODEV)
1209                 ret = i915_gem_gtt_pread(obj, args);
1210
1211         i915_gem_object_unpin_pages(obj);
1212 out:
1213         i915_gem_object_put(obj);
1214         return ret;
1215 }
1216
1217 /* This is the fast write path which cannot handle
1218  * page faults in the source data
1219  */
1220
1221 static inline bool
1222 ggtt_write(struct io_mapping *mapping,
1223            loff_t base, int offset,
1224            char __user *user_data, int length)
1225 {
1226         void __iomem *vaddr;
1227         unsigned long unwritten;
1228
1229         /* We can use the cpu mem copy function because this is X86. */
1230         vaddr = io_mapping_map_atomic_wc(mapping, base);
1231         unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset,
1232                                                       user_data, length);
1233         io_mapping_unmap_atomic(vaddr);
1234         if (unwritten) {
1235                 vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
1236                 unwritten = copy_from_user((void __force *)vaddr + offset,
1237                                            user_data, length);
1238                 io_mapping_unmap(vaddr);
1239         }
1240
1241         return unwritten;
1242 }
1243
1244 /**
1245  * This is the fast pwrite path, where we copy the data directly from the
1246  * user into the GTT, uncached.
1247  * @obj: i915 GEM object
1248  * @args: pwrite arguments structure
1249  */
1250 static int
1251 i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
1252                          const struct drm_i915_gem_pwrite *args)
1253 {
1254         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1255         struct i915_ggtt *ggtt = &i915->ggtt;
1256         intel_wakeref_t wakeref;
1257         struct drm_mm_node node;
1258         struct i915_vma *vma;
1259         u64 remain, offset;
1260         void __user *user_data;
1261         int ret;
1262
1263         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1264         if (ret)
1265                 return ret;
1266
1267         if (i915_gem_object_has_struct_page(obj)) {
1268                 /*
1269                  * Avoid waking the device up if we can fallback, as
1270                  * waking/resuming is very slow (worst-case 10-100 ms
1271                  * depending on PCI sleeps and our own resume time).
1272                  * This easily dwarfs any performance advantage from
1273                  * using the cache bypass of indirect GGTT access.
1274                  */
1275                 wakeref = intel_runtime_pm_get_if_in_use(i915);
1276                 if (!wakeref) {
1277                         ret = -EFAULT;
1278                         goto out_unlock;
1279                 }
1280         } else {
1281                 /* No backing pages, no fallback, we must force GGTT access */
1282                 wakeref = intel_runtime_pm_get(i915);
1283         }
1284
1285         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1286                                        PIN_MAPPABLE |
1287                                        PIN_NONFAULT |
1288                                        PIN_NONBLOCK);
1289         if (!IS_ERR(vma)) {
1290                 node.start = i915_ggtt_offset(vma);
1291                 node.allocated = false;
1292                 ret = i915_vma_put_fence(vma);
1293                 if (ret) {
1294                         i915_vma_unpin(vma);
1295                         vma = ERR_PTR(ret);
1296                 }
1297         }
1298         if (IS_ERR(vma)) {
1299                 ret = insert_mappable_node(ggtt, &node, PAGE_SIZE);
1300                 if (ret)
1301                         goto out_rpm;
1302                 GEM_BUG_ON(!node.allocated);
1303         }
1304
1305         ret = i915_gem_object_set_to_gtt_domain(obj, true);
1306         if (ret)
1307                 goto out_unpin;
1308
1309         mutex_unlock(&i915->drm.struct_mutex);
1310
1311         intel_fb_obj_invalidate(obj, ORIGIN_CPU);
1312
1313         user_data = u64_to_user_ptr(args->data_ptr);
1314         offset = args->offset;
1315         remain = args->size;
1316         while (remain) {
1317                 /* Operation in this page
1318                  *
1319                  * page_base = page offset within aperture
1320                  * page_offset = offset within page
1321                  * page_length = bytes to copy for this page
1322                  */
1323                 u32 page_base = node.start;
1324                 unsigned int page_offset = offset_in_page(offset);
1325                 unsigned int page_length = PAGE_SIZE - page_offset;
1326                 page_length = remain < page_length ? remain : page_length;
1327                 if (node.allocated) {
1328                         wmb(); /* flush the write before we modify the GGTT */
1329                         ggtt->vm.insert_page(&ggtt->vm,
1330                                              i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT),
1331                                              node.start, I915_CACHE_NONE, 0);
1332                         wmb(); /* flush modifications to the GGTT (insert_page) */
1333                 } else {
1334                         page_base += offset & PAGE_MASK;
1335                 }
1336                 /* If we get a fault while copying data, then (presumably) our
1337                  * source page isn't available.  Return the error and we'll
1338                  * retry in the slow path.
1339                  * If the object is non-shmem backed, we retry again with the
1340                  * path that handles page fault.
1341                  */
1342                 if (ggtt_write(&ggtt->iomap, page_base, page_offset,
1343                                user_data, page_length)) {
1344                         ret = -EFAULT;
1345                         break;
1346                 }
1347
1348                 remain -= page_length;
1349                 user_data += page_length;
1350                 offset += page_length;
1351         }
1352         intel_fb_obj_flush(obj, ORIGIN_CPU);
1353
1354         mutex_lock(&i915->drm.struct_mutex);
1355 out_unpin:
1356         if (node.allocated) {
1357                 wmb();
1358                 ggtt->vm.clear_range(&ggtt->vm, node.start, node.size);
1359                 remove_mappable_node(&node);
1360         } else {
1361                 i915_vma_unpin(vma);
1362         }
1363 out_rpm:
1364         intel_runtime_pm_put(i915, wakeref);
1365 out_unlock:
1366         mutex_unlock(&i915->drm.struct_mutex);
1367         return ret;
1368 }
1369
1370 /* Per-page copy function for the shmem pwrite fastpath.
1371  * Flushes invalid cachelines before writing to the target if
1372  * needs_clflush_before is set and flushes out any written cachelines after
1373  * writing if needs_clflush is set.
1374  */
1375 static int
1376 shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
1377              bool needs_clflush_before,
1378              bool needs_clflush_after)
1379 {
1380         char *vaddr;
1381         int ret;
1382
1383         vaddr = kmap(page);
1384
1385         if (needs_clflush_before)
1386                 drm_clflush_virt_range(vaddr + offset, len);
1387
1388         ret = __copy_from_user(vaddr + offset, user_data, len);
1389         if (!ret && needs_clflush_after)
1390                 drm_clflush_virt_range(vaddr + offset, len);
1391
1392         kunmap(page);
1393
1394         return ret ? -EFAULT : 0;
1395 }
1396
1397 static int
1398 i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
1399                       const struct drm_i915_gem_pwrite *args)
1400 {
1401         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1402         void __user *user_data;
1403         u64 remain;
1404         unsigned int partial_cacheline_write;
1405         unsigned int needs_clflush;
1406         unsigned int offset, idx;
1407         int ret;
1408
1409         ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
1410         if (ret)
1411                 return ret;
1412
1413         ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
1414         mutex_unlock(&i915->drm.struct_mutex);
1415         if (ret)
1416                 return ret;
1417
1418         /* If we don't overwrite a cacheline completely we need to be
1419          * careful to have up-to-date data by first clflushing. Don't
1420          * overcomplicate things and flush the entire patch.
1421          */
1422         partial_cacheline_write = 0;
1423         if (needs_clflush & CLFLUSH_BEFORE)
1424                 partial_cacheline_write = boot_cpu_data.x86_clflush_size - 1;
1425
1426         user_data = u64_to_user_ptr(args->data_ptr);
1427         remain = args->size;
1428         offset = offset_in_page(args->offset);
1429         for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
1430                 struct page *page = i915_gem_object_get_page(obj, idx);
1431                 unsigned int length = min_t(u64, remain, PAGE_SIZE - offset);
1432
1433                 ret = shmem_pwrite(page, offset, length, user_data,
1434                                    (offset | length) & partial_cacheline_write,
1435                                    needs_clflush & CLFLUSH_AFTER);
1436                 if (ret)
1437                         break;
1438
1439                 remain -= length;
1440                 user_data += length;
1441                 offset = 0;
1442         }
1443
1444         intel_fb_obj_flush(obj, ORIGIN_CPU);
1445         i915_gem_obj_finish_shmem_access(obj);
1446         return ret;
1447 }
1448
1449 /**
1450  * Writes data to the object referenced by handle.
1451  * @dev: drm device
1452  * @data: ioctl data blob
1453  * @file: drm file
1454  *
1455  * On error, the contents of the buffer that were to be modified are undefined.
1456  */
1457 int
1458 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
1459                       struct drm_file *file)
1460 {
1461         struct drm_i915_gem_pwrite *args = data;
1462         struct drm_i915_gem_object *obj;
1463         int ret;
1464
1465         if (args->size == 0)
1466                 return 0;
1467
1468         if (!access_ok(u64_to_user_ptr(args->data_ptr), args->size))
1469                 return -EFAULT;
1470
1471         obj = i915_gem_object_lookup(file, args->handle);
1472         if (!obj)
1473                 return -ENOENT;
1474
1475         /* Bounds check destination. */
1476         if (range_overflows_t(u64, args->offset, args->size, obj->base.size)) {
1477                 ret = -EINVAL;
1478                 goto err;
1479         }
1480
1481         /* Writes not allowed into this read-only object */
1482         if (i915_gem_object_is_readonly(obj)) {
1483                 ret = -EINVAL;
1484                 goto err;
1485         }
1486
1487         trace_i915_gem_object_pwrite(obj, args->offset, args->size);
1488
1489         ret = -ENODEV;
1490         if (obj->ops->pwrite)
1491                 ret = obj->ops->pwrite(obj, args);
1492         if (ret != -ENODEV)
1493                 goto err;
1494
1495         ret = i915_gem_object_wait(obj,
1496                                    I915_WAIT_INTERRUPTIBLE |
1497                                    I915_WAIT_ALL,
1498                                    MAX_SCHEDULE_TIMEOUT,
1499                                    to_rps_client(file));
1500         if (ret)
1501                 goto err;
1502
1503         ret = i915_gem_object_pin_pages(obj);
1504         if (ret)
1505                 goto err;
1506
1507         ret = -EFAULT;
1508         /* We can only do the GTT pwrite on untiled buffers, as otherwise
1509          * it would end up going through the fenced access, and we'll get
1510          * different detiling behavior between reading and writing.
1511          * pread/pwrite currently are reading and writing from the CPU
1512          * perspective, requiring manual detiling by the client.
1513          */
1514         if (!i915_gem_object_has_struct_page(obj) ||
1515             cpu_write_needs_clflush(obj))
1516                 /* Note that the gtt paths might fail with non-page-backed user
1517                  * pointers (e.g. gtt mappings when moving data between
1518                  * textures). Fallback to the shmem path in that case.
1519                  */
1520                 ret = i915_gem_gtt_pwrite_fast(obj, args);
1521
1522         if (ret == -EFAULT || ret == -ENOSPC) {
1523                 if (obj->phys_handle)
1524                         ret = i915_gem_phys_pwrite(obj, args, file);
1525                 else
1526                         ret = i915_gem_shmem_pwrite(obj, args);
1527         }
1528
1529         i915_gem_object_unpin_pages(obj);
1530 err:
1531         i915_gem_object_put(obj);
1532         return ret;
1533 }
1534
1535 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
1536 {
1537         struct drm_i915_private *i915 = to_i915(obj->base.dev);
1538         struct list_head *list;
1539         struct i915_vma *vma;
1540
1541         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
1542
1543         mutex_lock(&i915->ggtt.vm.mutex);
1544         for_each_ggtt_vma(vma, obj) {
1545                 if (!drm_mm_node_allocated(&vma->node))
1546                         continue;
1547
1548                 list_move_tail(&vma->vm_link, &vma->vm->bound_list);
1549         }
1550         mutex_unlock(&i915->ggtt.vm.mutex);
1551
1552         spin_lock(&i915->mm.obj_lock);
1553         list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
1554         list_move_tail(&obj->mm.link, list);
1555         spin_unlock(&i915->mm.obj_lock);
1556 }
1557
1558 /**
1559  * Called when user space prepares to use an object with the CPU, either
1560  * through the mmap ioctl's mapping or a GTT mapping.
1561  * @dev: drm device
1562  * @data: ioctl data blob
1563  * @file: drm file
1564  */
1565 int
1566 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
1567                           struct drm_file *file)
1568 {
1569         struct drm_i915_gem_set_domain *args = data;
1570         struct drm_i915_gem_object *obj;
1571         u32 read_domains = args->read_domains;
1572         u32 write_domain = args->write_domain;
1573         int err;
1574
1575         /* Only handle setting domains to types used by the CPU. */
1576         if ((write_domain | read_domains) & I915_GEM_GPU_DOMAINS)
1577                 return -EINVAL;
1578
1579         /* Having something in the write domain implies it's in the read
1580          * domain, and only that read domain.  Enforce that in the request.
1581          */
1582         if (write_domain != 0 && read_domains != write_domain)
1583                 return -EINVAL;
1584
1585         obj = i915_gem_object_lookup(file, args->handle);
1586         if (!obj)
1587                 return -ENOENT;
1588
1589         /* Try to flush the object off the GPU without holding the lock.
1590          * We will repeat the flush holding the lock in the normal manner
1591          * to catch cases where we are gazumped.
1592          */
1593         err = i915_gem_object_wait(obj,
1594                                    I915_WAIT_INTERRUPTIBLE |
1595                                    I915_WAIT_PRIORITY |
1596                                    (write_domain ? I915_WAIT_ALL : 0),
1597                                    MAX_SCHEDULE_TIMEOUT,
1598                                    to_rps_client(file));
1599         if (err)
1600                 goto out;
1601
1602         /*
1603          * Proxy objects do not control access to the backing storage, ergo
1604          * they cannot be used as a means to manipulate the cache domain
1605          * tracking for that backing storage. The proxy object is always
1606          * considered to be outside of any cache domain.
1607          */
1608         if (i915_gem_object_is_proxy(obj)) {
1609                 err = -ENXIO;
1610                 goto out;
1611         }
1612
1613         /*
1614          * Flush and acquire obj->pages so that we are coherent through
1615          * direct access in memory with previous cached writes through
1616          * shmemfs and that our cache domain tracking remains valid.
1617          * For example, if the obj->filp was moved to swap without us
1618          * being notified and releasing the pages, we would mistakenly
1619          * continue to assume that the obj remained out of the CPU cached
1620          * domain.
1621          */
1622         err = i915_gem_object_pin_pages(obj);
1623         if (err)
1624                 goto out;
1625
1626         err = i915_mutex_lock_interruptible(dev);
1627         if (err)
1628                 goto out_unpin;
1629
1630         if (read_domains & I915_GEM_DOMAIN_WC)
1631                 err = i915_gem_object_set_to_wc_domain(obj, write_domain);
1632         else if (read_domains & I915_GEM_DOMAIN_GTT)
1633                 err = i915_gem_object_set_to_gtt_domain(obj, write_domain);
1634         else
1635                 err = i915_gem_object_set_to_cpu_domain(obj, write_domain);
1636
1637         /* And bump the LRU for this access */
1638         i915_gem_object_bump_inactive_ggtt(obj);
1639
1640         mutex_unlock(&dev->struct_mutex);
1641
1642         if (write_domain != 0)
1643                 intel_fb_obj_invalidate(obj,
1644                                         fb_write_origin(obj, write_domain));
1645
1646 out_unpin:
1647         i915_gem_object_unpin_pages(obj);
1648 out:
1649         i915_gem_object_put(obj);
1650         return err;
1651 }
1652
1653 /**
1654  * Called when user space has done writes to this buffer
1655  * @dev: drm device
1656  * @data: ioctl data blob
1657  * @file: drm file
1658  */
1659 int
1660 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
1661                          struct drm_file *file)
1662 {
1663         struct drm_i915_gem_sw_finish *args = data;
1664         struct drm_i915_gem_object *obj;
1665
1666         obj = i915_gem_object_lookup(file, args->handle);
1667         if (!obj)
1668                 return -ENOENT;
1669
1670         /*
1671          * Proxy objects are barred from CPU access, so there is no
1672          * need to ban sw_finish as it is a nop.
1673          */
1674
1675         /* Pinned buffers may be scanout, so flush the cache */
1676         i915_gem_object_flush_if_display(obj);
1677         i915_gem_object_put(obj);
1678
1679         return 0;
1680 }
1681
1682 static inline bool
1683 __vma_matches(struct vm_area_struct *vma, struct file *filp,
1684               unsigned long addr, unsigned long size)
1685 {
1686         if (vma->vm_file != filp)
1687                 return false;
1688
1689         return vma->vm_start == addr && (vma->vm_end - vma->vm_start) == size;
1690 }
1691
1692 /**
1693  * i915_gem_mmap_ioctl - Maps the contents of an object, returning the address
1694  *                       it is mapped to.
1695  * @dev: drm device
1696  * @data: ioctl data blob
1697  * @file: drm file
1698  *
1699  * While the mapping holds a reference on the contents of the object, it doesn't
1700  * imply a ref on the object itself.
1701  *
1702  * IMPORTANT:
1703  *
1704  * DRM driver writers who look a this function as an example for how to do GEM
1705  * mmap support, please don't implement mmap support like here. The modern way
1706  * to implement DRM mmap support is with an mmap offset ioctl (like
1707  * i915_gem_mmap_gtt) and then using the mmap syscall on the DRM fd directly.
1708  * That way debug tooling like valgrind will understand what's going on, hiding
1709  * the mmap call in a driver private ioctl will break that. The i915 driver only
1710  * does cpu mmaps this way because we didn't know better.
1711  */
1712 int
1713 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
1714                     struct drm_file *file)
1715 {
1716         struct drm_i915_gem_mmap *args = data;
1717         struct drm_i915_gem_object *obj;
1718         unsigned long addr;
1719
1720         if (args->flags & ~(I915_MMAP_WC))
1721                 return -EINVAL;
1722
1723         if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
1724                 return -ENODEV;
1725
1726         obj = i915_gem_object_lookup(file, args->handle);
1727         if (!obj)
1728                 return -ENOENT;
1729
1730         /* prime objects have no backing filp to GEM mmap
1731          * pages from.
1732          */
1733         if (!obj->base.filp) {
1734                 i915_gem_object_put(obj);
1735                 return -ENXIO;
1736         }
1737
1738         addr = vm_mmap(obj->base.filp, 0, args->size,
1739                        PROT_READ | PROT_WRITE, MAP_SHARED,
1740                        args->offset);
1741         if (args->flags & I915_MMAP_WC) {
1742                 struct mm_struct *mm = current->mm;
1743                 struct vm_area_struct *vma;
1744
1745                 if (down_write_killable(&mm->mmap_sem)) {
1746                         i915_gem_object_put(obj);
1747                         return -EINTR;
1748                 }
1749                 vma = find_vma(mm, addr);
1750                 if (vma && __vma_matches(vma, obj->base.filp, addr, args->size))
1751                         vma->vm_page_prot =
1752                                 pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
1753                 else
1754                         addr = -ENOMEM;
1755                 up_write(&mm->mmap_sem);
1756
1757                 /* This may race, but that's ok, it only gets set */
1758                 WRITE_ONCE(obj->frontbuffer_ggtt_origin, ORIGIN_CPU);
1759         }
1760         i915_gem_object_put(obj);
1761         if (IS_ERR((void *)addr))
1762                 return addr;
1763
1764         args->addr_ptr = (u64)addr;
1765
1766         return 0;
1767 }
1768
1769 static unsigned int tile_row_pages(const struct drm_i915_gem_object *obj)
1770 {
1771         return i915_gem_object_get_tile_row_size(obj) >> PAGE_SHIFT;
1772 }
1773
1774 /**
1775  * i915_gem_mmap_gtt_version - report the current feature set for GTT mmaps
1776  *
1777  * A history of the GTT mmap interface:
1778  *
1779  * 0 - Everything had to fit into the GTT. Both parties of a memcpy had to
1780  *     aligned and suitable for fencing, and still fit into the available
1781  *     mappable space left by the pinned display objects. A classic problem
1782  *     we called the page-fault-of-doom where we would ping-pong between
1783  *     two objects that could not fit inside the GTT and so the memcpy
1784  *     would page one object in at the expense of the other between every
1785  *     single byte.
1786  *
1787  * 1 - Objects can be any size, and have any compatible fencing (X Y, or none
1788  *     as set via i915_gem_set_tiling() [DRM_I915_GEM_SET_TILING]). If the
1789  *     object is too large for the available space (or simply too large
1790  *     for the mappable aperture!), a view is created instead and faulted
1791  *     into userspace. (This view is aligned and sized appropriately for
1792  *     fenced access.)
1793  *
1794  * 2 - Recognise WC as a separate cache domain so that we can flush the
1795  *     delayed writes via GTT before performing direct access via WC.
1796  *
1797  * Restrictions:
1798  *
1799  *  * snoopable objects cannot be accessed via the GTT. It can cause machine
1800  *    hangs on some architectures, corruption on others. An attempt to service
1801  *    a GTT page fault from a snoopable object will generate a SIGBUS.
1802  *
1803  *  * the object must be able to fit into RAM (physical memory, though no
1804  *    limited to the mappable aperture).
1805  *
1806  *
1807  * Caveats:
1808  *
1809  *  * a new GTT page fault will synchronize rendering from the GPU and flush
1810  *    all data to system memory. Subsequent access will not be synchronized.
1811  *
1812  *  * all mappings are revoked on runtime device suspend.
1813  *
1814  *  * there are only 8, 16 or 32 fence registers to share between all users
1815  *    (older machines require fence register for display and blitter access
1816  *    as well). Contention of the fence registers will cause the previous users
1817  *    to be unmapped and any new access will generate new page faults.
1818  *
1819  *  * running out of memory while servicing a fault may generate a SIGBUS,
1820  *    rather than the expected SIGSEGV.
1821  */
1822 int i915_gem_mmap_gtt_version(void)
1823 {
1824         return 2;
1825 }
1826
1827 static inline struct i915_ggtt_view
1828 compute_partial_view(const struct drm_i915_gem_object *obj,
1829                      pgoff_t page_offset,
1830                      unsigned int chunk)
1831 {
1832         struct i915_ggtt_view view;
1833
1834         if (i915_gem_object_is_tiled(obj))
1835                 chunk = roundup(chunk, tile_row_pages(obj));
1836
1837         view.type = I915_GGTT_VIEW_PARTIAL;
1838         view.partial.offset = rounddown(page_offset, chunk);
1839         view.partial.size =
1840                 min_t(unsigned int, chunk,
1841                       (obj->base.size >> PAGE_SHIFT) - view.partial.offset);
1842
1843         /* If the partial covers the entire object, just create a normal VMA. */
1844         if (chunk >= obj->base.size >> PAGE_SHIFT)
1845                 view.type = I915_GGTT_VIEW_NORMAL;
1846
1847         return view;
1848 }
1849
1850 /**
1851  * i915_gem_fault - fault a page into the GTT
1852  * @vmf: fault info
1853  *
1854  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
1855  * from userspace.  The fault handler takes care of binding the object to
1856  * the GTT (if needed), allocating and programming a fence register (again,
1857  * only if needed based on whether the old reg is still valid or the object
1858  * is tiled) and inserting a new PTE into the faulting process.
1859  *
1860  * Note that the faulting process may involve evicting existing objects
1861  * from the GTT and/or fence registers to make room.  So performance may
1862  * suffer if the GTT working set is large or there are few fence registers
1863  * left.
1864  *
1865  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
1866  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
1867  */
1868 vm_fault_t i915_gem_fault(struct vm_fault *vmf)
1869 {
1870 #define MIN_CHUNK_PAGES (SZ_1M >> PAGE_SHIFT)
1871         struct vm_area_struct *area = vmf->vma;
1872         struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
1873         struct drm_device *dev = obj->base.dev;
1874         struct drm_i915_private *dev_priv = to_i915(dev);
1875         struct i915_ggtt *ggtt = &dev_priv->ggtt;
1876         bool write = area->vm_flags & VM_WRITE;
1877         intel_wakeref_t wakeref;
1878         struct i915_vma *vma;
1879         pgoff_t page_offset;
1880         int ret;
1881
1882         /* Sanity check that we allow writing into this object */
1883         if (i915_gem_object_is_readonly(obj) && write)
1884                 return VM_FAULT_SIGBUS;
1885
1886         /* We don't use vmf->pgoff since that has the fake offset */
1887         page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1888
1889         trace_i915_gem_object_fault(obj, page_offset, true, write);
1890
1891         /* Try to flush the object off the GPU first without holding the lock.
1892          * Upon acquiring the lock, we will perform our sanity checks and then
1893          * repeat the flush holding the lock in the normal manner to catch cases
1894          * where we are gazumped.
1895          */
1896         ret = i915_gem_object_wait(obj,
1897                                    I915_WAIT_INTERRUPTIBLE,
1898                                    MAX_SCHEDULE_TIMEOUT,
1899                                    NULL);
1900         if (ret)
1901                 goto err;
1902
1903         ret = i915_gem_object_pin_pages(obj);
1904         if (ret)
1905                 goto err;
1906
1907         wakeref = intel_runtime_pm_get(dev_priv);
1908
1909         ret = i915_mutex_lock_interruptible(dev);
1910         if (ret)
1911                 goto err_rpm;
1912
1913         /* Access to snoopable pages through the GTT is incoherent. */
1914         if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev_priv)) {
1915                 ret = -EFAULT;
1916                 goto err_unlock;
1917         }
1918
1919
1920         /* Now pin it into the GTT as needed */
1921         vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
1922                                        PIN_MAPPABLE |
1923                                        PIN_NONBLOCK |
1924                                        PIN_NONFAULT);
1925         if (IS_ERR(vma)) {
1926                 /* Use a partial view if it is bigger than available space */
1927                 struct i915_ggtt_view view =
1928                         compute_partial_view(obj, page_offset, MIN_CHUNK_PAGES);
1929                 unsigned int flags;
1930
1931                 flags = PIN_MAPPABLE;
1932                 if (view.type == I915_GGTT_VIEW_NORMAL)
1933                         flags |= PIN_NONBLOCK; /* avoid warnings for pinned */
1934
1935                 /*
1936                  * Userspace is now writing through an untracked VMA, abandon
1937                  * all hope that the hardware is able to track future writes.
1938                  */
1939                 obj->frontbuffer_ggtt_origin = ORIGIN_CPU;
1940
1941                 vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1942                 if (IS_ERR(vma) && !view.type) {
1943                         flags = PIN_MAPPABLE;
1944                         view.type = I915_GGTT_VIEW_PARTIAL;
1945                         vma = i915_gem_object_ggtt_pin(obj, &view, 0, 0, flags);
1946                 }
1947         }
1948         if (IS_ERR(vma)) {
1949                 ret = PTR_ERR(vma);
1950                 goto err_unlock;
1951         }
1952
1953         ret = i915_gem_object_set_to_gtt_domain(obj, write);
1954         if (ret)
1955                 goto err_unpin;
1956
1957         ret = i915_vma_pin_fence(vma);
1958         if (ret)
1959                 goto err_unpin;
1960
1961         /* Finally, remap it using the new GTT offset */
1962         ret = remap_io_mapping(area,
1963                                area->vm_start + (vma->ggtt_view.partial.offset << PAGE_SHIFT),
1964                                (ggtt->gmadr.start + vma->node.start) >> PAGE_SHIFT,
1965                                min_t(u64, vma->size, area->vm_end - area->vm_start),
1966                                &ggtt->iomap);
1967         if (ret)
1968                 goto err_fence;
1969
1970         /* Mark as being mmapped into userspace for later revocation */
1971         assert_rpm_wakelock_held(dev_priv);
1972         if (!i915_vma_set_userfault(vma) && !obj->userfault_count++)
1973                 list_add(&obj->userfault_link, &dev_priv->mm.userfault_list);
1974         GEM_BUG_ON(!obj->userfault_count);
1975
1976         i915_vma_set_ggtt_write(vma);
1977
1978 err_fence:
1979         i915_vma_unpin_fence(vma);
1980 err_unpin:
1981         __i915_vma_unpin(vma);
1982 err_unlock:
1983         mutex_unlock(&dev->struct_mutex);
1984 err_rpm:
1985         intel_runtime_pm_put(dev_priv, wakeref);
1986         i915_gem_object_unpin_pages(obj);
1987 err:
1988         switch (ret) {
1989         case -EIO:
1990                 /*
1991                  * We eat errors when the gpu is terminally wedged to avoid
1992                  * userspace unduly crashing (gl has no provisions for mmaps to
1993                  * fail). But any other -EIO isn't ours (e.g. swap in failure)
1994                  * and so needs to be reported.
1995                  */
1996                 if (!i915_terminally_wedged(&dev_priv->gpu_error))
1997                         return VM_FAULT_SIGBUS;
1998                 /* else: fall through */
1999         case -EAGAIN:
2000                 /*
2001                  * EAGAIN means the gpu is hung and we'll wait for the error
2002                  * handler to reset everything when re-faulting in
2003                  * i915_mutex_lock_interruptible.
2004                  */
2005         case 0:
2006         case -ERESTARTSYS:
2007         case -EINTR:
2008         case -EBUSY:
2009                 /*
2010                  * EBUSY is ok: this just means that another thread
2011                  * already did the job.
2012                  */
2013                 return VM_FAULT_NOPAGE;
2014         case -ENOMEM:
2015                 return VM_FAULT_OOM;
2016         case -ENOSPC:
2017         case -EFAULT:
2018                 return VM_FAULT_SIGBUS;
2019         default:
2020                 WARN_ONCE(ret, "unhandled error in i915_gem_fault: %i\n", ret);
2021                 return VM_FAULT_SIGBUS;
2022         }
2023 }
2024
2025 static void __i915_gem_object_release_mmap(struct drm_i915_gem_object *obj)
2026 {
2027         struct i915_vma *vma;
2028
2029         GEM_BUG_ON(!obj->userfault_count);
2030
2031         obj->userfault_count = 0;
2032         list_del(&obj->userfault_link);
2033         drm_vma_node_unmap(&obj->base.vma_node,
2034                            obj->base.dev->anon_inode->i_mapping);
2035
2036         for_each_ggtt_vma(vma, obj)
2037                 i915_vma_unset_userfault(vma);
2038 }
2039
2040 /**
2041  * i915_gem_release_mmap - remove physical page mappings
2042  * @obj: obj in question
2043  *
2044  * Preserve the reservation of the mmapping with the DRM core code, but
2045  * relinquish ownership of the pages back to the system.
2046  *
2047  * It is vital that we remove the page mapping if we have mapped a tiled
2048  * object through the GTT and then lose the fence register due to
2049  * resource pressure. Similarly if the object has been moved out of the
2050  * aperture, than pages mapped into userspace must be revoked. Removing the
2051  * mapping will then trigger a page fault on the next user access, allowing
2052  * fixup by i915_gem_fault().
2053  */
2054 void
2055 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
2056 {
2057         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2058         intel_wakeref_t wakeref;
2059
2060         /* Serialisation between user GTT access and our code depends upon
2061          * revoking the CPU's PTE whilst the mutex is held. The next user
2062          * pagefault then has to wait until we release the mutex.
2063          *
2064          * Note that RPM complicates somewhat by adding an additional
2065          * requirement that operations to the GGTT be made holding the RPM
2066          * wakeref.
2067          */
2068         lockdep_assert_held(&i915->drm.struct_mutex);
2069         wakeref = intel_runtime_pm_get(i915);
2070
2071         if (!obj->userfault_count)
2072                 goto out;
2073
2074         __i915_gem_object_release_mmap(obj);
2075
2076         /* Ensure that the CPU's PTE are revoked and there are not outstanding
2077          * memory transactions from userspace before we return. The TLB
2078          * flushing implied above by changing the PTE above *should* be
2079          * sufficient, an extra barrier here just provides us with a bit
2080          * of paranoid documentation about our requirement to serialise
2081          * memory writes before touching registers / GSM.
2082          */
2083         wmb();
2084
2085 out:
2086         intel_runtime_pm_put(i915, wakeref);
2087 }
2088
2089 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
2090 {
2091         struct drm_i915_gem_object *obj, *on;
2092         int i;
2093
2094         /*
2095          * Only called during RPM suspend. All users of the userfault_list
2096          * must be holding an RPM wakeref to ensure that this can not
2097          * run concurrently with themselves (and use the struct_mutex for
2098          * protection between themselves).
2099          */
2100
2101         list_for_each_entry_safe(obj, on,
2102                                  &dev_priv->mm.userfault_list, userfault_link)
2103                 __i915_gem_object_release_mmap(obj);
2104
2105         /* The fence will be lost when the device powers down. If any were
2106          * in use by hardware (i.e. they are pinned), we should not be powering
2107          * down! All other fences will be reacquired by the user upon waking.
2108          */
2109         for (i = 0; i < dev_priv->num_fence_regs; i++) {
2110                 struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
2111
2112                 /* Ideally we want to assert that the fence register is not
2113                  * live at this point (i.e. that no piece of code will be
2114                  * trying to write through fence + GTT, as that both violates
2115                  * our tracking of activity and associated locking/barriers,
2116                  * but also is illegal given that the hw is powered down).
2117                  *
2118                  * Previously we used reg->pin_count as a "liveness" indicator.
2119                  * That is not sufficient, and we need a more fine-grained
2120                  * tool if we want to have a sanity check here.
2121                  */
2122
2123                 if (!reg->vma)
2124                         continue;
2125
2126                 GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
2127                 reg->dirty = true;
2128         }
2129 }
2130
2131 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
2132 {
2133         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2134         int err;
2135
2136         err = drm_gem_create_mmap_offset(&obj->base);
2137         if (likely(!err))
2138                 return 0;
2139
2140         /* Attempt to reap some mmap space from dead objects */
2141         do {
2142                 err = i915_gem_wait_for_idle(dev_priv,
2143                                              I915_WAIT_INTERRUPTIBLE,
2144                                              MAX_SCHEDULE_TIMEOUT);
2145                 if (err)
2146                         break;
2147
2148                 i915_gem_drain_freed_objects(dev_priv);
2149                 err = drm_gem_create_mmap_offset(&obj->base);
2150                 if (!err)
2151                         break;
2152
2153         } while (flush_delayed_work(&dev_priv->gt.retire_work));
2154
2155         return err;
2156 }
2157
2158 static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj)
2159 {
2160         drm_gem_free_mmap_offset(&obj->base);
2161 }
2162
2163 int
2164 i915_gem_mmap_gtt(struct drm_file *file,
2165                   struct drm_device *dev,
2166                   u32 handle,
2167                   u64 *offset)
2168 {
2169         struct drm_i915_gem_object *obj;
2170         int ret;
2171
2172         obj = i915_gem_object_lookup(file, handle);
2173         if (!obj)
2174                 return -ENOENT;
2175
2176         ret = i915_gem_object_create_mmap_offset(obj);
2177         if (ret == 0)
2178                 *offset = drm_vma_node_offset_addr(&obj->base.vma_node);
2179
2180         i915_gem_object_put(obj);
2181         return ret;
2182 }
2183
2184 /**
2185  * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing
2186  * @dev: DRM device
2187  * @data: GTT mapping ioctl data
2188  * @file: GEM object info
2189  *
2190  * Simply returns the fake offset to userspace so it can mmap it.
2191  * The mmap call will end up in drm_gem_mmap(), which will set things
2192  * up so we can get faults in the handler above.
2193  *
2194  * The fault handler will take care of binding the object into the GTT
2195  * (since it may have been evicted to make room for something), allocating
2196  * a fence register, and mapping the appropriate aperture address into
2197  * userspace.
2198  */
2199 int
2200 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
2201                         struct drm_file *file)
2202 {
2203         struct drm_i915_gem_mmap_gtt *args = data;
2204
2205         return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset);
2206 }
2207
2208 /* Immediately discard the backing storage */
2209 static void
2210 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
2211 {
2212         i915_gem_object_free_mmap_offset(obj);
2213
2214         if (obj->base.filp == NULL)
2215                 return;
2216
2217         /* Our goal here is to return as much of the memory as
2218          * is possible back to the system as we are called from OOM.
2219          * To do this we must instruct the shmfs to drop all of its
2220          * backing pages, *now*.
2221          */
2222         shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1);
2223         obj->mm.madv = __I915_MADV_PURGED;
2224         obj->mm.pages = ERR_PTR(-EFAULT);
2225 }
2226
2227 /* Try to discard unwanted pages */
2228 void __i915_gem_object_invalidate(struct drm_i915_gem_object *obj)
2229 {
2230         struct address_space *mapping;
2231
2232         lockdep_assert_held(&obj->mm.lock);
2233         GEM_BUG_ON(i915_gem_object_has_pages(obj));
2234
2235         switch (obj->mm.madv) {
2236         case I915_MADV_DONTNEED:
2237                 i915_gem_object_truncate(obj);
2238         case __I915_MADV_PURGED:
2239                 return;
2240         }
2241
2242         if (obj->base.filp == NULL)
2243                 return;
2244
2245         mapping = obj->base.filp->f_mapping,
2246         invalidate_mapping_pages(mapping, 0, (loff_t)-1);
2247 }
2248
2249 /*
2250  * Move pages to appropriate lru and release the pagevec, decrementing the
2251  * ref count of those pages.
2252  */
2253 static void check_release_pagevec(struct pagevec *pvec)
2254 {
2255         check_move_unevictable_pages(pvec);
2256         __pagevec_release(pvec);
2257         cond_resched();
2258 }
2259
2260 static void
2261 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj,
2262                               struct sg_table *pages)
2263 {
2264         struct sgt_iter sgt_iter;
2265         struct pagevec pvec;
2266         struct page *page;
2267
2268         __i915_gem_object_release_shmem(obj, pages, true);
2269
2270         i915_gem_gtt_finish_pages(obj, pages);
2271
2272         if (i915_gem_object_needs_bit17_swizzle(obj))
2273                 i915_gem_object_save_bit_17_swizzle(obj, pages);
2274
2275         mapping_clear_unevictable(file_inode(obj->base.filp)->i_mapping);
2276
2277         pagevec_init(&pvec);
2278         for_each_sgt_page(page, sgt_iter, pages) {
2279                 if (obj->mm.dirty)
2280                         set_page_dirty(page);
2281
2282                 if (obj->mm.madv == I915_MADV_WILLNEED)
2283                         mark_page_accessed(page);
2284
2285                 if (!pagevec_add(&pvec, page))
2286                         check_release_pagevec(&pvec);
2287         }
2288         if (pagevec_count(&pvec))
2289                 check_release_pagevec(&pvec);
2290         obj->mm.dirty = false;
2291
2292         sg_free_table(pages);
2293         kfree(pages);
2294 }
2295
2296 static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
2297 {
2298         struct radix_tree_iter iter;
2299         void __rcu **slot;
2300
2301         rcu_read_lock();
2302         radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0)
2303                 radix_tree_delete(&obj->mm.get_page.radix, iter.index);
2304         rcu_read_unlock();
2305 }
2306
2307 static struct sg_table *
2308 __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
2309 {
2310         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2311         struct sg_table *pages;
2312
2313         pages = fetch_and_zero(&obj->mm.pages);
2314         if (IS_ERR_OR_NULL(pages))
2315                 return pages;
2316
2317         spin_lock(&i915->mm.obj_lock);
2318         list_del(&obj->mm.link);
2319         spin_unlock(&i915->mm.obj_lock);
2320
2321         if (obj->mm.mapping) {
2322                 void *ptr;
2323
2324                 ptr = page_mask_bits(obj->mm.mapping);
2325                 if (is_vmalloc_addr(ptr))
2326                         vunmap(ptr);
2327                 else
2328                         kunmap(kmap_to_page(ptr));
2329
2330                 obj->mm.mapping = NULL;
2331         }
2332
2333         __i915_gem_object_reset_page_iter(obj);
2334         obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0;
2335
2336         return pages;
2337 }
2338
2339 int __i915_gem_object_put_pages(struct drm_i915_gem_object *obj,
2340                                 enum i915_mm_subclass subclass)
2341 {
2342         struct sg_table *pages;
2343         int ret;
2344
2345         if (i915_gem_object_has_pinned_pages(obj))
2346                 return -EBUSY;
2347
2348         GEM_BUG_ON(obj->bind_count);
2349
2350         /* May be called by shrinker from within get_pages() (on another bo) */
2351         mutex_lock_nested(&obj->mm.lock, subclass);
2352         if (unlikely(atomic_read(&obj->mm.pages_pin_count))) {
2353                 ret = -EBUSY;
2354                 goto unlock;
2355         }
2356
2357         /*
2358          * ->put_pages might need to allocate memory for the bit17 swizzle
2359          * array, hence protect them from being reaped by removing them from gtt
2360          * lists early.
2361          */
2362         pages = __i915_gem_object_unset_pages(obj);
2363
2364         /*
2365          * XXX Temporary hijinx to avoid updating all backends to handle
2366          * NULL pages. In the future, when we have more asynchronous
2367          * get_pages backends we should be better able to handle the
2368          * cancellation of the async task in a more uniform manner.
2369          */
2370         if (!pages && !i915_gem_object_needs_async_cancel(obj))
2371                 pages = ERR_PTR(-EINVAL);
2372
2373         if (!IS_ERR(pages))
2374                 obj->ops->put_pages(obj, pages);
2375
2376         ret = 0;
2377 unlock:
2378         mutex_unlock(&obj->mm.lock);
2379
2380         return ret;
2381 }
2382
2383 bool i915_sg_trim(struct sg_table *orig_st)
2384 {
2385         struct sg_table new_st;
2386         struct scatterlist *sg, *new_sg;
2387         unsigned int i;
2388
2389         if (orig_st->nents == orig_st->orig_nents)
2390                 return false;
2391
2392         if (sg_alloc_table(&new_st, orig_st->nents, GFP_KERNEL | __GFP_NOWARN))
2393                 return false;
2394
2395         new_sg = new_st.sgl;
2396         for_each_sg(orig_st->sgl, sg, orig_st->nents, i) {
2397                 sg_set_page(new_sg, sg_page(sg), sg->length, 0);
2398                 sg_dma_address(new_sg) = sg_dma_address(sg);
2399                 sg_dma_len(new_sg) = sg_dma_len(sg);
2400
2401                 new_sg = sg_next(new_sg);
2402         }
2403         GEM_BUG_ON(new_sg); /* Should walk exactly nents and hit the end */
2404
2405         sg_free_table(orig_st);
2406
2407         *orig_st = new_st;
2408         return true;
2409 }
2410
2411 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
2412 {
2413         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
2414         const unsigned long page_count = obj->base.size / PAGE_SIZE;
2415         unsigned long i;
2416         struct address_space *mapping;
2417         struct sg_table *st;
2418         struct scatterlist *sg;
2419         struct sgt_iter sgt_iter;
2420         struct page *page;
2421         unsigned long last_pfn = 0;     /* suppress gcc warning */
2422         unsigned int max_segment = i915_sg_segment_size();
2423         unsigned int sg_page_sizes;
2424         struct pagevec pvec;
2425         gfp_t noreclaim;
2426         int ret;
2427
2428         /*
2429          * Assert that the object is not currently in any GPU domain. As it
2430          * wasn't in the GTT, there shouldn't be any way it could have been in
2431          * a GPU cache
2432          */
2433         GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS);
2434         GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS);
2435
2436         /*
2437          * If there's no chance of allocating enough pages for the whole
2438          * object, bail early.
2439          */
2440         if (page_count > totalram_pages())
2441                 return -ENOMEM;
2442
2443         st = kmalloc(sizeof(*st), GFP_KERNEL);
2444         if (st == NULL)
2445                 return -ENOMEM;
2446
2447 rebuild_st:
2448         if (sg_alloc_table(st, page_count, GFP_KERNEL)) {
2449                 kfree(st);
2450                 return -ENOMEM;
2451         }
2452
2453         /*
2454          * Get the list of pages out of our struct file.  They'll be pinned
2455          * at this point until we release them.
2456          *
2457          * Fail silently without starting the shrinker
2458          */
2459         mapping = obj->base.filp->f_mapping;
2460         mapping_set_unevictable(mapping);
2461         noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM);
2462         noreclaim |= __GFP_NORETRY | __GFP_NOWARN;
2463
2464         sg = st->sgl;
2465         st->nents = 0;
2466         sg_page_sizes = 0;
2467         for (i = 0; i < page_count; i++) {
2468                 const unsigned int shrink[] = {
2469                         I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE,
2470                         0,
2471                 }, *s = shrink;
2472                 gfp_t gfp = noreclaim;
2473
2474                 do {
2475                         cond_resched();
2476                         page = shmem_read_mapping_page_gfp(mapping, i, gfp);
2477                         if (likely(!IS_ERR(page)))
2478                                 break;
2479
2480                         if (!*s) {
2481                                 ret = PTR_ERR(page);
2482                                 goto err_sg;
2483                         }
2484
2485                         i915_gem_shrink(dev_priv, 2 * page_count, NULL, *s++);
2486
2487                         /*
2488                          * We've tried hard to allocate the memory by reaping
2489                          * our own buffer, now let the real VM do its job and
2490                          * go down in flames if truly OOM.
2491                          *
2492                          * However, since graphics tend to be disposable,
2493                          * defer the oom here by reporting the ENOMEM back
2494                          * to userspace.
2495                          */
2496                         if (!*s) {
2497                                 /* reclaim and warn, but no oom */
2498                                 gfp = mapping_gfp_mask(mapping);
2499
2500                                 /*
2501                                  * Our bo are always dirty and so we require
2502                                  * kswapd to reclaim our pages (direct reclaim
2503                                  * does not effectively begin pageout of our
2504                                  * buffers on its own). However, direct reclaim
2505                                  * only waits for kswapd when under allocation
2506                                  * congestion. So as a result __GFP_RECLAIM is
2507                                  * unreliable and fails to actually reclaim our
2508                                  * dirty pages -- unless you try over and over
2509                                  * again with !__GFP_NORETRY. However, we still
2510                                  * want to fail this allocation rather than
2511                                  * trigger the out-of-memory killer and for
2512                                  * this we want __GFP_RETRY_MAYFAIL.
2513                                  */
2514                                 gfp |= __GFP_RETRY_MAYFAIL;
2515                         }
2516                 } while (1);
2517
2518                 if (!i ||
2519                     sg->length >= max_segment ||
2520                     page_to_pfn(page) != last_pfn + 1) {
2521                         if (i) {
2522                                 sg_page_sizes |= sg->length;
2523                                 sg = sg_next(sg);
2524                         }
2525                         st->nents++;
2526                         sg_set_page(sg, page, PAGE_SIZE, 0);
2527                 } else {
2528                         sg->length += PAGE_SIZE;
2529                 }
2530                 last_pfn = page_to_pfn(page);
2531
2532                 /* Check that the i965g/gm workaround works. */
2533                 WARN_ON((gfp & __GFP_DMA32) && (last_pfn >= 0x00100000UL));
2534         }
2535         if (sg) { /* loop terminated early; short sg table */
2536                 sg_page_sizes |= sg->length;
2537                 sg_mark_end(sg);
2538         }
2539
2540         /* Trim unused sg entries to avoid wasting memory. */
2541         i915_sg_trim(st);
2542
2543         ret = i915_gem_gtt_prepare_pages(obj, st);
2544         if (ret) {
2545                 /*
2546                  * DMA remapping failed? One possible cause is that
2547                  * it could not reserve enough large entries, asking
2548                  * for PAGE_SIZE chunks instead may be helpful.
2549                  */
2550                 if (max_segment > PAGE_SIZE) {
2551                         for_each_sgt_page(page, sgt_iter, st)
2552                                 put_page(page);
2553                         sg_free_table(st);
2554
2555                         max_segment = PAGE_SIZE;
2556                         goto rebuild_st;
2557                 } else {
2558                         dev_warn(&dev_priv->drm.pdev->dev,
2559                                  "Failed to DMA remap %lu pages\n",
2560                                  page_count);
2561                         goto err_pages;
2562                 }
2563         }
2564
2565         if (i915_gem_object_needs_bit17_swizzle(obj))
2566                 i915_gem_object_do_bit_17_swizzle(obj, st);
2567
2568         __i915_gem_object_set_pages(obj, st, sg_page_sizes);
2569
2570         return 0;
2571
2572 err_sg:
2573         sg_mark_end(sg);
2574 err_pages:
2575         mapping_clear_unevictable(mapping);
2576         pagevec_init(&pvec);
2577         for_each_sgt_page(page, sgt_iter, st) {
2578                 if (!pagevec_add(&pvec, page))
2579                         check_release_pagevec(&pvec);
2580         }
2581         if (pagevec_count(&pvec))
2582                 check_release_pagevec(&pvec);
2583         sg_free_table(st);
2584         kfree(st);
2585
2586         /*
2587          * shmemfs first checks if there is enough memory to allocate the page
2588          * and reports ENOSPC should there be insufficient, along with the usual
2589          * ENOMEM for a genuine allocation failure.
2590          *
2591          * We use ENOSPC in our driver to mean that we have run out of aperture
2592          * space and so want to translate the error from shmemfs back to our
2593          * usual understanding of ENOMEM.
2594          */
2595         if (ret == -ENOSPC)
2596                 ret = -ENOMEM;
2597
2598         return ret;
2599 }
2600
2601 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
2602                                  struct sg_table *pages,
2603                                  unsigned int sg_page_sizes)
2604 {
2605         struct drm_i915_private *i915 = to_i915(obj->base.dev);
2606         unsigned long supported = INTEL_INFO(i915)->page_sizes;
2607         int i;
2608
2609         lockdep_assert_held(&obj->mm.lock);
2610
2611         obj->mm.get_page.sg_pos = pages->sgl;
2612         obj->mm.get_page.sg_idx = 0;
2613
2614         obj->mm.pages = pages;
2615
2616         if (i915_gem_object_is_tiled(obj) &&
2617             i915->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
2618                 GEM_BUG_ON(obj->mm.quirked);
2619                 __i915_gem_object_pin_pages(obj);
2620                 obj->mm.quirked = true;
2621         }
2622
2623         GEM_BUG_ON(!sg_page_sizes);
2624         obj->mm.page_sizes.phys = sg_page_sizes;
2625
2626         /*
2627          * Calculate the supported page-sizes which fit into the given
2628          * sg_page_sizes. This will give us the page-sizes which we may be able
2629          * to use opportunistically when later inserting into the GTT. For
2630          * example if phys=2G, then in theory we should be able to use 1G, 2M,
2631          * 64K or 4K pages, although in practice this will depend on a number of
2632          * other factors.
2633          */
2634         obj->mm.page_sizes.sg = 0;
2635         for_each_set_bit(i, &supported, ilog2(I915_GTT_MAX_PAGE_SIZE) + 1) {
2636                 if (obj->mm.page_sizes.phys & ~0u << i)
2637                         obj->mm.page_sizes.sg |= BIT(i);
2638         }
2639         GEM_BUG_ON(!HAS_PAGE_SIZES(i915, obj->mm.page_sizes.sg));
2640
2641         spin_lock(&i915->mm.obj_lock);
2642         list_add(&obj->mm.link, &i915->mm.unbound_list);
2643         spin_unlock(&i915->mm.obj_lock);
2644 }
2645
2646 static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2647 {
2648         int err;
2649
2650         if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
2651                 DRM_DEBUG("Attempting to obtain a purgeable object\n");
2652                 return -EFAULT;
2653         }
2654
2655         err = obj->ops->get_pages(obj);
2656         GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
2657
2658         return err;
2659 }
2660
2661 /* Ensure that the associated pages are gathered from the backing storage
2662  * and pinned into our object. i915_gem_object_pin_pages() may be called
2663  * multiple times before they are released by a single call to
2664  * i915_gem_object_unpin_pages() - once the pages are no longer referenced
2665  * either as a result of memory pressure (reaping pages under the shrinker)
2666  * or as the object is itself released.
2667  */
2668 int __i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
2669 {
2670         int err;
2671
2672         err = mutex_lock_interruptible(&obj->mm.lock);
2673         if (err)
2674                 return err;
2675
2676         if (unlikely(!i915_gem_object_has_pages(obj))) {
2677                 GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2678
2679                 err = ____i915_gem_object_get_pages(obj);
2680                 if (err)
2681                         goto unlock;
2682
2683                 smp_mb__before_atomic();
2684         }
2685         atomic_inc(&obj->mm.pages_pin_count);
2686
2687 unlock:
2688         mutex_unlock(&obj->mm.lock);
2689         return err;
2690 }
2691
2692 /* The 'mapping' part of i915_gem_object_pin_map() below */
2693 static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
2694                                  enum i915_map_type type)
2695 {
2696         unsigned long n_pages = obj->base.size >> PAGE_SHIFT;
2697         struct sg_table *sgt = obj->mm.pages;
2698         struct sgt_iter sgt_iter;
2699         struct page *page;
2700         struct page *stack_pages[32];
2701         struct page **pages = stack_pages;
2702         unsigned long i = 0;
2703         pgprot_t pgprot;
2704         void *addr;
2705
2706         /* A single page can always be kmapped */
2707         if (n_pages == 1 && type == I915_MAP_WB)
2708                 return kmap(sg_page(sgt->sgl));
2709
2710         if (n_pages > ARRAY_SIZE(stack_pages)) {
2711                 /* Too big for stack -- allocate temporary array instead */
2712                 pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
2713                 if (!pages)
2714                         return NULL;
2715         }
2716
2717         for_each_sgt_page(page, sgt_iter, sgt)
2718                 pages[i++] = page;
2719
2720         /* Check that we have the expected number of pages */
2721         GEM_BUG_ON(i != n_pages);
2722
2723         switch (type) {
2724         default:
2725                 MISSING_CASE(type);
2726                 /* fallthrough to use PAGE_KERNEL anyway */
2727         case I915_MAP_WB:
2728                 pgprot = PAGE_KERNEL;
2729                 break;
2730         case I915_MAP_WC:
2731                 pgprot = pgprot_writecombine(PAGE_KERNEL_IO);
2732                 break;
2733         }
2734         addr = vmap(pages, n_pages, 0, pgprot);
2735
2736         if (pages != stack_pages)
2737                 kvfree(pages);
2738
2739         return addr;
2740 }
2741
2742 /* get, pin, and map the pages of the object into kernel space */
2743 void *i915_gem_object_pin_map(struct drm_i915_gem_object *obj,
2744                               enum i915_map_type type)
2745 {
2746         enum i915_map_type has_type;
2747         bool pinned;
2748         void *ptr;
2749         int ret;
2750
2751         if (unlikely(!i915_gem_object_has_struct_page(obj)))
2752                 return ERR_PTR(-ENXIO);
2753
2754         ret = mutex_lock_interruptible(&obj->mm.lock);
2755         if (ret)
2756                 return ERR_PTR(ret);
2757
2758         pinned = !(type & I915_MAP_OVERRIDE);
2759         type &= ~I915_MAP_OVERRIDE;
2760
2761         if (!atomic_inc_not_zero(&obj->mm.pages_pin_count)) {
2762                 if (unlikely(!i915_gem_object_has_pages(obj))) {
2763                         GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
2764
2765                         ret = ____i915_gem_object_get_pages(obj);
2766                         if (ret)
2767                                 goto err_unlock;
2768
2769                         smp_mb__before_atomic();
2770                 }
2771                 atomic_inc(&obj->mm.pages_pin_count);
2772                 pinned = false;
2773         }
2774         GEM_BUG_ON(!i915_gem_object_has_pages(obj));
2775
2776         ptr = page_unpack_bits(obj->mm.mapping, &has_type);
2777         if (ptr && has_type != type) {
2778                 if (pinned) {
2779                         ret = -EBUSY;
2780                         goto err_unpin;
2781                 }
2782
2783                 if (is_vmalloc_addr(ptr))
2784                         vunmap(ptr);
2785                 else
2786                         kunmap(kmap_to_page(ptr));
2787
2788                 ptr = obj->mm.mapping = NULL;
2789         }
2790
2791         if (!ptr) {
2792                 ptr = i915_gem_object_map(obj, type);
2793                 if (!ptr) {
2794                         ret = -ENOMEM;
2795                         goto err_unpin;
2796                 }
2797
2798                 obj->mm.mapping = page_pack_bits(ptr, type);
2799         }
2800
2801 out_unlock:
2802         mutex_unlock(&obj->mm.lock);
2803         return ptr;
2804
2805 err_unpin:
2806         atomic_dec(&obj->mm.pages_pin_count);
2807 err_unlock:
2808         ptr = ERR_PTR(ret);
2809         goto out_unlock;
2810 }
2811
2812 static int
2813 i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
2814                            const struct drm_i915_gem_pwrite *arg)
2815 {
2816         struct address_space *mapping = obj->base.filp->f_mapping;
2817         char __user *user_data = u64_to_user_ptr(arg->data_ptr);
2818         u64 remain, offset;
2819         unsigned int pg;
2820
2821         /* Before we instantiate/pin the backing store for our use, we
2822          * can prepopulate the shmemfs filp efficiently using a write into
2823          * the pagecache. We avoid the penalty of instantiating all the
2824          * pages, important if the user is just writing to a few and never
2825          * uses the object on the GPU, and using a direct write into shmemfs
2826          * allows it to avoid the cost of retrieving a page (either swapin
2827          * or clearing-before-use) before it is overwritten.
2828          */
2829         if (i915_gem_object_has_pages(obj))
2830                 return -ENODEV;
2831
2832         if (obj->mm.madv != I915_MADV_WILLNEED)
2833                 return -EFAULT;
2834
2835         /* Before the pages are instantiated the object is treated as being
2836          * in the CPU domain. The pages will be clflushed as required before
2837          * use, and we can freely write into the pages directly. If userspace
2838          * races pwrite with any other operation; corruption will ensue -
2839          * that is userspace's prerogative!
2840          */
2841
2842         remain = arg->size;
2843         offset = arg->offset;
2844         pg = offset_in_page(offset);
2845
2846         do {
2847                 unsigned int len, unwritten;
2848                 struct page *page;
2849                 void *data, *vaddr;
2850                 int err;
2851
2852                 len = PAGE_SIZE - pg;
2853                 if (len > remain)
2854                         len = remain;
2855
2856                 err = pagecache_write_begin(obj->base.filp, mapping,
2857                                             offset, len, 0,
2858                                             &page, &data);
2859                 if (err < 0)
2860                         return err;
2861
2862                 vaddr = kmap(page);
2863                 unwritten = copy_from_user(vaddr + pg, user_data, len);
2864                 kunmap(page);
2865
2866                 err = pagecache_write_end(obj->base.filp, mapping,
2867                                           offset, len, len - unwritten,
2868                                           page, data);
2869                 if (err < 0)
2870                         return err;
2871
2872                 if (unwritten)
2873                         return -EFAULT;
2874
2875                 remain -= len;
2876                 user_data += len;
2877                 offset += len;
2878                 pg = 0;
2879         } while (remain);
2880
2881         return 0;
2882 }
2883
2884 static bool match_ring(struct i915_request *rq)
2885 {
2886         struct drm_i915_private *dev_priv = rq->i915;
2887         u32 ring = I915_READ(RING_START(rq->engine->mmio_base));
2888
2889         return ring == i915_ggtt_offset(rq->ring->vma);
2890 }
2891
2892 struct i915_request *
2893 i915_gem_find_active_request(struct intel_engine_cs *engine)
2894 {
2895         struct i915_request *request, *active = NULL;
2896         unsigned long flags;
2897
2898         /*
2899          * We are called by the error capture, reset and to dump engine
2900          * state at random points in time. In particular, note that neither is
2901          * crucially ordered with an interrupt. After a hang, the GPU is dead
2902          * and we assume that no more writes can happen (we waited long enough
2903          * for all writes that were in transaction to be flushed) - adding an
2904          * extra delay for a recent interrupt is pointless. Hence, we do
2905          * not need an engine->irq_seqno_barrier() before the seqno reads.
2906          * At all other times, we must assume the GPU is still running, but
2907          * we only care about the snapshot of this moment.
2908          */
2909         spin_lock_irqsave(&engine->timeline.lock, flags);
2910         list_for_each_entry(request, &engine->timeline.requests, link) {
2911                 if (i915_request_completed(request))
2912                         continue;
2913
2914                 if (!i915_request_started(request))
2915                         break;
2916
2917                 /* More than one preemptible request may match! */
2918                 if (!match_ring(request))
2919                         break;
2920
2921                 active = request;
2922                 break;
2923         }
2924         spin_unlock_irqrestore(&engine->timeline.lock, flags);
2925
2926         return active;
2927 }
2928
2929 static void
2930 i915_gem_retire_work_handler(struct work_struct *work)
2931 {
2932         struct drm_i915_private *dev_priv =
2933                 container_of(work, typeof(*dev_priv), gt.retire_work.work);
2934         struct drm_device *dev = &dev_priv->drm;
2935
2936         /* Come back later if the device is busy... */
2937         if (mutex_trylock(&dev->struct_mutex)) {
2938                 i915_retire_requests(dev_priv);
2939                 mutex_unlock(&dev->struct_mutex);
2940         }
2941
2942         /*
2943          * Keep the retire handler running until we are finally idle.
2944          * We do not need to do this test under locking as in the worst-case
2945          * we queue the retire worker once too often.
2946          */
2947         if (READ_ONCE(dev_priv->gt.awake))
2948                 queue_delayed_work(dev_priv->wq,
2949                                    &dev_priv->gt.retire_work,
2950                                    round_jiffies_up_relative(HZ));
2951 }
2952
2953 static void shrink_caches(struct drm_i915_private *i915)
2954 {
2955         /*
2956          * kmem_cache_shrink() discards empty slabs and reorders partially
2957          * filled slabs to prioritise allocating from the mostly full slabs,
2958          * with the aim of reducing fragmentation.
2959          */
2960         kmem_cache_shrink(i915->priorities);
2961         kmem_cache_shrink(i915->dependencies);
2962         kmem_cache_shrink(i915->requests);
2963         kmem_cache_shrink(i915->luts);
2964         kmem_cache_shrink(i915->vmas);
2965         kmem_cache_shrink(i915->objects);
2966 }
2967
2968 struct sleep_rcu_work {
2969         union {
2970                 struct rcu_head rcu;
2971                 struct work_struct work;
2972         };
2973         struct drm_i915_private *i915;
2974         unsigned int epoch;
2975 };
2976
2977 static inline bool
2978 same_epoch(struct drm_i915_private *i915, unsigned int epoch)
2979 {
2980         /*
2981          * There is a small chance that the epoch wrapped since we started
2982          * sleeping. If we assume that epoch is at least a u32, then it will
2983          * take at least 2^32 * 100ms for it to wrap, or about 326 years.
2984          */
2985         return epoch == READ_ONCE(i915->gt.epoch);
2986 }
2987
2988 static void __sleep_work(struct work_struct *work)
2989 {
2990         struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
2991         struct drm_i915_private *i915 = s->i915;
2992         unsigned int epoch = s->epoch;
2993
2994         kfree(s);
2995         if (same_epoch(i915, epoch))
2996                 shrink_caches(i915);
2997 }
2998
2999 static void __sleep_rcu(struct rcu_head *rcu)
3000 {
3001         struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
3002         struct drm_i915_private *i915 = s->i915;
3003
3004         destroy_rcu_head(&s->rcu);
3005
3006         if (same_epoch(i915, s->epoch)) {
3007                 INIT_WORK(&s->work, __sleep_work);
3008                 queue_work(i915->wq, &s->work);
3009         } else {
3010                 kfree(s);
3011         }
3012 }
3013
3014 static inline bool
3015 new_requests_since_last_retire(const struct drm_i915_private *i915)
3016 {
3017         return (READ_ONCE(i915->gt.active_requests) ||
3018                 work_pending(&i915->gt.idle_work.work));
3019 }
3020
3021 static void assert_kernel_context_is_current(struct drm_i915_private *i915)
3022 {
3023         struct intel_engine_cs *engine;
3024         enum intel_engine_id id;
3025
3026         if (i915_terminally_wedged(&i915->gpu_error))
3027                 return;
3028
3029         GEM_BUG_ON(i915->gt.active_requests);
3030         for_each_engine(engine, i915, id) {
3031                 GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
3032                 GEM_BUG_ON(engine->last_retired_context !=
3033                            to_intel_context(i915->kernel_context, engine));
3034         }
3035 }
3036
3037 static void
3038 i915_gem_idle_work_handler(struct work_struct *work)
3039 {
3040         struct drm_i915_private *dev_priv =
3041                 container_of(work, typeof(*dev_priv), gt.idle_work.work);
3042         unsigned int epoch = I915_EPOCH_INVALID;
3043         bool rearm_hangcheck;
3044
3045         if (!READ_ONCE(dev_priv->gt.awake))
3046                 return;
3047
3048         if (READ_ONCE(dev_priv->gt.active_requests))
3049                 return;
3050
3051         /*
3052          * Flush out the last user context, leaving only the pinned
3053          * kernel context resident. When we are idling on the kernel_context,
3054          * no more new requests (with a context switch) are emitted and we
3055          * can finally rest. A consequence is that the idle work handler is
3056          * always called at least twice before idling (and if the system is
3057          * idle that implies a round trip through the retire worker).
3058          */
3059         mutex_lock(&dev_priv->drm.struct_mutex);
3060         i915_gem_switch_to_kernel_context(dev_priv);
3061         mutex_unlock(&dev_priv->drm.struct_mutex);
3062
3063         GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
3064                   READ_ONCE(dev_priv->gt.active_requests));
3065
3066         /*
3067          * Wait for last execlists context complete, but bail out in case a
3068          * new request is submitted. As we don't trust the hardware, we
3069          * continue on if the wait times out. This is necessary to allow
3070          * the machine to suspend even if the hardware dies, and we will
3071          * try to recover in resume (after depriving the hardware of power,
3072          * it may be in a better mmod).
3073          */
3074         __wait_for(if (new_requests_since_last_retire(dev_priv)) return,
3075                    intel_engines_are_idle(dev_priv),
3076                    I915_IDLE_ENGINES_TIMEOUT * 1000,
3077                    10, 500);
3078
3079         rearm_hangcheck =
3080                 cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
3081
3082         if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
3083                 /* Currently busy, come back later */
3084                 mod_delayed_work(dev_priv->wq,
3085                                  &dev_priv->gt.idle_work,
3086                                  msecs_to_jiffies(50));
3087                 goto out_rearm;
3088         }
3089
3090         /*
3091          * New request retired after this work handler started, extend active
3092          * period until next instance of the work.
3093          */
3094         if (new_requests_since_last_retire(dev_priv))
3095                 goto out_unlock;
3096
3097         epoch = __i915_gem_park(dev_priv);
3098
3099         assert_kernel_context_is_current(dev_priv);
3100
3101         rearm_hangcheck = false;
3102 out_unlock:
3103         mutex_unlock(&dev_priv->drm.struct_mutex);
3104
3105 out_rearm:
3106         if (rearm_hangcheck) {
3107                 GEM_BUG_ON(!dev_priv->gt.awake);
3108                 i915_queue_hangcheck(dev_priv);
3109         }
3110
3111         /*
3112          * When we are idle, it is an opportune time to reap our caches.
3113          * However, we have many objects that utilise RCU and the ordered
3114          * i915->wq that this work is executing on. To try and flush any
3115          * pending frees now we are idle, we first wait for an RCU grace
3116          * period, and then queue a task (that will run last on the wq) to
3117          * shrink and re-optimize the caches.
3118          */
3119         if (same_epoch(dev_priv, epoch)) {
3120                 struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
3121                 if (s) {
3122                         init_rcu_head(&s->rcu);
3123                         s->i915 = dev_priv;
3124                         s->epoch = epoch;
3125                         call_rcu(&s->rcu, __sleep_rcu);
3126                 }
3127         }
3128 }
3129
3130 void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file)
3131 {
3132         struct drm_i915_private *i915 = to_i915(gem->dev);
3133         struct drm_i915_gem_object *obj = to_intel_bo(gem);
3134         struct drm_i915_file_private *fpriv = file->driver_priv;
3135         struct i915_lut_handle *lut, *ln;
3136
3137         mutex_lock(&i915->drm.struct_mutex);
3138
3139         list_for_each_entry_safe(lut, ln, &obj->lut_list, obj_link) {
3140                 struct i915_gem_context *ctx = lut->ctx;
3141                 struct i915_vma *vma;
3142
3143                 GEM_BUG_ON(ctx->file_priv == ERR_PTR(-EBADF));
3144                 if (ctx->file_priv != fpriv)
3145                         continue;
3146
3147                 vma = radix_tree_delete(&ctx->handles_vma, lut->handle);
3148                 GEM_BUG_ON(vma->obj != obj);
3149
3150                 /* We allow the process to have multiple handles to the same
3151                  * vma, in the same fd namespace, by virtue of flink/open.
3152                  */
3153                 GEM_BUG_ON(!vma->open_count);
3154                 if (!--vma->open_count && !i915_vma_is_ggtt(vma))
3155                         i915_vma_close(vma);
3156
3157                 list_del(&lut->obj_link);
3158                 list_del(&lut->ctx_link);
3159
3160                 kmem_cache_free(i915->luts, lut);
3161                 __i915_gem_object_release_unless_active(obj);
3162         }
3163
3164         mutex_unlock(&i915->drm.struct_mutex);
3165 }
3166
3167 static unsigned long to_wait_timeout(s64 timeout_ns)
3168 {
3169         if (timeout_ns < 0)
3170                 return MAX_SCHEDULE_TIMEOUT;
3171
3172         if (timeout_ns == 0)
3173                 return 0;
3174
3175         return nsecs_to_jiffies_timeout(timeout_ns);
3176 }
3177
3178 /**
3179  * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT
3180  * @dev: drm device pointer
3181  * @data: ioctl data blob
3182  * @file: drm file pointer
3183  *
3184  * Returns 0 if successful, else an error is returned with the remaining time in
3185  * the timeout parameter.
3186  *  -ETIME: object is still busy after timeout
3187  *  -ERESTARTSYS: signal interrupted the wait
3188  *  -ENONENT: object doesn't exist
3189  * Also possible, but rare:
3190  *  -EAGAIN: incomplete, restart syscall
3191  *  -ENOMEM: damn
3192  *  -ENODEV: Internal IRQ fail
3193  *  -E?: The add request failed
3194  *
3195  * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any
3196  * non-zero timeout parameter the wait ioctl will wait for the given number of
3197  * nanoseconds on an object becoming unbusy. Since the wait itself does so
3198  * without holding struct_mutex the object may become re-busied before this
3199  * function completes. A similar but shorter * race condition exists in the busy
3200  * ioctl
3201  */
3202 int
3203 i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
3204 {
3205         struct drm_i915_gem_wait *args = data;
3206         struct drm_i915_gem_object *obj;
3207         ktime_t start;
3208         long ret;
3209
3210         if (args->flags != 0)
3211                 return -EINVAL;
3212
3213         obj = i915_gem_object_lookup(file, args->bo_handle);
3214         if (!obj)
3215                 return -ENOENT;
3216
3217         start = ktime_get();
3218
3219         ret = i915_gem_object_wait(obj,
3220                                    I915_WAIT_INTERRUPTIBLE |
3221                                    I915_WAIT_PRIORITY |
3222                                    I915_WAIT_ALL,
3223                                    to_wait_timeout(args->timeout_ns),
3224                                    to_rps_client(file));
3225
3226         if (args->timeout_ns > 0) {
3227                 args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
3228                 if (args->timeout_ns < 0)
3229                         args->timeout_ns = 0;
3230
3231                 /*
3232                  * Apparently ktime isn't accurate enough and occasionally has a
3233                  * bit of mismatch in the jiffies<->nsecs<->ktime loop. So patch
3234                  * things up to make the test happy. We allow up to 1 jiffy.
3235                  *
3236                  * This is a regression from the timespec->ktime conversion.
3237                  */
3238                 if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns))
3239                         args->timeout_ns = 0;
3240
3241                 /* Asked to wait beyond the jiffie/scheduler precision? */
3242                 if (ret == -ETIME && args->timeout_ns)
3243                         ret = -EAGAIN;
3244         }
3245
3246         i915_gem_object_put(obj);
3247         return ret;
3248 }
3249
3250 static int wait_for_engines(struct drm_i915_private *i915)
3251 {
3252         if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
3253                 dev_err(i915->drm.dev,
3254                         "Failed to idle engines, declaring wedged!\n");
3255                 GEM_TRACE_DUMP();
3256                 i915_gem_set_wedged(i915);
3257                 return -EIO;
3258         }
3259
3260         return 0;
3261 }
3262
3263 static long
3264 wait_for_timelines(struct drm_i915_private *i915,
3265                    unsigned int flags, long timeout)
3266 {
3267         struct i915_gt_timelines *gt = &i915->gt.timelines;
3268         struct i915_timeline *tl;
3269
3270         if (!READ_ONCE(i915->gt.active_requests))
3271                 return timeout;
3272
3273         mutex_lock(&gt->mutex);
3274         list_for_each_entry(tl, &gt->active_list, link) {
3275                 struct i915_request *rq;
3276
3277                 rq = i915_active_request_get_unlocked(&tl->last_request);
3278                 if (!rq)
3279                         continue;
3280
3281                 mutex_unlock(&gt->mutex);
3282
3283                 /*
3284                  * "Race-to-idle".
3285                  *
3286                  * Switching to the kernel context is often used a synchronous
3287                  * step prior to idling, e.g. in suspend for flushing all
3288                  * current operations to memory before sleeping. These we
3289                  * want to complete as quickly as possible to avoid prolonged
3290                  * stalls, so allow the gpu to boost to maximum clocks.
3291                  */
3292                 if (flags & I915_WAIT_FOR_IDLE_BOOST)
3293                         gen6_rps_boost(rq, NULL);
3294
3295                 timeout = i915_request_wait(rq, flags, timeout);
3296                 i915_request_put(rq);
3297                 if (timeout < 0)
3298                         return timeout;
3299
3300                 /* restart after reacquiring the lock */
3301                 mutex_lock(&gt->mutex);
3302                 tl = list_entry(&gt->active_list, typeof(*tl), link);
3303         }
3304         mutex_unlock(&gt->mutex);
3305
3306         return timeout;
3307 }
3308
3309 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
3310                            unsigned int flags, long timeout)
3311 {
3312         GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
3313                   flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
3314                   timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
3315
3316         /* If the device is asleep, we have no requests outstanding */
3317         if (!READ_ONCE(i915->gt.awake))
3318                 return 0;
3319
3320         timeout = wait_for_timelines(i915, flags, timeout);
3321         if (timeout < 0)
3322                 return timeout;
3323
3324         if (flags & I915_WAIT_LOCKED) {
3325                 int err;
3326
3327                 lockdep_assert_held(&i915->drm.struct_mutex);
3328
3329                 if (GEM_SHOW_DEBUG() && !timeout) {
3330                         /* Presume that timeout was non-zero to begin with! */
3331                         dev_warn(&i915->drm.pdev->dev,
3332                                  "Missed idle-completion interrupt!\n");
3333                         GEM_TRACE_DUMP();
3334                 }
3335
3336                 err = wait_for_engines(i915);
3337                 if (err)
3338                         return err;
3339
3340                 i915_retire_requests(i915);
3341                 GEM_BUG_ON(i915->gt.active_requests);
3342         }
3343
3344         return 0;
3345 }
3346
3347 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
3348 {
3349         /*
3350          * We manually flush the CPU domain so that we can override and
3351          * force the flush for the display, and perform it asyncrhonously.
3352          */
3353         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3354         if (obj->cache_dirty)
3355                 i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
3356         obj->write_domain = 0;
3357 }
3358
3359 void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj)
3360 {
3361         if (!READ_ONCE(obj->pin_global))
3362                 return;
3363
3364         mutex_lock(&obj->base.dev->struct_mutex);
3365         __i915_gem_object_flush_for_display(obj);
3366         mutex_unlock(&obj->base.dev->struct_mutex);
3367 }
3368
3369 /**
3370  * Moves a single object to the WC read, and possibly write domain.
3371  * @obj: object to act on
3372  * @write: ask for write access or read only
3373  *
3374  * This function returns when the move is complete, including waiting on
3375  * flushes to occur.
3376  */
3377 int
3378 i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
3379 {
3380         int ret;
3381
3382         lockdep_assert_held(&obj->base.dev->struct_mutex);
3383
3384         ret = i915_gem_object_wait(obj,
3385                                    I915_WAIT_INTERRUPTIBLE |
3386                                    I915_WAIT_LOCKED |
3387                                    (write ? I915_WAIT_ALL : 0),
3388                                    MAX_SCHEDULE_TIMEOUT,
3389                                    NULL);
3390         if (ret)
3391                 return ret;
3392
3393         if (obj->write_domain == I915_GEM_DOMAIN_WC)
3394                 return 0;
3395
3396         /* Flush and acquire obj->pages so that we are coherent through
3397          * direct access in memory with previous cached writes through
3398          * shmemfs and that our cache domain tracking remains valid.
3399          * For example, if the obj->filp was moved to swap without us
3400          * being notified and releasing the pages, we would mistakenly
3401          * continue to assume that the obj remained out of the CPU cached
3402          * domain.
3403          */
3404         ret = i915_gem_object_pin_pages(obj);
3405         if (ret)
3406                 return ret;
3407
3408         flush_write_domain(obj, ~I915_GEM_DOMAIN_WC);
3409
3410         /* Serialise direct access to this object with the barriers for
3411          * coherent writes from the GPU, by effectively invalidating the
3412          * WC domain upon first access.
3413          */
3414         if ((obj->read_domains & I915_GEM_DOMAIN_WC) == 0)
3415                 mb();
3416
3417         /* It should now be out of any other write domains, and we can update
3418          * the domain values for our changes.
3419          */
3420         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_WC) != 0);
3421         obj->read_domains |= I915_GEM_DOMAIN_WC;
3422         if (write) {
3423                 obj->read_domains = I915_GEM_DOMAIN_WC;
3424                 obj->write_domain = I915_GEM_DOMAIN_WC;
3425                 obj->mm.dirty = true;
3426         }
3427
3428         i915_gem_object_unpin_pages(obj);
3429         return 0;
3430 }
3431
3432 /**
3433  * Moves a single object to the GTT read, and possibly write domain.
3434  * @obj: object to act on
3435  * @write: ask for write access or read only
3436  *
3437  * This function returns when the move is complete, including waiting on
3438  * flushes to occur.
3439  */
3440 int
3441 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
3442 {
3443         int ret;
3444
3445         lockdep_assert_held(&obj->base.dev->struct_mutex);
3446
3447         ret = i915_gem_object_wait(obj,
3448                                    I915_WAIT_INTERRUPTIBLE |
3449                                    I915_WAIT_LOCKED |
3450                                    (write ? I915_WAIT_ALL : 0),
3451                                    MAX_SCHEDULE_TIMEOUT,
3452                                    NULL);
3453         if (ret)
3454                 return ret;
3455
3456         if (obj->write_domain == I915_GEM_DOMAIN_GTT)
3457                 return 0;
3458
3459         /* Flush and acquire obj->pages so that we are coherent through
3460          * direct access in memory with previous cached writes through
3461          * shmemfs and that our cache domain tracking remains valid.
3462          * For example, if the obj->filp was moved to swap without us
3463          * being notified and releasing the pages, we would mistakenly
3464          * continue to assume that the obj remained out of the CPU cached
3465          * domain.
3466          */
3467         ret = i915_gem_object_pin_pages(obj);
3468         if (ret)
3469                 return ret;
3470
3471         flush_write_domain(obj, ~I915_GEM_DOMAIN_GTT);
3472
3473         /* Serialise direct access to this object with the barriers for
3474          * coherent writes from the GPU, by effectively invalidating the
3475          * GTT domain upon first access.
3476          */
3477         if ((obj->read_domains & I915_GEM_DOMAIN_GTT) == 0)
3478                 mb();
3479
3480         /* It should now be out of any other write domains, and we can update
3481          * the domain values for our changes.
3482          */
3483         GEM_BUG_ON((obj->write_domain & ~I915_GEM_DOMAIN_GTT) != 0);
3484         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3485         if (write) {
3486                 obj->read_domains = I915_GEM_DOMAIN_GTT;
3487                 obj->write_domain = I915_GEM_DOMAIN_GTT;
3488                 obj->mm.dirty = true;
3489         }
3490
3491         i915_gem_object_unpin_pages(obj);
3492         return 0;
3493 }
3494
3495 /**
3496  * Changes the cache-level of an object across all VMA.
3497  * @obj: object to act on
3498  * @cache_level: new cache level to set for the object
3499  *
3500  * After this function returns, the object will be in the new cache-level
3501  * across all GTT and the contents of the backing storage will be coherent,
3502  * with respect to the new cache-level. In order to keep the backing storage
3503  * coherent for all users, we only allow a single cache level to be set
3504  * globally on the object and prevent it from being changed whilst the
3505  * hardware is reading from the object. That is if the object is currently
3506  * on the scanout it will be set to uncached (or equivalent display
3507  * cache coherency) and all non-MOCS GPU access will also be uncached so
3508  * that all direct access to the scanout remains coherent.
3509  */
3510 int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
3511                                     enum i915_cache_level cache_level)
3512 {
3513         struct i915_vma *vma;
3514         int ret;
3515
3516         lockdep_assert_held(&obj->base.dev->struct_mutex);
3517
3518         if (obj->cache_level == cache_level)
3519                 return 0;
3520
3521         /* Inspect the list of currently bound VMA and unbind any that would
3522          * be invalid given the new cache-level. This is principally to
3523          * catch the issue of the CS prefetch crossing page boundaries and
3524          * reading an invalid PTE on older architectures.
3525          */
3526 restart:
3527         list_for_each_entry(vma, &obj->vma.list, obj_link) {
3528                 if (!drm_mm_node_allocated(&vma->node))
3529                         continue;
3530
3531                 if (i915_vma_is_pinned(vma)) {
3532                         DRM_DEBUG("can not change the cache level of pinned objects\n");
3533                         return -EBUSY;
3534                 }
3535
3536                 if (!i915_vma_is_closed(vma) &&
3537                     i915_gem_valid_gtt_space(vma, cache_level))
3538                         continue;
3539
3540                 ret = i915_vma_unbind(vma);
3541                 if (ret)
3542                         return ret;
3543
3544                 /* As unbinding may affect other elements in the
3545                  * obj->vma_list (due to side-effects from retiring
3546                  * an active vma), play safe and restart the iterator.
3547                  */
3548                 goto restart;
3549         }
3550
3551         /* We can reuse the existing drm_mm nodes but need to change the
3552          * cache-level on the PTE. We could simply unbind them all and
3553          * rebind with the correct cache-level on next use. However since
3554          * we already have a valid slot, dma mapping, pages etc, we may as
3555          * rewrite the PTE in the belief that doing so tramples upon less
3556          * state and so involves less work.
3557          */
3558         if (obj->bind_count) {
3559                 /* Before we change the PTE, the GPU must not be accessing it.
3560                  * If we wait upon the object, we know that all the bound
3561                  * VMA are no longer active.
3562                  */
3563                 ret = i915_gem_object_wait(obj,
3564                                            I915_WAIT_INTERRUPTIBLE |
3565                                            I915_WAIT_LOCKED |
3566                                            I915_WAIT_ALL,
3567                                            MAX_SCHEDULE_TIMEOUT,
3568                                            NULL);
3569                 if (ret)
3570                         return ret;
3571
3572                 if (!HAS_LLC(to_i915(obj->base.dev)) &&
3573                     cache_level != I915_CACHE_NONE) {
3574                         /* Access to snoopable pages through the GTT is
3575                          * incoherent and on some machines causes a hard
3576                          * lockup. Relinquish the CPU mmaping to force
3577                          * userspace to refault in the pages and we can
3578                          * then double check if the GTT mapping is still
3579                          * valid for that pointer access.
3580                          */
3581                         i915_gem_release_mmap(obj);
3582
3583                         /* As we no longer need a fence for GTT access,
3584                          * we can relinquish it now (and so prevent having
3585                          * to steal a fence from someone else on the next
3586                          * fence request). Note GPU activity would have
3587                          * dropped the fence as all snoopable access is
3588                          * supposed to be linear.
3589                          */
3590                         for_each_ggtt_vma(vma, obj) {
3591                                 ret = i915_vma_put_fence(vma);
3592                                 if (ret)
3593                                         return ret;
3594                         }
3595                 } else {
3596                         /* We either have incoherent backing store and
3597                          * so no GTT access or the architecture is fully
3598                          * coherent. In such cases, existing GTT mmaps
3599                          * ignore the cache bit in the PTE and we can
3600                          * rewrite it without confusing the GPU or having
3601                          * to force userspace to fault back in its mmaps.
3602                          */
3603                 }
3604
3605                 list_for_each_entry(vma, &obj->vma.list, obj_link) {
3606                         if (!drm_mm_node_allocated(&vma->node))
3607                                 continue;
3608
3609                         ret = i915_vma_bind(vma, cache_level, PIN_UPDATE);
3610                         if (ret)
3611                                 return ret;
3612                 }
3613         }
3614
3615         list_for_each_entry(vma, &obj->vma.list, obj_link)
3616                 vma->node.color = cache_level;
3617         i915_gem_object_set_cache_coherency(obj, cache_level);
3618         obj->cache_dirty = true; /* Always invalidate stale cachelines */
3619
3620         return 0;
3621 }
3622
3623 int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data,
3624                                struct drm_file *file)
3625 {
3626         struct drm_i915_gem_caching *args = data;
3627         struct drm_i915_gem_object *obj;
3628         int err = 0;
3629
3630         rcu_read_lock();
3631         obj = i915_gem_object_lookup_rcu(file, args->handle);
3632         if (!obj) {
3633                 err = -ENOENT;
3634                 goto out;
3635         }
3636
3637         switch (obj->cache_level) {
3638         case I915_CACHE_LLC:
3639         case I915_CACHE_L3_LLC:
3640                 args->caching = I915_CACHING_CACHED;
3641                 break;
3642
3643         case I915_CACHE_WT:
3644                 args->caching = I915_CACHING_DISPLAY;
3645                 break;
3646
3647         default:
3648                 args->caching = I915_CACHING_NONE;
3649                 break;
3650         }
3651 out:
3652         rcu_read_unlock();
3653         return err;
3654 }
3655
3656 int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
3657                                struct drm_file *file)
3658 {
3659         struct drm_i915_private *i915 = to_i915(dev);
3660         struct drm_i915_gem_caching *args = data;
3661         struct drm_i915_gem_object *obj;
3662         enum i915_cache_level level;
3663         int ret = 0;
3664
3665         switch (args->caching) {
3666         case I915_CACHING_NONE:
3667                 level = I915_CACHE_NONE;
3668                 break;
3669         case I915_CACHING_CACHED:
3670                 /*
3671                  * Due to a HW issue on BXT A stepping, GPU stores via a
3672                  * snooped mapping may leave stale data in a corresponding CPU
3673                  * cacheline, whereas normally such cachelines would get
3674                  * invalidated.
3675                  */
3676                 if (!HAS_LLC(i915) && !HAS_SNOOP(i915))
3677                         return -ENODEV;
3678
3679                 level = I915_CACHE_LLC;
3680                 break;
3681         case I915_CACHING_DISPLAY:
3682                 level = HAS_WT(i915) ? I915_CACHE_WT : I915_CACHE_NONE;
3683                 break;
3684         default:
3685                 return -EINVAL;
3686         }
3687
3688         obj = i915_gem_object_lookup(file, args->handle);
3689         if (!obj)
3690                 return -ENOENT;
3691
3692         /*
3693          * The caching mode of proxy object is handled by its generator, and
3694          * not allowed to be changed by userspace.
3695          */
3696         if (i915_gem_object_is_proxy(obj)) {
3697                 ret = -ENXIO;
3698                 goto out;
3699         }
3700
3701         if (obj->cache_level == level)
3702                 goto out;
3703
3704         ret = i915_gem_object_wait(obj,
3705                                    I915_WAIT_INTERRUPTIBLE,
3706                                    MAX_SCHEDULE_TIMEOUT,
3707                                    to_rps_client(file));
3708         if (ret)
3709                 goto out;
3710
3711         ret = i915_mutex_lock_interruptible(dev);
3712         if (ret)
3713                 goto out;
3714
3715         ret = i915_gem_object_set_cache_level(obj, level);
3716         mutex_unlock(&dev->struct_mutex);
3717
3718 out:
3719         i915_gem_object_put(obj);
3720         return ret;
3721 }
3722
3723 /*
3724  * Prepare buffer for display plane (scanout, cursors, etc). Can be called from
3725  * an uninterruptible phase (modesetting) and allows any flushes to be pipelined
3726  * (for pageflips). We only flush the caches while preparing the buffer for
3727  * display, the callers are responsible for frontbuffer flush.
3728  */
3729 struct i915_vma *
3730 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
3731                                      u32 alignment,
3732                                      const struct i915_ggtt_view *view,
3733                                      unsigned int flags)
3734 {
3735         struct i915_vma *vma;
3736         int ret;
3737
3738         lockdep_assert_held(&obj->base.dev->struct_mutex);
3739
3740         /* Mark the global pin early so that we account for the
3741          * display coherency whilst setting up the cache domains.
3742          */
3743         obj->pin_global++;
3744
3745         /* The display engine is not coherent with the LLC cache on gen6.  As
3746          * a result, we make sure that the pinning that is about to occur is
3747          * done with uncached PTEs. This is lowest common denominator for all
3748          * chipsets.
3749          *
3750          * However for gen6+, we could do better by using the GFDT bit instead
3751          * of uncaching, which would allow us to flush all the LLC-cached data
3752          * with that bit in the PTE to main memory with just one PIPE_CONTROL.
3753          */
3754         ret = i915_gem_object_set_cache_level(obj,
3755                                               HAS_WT(to_i915(obj->base.dev)) ?
3756                                               I915_CACHE_WT : I915_CACHE_NONE);
3757         if (ret) {
3758                 vma = ERR_PTR(ret);
3759                 goto err_unpin_global;
3760         }
3761
3762         /* As the user may map the buffer once pinned in the display plane
3763          * (e.g. libkms for the bootup splash), we have to ensure that we
3764          * always use map_and_fenceable for all scanout buffers. However,
3765          * it may simply be too big to fit into mappable, in which case
3766          * put it anyway and hope that userspace can cope (but always first
3767          * try to preserve the existing ABI).
3768          */
3769         vma = ERR_PTR(-ENOSPC);
3770         if ((flags & PIN_MAPPABLE) == 0 &&
3771             (!view || view->type == I915_GGTT_VIEW_NORMAL))
3772                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment,
3773                                                flags |
3774                                                PIN_MAPPABLE |
3775                                                PIN_NONBLOCK);
3776         if (IS_ERR(vma))
3777                 vma = i915_gem_object_ggtt_pin(obj, view, 0, alignment, flags);
3778         if (IS_ERR(vma))
3779                 goto err_unpin_global;
3780
3781         vma->display_alignment = max_t(u64, vma->display_alignment, alignment);
3782
3783         __i915_gem_object_flush_for_display(obj);
3784
3785         /* It should now be out of any other write domains, and we can update
3786          * the domain values for our changes.
3787          */
3788         obj->read_domains |= I915_GEM_DOMAIN_GTT;
3789
3790         return vma;
3791
3792 err_unpin_global:
3793         obj->pin_global--;
3794         return vma;
3795 }
3796
3797 void
3798 i915_gem_object_unpin_from_display_plane(struct i915_vma *vma)
3799 {
3800         lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
3801
3802         if (WARN_ON(vma->obj->pin_global == 0))
3803                 return;
3804
3805         if (--vma->obj->pin_global == 0)
3806                 vma->display_alignment = I915_GTT_MIN_ALIGNMENT;
3807
3808         /* Bump the LRU to try and avoid premature eviction whilst flipping  */
3809         i915_gem_object_bump_inactive_ggtt(vma->obj);
3810
3811         i915_vma_unpin(vma);
3812 }
3813
3814 /**
3815  * Moves a single object to the CPU read, and possibly write domain.
3816  * @obj: object to act on
3817  * @write: requesting write or read-only access
3818  *
3819  * This function returns when the move is complete, including waiting on
3820  * flushes to occur.
3821  */
3822 int
3823 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
3824 {
3825         int ret;
3826
3827         lockdep_assert_held(&obj->base.dev->struct_mutex);
3828
3829         ret = i915_gem_object_wait(obj,
3830                                    I915_WAIT_INTERRUPTIBLE |
3831                                    I915_WAIT_LOCKED |
3832                                    (write ? I915_WAIT_ALL : 0),
3833                                    MAX_SCHEDULE_TIMEOUT,
3834                                    NULL);
3835         if (ret)
3836                 return ret;
3837
3838         flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
3839
3840         /* Flush the CPU cache if it's still invalid. */
3841         if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) {
3842                 i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
3843                 obj->read_domains |= I915_GEM_DOMAIN_CPU;
3844         }
3845
3846         /* It should now be out of any other write domains, and we can update
3847          * the domain values for our changes.
3848          */
3849         GEM_BUG_ON(obj->write_domain & ~I915_GEM_DOMAIN_CPU);
3850
3851         /* If we're writing through the CPU, then the GPU read domains will
3852          * need to be invalidated at next use.
3853          */
3854         if (write)
3855                 __start_cpu_write(obj);
3856
3857         return 0;
3858 }
3859
3860 /* Throttle our rendering by waiting until the ring has completed our requests
3861  * emitted over 20 msec ago.
3862  *
3863  * Note that if we were to use the current jiffies each time around the loop,
3864  * we wouldn't escape the function with any frames outstanding if the time to
3865  * render a frame was over 20ms.
3866  *
3867  * This should get us reasonable parallelism between CPU and GPU but also
3868  * relatively low latency when blocking on a particular request to finish.
3869  */
3870 static int
3871 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
3872 {
3873         struct drm_i915_private *dev_priv = to_i915(dev);
3874         struct drm_i915_file_private *file_priv = file->driver_priv;
3875         unsigned long recent_enough = jiffies - DRM_I915_THROTTLE_JIFFIES;
3876         struct i915_request *request, *target = NULL;
3877         long ret;
3878
3879         /* ABI: return -EIO if already wedged */
3880         if (i915_terminally_wedged(&dev_priv->gpu_error))
3881                 return -EIO;
3882
3883         spin_lock(&file_priv->mm.lock);
3884         list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
3885                 if (time_after_eq(request->emitted_jiffies, recent_enough))
3886                         break;
3887
3888                 if (target) {
3889                         list_del(&target->client_link);
3890                         target->file_priv = NULL;
3891                 }
3892
3893                 target = request;
3894         }
3895         if (target)
3896                 i915_request_get(target);
3897         spin_unlock(&file_priv->mm.lock);
3898
3899         if (target == NULL)
3900                 return 0;
3901
3902         ret = i915_request_wait(target,
3903                                 I915_WAIT_INTERRUPTIBLE,
3904                                 MAX_SCHEDULE_TIMEOUT);
3905         i915_request_put(target);
3906
3907         return ret < 0 ? ret : 0;
3908 }
3909
3910 struct i915_vma *
3911 i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
3912                          const struct i915_ggtt_view *view,
3913                          u64 size,
3914                          u64 alignment,
3915                          u64 flags)
3916 {
3917         struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
3918         struct i915_address_space *vm = &dev_priv->ggtt.vm;
3919         struct i915_vma *vma;
3920         int ret;
3921
3922         lockdep_assert_held(&obj->base.dev->struct_mutex);
3923
3924         if (flags & PIN_MAPPABLE &&
3925             (!view || view->type == I915_GGTT_VIEW_NORMAL)) {
3926                 /* If the required space is larger than the available
3927                  * aperture, we will not able to find a slot for the
3928                  * object and unbinding the object now will be in
3929                  * vain. Worse, doing so may cause us to ping-pong
3930                  * the object in and out of the Global GTT and
3931                  * waste a lot of cycles under the mutex.
3932                  */
3933                 if (obj->base.size > dev_priv->ggtt.mappable_end)
3934                         return ERR_PTR(-E2BIG);
3935
3936                 /* If NONBLOCK is set the caller is optimistically
3937                  * trying to cache the full object within the mappable
3938                  * aperture, and *must* have a fallback in place for
3939                  * situations where we cannot bind the object. We
3940                  * can be a little more lax here and use the fallback
3941                  * more often to avoid costly migrations of ourselves
3942                  * and other objects within the aperture.
3943                  *
3944                  * Half-the-aperture is used as a simple heuristic.
3945                  * More interesting would to do search for a free
3946                  * block prior to making the commitment to unbind.
3947                  * That caters for the self-harm case, and with a
3948                  * little more heuristics (e.g. NOFAULT, NOEVICT)
3949                  * we could try to minimise harm to others.
3950                  */
3951                 if (flags & PIN_NONBLOCK &&
3952                     obj->base.size > dev_priv->ggtt.mappable_end / 2)
3953                         return ERR_PTR(-ENOSPC);
3954         }
3955
3956         vma = i915_vma_instance(obj, vm, view);
3957         if (unlikely(IS_ERR(vma)))
3958                 return vma;
3959
3960         if (i915_vma_misplaced(vma, size, alignment, flags)) {
3961                 if (flags & PIN_NONBLOCK) {
3962                         if (i915_vma_is_pinned(vma) || i915_vma_is_active(vma))
3963                                 return ERR_PTR(-ENOSPC);
3964
3965                         if (flags & PIN_MAPPABLE &&
3966                             vma->fence_size > dev_priv->ggtt.mappable_end / 2)
3967                                 return ERR_PTR(-ENOSPC);
3968                 }
3969
3970                 WARN(i915_vma_is_pinned(vma),
3971                      "bo is already pinned in ggtt with incorrect alignment:"
3972                      " offset=%08x, req.alignment=%llx,"
3973                      " req.map_and_fenceable=%d, vma->map_and_fenceable=%d\n",
3974                      i915_ggtt_offset(vma), alignment,
3975                      !!(flags & PIN_MAPPABLE),
3976                      i915_vma_is_map_and_fenceable(vma));
3977                 ret = i915_vma_unbind(vma);
3978                 if (ret)
3979                         return ERR_PTR(ret);
3980         }
3981
3982         ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
3983         if (ret)
3984                 return ERR_PTR(ret);
3985
3986         return vma;
3987 }
3988
3989 static __always_inline unsigned int __busy_read_flag(unsigned int id)
3990 {
3991         /* Note that we could alias engines in the execbuf API, but
3992          * that would be very unwise as it prevents userspace from
3993          * fine control over engine selection. Ahem.
3994          *
3995          * This should be something like EXEC_MAX_ENGINE instead of
3996          * I915_NUM_ENGINES.
3997          */
3998         BUILD_BUG_ON(I915_NUM_ENGINES > 16);
3999         return 0x10000 << id;
4000 }
4001
4002 static __always_inline unsigned int __busy_write_id(unsigned int id)
4003 {
4004         /* The uABI guarantees an active writer is also amongst the read
4005          * engines. This would be true if we accessed the activity tracking
4006          * under the lock, but as we perform the lookup of the object and
4007          * its activity locklessly we can not guarantee that the last_write
4008          * being active implies that we have set the same engine flag from
4009          * last_read - hence we always set both read and write busy for
4010          * last_write.
4011          */
4012         return id | __busy_read_flag(id);
4013 }
4014
4015 static __always_inline unsigned int
4016 __busy_set_if_active(const struct dma_fence *fence,
4017                      unsigned int (*flag)(unsigned int id))
4018 {
4019         struct i915_request *rq;
4020
4021         /* We have to check the current hw status of the fence as the uABI
4022          * guarantees forward progress. We could rely on the idle worker
4023          * to eventually flush us, but to minimise latency just ask the
4024          * hardware.
4025          *
4026          * Note we only report on the status of native fences.
4027          */
4028         if (!dma_fence_is_i915(fence))
4029                 return 0;
4030
4031         /* opencode to_request() in order to avoid const warnings */
4032         rq = container_of(fence, struct i915_request, fence);
4033         if (i915_request_completed(rq))
4034                 return 0;
4035
4036         return flag(rq->engine->uabi_id);
4037 }
4038
4039 static __always_inline unsigned int
4040 busy_check_reader(const struct dma_fence *fence)
4041 {
4042         return __busy_set_if_active(fence, __busy_read_flag);
4043 }
4044
4045 static __always_inline unsigned int
4046 busy_check_writer(const struct dma_fence *fence)
4047 {
4048         if (!fence)
4049                 return 0;
4050
4051         return __busy_set_if_active(fence, __busy_write_id);
4052 }
4053
4054 int
4055 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
4056                     struct drm_file *file)
4057 {
4058         struct drm_i915_gem_busy *args = data;
4059         struct drm_i915_gem_object *obj;
4060         struct reservation_object_list *list;
4061         unsigned int seq;
4062         int err;
4063
4064         err = -ENOENT;
4065         rcu_read_lock();
4066         obj = i915_gem_object_lookup_rcu(file, args->handle);
4067         if (!obj)
4068                 goto out;
4069
4070         /* A discrepancy here is that we do not report the status of
4071          * non-i915 fences, i.e. even though we may report the object as idle,
4072          * a call to set-domain may still stall waiting for foreign rendering.
4073          * This also means that wait-ioctl may report an object as busy,
4074          * where busy-ioctl considers it idle.
4075          *
4076          * We trade the ability to warn of foreign fences to report on which
4077          * i915 engines are active for the object.
4078          *
4079          * Alternatively, we can trade that extra information on read/write
4080          * activity with
4081          *      args->busy =
4082          *              !reservation_object_test_signaled_rcu(obj->resv, true);
4083          * to report the overall busyness. This is what the wait-ioctl does.
4084          *
4085          */
4086 retry:
4087         seq = raw_read_seqcount(&obj->resv->seq);
4088
4089         /* Translate the exclusive fence to the READ *and* WRITE engine */
4090         args->busy = busy_check_writer(rcu_dereference(obj->resv->fence_excl));
4091
4092         /* Translate shared fences to READ set of engines */
4093         list = rcu_dereference(obj->resv->fence);
4094         if (list) {
4095                 unsigned int shared_count = list->shared_count, i;
4096
4097                 for (i = 0; i < shared_count; ++i) {
4098                         struct dma_fence *fence =
4099                                 rcu_dereference(list->shared[i]);
4100
4101                         args->busy |= busy_check_reader(fence);
4102                 }
4103         }
4104
4105         if (args->busy && read_seqcount_retry(&obj->resv->seq, seq))
4106                 goto retry;
4107
4108         err = 0;
4109 out:
4110         rcu_read_unlock();
4111         return err;
4112 }
4113
4114 int
4115 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
4116                         struct drm_file *file_priv)
4117 {
4118         return i915_gem_ring_throttle(dev, file_priv);
4119 }
4120
4121 int
4122 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
4123                        struct drm_file *file_priv)
4124 {
4125         struct drm_i915_private *dev_priv = to_i915(dev);
4126         struct drm_i915_gem_madvise *args = data;
4127         struct drm_i915_gem_object *obj;
4128         int err;
4129
4130         switch (args->madv) {
4131         case I915_MADV_DONTNEED:
4132         case I915_MADV_WILLNEED:
4133             break;
4134         default:
4135             return -EINVAL;
4136         }
4137
4138         obj = i915_gem_object_lookup(file_priv, args->handle);
4139         if (!obj)
4140                 return -ENOENT;
4141
4142         err = mutex_lock_interruptible(&obj->mm.lock);
4143         if (err)
4144                 goto out;
4145
4146         if (i915_gem_object_has_pages(obj) &&
4147             i915_gem_object_is_tiled(obj) &&
4148             dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES) {
4149                 if (obj->mm.madv == I915_MADV_WILLNEED) {
4150                         GEM_BUG_ON(!obj->mm.quirked);
4151                         __i915_gem_object_unpin_pages(obj);
4152                         obj->mm.quirked = false;
4153                 }
4154                 if (args->madv == I915_MADV_WILLNEED) {
4155                         GEM_BUG_ON(obj->mm.quirked);
4156                         __i915_gem_object_pin_pages(obj);
4157                         obj->mm.quirked = true;
4158                 }
4159         }
4160
4161         if (obj->mm.madv != __I915_MADV_PURGED)
4162                 obj->mm.madv = args->madv;
4163
4164         /* if the object is no longer attached, discard its backing storage */
4165         if (obj->mm.madv == I915_MADV_DONTNEED &&
4166             !i915_gem_object_has_pages(obj))
4167                 i915_gem_object_truncate(obj);
4168
4169         args->retained = obj->mm.madv != __I915_MADV_PURGED;
4170         mutex_unlock(&obj->mm.lock);
4171
4172 out:
4173         i915_gem_object_put(obj);
4174         return err;
4175 }
4176
4177 static void
4178 frontbuffer_retire(struct i915_active_request *active,
4179                    struct i915_request *request)
4180 {
4181         struct drm_i915_gem_object *obj =
4182                 container_of(active, typeof(*obj), frontbuffer_write);
4183
4184         intel_fb_obj_flush(obj, ORIGIN_CS);
4185 }
4186
4187 void i915_gem_object_init(struct drm_i915_gem_object *obj,
4188                           const struct drm_i915_gem_object_ops *ops)
4189 {
4190         mutex_init(&obj->mm.lock);
4191
4192         spin_lock_init(&obj->vma.lock);
4193         INIT_LIST_HEAD(&obj->vma.list);
4194
4195         INIT_LIST_HEAD(&obj->lut_list);
4196         INIT_LIST_HEAD(&obj->batch_pool_link);
4197
4198         init_rcu_head(&obj->rcu);
4199
4200         obj->ops = ops;
4201
4202         reservation_object_init(&obj->__builtin_resv);
4203         obj->resv = &obj->__builtin_resv;
4204
4205         obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
4206         i915_active_request_init(&obj->frontbuffer_write,
4207                                  NULL, frontbuffer_retire);
4208
4209         obj->mm.madv = I915_MADV_WILLNEED;
4210         INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
4211         mutex_init(&obj->mm.get_page.lock);
4212
4213         i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
4214 }
4215
4216 static const struct drm_i915_gem_object_ops i915_gem_object_ops = {
4217         .flags = I915_GEM_OBJECT_HAS_STRUCT_PAGE |
4218                  I915_GEM_OBJECT_IS_SHRINKABLE,
4219
4220         .get_pages = i915_gem_object_get_pages_gtt,
4221         .put_pages = i915_gem_object_put_pages_gtt,
4222
4223         .pwrite = i915_gem_object_pwrite_gtt,
4224 };
4225
4226 static int i915_gem_object_create_shmem(struct drm_device *dev,
4227                                         struct drm_gem_object *obj,
4228                                         size_t size)
4229 {
4230         struct drm_i915_private *i915 = to_i915(dev);
4231         unsigned long flags = VM_NORESERVE;
4232         struct file *filp;
4233
4234         drm_gem_private_object_init(dev, obj, size);
4235
4236         if (i915->mm.gemfs)
4237                 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size,
4238                                                  flags);
4239         else
4240                 filp = shmem_file_setup("i915", size, flags);
4241
4242         if (IS_ERR(filp))
4243                 return PTR_ERR(filp);
4244
4245         obj->filp = filp;
4246
4247         return 0;
4248 }
4249
4250 struct drm_i915_gem_object *
4251 i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
4252 {
4253         struct drm_i915_gem_object *obj;
4254         struct address_space *mapping;
4255         unsigned int cache_level;
4256         gfp_t mask;
4257         int ret;
4258
4259         /* There is a prevalence of the assumption that we fit the object's
4260          * page count inside a 32bit _signed_ variable. Let's document this and
4261          * catch if we ever need to fix it. In the meantime, if you do spot
4262          * such a local variable, please consider fixing!
4263          */
4264         if (size >> PAGE_SHIFT > INT_MAX)
4265                 return ERR_PTR(-E2BIG);
4266
4267         if (overflows_type(size, obj->base.size))
4268                 return ERR_PTR(-E2BIG);
4269
4270         obj = i915_gem_object_alloc(dev_priv);
4271         if (obj == NULL)
4272                 return ERR_PTR(-ENOMEM);
4273
4274         ret = i915_gem_object_create_shmem(&dev_priv->drm, &obj->base, size);
4275         if (ret)
4276                 goto fail;
4277
4278         mask = GFP_HIGHUSER | __GFP_RECLAIMABLE;
4279         if (IS_I965GM(dev_priv) || IS_I965G(dev_priv)) {
4280                 /* 965gm cannot relocate objects above 4GiB. */
4281                 mask &= ~__GFP_HIGHMEM;
4282                 mask |= __GFP_DMA32;
4283         }
4284
4285         mapping = obj->base.filp->f_mapping;
4286         mapping_set_gfp_mask(mapping, mask);
4287         GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM));
4288
4289         i915_gem_object_init(obj, &i915_gem_object_ops);
4290
4291         obj->write_domain = I915_GEM_DOMAIN_CPU;
4292         obj->read_domains = I915_GEM_DOMAIN_CPU;
4293
4294         if (HAS_LLC(dev_priv))
4295                 /* On some devices, we can have the GPU use the LLC (the CPU
4296                  * cache) for about a 10% performance improvement
4297                  * compared to uncached.  Graphics requests other than
4298                  * display scanout are coherent with the CPU in
4299                  * accessing this cache.  This means in this mode we
4300                  * don't need to clflush on the CPU side, and on the
4301                  * GPU side we only need to flush internal caches to
4302                  * get data visible to the CPU.
4303                  *
4304                  * However, we maintain the display planes as UC, and so
4305                  * need to rebind when first used as such.
4306                  */
4307                 cache_level = I915_CACHE_LLC;
4308         else
4309                 cache_level = I915_CACHE_NONE;
4310
4311         i915_gem_object_set_cache_coherency(obj, cache_level);
4312
4313         trace_i915_gem_object_create(obj);
4314
4315         return obj;
4316
4317 fail:
4318         i915_gem_object_free(obj);
4319         return ERR_PTR(ret);
4320 }
4321
4322 static bool discard_backing_storage(struct drm_i915_gem_object *obj)
4323 {
4324         /* If we are the last user of the backing storage (be it shmemfs
4325          * pages or stolen etc), we know that the pages are going to be
4326          * immediately released. In this case, we can then skip copying
4327          * back the contents from the GPU.
4328          */
4329
4330         if (obj->mm.madv != I915_MADV_WILLNEED)
4331                 return false;
4332
4333         if (obj->base.filp == NULL)
4334                 return true;
4335
4336         /* At first glance, this looks racy, but then again so would be
4337          * userspace racing mmap against close. However, the first external
4338          * reference to the filp can only be obtained through the
4339          * i915_gem_mmap_ioctl() which safeguards us against the user
4340          * acquiring such a reference whilst we are in the middle of
4341          * freeing the object.
4342          */
4343         return atomic_long_read(&obj->base.filp->f_count) == 1;
4344 }
4345
4346 static void __i915_gem_free_objects(struct drm_i915_private *i915,
4347                                     struct llist_node *freed)
4348 {
4349         struct drm_i915_gem_object *obj, *on;
4350         intel_wakeref_t wakeref;
4351
4352         wakeref = intel_runtime_pm_get(i915);
4353         llist_for_each_entry_safe(obj, on, freed, freed) {
4354                 struct i915_vma *vma, *vn;
4355
4356                 trace_i915_gem_object_destroy(obj);
4357
4358                 mutex_lock(&i915->drm.struct_mutex);
4359
4360                 GEM_BUG_ON(i915_gem_object_is_active(obj));
4361                 list_for_each_entry_safe(vma, vn, &obj->vma.list, obj_link) {
4362                         GEM_BUG_ON(i915_vma_is_active(vma));
4363                         vma->flags &= ~I915_VMA_PIN_MASK;
4364                         i915_vma_destroy(vma);
4365                 }
4366                 GEM_BUG_ON(!list_empty(&obj->vma.list));
4367                 GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma.tree));
4368
4369                 /* This serializes freeing with the shrinker. Since the free
4370                  * is delayed, first by RCU then by the workqueue, we want the
4371                  * shrinker to be able to free pages of unreferenced objects,
4372                  * or else we may oom whilst there are plenty of deferred
4373                  * freed objects.
4374                  */
4375                 if (i915_gem_object_has_pages(obj)) {
4376                         spin_lock(&i915->mm.obj_lock);
4377                         list_del_init(&obj->mm.link);
4378                         spin_unlock(&i915->mm.obj_lock);
4379                 }
4380
4381                 mutex_unlock(&i915->drm.struct_mutex);
4382
4383                 GEM_BUG_ON(obj->bind_count);
4384                 GEM_BUG_ON(obj->userfault_count);
4385                 GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
4386                 GEM_BUG_ON(!list_empty(&obj->lut_list));
4387
4388                 if (obj->ops->release)
4389                         obj->ops->release(obj);
4390
4391                 if (WARN_ON(i915_gem_object_has_pinned_pages(obj)))
4392                         atomic_set(&obj->mm.pages_pin_count, 0);
4393                 __i915_gem_object_put_pages(obj, I915_MM_NORMAL);
4394                 GEM_BUG_ON(i915_gem_object_has_pages(obj));
4395
4396                 if (obj->base.import_attach)
4397                         drm_prime_gem_destroy(&obj->base, NULL);
4398
4399                 reservation_object_fini(&obj->__builtin_resv);
4400                 drm_gem_object_release(&obj->base);
4401                 i915_gem_info_remove_obj(i915, obj->base.size);
4402
4403                 kfree(obj->bit_17);
4404                 i915_gem_object_free(obj);
4405
4406                 GEM_BUG_ON(!atomic_read(&i915->mm.free_count));
4407                 atomic_dec(&i915->mm.free_count);
4408
4409                 if (on)
4410                         cond_resched();
4411         }
4412         intel_runtime_pm_put(i915, wakeref);
4413 }
4414
4415 static void i915_gem_flush_free_objects(struct drm_i915_private *i915)
4416 {
4417         struct llist_node *freed;
4418
4419         /* Free the oldest, most stale object to keep the free_list short */
4420         freed = NULL;
4421         if (!llist_empty(&i915->mm.free_list)) { /* quick test for hotpath */
4422                 /* Only one consumer of llist_del_first() allowed */
4423                 spin_lock(&i915->mm.free_lock);
4424                 freed = llist_del_first(&i915->mm.free_list);
4425                 spin_unlock(&i915->mm.free_lock);
4426         }
4427         if (unlikely(freed)) {
4428                 freed->next = NULL;
4429                 __i915_gem_free_objects(i915, freed);
4430         }
4431 }
4432
4433 static void __i915_gem_free_work(struct work_struct *work)
4434 {
4435         struct drm_i915_private *i915 =
4436                 container_of(work, struct drm_i915_private, mm.free_work);
4437         struct llist_node *freed;
4438
4439         /*
4440          * All file-owned VMA should have been released by this point through
4441          * i915_gem_close_object(), or earlier by i915_gem_context_close().
4442          * However, the object may also be bound into the global GTT (e.g.
4443          * older GPUs without per-process support, or for direct access through
4444          * the GTT either for the user or for scanout). Those VMA still need to
4445          * unbound now.
4446          */
4447
4448         spin_lock(&i915->mm.free_lock);
4449         while ((freed = llist_del_all(&i915->mm.free_list))) {
4450                 spin_unlock(&i915->mm.free_lock);
4451
4452                 __i915_gem_free_objects(i915, freed);
4453                 if (need_resched())
4454                         return;
4455
4456                 spin_lock(&i915->mm.free_lock);
4457         }
4458         spin_unlock(&i915->mm.free_lock);
4459 }
4460
4461 static void __i915_gem_free_object_rcu(struct rcu_head *head)
4462 {
4463         struct drm_i915_gem_object *obj =
4464                 container_of(head, typeof(*obj), rcu);
4465         struct drm_i915_private *i915 = to_i915(obj->base.dev);
4466
4467         /*
4468          * We reuse obj->rcu for the freed list, so we had better not treat
4469          * it like a rcu_head from this point forwards. And we expect all
4470          * objects to be freed via this path.
4471          */
4472         destroy_rcu_head(&obj->rcu);
4473
4474         /*
4475          * Since we require blocking on struct_mutex to unbind the freed
4476          * object from the GPU before releasing resources back to the
4477          * system, we can not do that directly from the RCU callback (which may
4478          * be a softirq context), but must instead then defer that work onto a
4479          * kthread. We use the RCU callback rather than move the freed object
4480          * directly onto the work queue so that we can mix between using the
4481          * worker and performing frees directly from subsequent allocations for
4482          * crude but effective memory throttling.
4483          */
4484         if (llist_add(&obj->freed, &i915->mm.free_list))
4485                 queue_work(i915->wq, &i915->mm.free_work);
4486 }
4487
4488 void i915_gem_free_object(struct drm_gem_object *gem_obj)
4489 {
4490         struct drm_i915_gem_object *obj = to_intel_bo(gem_obj);
4491
4492         if (obj->mm.quirked)
4493                 __i915_gem_object_unpin_pages(obj);
4494
4495         if (discard_backing_storage(obj))
4496                 obj->mm.madv = I915_MADV_DONTNEED;
4497
4498         /*
4499          * Before we free the object, make sure any pure RCU-only
4500          * read-side critical sections are complete, e.g.
4501          * i915_gem_busy_ioctl(). For the corresponding synchronized
4502          * lookup see i915_gem_object_lookup_rcu().
4503          */
4504         atomic_inc(&to_i915(obj->base.dev)->mm.free_count);
4505         call_rcu(&obj->rcu, __i915_gem_free_object_rcu);
4506 }
4507
4508 void __i915_gem_object_release_unless_active(struct drm_i915_gem_object *obj)
4509 {
4510         lockdep_assert_held(&obj->base.dev->struct_mutex);
4511
4512         if (!i915_gem_object_has_active_reference(obj) &&
4513             i915_gem_object_is_active(obj))
4514                 i915_gem_object_set_active_reference(obj);
4515         else
4516                 i915_gem_object_put(obj);
4517 }
4518
4519 void i915_gem_sanitize(struct drm_i915_private *i915)
4520 {
4521         intel_wakeref_t wakeref;
4522
4523         GEM_TRACE("\n");
4524
4525         wakeref = intel_runtime_pm_get(i915);
4526         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4527
4528         /*
4529          * As we have just resumed the machine and woken the device up from
4530          * deep PCI sleep (presumably D3_cold), assume the HW has been reset
4531          * back to defaults, recovering from whatever wedged state we left it
4532          * in and so worth trying to use the device once more.
4533          */
4534         if (i915_terminally_wedged(&i915->gpu_error))
4535                 i915_gem_unset_wedged(i915);
4536
4537         /*
4538          * If we inherit context state from the BIOS or earlier occupants
4539          * of the GPU, the GPU may be in an inconsistent state when we
4540          * try to take over. The only way to remove the earlier state
4541          * is by resetting. However, resetting on earlier gen is tricky as
4542          * it may impact the display and we are uncertain about the stability
4543          * of the reset, so this could be applied to even earlier gen.
4544          */
4545         intel_engines_sanitize(i915, false);
4546
4547         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4548         intel_runtime_pm_put(i915, wakeref);
4549
4550         mutex_lock(&i915->drm.struct_mutex);
4551         i915_gem_contexts_lost(i915);
4552         mutex_unlock(&i915->drm.struct_mutex);
4553 }
4554
4555 int i915_gem_suspend(struct drm_i915_private *i915)
4556 {
4557         intel_wakeref_t wakeref;
4558         int ret;
4559
4560         GEM_TRACE("\n");
4561
4562         wakeref = intel_runtime_pm_get(i915);
4563         intel_suspend_gt_powersave(i915);
4564
4565         flush_workqueue(i915->wq);
4566
4567         mutex_lock(&i915->drm.struct_mutex);
4568
4569         /*
4570          * We have to flush all the executing contexts to main memory so
4571          * that they can saved in the hibernation image. To ensure the last
4572          * context image is coherent, we have to switch away from it. That
4573          * leaves the i915->kernel_context still active when
4574          * we actually suspend, and its image in memory may not match the GPU
4575          * state. Fortunately, the kernel_context is disposable and we do
4576          * not rely on its state.
4577          */
4578         if (!i915_terminally_wedged(&i915->gpu_error)) {
4579                 ret = i915_gem_switch_to_kernel_context(i915);
4580                 if (ret)
4581                         goto err_unlock;
4582
4583                 ret = i915_gem_wait_for_idle(i915,
4584                                              I915_WAIT_INTERRUPTIBLE |
4585                                              I915_WAIT_LOCKED |
4586                                              I915_WAIT_FOR_IDLE_BOOST,
4587                                              MAX_SCHEDULE_TIMEOUT);
4588                 if (ret && ret != -EIO)
4589                         goto err_unlock;
4590
4591                 assert_kernel_context_is_current(i915);
4592         }
4593         i915_retire_requests(i915); /* ensure we flush after wedging */
4594
4595         mutex_unlock(&i915->drm.struct_mutex);
4596         i915_reset_flush(i915);
4597
4598         drain_delayed_work(&i915->gt.retire_work);
4599
4600         /*
4601          * As the idle_work is rearming if it detects a race, play safe and
4602          * repeat the flush until it is definitely idle.
4603          */
4604         drain_delayed_work(&i915->gt.idle_work);
4605
4606         intel_uc_suspend(i915);
4607
4608         /*
4609          * Assert that we successfully flushed all the work and
4610          * reset the GPU back to its idle, low power state.
4611          */
4612         WARN_ON(i915->gt.awake);
4613         if (WARN_ON(!intel_engines_are_idle(i915)))
4614                 i915_gem_set_wedged(i915); /* no hope, discard everything */
4615
4616         intel_runtime_pm_put(i915, wakeref);
4617         return 0;
4618
4619 err_unlock:
4620         mutex_unlock(&i915->drm.struct_mutex);
4621         intel_runtime_pm_put(i915, wakeref);
4622         return ret;
4623 }
4624
4625 void i915_gem_suspend_late(struct drm_i915_private *i915)
4626 {
4627         struct drm_i915_gem_object *obj;
4628         struct list_head *phases[] = {
4629                 &i915->mm.unbound_list,
4630                 &i915->mm.bound_list,
4631                 NULL
4632         }, **phase;
4633
4634         /*
4635          * Neither the BIOS, ourselves or any other kernel
4636          * expects the system to be in execlists mode on startup,
4637          * so we need to reset the GPU back to legacy mode. And the only
4638          * known way to disable logical contexts is through a GPU reset.
4639          *
4640          * So in order to leave the system in a known default configuration,
4641          * always reset the GPU upon unload and suspend. Afterwards we then
4642          * clean up the GEM state tracking, flushing off the requests and
4643          * leaving the system in a known idle state.
4644          *
4645          * Note that is of the upmost importance that the GPU is idle and
4646          * all stray writes are flushed *before* we dismantle the backing
4647          * storage for the pinned objects.
4648          *
4649          * However, since we are uncertain that resetting the GPU on older
4650          * machines is a good idea, we don't - just in case it leaves the
4651          * machine in an unusable condition.
4652          */
4653
4654         mutex_lock(&i915->drm.struct_mutex);
4655         for (phase = phases; *phase; phase++) {
4656                 list_for_each_entry(obj, *phase, mm.link)
4657                         WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
4658         }
4659         mutex_unlock(&i915->drm.struct_mutex);
4660
4661         intel_uc_sanitize(i915);
4662         i915_gem_sanitize(i915);
4663 }
4664
4665 void i915_gem_resume(struct drm_i915_private *i915)
4666 {
4667         GEM_TRACE("\n");
4668
4669         WARN_ON(i915->gt.awake);
4670
4671         mutex_lock(&i915->drm.struct_mutex);
4672         intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
4673
4674         i915_gem_restore_gtt_mappings(i915);
4675         i915_gem_restore_fences(i915);
4676
4677         /*
4678          * As we didn't flush the kernel context before suspend, we cannot
4679          * guarantee that the context image is complete. So let's just reset
4680          * it and start again.
4681          */
4682         i915->gt.resume(i915);
4683
4684         if (i915_gem_init_hw(i915))
4685                 goto err_wedged;
4686
4687         intel_uc_resume(i915);
4688
4689         /* Always reload a context for powersaving. */
4690         if (i915_gem_switch_to_kernel_context(i915))
4691                 goto err_wedged;
4692
4693 out_unlock:
4694         intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
4695         mutex_unlock(&i915->drm.struct_mutex);
4696         return;
4697
4698 err_wedged:
4699         if (!i915_terminally_wedged(&i915->gpu_error)) {
4700                 DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
4701                 i915_gem_set_wedged(i915);
4702         }
4703         goto out_unlock;
4704 }
4705
4706 void i915_gem_init_swizzling(struct drm_i915_private *dev_priv)
4707 {
4708         if (INTEL_GEN(dev_priv) < 5 ||
4709             dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
4710                 return;
4711
4712         I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
4713                                  DISP_TILE_SURFACE_SWIZZLING);
4714
4715         if (IS_GEN(dev_priv, 5))
4716                 return;
4717
4718         I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
4719         if (IS_GEN(dev_priv, 6))
4720                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
4721         else if (IS_GEN(dev_priv, 7))
4722                 I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
4723         else if (IS_GEN(dev_priv, 8))
4724                 I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
4725         else
4726                 BUG();
4727 }
4728
4729 static void init_unused_ring(struct drm_i915_private *dev_priv, u32 base)
4730 {
4731         I915_WRITE(RING_CTL(base), 0);
4732         I915_WRITE(RING_HEAD(base), 0);
4733         I915_WRITE(RING_TAIL(base), 0);
4734         I915_WRITE(RING_START(base), 0);
4735 }
4736
4737 static void init_unused_rings(struct drm_i915_private *dev_priv)
4738 {
4739         if (IS_I830(dev_priv)) {
4740                 init_unused_ring(dev_priv, PRB1_BASE);
4741                 init_unused_ring(dev_priv, SRB0_BASE);
4742                 init_unused_ring(dev_priv, SRB1_BASE);
4743                 init_unused_ring(dev_priv, SRB2_BASE);
4744                 init_unused_ring(dev_priv, SRB3_BASE);
4745         } else if (IS_GEN(dev_priv, 2)) {
4746                 init_unused_ring(dev_priv, SRB0_BASE);
4747                 init_unused_ring(dev_priv, SRB1_BASE);
4748         } else if (IS_GEN(dev_priv, 3)) {
4749                 init_unused_ring(dev_priv, PRB1_BASE);
4750                 init_unused_ring(dev_priv, PRB2_BASE);
4751         }
4752 }
4753
4754 static int __i915_gem_restart_engines(void *data)
4755 {
4756         struct drm_i915_private *i915 = data;
4757         struct intel_engine_cs *engine;
4758         enum intel_engine_id id;
4759         int err;
4760
4761         for_each_engine(engine, i915, id) {
4762                 err = engine->init_hw(engine);
4763                 if (err) {
4764                         DRM_ERROR("Failed to restart %s (%d)\n",
4765                                   engine->name, err);
4766                         return err;
4767                 }
4768         }
4769
4770         return 0;
4771 }
4772
4773 int i915_gem_init_hw(struct drm_i915_private *dev_priv)
4774 {
4775         int ret;
4776
4777         dev_priv->gt.last_init_time = ktime_get();
4778
4779         /* Double layer security blanket, see i915_gem_init() */
4780         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
4781
4782         if (HAS_EDRAM(dev_priv) && INTEL_GEN(dev_priv) < 9)
4783                 I915_WRITE(HSW_IDICR, I915_READ(HSW_IDICR) | IDIHASHMSK(0xf));
4784
4785         if (IS_HASWELL(dev_priv))
4786                 I915_WRITE(MI_PREDICATE_RESULT_2, IS_HSW_GT3(dev_priv) ?
4787                            LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
4788
4789         /* Apply the GT workarounds... */
4790         intel_gt_apply_workarounds(dev_priv);
4791         /* ...and determine whether they are sticking. */
4792         intel_gt_verify_workarounds(dev_priv, "init");
4793
4794         i915_gem_init_swizzling(dev_priv);
4795
4796         /*
4797          * At least 830 can leave some of the unused rings
4798          * "active" (ie. head != tail) after resume which
4799          * will prevent c3 entry. Makes sure all unused rings
4800          * are totally idle.
4801          */
4802         init_unused_rings(dev_priv);
4803
4804         BUG_ON(!dev_priv->kernel_context);
4805         if (i915_terminally_wedged(&dev_priv->gpu_error)) {
4806                 ret = -EIO;
4807                 goto out;
4808         }
4809
4810         ret = i915_ppgtt_init_hw(dev_priv);
4811         if (ret) {
4812                 DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
4813                 goto out;
4814         }
4815
4816         ret = intel_wopcm_init_hw(&dev_priv->wopcm);
4817         if (ret) {
4818                 DRM_ERROR("Enabling WOPCM failed (%d)\n", ret);
4819                 goto out;
4820         }
4821
4822         /* We can't enable contexts until all firmware is loaded */
4823         ret = intel_uc_init_hw(dev_priv);
4824         if (ret) {
4825                 DRM_ERROR("Enabling uc failed (%d)\n", ret);
4826                 goto out;
4827         }
4828
4829         intel_mocs_init_l3cc_table(dev_priv);
4830
4831         /* Only when the HW is re-initialised, can we replay the requests */
4832         ret = __i915_gem_restart_engines(dev_priv);
4833         if (ret)
4834                 goto cleanup_uc;
4835
4836         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4837
4838         return 0;
4839
4840 cleanup_uc:
4841         intel_uc_fini_hw(dev_priv);
4842 out:
4843         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
4844
4845         return ret;
4846 }
4847
4848 static int __intel_engines_record_defaults(struct drm_i915_private *i915)
4849 {
4850         struct i915_gem_context *ctx;
4851         struct intel_engine_cs *engine;
4852         enum intel_engine_id id;
4853         int err;
4854
4855         /*
4856          * As we reset the gpu during very early sanitisation, the current
4857          * register state on the GPU should reflect its defaults values.
4858          * We load a context onto the hw (with restore-inhibit), then switch
4859          * over to a second context to save that default register state. We
4860          * can then prime every new context with that state so they all start
4861          * from the same default HW values.
4862          */
4863
4864         ctx = i915_gem_context_create_kernel(i915, 0);
4865         if (IS_ERR(ctx))
4866                 return PTR_ERR(ctx);
4867
4868         for_each_engine(engine, i915, id) {
4869                 struct i915_request *rq;
4870
4871                 rq = i915_request_alloc(engine, ctx);
4872                 if (IS_ERR(rq)) {
4873                         err = PTR_ERR(rq);
4874                         goto out_ctx;
4875                 }
4876
4877                 err = 0;
4878                 if (engine->init_context)
4879                         err = engine->init_context(rq);
4880
4881                 i915_request_add(rq);
4882                 if (err)
4883                         goto err_active;
4884         }
4885
4886         err = i915_gem_switch_to_kernel_context(i915);
4887         if (err)
4888                 goto err_active;
4889
4890         if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
4891                 i915_gem_set_wedged(i915);
4892                 err = -EIO; /* Caller will declare us wedged */
4893                 goto err_active;
4894         }
4895
4896         assert_kernel_context_is_current(i915);
4897
4898         /*
4899          * Immediately park the GPU so that we enable powersaving and
4900          * treat it as idle. The next time we issue a request, we will
4901          * unpark and start using the engine->pinned_default_state, otherwise
4902          * it is in limbo and an early reset may fail.
4903          */
4904         __i915_gem_park(i915);
4905
4906         for_each_engine(engine, i915, id) {
4907                 struct i915_vma *state;
4908                 void *vaddr;
4909
4910                 GEM_BUG_ON(to_intel_context(ctx, engine)->pin_count);
4911
4912                 state = to_intel_context(ctx, engine)->state;
4913                 if (!state)
4914                         continue;
4915
4916                 /*
4917                  * As we will hold a reference to the logical state, it will
4918                  * not be torn down with the context, and importantly the
4919                  * object will hold onto its vma (making it possible for a
4920                  * stray GTT write to corrupt our defaults). Unmap the vma
4921                  * from the GTT to prevent such accidents and reclaim the
4922                  * space.
4923                  */
4924                 err = i915_vma_unbind(state);
4925                 if (err)
4926                         goto err_active;
4927
4928                 err = i915_gem_object_set_to_cpu_domain(state->obj, false);
4929                 if (err)
4930                         goto err_active;
4931
4932                 engine->default_state = i915_gem_object_get(state->obj);
4933
4934                 /* Check we can acquire the image of the context state */
4935                 vaddr = i915_gem_object_pin_map(engine->default_state,
4936                                                 I915_MAP_FORCE_WB);
4937                 if (IS_ERR(vaddr)) {
4938                         err = PTR_ERR(vaddr);
4939                         goto err_active;
4940                 }
4941
4942                 i915_gem_object_unpin_map(engine->default_state);
4943         }
4944
4945         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
4946                 unsigned int found = intel_engines_has_context_isolation(i915);
4947
4948                 /*
4949                  * Make sure that classes with multiple engine instances all
4950                  * share the same basic configuration.
4951                  */
4952                 for_each_engine(engine, i915, id) {
4953                         unsigned int bit = BIT(engine->uabi_class);
4954                         unsigned int expected = engine->default_state ? bit : 0;
4955
4956                         if ((found & bit) != expected) {
4957                                 DRM_ERROR("mismatching default context state for class %d on engine %s\n",
4958                                           engine->uabi_class, engine->name);
4959                         }
4960                 }
4961         }
4962
4963 out_ctx:
4964         i915_gem_context_set_closed(ctx);
4965         i915_gem_context_put(ctx);
4966         return err;
4967
4968 err_active:
4969         /*
4970          * If we have to abandon now, we expect the engines to be idle
4971          * and ready to be torn-down. First try to flush any remaining
4972          * request, ensure we are pointing at the kernel context and
4973          * then remove it.
4974          */
4975         if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
4976                 goto out_ctx;
4977
4978         if (WARN_ON(i915_gem_wait_for_idle(i915,
4979                                            I915_WAIT_LOCKED,
4980                                            MAX_SCHEDULE_TIMEOUT)))
4981                 goto out_ctx;
4982
4983         i915_gem_contexts_lost(i915);
4984         goto out_ctx;
4985 }
4986
4987 static int
4988 i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size)
4989 {
4990         struct drm_i915_gem_object *obj;
4991         struct i915_vma *vma;
4992         int ret;
4993
4994         obj = i915_gem_object_create_stolen(i915, size);
4995         if (!obj)
4996                 obj = i915_gem_object_create_internal(i915, size);
4997         if (IS_ERR(obj)) {
4998                 DRM_ERROR("Failed to allocate scratch page\n");
4999                 return PTR_ERR(obj);
5000         }
5001
5002         vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
5003         if (IS_ERR(vma)) {
5004                 ret = PTR_ERR(vma);
5005                 goto err_unref;
5006         }
5007
5008         ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
5009         if (ret)
5010                 goto err_unref;
5011
5012         i915->gt.scratch = vma;
5013         return 0;
5014
5015 err_unref:
5016         i915_gem_object_put(obj);
5017         return ret;
5018 }
5019
5020 static void i915_gem_fini_scratch(struct drm_i915_private *i915)
5021 {
5022         i915_vma_unpin_and_release(&i915->gt.scratch, 0);
5023 }
5024
5025 int i915_gem_init(struct drm_i915_private *dev_priv)
5026 {
5027         int ret;
5028
5029         /* We need to fallback to 4K pages if host doesn't support huge gtt. */
5030         if (intel_vgpu_active(dev_priv) && !intel_vgpu_has_huge_gtt(dev_priv))
5031                 mkwrite_device_info(dev_priv)->page_sizes =
5032                         I915_GTT_PAGE_SIZE_4K;
5033
5034         dev_priv->mm.unordered_timeline = dma_fence_context_alloc(1);
5035
5036         if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
5037                 dev_priv->gt.resume = intel_lr_context_resume;
5038                 dev_priv->gt.cleanup_engine = intel_logical_ring_cleanup;
5039         } else {
5040                 dev_priv->gt.resume = intel_legacy_submission_resume;
5041                 dev_priv->gt.cleanup_engine = intel_engine_cleanup;
5042         }
5043
5044         i915_timelines_init(dev_priv);
5045
5046         ret = i915_gem_init_userptr(dev_priv);
5047         if (ret)
5048                 return ret;
5049
5050         ret = intel_uc_init_misc(dev_priv);
5051         if (ret)
5052                 return ret;
5053
5054         ret = intel_wopcm_init(&dev_priv->wopcm);
5055         if (ret)
5056                 goto err_uc_misc;
5057
5058         /* This is just a security blanket to placate dragons.
5059          * On some systems, we very sporadically observe that the first TLBs
5060          * used by the CS may be stale, despite us poking the TLB reset. If
5061          * we hold the forcewake during initialisation these problems
5062          * just magically go away.
5063          */
5064         mutex_lock(&dev_priv->drm.struct_mutex);
5065         intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
5066
5067         ret = i915_gem_init_ggtt(dev_priv);
5068         if (ret) {
5069                 GEM_BUG_ON(ret == -EIO);
5070                 goto err_unlock;
5071         }
5072
5073         ret = i915_gem_init_scratch(dev_priv,
5074                                     IS_GEN(dev_priv, 2) ? SZ_256K : PAGE_SIZE);
5075         if (ret) {
5076                 GEM_BUG_ON(ret == -EIO);
5077                 goto err_ggtt;
5078         }
5079
5080         ret = i915_gem_contexts_init(dev_priv);
5081         if (ret) {
5082                 GEM_BUG_ON(ret == -EIO);
5083                 goto err_scratch;
5084         }
5085
5086         ret = intel_engines_init(dev_priv);
5087         if (ret) {
5088                 GEM_BUG_ON(ret == -EIO);
5089                 goto err_context;
5090         }
5091
5092         intel_init_gt_powersave(dev_priv);
5093
5094         ret = intel_uc_init(dev_priv);
5095         if (ret)
5096                 goto err_pm;
5097
5098         ret = i915_gem_init_hw(dev_priv);
5099         if (ret)
5100                 goto err_uc_init;
5101
5102         /*
5103          * Despite its name intel_init_clock_gating applies both display
5104          * clock gating workarounds; GT mmio workarounds and the occasional
5105          * GT power context workaround. Worse, sometimes it includes a context
5106          * register workaround which we need to apply before we record the
5107          * default HW state for all contexts.
5108          *
5109          * FIXME: break up the workarounds and apply them at the right time!
5110          */
5111         intel_init_clock_gating(dev_priv);
5112
5113         ret = __intel_engines_record_defaults(dev_priv);
5114         if (ret)
5115                 goto err_init_hw;
5116
5117         if (i915_inject_load_failure()) {
5118                 ret = -ENODEV;
5119                 goto err_init_hw;
5120         }
5121
5122         if (i915_inject_load_failure()) {
5123                 ret = -EIO;
5124                 goto err_init_hw;
5125         }
5126
5127         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5128         mutex_unlock(&dev_priv->drm.struct_mutex);
5129
5130         return 0;
5131
5132         /*
5133          * Unwinding is complicated by that we want to handle -EIO to mean
5134          * disable GPU submission but keep KMS alive. We want to mark the
5135          * HW as irrevisibly wedged, but keep enough state around that the
5136          * driver doesn't explode during runtime.
5137          */
5138 err_init_hw:
5139         mutex_unlock(&dev_priv->drm.struct_mutex);
5140
5141         WARN_ON(i915_gem_suspend(dev_priv));
5142         i915_gem_suspend_late(dev_priv);
5143
5144         i915_gem_drain_workqueue(dev_priv);
5145
5146         mutex_lock(&dev_priv->drm.struct_mutex);
5147         intel_uc_fini_hw(dev_priv);
5148 err_uc_init:
5149         intel_uc_fini(dev_priv);
5150 err_pm:
5151         if (ret != -EIO) {
5152                 intel_cleanup_gt_powersave(dev_priv);
5153                 i915_gem_cleanup_engines(dev_priv);
5154         }
5155 err_context:
5156         if (ret != -EIO)
5157                 i915_gem_contexts_fini(dev_priv);
5158 err_scratch:
5159         i915_gem_fini_scratch(dev_priv);
5160 err_ggtt:
5161 err_unlock:
5162         intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
5163         mutex_unlock(&dev_priv->drm.struct_mutex);
5164
5165 err_uc_misc:
5166         intel_uc_fini_misc(dev_priv);
5167
5168         if (ret != -EIO) {
5169                 i915_gem_cleanup_userptr(dev_priv);
5170                 i915_timelines_fini(dev_priv);
5171         }
5172
5173         if (ret == -EIO) {
5174                 mutex_lock(&dev_priv->drm.struct_mutex);
5175
5176                 /*
5177                  * Allow engine initialisation to fail by marking the GPU as
5178                  * wedged. But we only want to do this where the GPU is angry,
5179                  * for all other failure, such as an allocation failure, bail.
5180                  */
5181                 if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
5182                         i915_load_error(dev_priv,
5183                                         "Failed to initialize GPU, declaring it wedged!\n");
5184                         i915_gem_set_wedged(dev_priv);
5185                 }
5186
5187                 /* Minimal basic recovery for KMS */
5188                 ret = i915_ggtt_enable_hw(dev_priv);
5189                 i915_gem_restore_gtt_mappings(dev_priv);
5190                 i915_gem_restore_fences(dev_priv);
5191                 intel_init_clock_gating(dev_priv);
5192
5193                 mutex_unlock(&dev_priv->drm.struct_mutex);
5194         }
5195
5196         i915_gem_drain_freed_objects(dev_priv);
5197         return ret;
5198 }
5199
5200 void i915_gem_fini(struct drm_i915_private *dev_priv)
5201 {
5202         i915_gem_suspend_late(dev_priv);
5203         intel_disable_gt_powersave(dev_priv);
5204
5205         /* Flush any outstanding unpin_work. */
5206         i915_gem_drain_workqueue(dev_priv);
5207
5208         mutex_lock(&dev_priv->drm.struct_mutex);
5209         intel_uc_fini_hw(dev_priv);
5210         intel_uc_fini(dev_priv);
5211         i915_gem_cleanup_engines(dev_priv);
5212         i915_gem_contexts_fini(dev_priv);
5213         i915_gem_fini_scratch(dev_priv);
5214         mutex_unlock(&dev_priv->drm.struct_mutex);
5215
5216         intel_wa_list_free(&dev_priv->gt_wa_list);
5217
5218         intel_cleanup_gt_powersave(dev_priv);
5219
5220         intel_uc_fini_misc(dev_priv);
5221         i915_gem_cleanup_userptr(dev_priv);
5222         i915_timelines_fini(dev_priv);
5223
5224         i915_gem_drain_freed_objects(dev_priv);
5225
5226         WARN_ON(!list_empty(&dev_priv->contexts.list));
5227 }
5228
5229 void i915_gem_init_mmio(struct drm_i915_private *i915)
5230 {
5231         i915_gem_sanitize(i915);
5232 }
5233
5234 void
5235 i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
5236 {
5237         struct intel_engine_cs *engine;
5238         enum intel_engine_id id;
5239
5240         for_each_engine(engine, dev_priv, id)
5241                 dev_priv->gt.cleanup_engine(engine);
5242 }
5243
5244 void
5245 i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
5246 {
5247         int i;
5248
5249         if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
5250             !IS_CHERRYVIEW(dev_priv))
5251                 dev_priv->num_fence_regs = 32;
5252         else if (INTEL_GEN(dev_priv) >= 4 ||
5253                  IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
5254                  IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
5255                 dev_priv->num_fence_regs = 16;
5256         else
5257                 dev_priv->num_fence_regs = 8;
5258
5259         if (intel_vgpu_active(dev_priv))
5260                 dev_priv->num_fence_regs =
5261                                 I915_READ(vgtif_reg(avail_rs.fence_num));
5262
5263         /* Initialize fence registers to zero */
5264         for (i = 0; i < dev_priv->num_fence_regs; i++) {
5265                 struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
5266
5267                 fence->i915 = dev_priv;
5268                 fence->id = i;
5269                 list_add_tail(&fence->link, &dev_priv->mm.fence_list);
5270         }
5271         i915_gem_restore_fences(dev_priv);
5272
5273         i915_gem_detect_bit_6_swizzle(dev_priv);
5274 }
5275
5276 static void i915_gem_init__mm(struct drm_i915_private *i915)
5277 {
5278         spin_lock_init(&i915->mm.object_stat_lock);
5279         spin_lock_init(&i915->mm.obj_lock);
5280         spin_lock_init(&i915->mm.free_lock);
5281
5282         init_llist_head(&i915->mm.free_list);
5283
5284         INIT_LIST_HEAD(&i915->mm.unbound_list);
5285         INIT_LIST_HEAD(&i915->mm.bound_list);
5286         INIT_LIST_HEAD(&i915->mm.fence_list);
5287         INIT_LIST_HEAD(&i915->mm.userfault_list);
5288
5289         INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
5290 }
5291
5292 int i915_gem_init_early(struct drm_i915_private *dev_priv)
5293 {
5294         int err = -ENOMEM;
5295
5296         dev_priv->objects = KMEM_CACHE(drm_i915_gem_object, SLAB_HWCACHE_ALIGN);
5297         if (!dev_priv->objects)
5298                 goto err_out;
5299
5300         dev_priv->vmas = KMEM_CACHE(i915_vma, SLAB_HWCACHE_ALIGN);
5301         if (!dev_priv->vmas)
5302                 goto err_objects;
5303
5304         dev_priv->luts = KMEM_CACHE(i915_lut_handle, 0);
5305         if (!dev_priv->luts)
5306                 goto err_vmas;
5307
5308         dev_priv->requests = KMEM_CACHE(i915_request,
5309                                         SLAB_HWCACHE_ALIGN |
5310                                         SLAB_RECLAIM_ACCOUNT |
5311                                         SLAB_TYPESAFE_BY_RCU);
5312         if (!dev_priv->requests)
5313                 goto err_luts;
5314
5315         dev_priv->dependencies = KMEM_CACHE(i915_dependency,
5316                                             SLAB_HWCACHE_ALIGN |
5317                                             SLAB_RECLAIM_ACCOUNT);
5318         if (!dev_priv->dependencies)
5319                 goto err_requests;
5320
5321         dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
5322         if (!dev_priv->priorities)
5323                 goto err_dependencies;
5324
5325         INIT_LIST_HEAD(&dev_priv->gt.active_rings);
5326         INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
5327
5328         i915_gem_init__mm(dev_priv);
5329
5330         INIT_DELAYED_WORK(&dev_priv->gt.retire_work,
5331                           i915_gem_retire_work_handler);
5332         INIT_DELAYED_WORK(&dev_priv->gt.idle_work,
5333                           i915_gem_idle_work_handler);
5334         init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
5335         init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
5336         mutex_init(&dev_priv->gpu_error.wedge_mutex);
5337
5338         atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
5339
5340         spin_lock_init(&dev_priv->fb_tracking.lock);
5341
5342         err = i915_gemfs_init(dev_priv);
5343         if (err)
5344                 DRM_NOTE("Unable to create a private tmpfs mount, hugepage support will be disabled(%d).\n", err);
5345
5346         return 0;
5347
5348 err_dependencies:
5349         kmem_cache_destroy(dev_priv->dependencies);
5350 err_requests:
5351         kmem_cache_destroy(dev_priv->requests);
5352 err_luts:
5353         kmem_cache_destroy(dev_priv->luts);
5354 err_vmas:
5355         kmem_cache_destroy(dev_priv->vmas);
5356 err_objects:
5357         kmem_cache_destroy(dev_priv->objects);
5358 err_out:
5359         return err;
5360 }
5361
5362 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
5363 {
5364         i915_gem_drain_freed_objects(dev_priv);
5365         GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
5366         GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
5367         WARN_ON(dev_priv->mm.object_count);
5368
5369         kmem_cache_destroy(dev_priv->priorities);
5370         kmem_cache_destroy(dev_priv->dependencies);
5371         kmem_cache_destroy(dev_priv->requests);
5372         kmem_cache_destroy(dev_priv->luts);
5373         kmem_cache_destroy(dev_priv->vmas);
5374         kmem_cache_destroy(dev_priv->objects);
5375
5376         /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
5377         rcu_barrier();
5378
5379         i915_gemfs_fini(dev_priv);
5380 }
5381
5382 int i915_gem_freeze(struct drm_i915_private *dev_priv)
5383 {
5384         /* Discard all purgeable objects, let userspace recover those as
5385          * required after resuming.
5386          */
5387         i915_gem_shrink_all(dev_priv);
5388
5389         return 0;
5390 }
5391
5392 int i915_gem_freeze_late(struct drm_i915_private *i915)
5393 {
5394         struct drm_i915_gem_object *obj;
5395         struct list_head *phases[] = {
5396                 &i915->mm.unbound_list,
5397                 &i915->mm.bound_list,
5398                 NULL
5399         }, **phase;
5400
5401         /*
5402          * Called just before we write the hibernation image.
5403          *
5404          * We need to update the domain tracking to reflect that the CPU
5405          * will be accessing all the pages to create and restore from the
5406          * hibernation, and so upon restoration those pages will be in the
5407          * CPU domain.
5408          *
5409          * To make sure the hibernation image contains the latest state,
5410          * we update that state just before writing out the image.
5411          *
5412          * To try and reduce the hibernation image, we manually shrink
5413          * the objects as well, see i915_gem_freeze()
5414          */
5415
5416         i915_gem_shrink(i915, -1UL, NULL, I915_SHRINK_UNBOUND);
5417         i915_gem_drain_freed_objects(i915);
5418
5419         mutex_lock(&i915->drm.struct_mutex);
5420         for (phase = phases; *phase; phase++) {
5421                 list_for_each_entry(obj, *phase, mm.link)
5422                         WARN_ON(i915_gem_object_set_to_cpu_domain(obj, true));
5423         }
5424         mutex_unlock(&i915->drm.struct_mutex);
5425
5426         return 0;
5427 }
5428
5429 void i915_gem_release(struct drm_device *dev, struct drm_file *file)
5430 {
5431         struct drm_i915_file_private *file_priv = file->driver_priv;
5432         struct i915_request *request;
5433
5434         /* Clean up our request list when the client is going away, so that
5435          * later retire_requests won't dereference our soon-to-be-gone
5436          * file_priv.
5437          */
5438         spin_lock(&file_priv->mm.lock);
5439         list_for_each_entry(request, &file_priv->mm.request_list, client_link)
5440                 request->file_priv = NULL;
5441         spin_unlock(&file_priv->mm.lock);
5442 }
5443
5444 int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file)
5445 {
5446         struct drm_i915_file_private *file_priv;
5447         int ret;
5448
5449         DRM_DEBUG("\n");
5450
5451         file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
5452         if (!file_priv)
5453                 return -ENOMEM;
5454
5455         file->driver_priv = file_priv;
5456         file_priv->dev_priv = i915;
5457         file_priv->file = file;
5458
5459         spin_lock_init(&file_priv->mm.lock);
5460         INIT_LIST_HEAD(&file_priv->mm.request_list);
5461
5462         file_priv->bsd_engine = -1;
5463         file_priv->hang_timestamp = jiffies;
5464
5465         ret = i915_gem_context_open(i915, file);
5466         if (ret)
5467                 kfree(file_priv);
5468
5469         return ret;
5470 }
5471
5472 /**
5473  * i915_gem_track_fb - update frontbuffer tracking
5474  * @old: current GEM buffer for the frontbuffer slots
5475  * @new: new GEM buffer for the frontbuffer slots
5476  * @frontbuffer_bits: bitmask of frontbuffer slots
5477  *
5478  * This updates the frontbuffer tracking bits @frontbuffer_bits by clearing them
5479  * from @old and setting them in @new. Both @old and @new can be NULL.
5480  */
5481 void i915_gem_track_fb(struct drm_i915_gem_object *old,
5482                        struct drm_i915_gem_object *new,
5483                        unsigned frontbuffer_bits)
5484 {
5485         /* Control of individual bits within the mask are guarded by
5486          * the owning plane->mutex, i.e. we can never see concurrent
5487          * manipulation of individual bits. But since the bitfield as a whole
5488          * is updated using RMW, we need to use atomics in order to update
5489          * the bits.
5490          */
5491         BUILD_BUG_ON(INTEL_FRONTBUFFER_BITS_PER_PIPE * I915_MAX_PIPES >
5492                      BITS_PER_TYPE(atomic_t));
5493
5494         if (old) {
5495                 WARN_ON(!(atomic_read(&old->frontbuffer_bits) & frontbuffer_bits));
5496                 atomic_andnot(frontbuffer_bits, &old->frontbuffer_bits);
5497         }
5498
5499         if (new) {
5500                 WARN_ON(atomic_read(&new->frontbuffer_bits) & frontbuffer_bits);
5501                 atomic_or(frontbuffer_bits, &new->frontbuffer_bits);
5502         }
5503 }
5504
5505 /* Allocate a new GEM object and fill it with the supplied data */
5506 struct drm_i915_gem_object *
5507 i915_gem_object_create_from_data(struct drm_i915_private *dev_priv,
5508                                  const void *data, size_t size)
5509 {
5510         struct drm_i915_gem_object *obj;
5511         struct file *file;
5512         size_t offset;
5513         int err;
5514
5515         obj = i915_gem_object_create(dev_priv, round_up(size, PAGE_SIZE));
5516         if (IS_ERR(obj))
5517                 return obj;
5518
5519         GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU);
5520
5521         file = obj->base.filp;
5522         offset = 0;
5523         do {
5524                 unsigned int len = min_t(typeof(size), size, PAGE_SIZE);
5525                 struct page *page;
5526                 void *pgdata, *vaddr;
5527
5528                 err = pagecache_write_begin(file, file->f_mapping,
5529                                             offset, len, 0,
5530                                             &page, &pgdata);
5531                 if (err < 0)
5532                         goto fail;
5533
5534                 vaddr = kmap(page);
5535                 memcpy(vaddr, data, len);
5536                 kunmap(page);
5537
5538                 err = pagecache_write_end(file, file->f_mapping,
5539                                           offset, len, len,
5540                                           page, pgdata);
5541                 if (err < 0)
5542                         goto fail;
5543
5544                 size -= len;
5545                 data += len;
5546                 offset += len;
5547         } while (size);
5548
5549         return obj;
5550
5551 fail:
5552         i915_gem_object_put(obj);
5553         return ERR_PTR(err);
5554 }
5555
5556 struct scatterlist *
5557 i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
5558                        unsigned int n,
5559                        unsigned int *offset)
5560 {
5561         struct i915_gem_object_page_iter *iter = &obj->mm.get_page;
5562         struct scatterlist *sg;
5563         unsigned int idx, count;
5564
5565         might_sleep();
5566         GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
5567         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
5568
5569         /* As we iterate forward through the sg, we record each entry in a
5570          * radixtree for quick repeated (backwards) lookups. If we have seen
5571          * this index previously, we will have an entry for it.
5572          *
5573          * Initial lookup is O(N), but this is amortized to O(1) for
5574          * sequential page access (where each new request is consecutive
5575          * to the previous one). Repeated lookups are O(lg(obj->base.size)),
5576          * i.e. O(1) with a large constant!
5577          */
5578         if (n < READ_ONCE(iter->sg_idx))
5579                 goto lookup;
5580
5581         mutex_lock(&iter->lock);
5582
5583         /* We prefer to reuse the last sg so that repeated lookup of this
5584          * (or the subsequent) sg are fast - comparing against the last
5585          * sg is faster than going through the radixtree.
5586          */
5587
5588         sg = iter->sg_pos;
5589         idx = iter->sg_idx;
5590         count = __sg_page_count(sg);
5591
5592         while (idx + count <= n) {
5593                 void *entry;
5594                 unsigned long i;
5595                 int ret;
5596
5597                 /* If we cannot allocate and insert this entry, or the
5598                  * individual pages from this range, cancel updating the
5599                  * sg_idx so that on this lookup we are forced to linearly
5600                  * scan onwards, but on future lookups we will try the
5601                  * insertion again (in which case we need to be careful of
5602                  * the error return reporting that we have already inserted
5603                  * this index).
5604                  */
5605                 ret = radix_tree_insert(&iter->radix, idx, sg);
5606                 if (ret && ret != -EEXIST)
5607                         goto scan;
5608
5609                 entry = xa_mk_value(idx);
5610                 for (i = 1; i < count; i++) {
5611                         ret = radix_tree_insert(&iter->radix, idx + i, entry);
5612                         if (ret && ret != -EEXIST)
5613                                 goto scan;
5614                 }
5615
5616                 idx += count;
5617                 sg = ____sg_next(sg);
5618                 count = __sg_page_count(sg);
5619         }
5620
5621 scan:
5622         iter->sg_pos = sg;
5623         iter->sg_idx = idx;
5624
5625         mutex_unlock(&iter->lock);
5626
5627         if (unlikely(n < idx)) /* insertion completed by another thread */
5628                 goto lookup;
5629
5630         /* In case we failed to insert the entry into the radixtree, we need
5631          * to look beyond the current sg.
5632          */
5633         while (idx + count <= n) {
5634                 idx += count;
5635                 sg = ____sg_next(sg);
5636                 count = __sg_page_count(sg);
5637         }
5638
5639         *offset = n - idx;
5640         return sg;
5641
5642 lookup:
5643         rcu_read_lock();
5644
5645         sg = radix_tree_lookup(&iter->radix, n);
5646         GEM_BUG_ON(!sg);
5647
5648         /* If this index is in the middle of multi-page sg entry,
5649          * the radix tree will contain a value entry that points
5650          * to the start of that range. We will return the pointer to
5651          * the base page and the offset of this page within the
5652          * sg entry's range.
5653          */
5654         *offset = 0;
5655         if (unlikely(xa_is_value(sg))) {
5656                 unsigned long base = xa_to_value(sg);
5657
5658                 sg = radix_tree_lookup(&iter->radix, base);
5659                 GEM_BUG_ON(!sg);
5660
5661                 *offset = n - base;
5662         }
5663
5664         rcu_read_unlock();
5665
5666         return sg;
5667 }
5668
5669 struct page *
5670 i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned int n)
5671 {
5672         struct scatterlist *sg;
5673         unsigned int offset;
5674
5675         GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
5676
5677         sg = i915_gem_object_get_sg(obj, n, &offset);
5678         return nth_page(sg_page(sg), offset);
5679 }
5680
5681 /* Like i915_gem_object_get_page(), but mark the returned page dirty */
5682 struct page *
5683 i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
5684                                unsigned int n)
5685 {
5686         struct page *page;
5687
5688         page = i915_gem_object_get_page(obj, n);
5689         if (!obj->mm.dirty)
5690                 set_page_dirty(page);
5691
5692         return page;
5693 }
5694
5695 dma_addr_t
5696 i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
5697                                 unsigned long n)
5698 {
5699         struct scatterlist *sg;
5700         unsigned int offset;
5701
5702         sg = i915_gem_object_get_sg(obj, n, &offset);
5703         return sg_dma_address(sg) + (offset << PAGE_SHIFT);
5704 }
5705
5706 int i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align)
5707 {
5708         struct sg_table *pages;
5709         int err;
5710
5711         if (align > obj->base.size)
5712                 return -EINVAL;
5713
5714         if (obj->ops == &i915_gem_phys_ops)
5715                 return 0;
5716
5717         if (obj->ops != &i915_gem_object_ops)
5718                 return -EINVAL;
5719
5720         err = i915_gem_object_unbind(obj);
5721         if (err)
5722                 return err;
5723
5724         mutex_lock(&obj->mm.lock);
5725
5726         if (obj->mm.madv != I915_MADV_WILLNEED) {
5727                 err = -EFAULT;
5728                 goto err_unlock;
5729         }
5730
5731         if (obj->mm.quirked) {
5732                 err = -EFAULT;
5733                 goto err_unlock;
5734         }
5735
5736         if (obj->mm.mapping) {
5737                 err = -EBUSY;
5738                 goto err_unlock;
5739         }
5740
5741         pages = __i915_gem_object_unset_pages(obj);
5742
5743         obj->ops = &i915_gem_phys_ops;
5744
5745         err = ____i915_gem_object_get_pages(obj);
5746         if (err)
5747                 goto err_xfer;
5748
5749         /* Perma-pin (until release) the physical set of pages */
5750         __i915_gem_object_pin_pages(obj);
5751
5752         if (!IS_ERR_OR_NULL(pages))
5753                 i915_gem_object_ops.put_pages(obj, pages);
5754         mutex_unlock(&obj->mm.lock);
5755         return 0;
5756
5757 err_xfer:
5758         obj->ops = &i915_gem_object_ops;
5759         if (!IS_ERR_OR_NULL(pages)) {
5760                 unsigned int sg_page_sizes = i915_sg_page_sizes(pages->sgl);
5761
5762                 __i915_gem_object_set_pages(obj, pages, sg_page_sizes);
5763         }
5764 err_unlock:
5765         mutex_unlock(&obj->mm.lock);
5766         return err;
5767 }
5768
5769 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5770 #include "selftests/scatterlist.c"
5771 #include "selftests/mock_gem_device.c"
5772 #include "selftests/huge_gem_object.c"
5773 #include "selftests/huge_pages.c"
5774 #include "selftests/i915_gem_object.c"
5775 #include "selftests/i915_gem_coherency.c"
5776 #include "selftests/i915_gem.c"
5777 #endif