arch/powerpc/mm/pgtable-hash64.c

   1 /*
   2  * Copyright 2005, Paul Mackerras, IBM Corporation.
   3  * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
   4  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the License, or (at your option) any later version.
  10  */
  11
  12 #include <linux/sched.h>
  13 #include <linux/mm_types.h>
  14
  15 #include <asm/pgalloc.h>
  16 #include <asm/tlb.h>
  17
  18 #include "mmu_decl.h"
  19
  20 #define CREATE_TRACE_POINTS
  21 #include <trace/events/thp.h>
  22
  23 #ifdef CONFIG_SPARSEMEM_VMEMMAP
  24 /*
  25  * vmemmap is the starting address of the virtual address space where
  26  * struct pages are allocated for all possible PFNs present on the system
  27  * including holes and bad memory (hence sparse). These virtual struct
  28  * pages are stored in sequence in this virtual address space irrespective
  29  * of the fact whether the corresponding PFN is valid or not. This achieves
  30  * constant relationship between address of struct page and its PFN.
  31  *
  32  * During boot or memory hotplug operation when a new memory section is
  33  * added, physical memory allocation (including hash table bolting) will
  34  * be performed for the set of struct pages which are part of the memory
  35  * section. This saves memory by not allocating struct pages for PFNs
  36  * which are not valid.
  37  *
  38  *              ----------------------------------------------
  39  *              | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES|
  40  *              ----------------------------------------------
  41  *
  42  *         f000000000000000                  c000000000000000
  43  * vmemmap +--------------+                  +--------------+
  44  *  +      |  page struct | +--------------> |  page struct |
  45  *  |      +--------------+                  +--------------+
  46  *  |      |  page struct | +--------------> |  page struct |
  47  *  |      +--------------+ |                +--------------+
  48  *  |      |  page struct | +       +------> |  page struct |
  49  *  |      +--------------+         |        +--------------+
  50  *  |      |  page struct |         |   +--> |  page struct |
  51  *  |      +--------------+         |   |    +--------------+
  52  *  |      |  page struct |         |   |
  53  *  |      +--------------+         |   |
  54  *  |      |  page struct |         |   |
  55  *  |      +--------------+         |   |
  56  *  |      |  page struct |         |   |
  57  *  |      +--------------+         |   |
  58  *  |      |  page struct |         |   |
  59  *  |      +--------------+         |   |
  60  *  |      |  page struct | +-------+   |
  61  *  |      +--------------+             |
  62  *  |      |  page struct | +-----------+
  63  *  |      +--------------+
  64  *  |      |  page struct | No mapping
  65  *  |      +--------------+
  66  *  |      |  page struct | No mapping
  67  *  v      +--------------+
  68  *
  69  *              -----------------------------------------
  70  *              | RELATION BETWEEN STRUCT PAGES AND PFNS|
  71  *              -----------------------------------------
  72  *
  73  * vmemmap +--------------+                 +---------------+
  74  *  +      |  page struct | +-------------> |      PFN      |
  75  *  |      +--------------+                 +---------------+
  76  *  |      |  page struct | +-------------> |      PFN      |
  77  *  |      +--------------+                 +---------------+
  78  *  |      |  page struct | +-------------> |      PFN      |
  79  *  |      +--------------+                 +---------------+
  80  *  |      |  page struct | +-------------> |      PFN      |
  81  *  |      +--------------+                 +---------------+
  82  *  |      |              |
  83  *  |      +--------------+
  84  *  |      |              |
  85  *  |      +--------------+
  86  *  |      |              |
  87  *  |      +--------------+                 +---------------+
  88  *  |      |  page struct | +-------------> |      PFN      |
  89  *  |      +--------------+                 +---------------+
  90  *  |      |              |
  91  *  |      +--------------+
  92  *  |      |              |
  93  *  |      +--------------+                 +---------------+
  94  *  |      |  page struct | +-------------> |      PFN      |
  95  *  |      +--------------+                 +---------------+
  96  *  |      |  page struct | +-------------> |      PFN      |
  97  *  v      +--------------+                 +---------------+
  98  */
  99 /*
 100  * On hash-based CPUs, the vmemmap is bolted in the hash table.
 101  *
 102  */
 103 int __meminit hash__vmemmap_create_mapping(unsigned long start,
 104                                        unsigned long page_size,
 105                                        unsigned long phys)
 106 {
 107         int rc = htab_bolt_mapping(start, start + page_size, phys,
 108                                    pgprot_val(PAGE_KERNEL),
 109                                    mmu_vmemmap_psize, mmu_kernel_ssize);
 110         if (rc < 0) {
 111                 int rc2 = htab_remove_mapping(start, start + page_size,
 112                                               mmu_vmemmap_psize,
 113                                               mmu_kernel_ssize);
 114                 BUG_ON(rc2 && (rc2 != -ENOENT));
 115         }
 116         return rc;
 117 }
 118
 119 #ifdef CONFIG_MEMORY_HOTPLUG
 120 void hash__vmemmap_remove_mapping(unsigned long start,
 121                               unsigned long page_size)
 122 {
 123         int rc = htab_remove_mapping(start, start + page_size,
 124                                      mmu_vmemmap_psize,
 125                                      mmu_kernel_ssize);
 126         BUG_ON((rc < 0) && (rc != -ENOENT));
 127         WARN_ON(rc == -ENOENT);
 128 }
 129 #endif
 130 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 131
 132 /*
 133  * map_kernel_page currently only called by __ioremap
 134  * map_kernel_page adds an entry to the ioremap page table
 135  * and adds an entry to the HPT, possibly bolting it
 136  */
 137 int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
 138 {
 139         pgd_t *pgdp;
 140         pud_t *pudp;
 141         pmd_t *pmdp;
 142         pte_t *ptep;
 143
 144         BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
 145         if (slab_is_available()) {
 146                 pgdp = pgd_offset_k(ea);
 147                 pudp = pud_alloc(&init_mm, pgdp, ea);
 148                 if (!pudp)
 149                         return -ENOMEM;
 150                 pmdp = pmd_alloc(&init_mm, pudp, ea);
 151                 if (!pmdp)
 152                         return -ENOMEM;
 153                 ptep = pte_alloc_kernel(pmdp, ea);
 154                 if (!ptep)
 155                         return -ENOMEM;
 156                 set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
 157                                                           __pgprot(flags)));
 158         } else {
 159                 /*
 160                  * If the mm subsystem is not fully up, we cannot create a
 161                  * linux page table entry for this mapping.  Simply bolt an
 162                  * entry in the hardware page table.
 163                  *
 164                  */
 165                 if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
 166                                       mmu_io_psize, mmu_kernel_ssize)) {
 167                         printk(KERN_ERR "Failed to do bolted mapping IO "
 168                                "memory at %016lx !\n", pa);
 169                         return -ENOMEM;
 170                 }
 171         }
 172
 173         smp_wmb();
 174         return 0;
 175 }
 176
 177 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 178
 179 unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 180                                     pmd_t *pmdp, unsigned long clr,
 181                                     unsigned long set)
 182 {
 183         __be64 old_be, tmp;
 184         unsigned long old;
 185
 186 #ifdef CONFIG_DEBUG_VM
 187         WARN_ON(!pmd_trans_huge(*pmdp));
 188         assert_spin_locked(&mm->page_table_lock);
 189 #endif
 190
 191         __asm__ __volatile__(
 192         "1:     ldarx   %0,0,%3\n\
 193                 and.    %1,%0,%6\n\
 194                 bne-    1b \n\
 195                 andc    %1,%0,%4 \n\
 196                 or      %1,%1,%7\n\
 197                 stdcx.  %1,0,%3 \n\
 198                 bne-    1b"
 199         : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
 200         : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
 201           "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
 202         : "cc" );
 203
 204         old = be64_to_cpu(old_be);
 205
 206         trace_hugepage_update(addr, old, clr, set);
 207         if (old & H_PAGE_HASHPTE)
 208                 hpte_do_hugepage_flush(mm, addr, pmdp, old);
 209         return old;
 210 }
 211
 212 pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 213                             pmd_t *pmdp)
 214 {
 215         pmd_t pmd;
 216
 217         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 218         VM_BUG_ON(pmd_trans_huge(*pmdp));
 219
 220         pmd = *pmdp;
 221         pmd_clear(pmdp);
 222         /*
 223          * Wait for all pending hash_page to finish. This is needed
 224          * in case of subpage collapse. When we collapse normal pages
 225          * to hugepage, we first clear the pmd, then invalidate all
 226          * the PTE entries. The assumption here is that any low level
 227          * page fault will see a none pmd and take the slow path that
 228          * will wait on mmap_sem. But we could very well be in a
 229          * hash_page with local ptep pointer value. Such a hash page
 230          * can result in adding new HPTE entries for normal subpages.
 231          * That means we could be modifying the page content as we
 232          * copy them to a huge page. So wait for parallel hash_page
 233          * to finish before invalidating HPTE entries. We can do this
 234          * by sending an IPI to all the cpus and executing a dummy
 235          * function there.
 236          */
 237         kick_all_cpus_sync();
 238         /*
 239          * Now invalidate the hpte entries in the range
 240          * covered by pmd. This make sure we take a
 241          * fault and will find the pmd as none, which will
 242          * result in a major fault which takes mmap_sem and
 243          * hence wait for collapse to complete. Without this
 244          * the __collapse_huge_page_copy can result in copying
 245          * the old content.
 246          */
 247         flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
 248         return pmd;
 249 }
 250
 251 /*
 252  * We want to put the pgtable in pmd and use pgtable for tracking
 253  * the base page size hptes
 254  */
 255 void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 256                                   pgtable_t pgtable)
 257 {
 258         pgtable_t *pgtable_slot;
 259         assert_spin_locked(&mm->page_table_lock);
 260         /*
 261          * we store the pgtable in the second half of PMD
 262          */
 263         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 264         *pgtable_slot = pgtable;
 265         /*
 266          * expose the deposited pgtable to other cpus.
 267          * before we set the hugepage PTE at pmd level
 268          * hash fault code looks at the deposted pgtable
 269          * to store hash index values.
 270          */
 271         smp_wmb();
 272 }
 273
 274 pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 275 {
 276         pgtable_t pgtable;
 277         pgtable_t *pgtable_slot;
 278
 279         assert_spin_locked(&mm->page_table_lock);
 280         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 281         pgtable = *pgtable_slot;
 282         /*
 283          * Once we withdraw, mark the entry NULL.
 284          */
 285         *pgtable_slot = NULL;
 286         /*
 287          * We store HPTE information in the deposited PTE fragment.
 288          * zero out the content on withdraw.
 289          */
 290         memset(pgtable, 0, PTE_FRAG_SIZE);
 291         return pgtable;
 292 }
 293
 294 void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
 295                                unsigned long address, pmd_t *pmdp)
 296 {
 297         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 298         VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
 299
 300         /*
 301          * We can't mark the pmd none here, because that will cause a race
 302          * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
 303          * we spilt, but at the same time we wan't rest of the ppc64 code
 304          * not to insert hash pte on this, because we will be modifying
 305          * the deposited pgtable in the caller of this function. Hence
 306          * clear the _PAGE_USER so that we move the fault handling to
 307          * higher level function and that will serialize against ptl.
 308          * We need to flush existing hash pte entries here even though,
 309          * the translation is still valid, because we will withdraw
 310          * pgtable_t after this.
 311          */
 312         pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
 313 }
 314
 315 /*
 316  * A linux hugepage PMD was changed and the corresponding hash table entries
 317  * neesd to be flushed.
 318  */
 319 void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
 320                             pmd_t *pmdp, unsigned long old_pmd)
 321 {
 322         int ssize;
 323         unsigned int psize;
 324         unsigned long vsid;
 325         unsigned long flags = 0;
 326         const struct cpumask *tmp;
 327
 328         /* get the base page size,vsid and segment size */
 329 #ifdef CONFIG_DEBUG_VM
 330         psize = get_slice_psize(mm, addr);
 331         BUG_ON(psize == MMU_PAGE_16M);
 332 #endif
 333         if (old_pmd & H_PAGE_COMBO)
 334                 psize = MMU_PAGE_4K;
 335         else
 336                 psize = MMU_PAGE_64K;
 337
 338         if (!is_kernel_addr(addr)) {
 339                 ssize = user_segment_size(addr);
 340                 vsid = get_vsid(mm->context.id, addr, ssize);
 341                 WARN_ON(vsid == 0);
 342         } else {
 343                 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
 344                 ssize = mmu_kernel_ssize;
 345         }
 346
 347         tmp = cpumask_of(smp_processor_id());
 348         if (cpumask_equal(mm_cpumask(mm), tmp))
 349                 flags |= HPTE_LOCAL_UPDATE;
 350
 351         return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
 352 }
 353
 354 pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 355                                 unsigned long addr, pmd_t *pmdp)
 356 {
 357         pmd_t old_pmd;
 358         pgtable_t pgtable;
 359         unsigned long old;
 360         pgtable_t *pgtable_slot;
 361
 362         old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
 363         old_pmd = __pmd(old);
 364         /*
 365          * We have pmd == none and we are holding page_table_lock.
 366          * So we can safely go and clear the pgtable hash
 367          * index info.
 368          */
 369         pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
 370         pgtable = *pgtable_slot;
 371         /*
 372          * Let's zero out old valid and hash index details
 373          * hash fault look at them.
 374          */
 375         memset(pgtable, 0, PTE_FRAG_SIZE);
 376         /*
 377          * Serialize against find_linux_pte_or_hugepte which does lock-less
 378          * lookup in page tables with local interrupts disabled. For huge pages
 379          * it casts pmd_t to pte_t. Since format of pte_t is different from
 380          * pmd_t we want to prevent transit from pmd pointing to page table
 381          * to pmd pointing to huge page (and back) while interrupts are disabled.
 382          * We clear pmd to possibly replace it with page table pointer in
 383          * different code paths. So make sure we wait for the parallel
 384          * find_linux_pte_or_hugepage to finish.
 385          */
 386         kick_all_cpus_sync();
 387         return old_pmd;
 388 }
 389
 390 int hash__has_transparent_hugepage(void)
 391 {
 392
 393         if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 394                 return 0;
 395         /*
 396          * We support THP only if PMD_SIZE is 16MB.
 397          */
 398         if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
 399                 return 0;
 400         /*
 401          * We need to make sure that we support 16MB hugepage in a segement
 402          * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
 403          * of 64K.
 404          */
 405         /*
 406          * If we have 64K HPTE, we will be using that by default
 407          */
 408         if (mmu_psize_defs[MMU_PAGE_64K].shift &&
 409             (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
 410                 return 0;
 411         /*
 412          * Ok we only have 4K HPTE
 413          */
 414         if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
 415                 return 0;
 416
 417         return 1;
 418 }
 419 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */