]> asedeno.scripts.mit.edu Git - linux.git/blob - arch/powerpc/mm/hugetlbpage.c
Merge tag 'powerpc-5.6-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
[linux.git] / arch / powerpc / mm / hugetlbpage.c
1 /*
2  * PPC Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
6  *
7  * Based on the IA-32 version:
8  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
9  */
10
11 #include <linux/mm.h>
12 #include <linux/io.h>
13 #include <linux/slab.h>
14 #include <linux/hugetlb.h>
15 #include <linux/export.h>
16 #include <linux/of_fdt.h>
17 #include <linux/memblock.h>
18 #include <linux/moduleparam.h>
19 #include <linux/swap.h>
20 #include <linux/swapops.h>
21 #include <linux/kmemleak.h>
22 #include <asm/pgtable.h>
23 #include <asm/pgalloc.h>
24 #include <asm/tlb.h>
25 #include <asm/setup.h>
26 #include <asm/hugetlb.h>
27 #include <asm/pte-walk.h>
28
29 bool hugetlb_disabled = false;
30
31 #define hugepd_none(hpd)        (hpd_val(hpd) == 0)
32
33 #define PTE_T_ORDER     (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *)))
34
35 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
36 {
37         /*
38          * Only called for hugetlbfs pages, hence can ignore THP and the
39          * irq disabled walk.
40          */
41         return __find_linux_pte(mm->pgd, addr, NULL, NULL);
42 }
43
44 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
45                            unsigned long address, unsigned int pdshift,
46                            unsigned int pshift, spinlock_t *ptl)
47 {
48         struct kmem_cache *cachep;
49         pte_t *new;
50         int i;
51         int num_hugepd;
52
53         if (pshift >= pdshift) {
54                 cachep = PGT_CACHE(PTE_T_ORDER);
55                 num_hugepd = 1 << (pshift - pdshift);
56         } else if (IS_ENABLED(CONFIG_PPC_8xx)) {
57                 cachep = PGT_CACHE(PTE_INDEX_SIZE);
58                 num_hugepd = 1;
59         } else {
60                 cachep = PGT_CACHE(pdshift - pshift);
61                 num_hugepd = 1;
62         }
63
64         if (!cachep) {
65                 WARN_ONCE(1, "No page table cache created for hugetlb tables");
66                 return -ENOMEM;
67         }
68
69         new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
70
71         BUG_ON(pshift > HUGEPD_SHIFT_MASK);
72         BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
73
74         if (!new)
75                 return -ENOMEM;
76
77         /*
78          * Make sure other cpus find the hugepd set only after a
79          * properly initialized page table is visible to them.
80          * For more details look for comment in __pte_alloc().
81          */
82         smp_wmb();
83
84         spin_lock(ptl);
85         /*
86          * We have multiple higher-level entries that point to the same
87          * actual pte location.  Fill in each as we go and backtrack on error.
88          * We need all of these so the DTLB pgtable walk code can find the
89          * right higher-level entry without knowing if it's a hugepage or not.
90          */
91         for (i = 0; i < num_hugepd; i++, hpdp++) {
92                 if (unlikely(!hugepd_none(*hpdp)))
93                         break;
94                 hugepd_populate(hpdp, new, pshift);
95         }
96         /* If we bailed from the for loop early, an error occurred, clean up */
97         if (i < num_hugepd) {
98                 for (i = i - 1 ; i >= 0; i--, hpdp--)
99                         *hpdp = __hugepd(0);
100                 kmem_cache_free(cachep, new);
101         } else {
102                 kmemleak_ignore(new);
103         }
104         spin_unlock(ptl);
105         return 0;
106 }
107
108 /*
109  * At this point we do the placement change only for BOOK3S 64. This would
110  * possibly work on other subarchs.
111  */
112 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
113 {
114         pgd_t *pg;
115         pud_t *pu;
116         pmd_t *pm;
117         hugepd_t *hpdp = NULL;
118         unsigned pshift = __ffs(sz);
119         unsigned pdshift = PGDIR_SHIFT;
120         spinlock_t *ptl;
121
122         addr &= ~(sz-1);
123         pg = pgd_offset(mm, addr);
124
125 #ifdef CONFIG_PPC_BOOK3S_64
126         if (pshift == PGDIR_SHIFT)
127                 /* 16GB huge page */
128                 return (pte_t *) pg;
129         else if (pshift > PUD_SHIFT) {
130                 /*
131                  * We need to use hugepd table
132                  */
133                 ptl = &mm->page_table_lock;
134                 hpdp = (hugepd_t *)pg;
135         } else {
136                 pdshift = PUD_SHIFT;
137                 pu = pud_alloc(mm, pg, addr);
138                 if (!pu)
139                         return NULL;
140                 if (pshift == PUD_SHIFT)
141                         return (pte_t *)pu;
142                 else if (pshift > PMD_SHIFT) {
143                         ptl = pud_lockptr(mm, pu);
144                         hpdp = (hugepd_t *)pu;
145                 } else {
146                         pdshift = PMD_SHIFT;
147                         pm = pmd_alloc(mm, pu, addr);
148                         if (!pm)
149                                 return NULL;
150                         if (pshift == PMD_SHIFT)
151                                 /* 16MB hugepage */
152                                 return (pte_t *)pm;
153                         else {
154                                 ptl = pmd_lockptr(mm, pm);
155                                 hpdp = (hugepd_t *)pm;
156                         }
157                 }
158         }
159 #else
160         if (pshift >= PGDIR_SHIFT) {
161                 ptl = &mm->page_table_lock;
162                 hpdp = (hugepd_t *)pg;
163         } else {
164                 pdshift = PUD_SHIFT;
165                 pu = pud_alloc(mm, pg, addr);
166                 if (!pu)
167                         return NULL;
168                 if (pshift >= PUD_SHIFT) {
169                         ptl = pud_lockptr(mm, pu);
170                         hpdp = (hugepd_t *)pu;
171                 } else {
172                         pdshift = PMD_SHIFT;
173                         pm = pmd_alloc(mm, pu, addr);
174                         if (!pm)
175                                 return NULL;
176                         ptl = pmd_lockptr(mm, pm);
177                         hpdp = (hugepd_t *)pm;
178                 }
179         }
180 #endif
181         if (!hpdp)
182                 return NULL;
183
184         BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
185
186         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
187                                                   pdshift, pshift, ptl))
188                 return NULL;
189
190         return hugepte_offset(*hpdp, addr, pdshift);
191 }
192
193 #ifdef CONFIG_PPC_BOOK3S_64
194 /*
195  * Tracks gpages after the device tree is scanned and before the
196  * huge_boot_pages list is ready on pseries.
197  */
198 #define MAX_NUMBER_GPAGES       1024
199 __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
200 __initdata static unsigned nr_gpages;
201
202 /*
203  * Build list of addresses of gigantic pages.  This function is used in early
204  * boot before the buddy allocator is setup.
205  */
206 void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
207 {
208         if (!addr)
209                 return;
210         while (number_of_pages > 0) {
211                 gpage_freearray[nr_gpages] = addr;
212                 nr_gpages++;
213                 number_of_pages--;
214                 addr += page_size;
215         }
216 }
217
218 int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
219 {
220         struct huge_bootmem_page *m;
221         if (nr_gpages == 0)
222                 return 0;
223         m = phys_to_virt(gpage_freearray[--nr_gpages]);
224         gpage_freearray[nr_gpages] = 0;
225         list_add(&m->list, &huge_boot_pages);
226         m->hstate = hstate;
227         return 1;
228 }
229 #endif
230
231
232 int __init alloc_bootmem_huge_page(struct hstate *h)
233 {
234
235 #ifdef CONFIG_PPC_BOOK3S_64
236         if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
237                 return pseries_alloc_bootmem_huge_page(h);
238 #endif
239         return __alloc_bootmem_huge_page(h);
240 }
241
242 #ifndef CONFIG_PPC_BOOK3S_64
243 #define HUGEPD_FREELIST_SIZE \
244         ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
245
246 struct hugepd_freelist {
247         struct rcu_head rcu;
248         unsigned int index;
249         void *ptes[0];
250 };
251
252 static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
253
254 static void hugepd_free_rcu_callback(struct rcu_head *head)
255 {
256         struct hugepd_freelist *batch =
257                 container_of(head, struct hugepd_freelist, rcu);
258         unsigned int i;
259
260         for (i = 0; i < batch->index; i++)
261                 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
262
263         free_page((unsigned long)batch);
264 }
265
266 static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
267 {
268         struct hugepd_freelist **batchp;
269
270         batchp = &get_cpu_var(hugepd_freelist_cur);
271
272         if (atomic_read(&tlb->mm->mm_users) < 2 ||
273             mm_is_thread_local(tlb->mm)) {
274                 kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
275                 put_cpu_var(hugepd_freelist_cur);
276                 return;
277         }
278
279         if (*batchp == NULL) {
280                 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
281                 (*batchp)->index = 0;
282         }
283
284         (*batchp)->ptes[(*batchp)->index++] = hugepte;
285         if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
286                 call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
287                 *batchp = NULL;
288         }
289         put_cpu_var(hugepd_freelist_cur);
290 }
291 #else
292 static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
293 #endif
294
295 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
296                               unsigned long start, unsigned long end,
297                               unsigned long floor, unsigned long ceiling)
298 {
299         pte_t *hugepte = hugepd_page(*hpdp);
300         int i;
301
302         unsigned long pdmask = ~((1UL << pdshift) - 1);
303         unsigned int num_hugepd = 1;
304         unsigned int shift = hugepd_shift(*hpdp);
305
306         /* Note: On fsl the hpdp may be the first of several */
307         if (shift > pdshift)
308                 num_hugepd = 1 << (shift - pdshift);
309
310         start &= pdmask;
311         if (start < floor)
312                 return;
313         if (ceiling) {
314                 ceiling &= pdmask;
315                 if (! ceiling)
316                         return;
317         }
318         if (end - 1 > ceiling - 1)
319                 return;
320
321         for (i = 0; i < num_hugepd; i++, hpdp++)
322                 *hpdp = __hugepd(0);
323
324         if (shift >= pdshift)
325                 hugepd_free(tlb, hugepte);
326         else if (IS_ENABLED(CONFIG_PPC_8xx))
327                 pgtable_free_tlb(tlb, hugepte,
328                                  get_hugepd_cache_index(PTE_INDEX_SIZE));
329         else
330                 pgtable_free_tlb(tlb, hugepte,
331                                  get_hugepd_cache_index(pdshift - shift));
332 }
333
334 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
335                                    unsigned long addr, unsigned long end,
336                                    unsigned long floor, unsigned long ceiling)
337 {
338         pmd_t *pmd;
339         unsigned long next;
340         unsigned long start;
341
342         start = addr;
343         do {
344                 unsigned long more;
345
346                 pmd = pmd_offset(pud, addr);
347                 next = pmd_addr_end(addr, end);
348                 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
349                         /*
350                          * if it is not hugepd pointer, we should already find
351                          * it cleared.
352                          */
353                         WARN_ON(!pmd_none_or_clear_bad(pmd));
354                         continue;
355                 }
356                 /*
357                  * Increment next by the size of the huge mapping since
358                  * there may be more than one entry at this level for a
359                  * single hugepage, but all of them point to
360                  * the same kmem cache that holds the hugepte.
361                  */
362                 more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
363                 if (more > next)
364                         next = more;
365
366                 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
367                                   addr, next, floor, ceiling);
368         } while (addr = next, addr != end);
369
370         start &= PUD_MASK;
371         if (start < floor)
372                 return;
373         if (ceiling) {
374                 ceiling &= PUD_MASK;
375                 if (!ceiling)
376                         return;
377         }
378         if (end - 1 > ceiling - 1)
379                 return;
380
381         pmd = pmd_offset(pud, start);
382         pud_clear(pud);
383         pmd_free_tlb(tlb, pmd, start);
384         mm_dec_nr_pmds(tlb->mm);
385 }
386
387 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
388                                    unsigned long addr, unsigned long end,
389                                    unsigned long floor, unsigned long ceiling)
390 {
391         pud_t *pud;
392         unsigned long next;
393         unsigned long start;
394
395         start = addr;
396         do {
397                 pud = pud_offset(pgd, addr);
398                 next = pud_addr_end(addr, end);
399                 if (!is_hugepd(__hugepd(pud_val(*pud)))) {
400                         if (pud_none_or_clear_bad(pud))
401                                 continue;
402                         hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
403                                                ceiling);
404                 } else {
405                         unsigned long more;
406                         /*
407                          * Increment next by the size of the huge mapping since
408                          * there may be more than one entry at this level for a
409                          * single hugepage, but all of them point to
410                          * the same kmem cache that holds the hugepte.
411                          */
412                         more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
413                         if (more > next)
414                                 next = more;
415
416                         free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
417                                           addr, next, floor, ceiling);
418                 }
419         } while (addr = next, addr != end);
420
421         start &= PGDIR_MASK;
422         if (start < floor)
423                 return;
424         if (ceiling) {
425                 ceiling &= PGDIR_MASK;
426                 if (!ceiling)
427                         return;
428         }
429         if (end - 1 > ceiling - 1)
430                 return;
431
432         pud = pud_offset(pgd, start);
433         pgd_clear(pgd);
434         pud_free_tlb(tlb, pud, start);
435         mm_dec_nr_puds(tlb->mm);
436 }
437
438 /*
439  * This function frees user-level page tables of a process.
440  */
441 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
442                             unsigned long addr, unsigned long end,
443                             unsigned long floor, unsigned long ceiling)
444 {
445         pgd_t *pgd;
446         unsigned long next;
447
448         /*
449          * Because there are a number of different possible pagetable
450          * layouts for hugepage ranges, we limit knowledge of how
451          * things should be laid out to the allocation path
452          * (huge_pte_alloc(), above).  Everything else works out the
453          * structure as it goes from information in the hugepd
454          * pointers.  That means that we can't here use the
455          * optimization used in the normal page free_pgd_range(), of
456          * checking whether we're actually covering a large enough
457          * range to have to do anything at the top level of the walk
458          * instead of at the bottom.
459          *
460          * To make sense of this, you should probably go read the big
461          * block comment at the top of the normal free_pgd_range(),
462          * too.
463          */
464
465         do {
466                 next = pgd_addr_end(addr, end);
467                 pgd = pgd_offset(tlb->mm, addr);
468                 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
469                         if (pgd_none_or_clear_bad(pgd))
470                                 continue;
471                         hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
472                 } else {
473                         unsigned long more;
474                         /*
475                          * Increment next by the size of the huge mapping since
476                          * there may be more than one entry at the pgd level
477                          * for a single hugepage, but all of them point to the
478                          * same kmem cache that holds the hugepte.
479                          */
480                         more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
481                         if (more > next)
482                                 next = more;
483
484                         free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
485                                           addr, next, floor, ceiling);
486                 }
487         } while (addr = next, addr != end);
488 }
489
490 struct page *follow_huge_pd(struct vm_area_struct *vma,
491                             unsigned long address, hugepd_t hpd,
492                             int flags, int pdshift)
493 {
494         pte_t *ptep;
495         spinlock_t *ptl;
496         struct page *page = NULL;
497         unsigned long mask;
498         int shift = hugepd_shift(hpd);
499         struct mm_struct *mm = vma->vm_mm;
500
501 retry:
502         /*
503          * hugepage directory entries are protected by mm->page_table_lock
504          * Use this instead of huge_pte_lockptr
505          */
506         ptl = &mm->page_table_lock;
507         spin_lock(ptl);
508
509         ptep = hugepte_offset(hpd, address, pdshift);
510         if (pte_present(*ptep)) {
511                 mask = (1UL << shift) - 1;
512                 page = pte_page(*ptep);
513                 page += ((address & mask) >> PAGE_SHIFT);
514                 if (flags & FOLL_GET)
515                         get_page(page);
516         } else {
517                 if (is_hugetlb_entry_migration(*ptep)) {
518                         spin_unlock(ptl);
519                         __migration_entry_wait(mm, ptep, ptl);
520                         goto retry;
521                 }
522         }
523         spin_unlock(ptl);
524         return page;
525 }
526
527 #ifdef CONFIG_PPC_MM_SLICES
528 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
529                                         unsigned long len, unsigned long pgoff,
530                                         unsigned long flags)
531 {
532         struct hstate *hstate = hstate_file(file);
533         int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
534
535 #ifdef CONFIG_PPC_RADIX_MMU
536         if (radix_enabled())
537                 return radix__hugetlb_get_unmapped_area(file, addr, len,
538                                                        pgoff, flags);
539 #endif
540         return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
541 }
542 #endif
543
544 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
545 {
546         /* With radix we don't use slice, so derive it from vma*/
547         if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) {
548                 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
549
550                 return 1UL << mmu_psize_to_shift(psize);
551         }
552         return vma_kernel_pagesize(vma);
553 }
554
555 static int __init add_huge_page_size(unsigned long long size)
556 {
557         int shift = __ffs(size);
558         int mmu_psize;
559
560         /* Check that it is a page size supported by the hardware and
561          * that it fits within pagetable and slice limits. */
562         if (size <= PAGE_SIZE || !is_power_of_2(size))
563                 return -EINVAL;
564
565         mmu_psize = check_and_get_huge_psize(shift);
566         if (mmu_psize < 0)
567                 return -EINVAL;
568
569         BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
570
571         /* Return if huge page size has already been setup */
572         if (size_to_hstate(size))
573                 return 0;
574
575         hugetlb_add_hstate(shift - PAGE_SHIFT);
576
577         return 0;
578 }
579
580 static int __init hugepage_setup_sz(char *str)
581 {
582         unsigned long long size;
583
584         size = memparse(str, &str);
585
586         if (add_huge_page_size(size) != 0) {
587                 hugetlb_bad_size();
588                 pr_err("Invalid huge page size specified(%llu)\n", size);
589         }
590
591         return 1;
592 }
593 __setup("hugepagesz=", hugepage_setup_sz);
594
595 static int __init hugetlbpage_init(void)
596 {
597         bool configured = false;
598         int psize;
599
600         if (hugetlb_disabled) {
601                 pr_info("HugeTLB support is disabled!\n");
602                 return 0;
603         }
604
605         if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
606             !mmu_has_feature(MMU_FTR_16M_PAGE))
607                 return -ENODEV;
608
609         for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
610                 unsigned shift;
611                 unsigned pdshift;
612
613                 if (!mmu_psize_defs[psize].shift)
614                         continue;
615
616                 shift = mmu_psize_to_shift(psize);
617
618 #ifdef CONFIG_PPC_BOOK3S_64
619                 if (shift > PGDIR_SHIFT)
620                         continue;
621                 else if (shift > PUD_SHIFT)
622                         pdshift = PGDIR_SHIFT;
623                 else if (shift > PMD_SHIFT)
624                         pdshift = PUD_SHIFT;
625                 else
626                         pdshift = PMD_SHIFT;
627 #else
628                 if (shift < PUD_SHIFT)
629                         pdshift = PMD_SHIFT;
630                 else if (shift < PGDIR_SHIFT)
631                         pdshift = PUD_SHIFT;
632                 else
633                         pdshift = PGDIR_SHIFT;
634 #endif
635
636                 if (add_huge_page_size(1ULL << shift) < 0)
637                         continue;
638                 /*
639                  * if we have pdshift and shift value same, we don't
640                  * use pgt cache for hugepd.
641                  */
642                 if (pdshift > shift && IS_ENABLED(CONFIG_PPC_8xx))
643                         pgtable_cache_add(PTE_INDEX_SIZE);
644                 else if (pdshift > shift)
645                         pgtable_cache_add(pdshift - shift);
646                 else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx))
647                         pgtable_cache_add(PTE_T_ORDER);
648
649                 configured = true;
650         }
651
652         if (configured) {
653                 if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
654                         hugetlbpage_init_default();
655         } else
656                 pr_info("Failed to initialize. Disabling HugeTLB");
657
658         return 0;
659 }
660
661 arch_initcall(hugetlbpage_init);
662
663 void flush_dcache_icache_hugepage(struct page *page)
664 {
665         int i;
666         void *start;
667
668         BUG_ON(!PageCompound(page));
669
670         for (i = 0; i < compound_nr(page); i++) {
671                 if (!PageHighMem(page)) {
672                         __flush_dcache_icache(page_address(page+i));
673                 } else {
674                         start = kmap_atomic(page+i);
675                         __flush_dcache_icache(start);
676                         kunmap_atomic(start);
677                 }
678         }
679 }