1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/highmem.h>
4 #include <linux/sched.h>
5 #include <linux/hugetlb.h>
7 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
12 const struct mm_walk_ops *ops = walk->ops;
15 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
17 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
26 pte_unmap_unlock(pte, ptl);
30 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
35 const struct mm_walk_ops *ops = walk->ops;
38 pmd = pmd_offset(pud, addr);
41 next = pmd_addr_end(addr, end);
42 if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
44 err = ops->pte_hole(addr, next, walk);
50 walk->action = ACTION_SUBTREE;
53 * This implies that each ->pmd_entry() handler
54 * needs to know about pmd_trans_huge() pmds
57 err = ops->pmd_entry(pmd, addr, next, walk);
61 if (walk->action == ACTION_AGAIN)
65 * Check this here so we only break down trans_huge
66 * pages when we _need_ to
68 if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
69 walk->action == ACTION_CONTINUE ||
74 split_huge_pmd(walk->vma, pmd, addr);
75 if (pmd_trans_unstable(pmd))
79 err = walk_pte_range(pmd, addr, next, walk);
82 } while (pmd++, addr = next, addr != end);
87 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
92 const struct mm_walk_ops *ops = walk->ops;
95 pud = pud_offset(p4d, addr);
98 next = pud_addr_end(addr, end);
99 if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
101 err = ops->pte_hole(addr, next, walk);
107 walk->action = ACTION_SUBTREE;
110 err = ops->pud_entry(pud, addr, next, walk);
114 if (walk->action == ACTION_AGAIN)
117 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
118 walk->action == ACTION_CONTINUE ||
119 !(ops->pmd_entry || ops->pte_entry))
123 split_huge_pud(walk->vma, pud, addr);
127 err = walk_pmd_range(pud, addr, next, walk);
130 } while (pud++, addr = next, addr != end);
135 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
136 struct mm_walk *walk)
140 const struct mm_walk_ops *ops = walk->ops;
143 p4d = p4d_offset(pgd, addr);
145 next = p4d_addr_end(addr, end);
146 if (p4d_none_or_clear_bad(p4d)) {
148 err = ops->pte_hole(addr, next, walk);
153 if (ops->p4d_entry) {
154 err = ops->p4d_entry(p4d, addr, next, walk);
158 if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
159 err = walk_pud_range(p4d, addr, next, walk);
162 } while (p4d++, addr = next, addr != end);
167 static int walk_pgd_range(unsigned long addr, unsigned long end,
168 struct mm_walk *walk)
172 const struct mm_walk_ops *ops = walk->ops;
175 pgd = pgd_offset(walk->mm, addr);
177 next = pgd_addr_end(addr, end);
178 if (pgd_none_or_clear_bad(pgd)) {
180 err = ops->pte_hole(addr, next, walk);
185 if (ops->pgd_entry) {
186 err = ops->pgd_entry(pgd, addr, next, walk);
190 if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
192 err = walk_p4d_range(pgd, addr, next, walk);
195 } while (pgd++, addr = next, addr != end);
200 #ifdef CONFIG_HUGETLB_PAGE
201 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
204 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
205 return boundary < end ? boundary : end;
208 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
209 struct mm_walk *walk)
211 struct vm_area_struct *vma = walk->vma;
212 struct hstate *h = hstate_vma(vma);
214 unsigned long hmask = huge_page_mask(h);
215 unsigned long sz = huge_page_size(h);
217 const struct mm_walk_ops *ops = walk->ops;
221 next = hugetlb_entry_end(h, addr, end);
222 pte = huge_pte_offset(walk->mm, addr & hmask, sz);
225 err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
226 else if (ops->pte_hole)
227 err = ops->pte_hole(addr, next, walk);
231 } while (addr = next, addr != end);
236 #else /* CONFIG_HUGETLB_PAGE */
237 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
238 struct mm_walk *walk)
243 #endif /* CONFIG_HUGETLB_PAGE */
246 * Decide whether we really walk over the current vma on [@start, @end)
247 * or skip it via the returned value. Return 0 if we do walk over the
248 * current vma, and return 1 if we skip the vma. Negative values means
249 * error, where we abort the current walk.
251 static int walk_page_test(unsigned long start, unsigned long end,
252 struct mm_walk *walk)
254 struct vm_area_struct *vma = walk->vma;
255 const struct mm_walk_ops *ops = walk->ops;
258 return ops->test_walk(start, end, walk);
261 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
262 * range, so we don't walk over it as we do for normal vmas. However,
263 * Some callers are interested in handling hole range and they don't
264 * want to just ignore any single address range. Such users certainly
265 * define their ->pte_hole() callbacks, so let's delegate them to handle
268 if (vma->vm_flags & VM_PFNMAP) {
271 err = ops->pte_hole(start, end, walk);
272 return err ? err : 1;
277 static int __walk_page_range(unsigned long start, unsigned long end,
278 struct mm_walk *walk)
281 struct vm_area_struct *vma = walk->vma;
282 const struct mm_walk_ops *ops = walk->ops;
284 if (vma && ops->pre_vma) {
285 err = ops->pre_vma(start, end, walk);
290 if (vma && is_vm_hugetlb_page(vma)) {
291 if (ops->hugetlb_entry)
292 err = walk_hugetlb_range(start, end, walk);
294 err = walk_pgd_range(start, end, walk);
296 if (vma && ops->post_vma)
303 * walk_page_range - walk page table with caller specific callbacks
304 * @mm: mm_struct representing the target process of page table walk
305 * @start: start address of the virtual address range
306 * @end: end address of the virtual address range
307 * @ops: operation to call during the walk
308 * @private: private data for callbacks' usage
310 * Recursively walk the page table tree of the process represented by @mm
311 * within the virtual address range [@start, @end). During walking, we can do
312 * some caller-specific works for each entry, by setting up pmd_entry(),
313 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
314 * callbacks, the associated entries/pages are just ignored.
315 * The return values of these callbacks are commonly defined like below:
317 * - 0 : succeeded to handle the current entry, and if you don't reach the
318 * end address yet, continue to walk.
319 * - >0 : succeeded to handle the current entry, and return to the caller
320 * with caller specific value.
321 * - <0 : failed to handle the current entry, and return to the caller
324 * Before starting to walk page table, some callers want to check whether
325 * they really want to walk over the current vma, typically by checking
326 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
329 * If operations need to be staged before and committed after a vma is walked,
330 * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
331 * since it is intended to handle commit-type operations, can't return any
334 * struct mm_walk keeps current values of some common data like vma and pmd,
335 * which are useful for the access from callbacks. If you want to pass some
336 * caller-specific data to callbacks, @private should be helpful.
339 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
340 * because these function traverse vma list and/or access to vma's data.
342 int walk_page_range(struct mm_struct *mm, unsigned long start,
343 unsigned long end, const struct mm_walk_ops *ops,
348 struct vm_area_struct *vma;
349 struct mm_walk walk = {
361 lockdep_assert_held(&walk.mm->mmap_sem);
363 vma = find_vma(walk.mm, start);
365 if (!vma) { /* after the last vma */
368 } else if (start < vma->vm_start) { /* outside vma */
370 next = min(end, vma->vm_start);
371 } else { /* inside vma */
373 next = min(end, vma->vm_end);
376 err = walk_page_test(start, next, &walk);
379 * positive return values are purely for
380 * controlling the pagewalk, so should never
381 * be passed to the callers.
389 if (walk.vma || walk.ops->pte_hole)
390 err = __walk_page_range(start, next, &walk);
393 } while (start = next, start < end);
397 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
398 unsigned long end, const struct mm_walk_ops *ops,
401 struct mm_walk walk = {
408 if (start >= end || !walk.mm)
411 lockdep_assert_held(&walk.mm->mmap_sem);
413 return __walk_page_range(start, end, &walk);
416 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
419 struct mm_walk walk = {
430 lockdep_assert_held(&walk.mm->mmap_sem);
432 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
437 return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
441 * walk_page_mapping - walk all memory areas mapped into a struct address_space.
442 * @mapping: Pointer to the struct address_space
443 * @first_index: First page offset in the address_space
444 * @nr: Number of incremental page offsets to cover
445 * @ops: operation to call during the walk
446 * @private: private data for callbacks' usage
448 * This function walks all memory areas mapped into a struct address_space.
449 * The walk is limited to only the given page-size index range, but if
450 * the index boundaries cross a huge page-table entry, that entry will be
453 * Also see walk_page_range() for additional information.
456 * This function can't require that the struct mm_struct::mmap_sem is held,
457 * since @mapping may be mapped by multiple processes. Instead
458 * @mapping->i_mmap_rwsem must be held. This might have implications in the
459 * callbacks, and it's up tho the caller to ensure that the
460 * struct mm_struct::mmap_sem is not needed.
462 * Also this means that a caller can't rely on the struct
463 * vm_area_struct::vm_flags to be constant across a call,
464 * except for immutable flags. Callers requiring this shouldn't use
467 * Return: 0 on success, negative error code on failure, positive number on
468 * caller defined premature termination.
470 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
471 pgoff_t nr, const struct mm_walk_ops *ops,
474 struct mm_walk walk = {
478 struct vm_area_struct *vma;
479 pgoff_t vba, vea, cba, cea;
480 unsigned long start_addr, end_addr;
483 lockdep_assert_held(&mapping->i_mmap_rwsem);
484 vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
485 first_index + nr - 1) {
486 /* Clip to the vma */
488 vea = vba + vma_pages(vma);
491 cea = first_index + nr;
494 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
495 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
496 if (start_addr >= end_addr)
500 walk.mm = vma->vm_mm;
502 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
509 err = __walk_page_range(start_addr, end_addr, &walk);