X-Git-Url: https://asedeno.scripts.mit.edu/gitweb/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=a7a6aac95a6d158690e1ca0981b1b652820248d4;hb=89d790ab31d033d67635f6362d57ea64e47708fa;hp=c21b336681334f33f13cc6a203f30e0a7ca389bf;hpb=f14cc3b13d8f3ceee862f8365d37ba214630126a;p=linux.git diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c21b33668133..a7a6aac95a6d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -92,6 +91,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +/* work_structs for global per-cpu drains */ +DEFINE_MUTEX(pcpu_drain_mutex); +DEFINE_PER_CPU(struct work_struct, pcpu_drain); + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); @@ -1085,10 +1088,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned; + unsigned long nr_scanned, flags; bool isolated_pageblocks; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) @@ -1137,7 +1140,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, @@ -1145,8 +1148,9 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned; - spin_lock(&zone->lock); + unsigned long nr_scanned, flags; + spin_lock_irqsave(&zone->lock, flags); + __count_vm_events(PGFREE, 1 << order); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); @@ -1156,7 +1160,7 @@ static void free_one_page(struct zone *zone, migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -1234,7 +1238,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) static void __free_pages_ok(struct page *page, unsigned int order) { - unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); @@ -1242,10 +1245,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) return; migratetype = get_pfnblock_migratetype(page, pfn); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype); - local_irq_restore(flags); } static void __init __free_pages_boot_core(struct page *page, unsigned int order) @@ -2217,8 +2217,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, int migratetype, bool cold) { int i, alloced = 0; + unsigned long flags; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) @@ -2254,7 +2255,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); return alloced; } @@ -2339,16 +2340,26 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } +static void drain_local_pages_wq(struct work_struct *work) +{ + /* + * drain_all_pages doesn't use proper cpu hotplug protection so + * we can race with cpu offline when the WQ can move this from + * a cpu pinned worker to an unbound one. We can operate on a different + * cpu which is allright but we also have to make sure to not move to + * a different one. + */ + preempt_disable(); + drain_local_pages(NULL); + preempt_enable(); +} + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. * - * Note that this code is protected against sending an IPI to an offline - * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: - * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but - * nothing keeps CPUs from showing up after we populated the cpumask and - * before the call to on_each_cpu_mask(). + * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { @@ -2360,6 +2371,21 @@ void drain_all_pages(struct zone *zone) */ static cpumask_t cpus_with_pcps; + /* Workqueues cannot recurse */ + if (current->flags & PF_WQ_WORKER) + return; + + /* + * Do not drain if one is already in progress unless it's specific to + * a zone. Such callers are primarily CMA and memory hotplug and need + * the drain to be complete when the call returns. + */ + if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { + if (!zone) + return; + mutex_lock(&pcpu_drain_mutex); + } + /* * We don't care about racing with CPU hotplug event * as offline notification will cause the notified @@ -2390,8 +2416,16 @@ void drain_all_pages(struct zone *zone) else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, - zone, 1); + + for_each_cpu(cpu, &cpus_with_pcps) { + struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); + INIT_WORK(work, drain_local_pages_wq); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, &cpus_with_pcps) + flush_work(per_cpu_ptr(&pcpu_drain, cpu)); + + mutex_unlock(&pcpu_drain_mutex); } #ifdef CONFIG_HIBERNATION @@ -2442,17 +2476,20 @@ void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; + if (in_interrupt()) { + __free_pages_ok(page, 0); + return; + } + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); - __count_vm_event(PGFREE); + preempt_disable(); /* * We only track unmovable, reclaimable and movable on pcp lists. @@ -2469,6 +2506,7 @@ void free_hot_cold_page(struct page *page, bool cold) migratetype = MIGRATE_MOVABLE; } + __count_vm_event(PGFREE); pcp = &this_cpu_ptr(zone->pageset)->pcp; if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); @@ -2482,7 +2520,7 @@ void free_hot_cold_page(struct page *page, bool cold) } out: - local_irq_restore(flags); + preempt_enable(); } /* @@ -2600,74 +2638,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif } +/* Remove page from the per-cpu list, caller must protect the list */ +static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + bool cold, struct per_cpu_pages *pcp, + struct list_head *list) +{ + struct page *page; + + VM_BUG_ON(in_interrupt()); + + do { + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + return NULL; + } + + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + + list_del(&page->lru); + pcp->count--; + } while (check_new_pcp(page)); + + return page; +} + +/* Lock and remove page from the per-cpu list */ +static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) +{ + struct per_cpu_pages *pcp; + struct list_head *list; + bool cold = ((gfp_flags & __GFP_COLD) != 0); + struct page *page; + + preempt_disable(); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + if (page) { + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone); + } + preempt_enable(); + return page; +} + /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline -struct page *buffered_rmqueue(struct zone *preferred_zone, +struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { unsigned long flags; struct page *page; - bool cold = ((gfp_flags & __GFP_COLD) != 0); - if (likely(order == 0)) { - struct per_cpu_pages *pcp; - struct list_head *list; - - local_irq_save(flags); - do { - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } - - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - - list_del(&page->lru); - pcp->count--; + if (likely(order == 0) && !in_interrupt()) { + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype); + goto out; + } - } while (check_new_pcp(page)); - } else { - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + spin_lock_irqsave(&zone->lock, flags); - do { - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); - } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } if (!page) - goto failed; - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - } + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); + spin_unlock(&zone->lock); + if (!page) + goto failed; + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - VM_BUG_ON_PAGE(bad_range(zone, page), page); +out: + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; failed: @@ -2875,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, #ifdef CONFIG_NUMA static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -2972,7 +3041,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } try_this_zone: - page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); @@ -3825,76 +3894,76 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, return page; } -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask, + struct alloc_context *ac, gfp_t *alloc_mask, + unsigned int *alloc_flags) { - struct page *page; - unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ - struct alloc_context ac = { - .high_zoneidx = gfp_zone(gfp_mask), - .zonelist = zonelist, - .nodemask = nodemask, - .migratetype = gfpflags_to_migratetype(gfp_mask), - }; + ac->high_zoneidx = gfp_zone(gfp_mask); + ac->zonelist = zonelist; + ac->nodemask = nodemask; + ac->migratetype = gfpflags_to_migratetype(gfp_mask); if (cpusets_enabled()) { - alloc_mask |= __GFP_HARDWALL; - alloc_flags |= ALLOC_CPUSET; - if (!ac.nodemask) - ac.nodemask = &cpuset_current_mems_allowed; + *alloc_mask |= __GFP_HARDWALL; + if (!ac->nodemask) + ac->nodemask = &cpuset_current_mems_allowed; + else + *alloc_flags |= ALLOC_CPUSET; } - gfp_mask &= gfp_allowed_mask; - lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) - return NULL; + return false; - /* - * Check the zones suitable for the gfp_mask contain at least one - * valid zone. It's possible to have an empty zonelist as a result - * of __GFP_THISNODE and a memoryless node - */ - if (unlikely(!zonelist->_zonerefs->zone)) - return NULL; + if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) + *alloc_flags |= ALLOC_CMA; - if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; + return true; +} +/* Determine whether to spread dirty pages and what the first usable zone */ +static inline void finalise_ac(gfp_t gfp_mask, + unsigned int order, struct alloc_context *ac) +{ /* Dirty zone balancing only done in the fast path */ - ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* * The preferred zone is used for statistics but crucially it is * also used as the starting point for the zonelist iterator. It * may get reset for allocations that ignore memory policies. */ - ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, - ac.high_zoneidx, ac.nodemask); - if (!ac.preferred_zoneref->zone) { - page = NULL; - /* - * This might be due to race with cpuset_current_mems_allowed - * update, so make sure we retry with original nodemask in the - * slow path. - */ - goto no_zone; - } + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + struct page *page; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { }; + + gfp_mask &= gfp_allowed_mask; + if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + return NULL; + + finalise_ac(gfp_mask, order, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) goto out; -no_zone: /* * Runtime PM, block IO and its error handling path can deadlock * because I/O on the device might not complete. @@ -5856,7 +5925,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * the zone and SPARSEMEM is in use. If there are holes within the * zone, each populated memory region may cost us one or two extra * memmap pages due to alignment because memmap pages for each - * populated regions may not naturally algined on page boundary. + * populated regions may not be naturally aligned on page boundary. * So the (present_pages >> 4) heuristic is a tradeoff for that. */ if (spanned_pages > present_pages + (present_pages >> 4) && @@ -6420,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) start_pfn = end_pfn; } - arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; - arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); @@ -7157,8 +7224,9 @@ void *__init alloc_large_system_hash(const char *tablename, * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. It means you can't - * expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) @@ -7214,6 +7282,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (skip_hwpoisoned_pages && PageHWPoison(page)) continue; + if (__PageMovable(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -7325,6 +7396,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. + * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that @@ -7338,7 +7410,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype) + unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; unsigned int order; @@ -7350,7 +7422,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = GFP_KERNEL, + .gfp_mask = memalloc_noio_flags(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages);