]> asedeno.scripts.mit.edu Git - linux.git/blob - mm/z3fold.c
mm/z3fold: don't try to use buddy slots after free
[linux.git] / mm / z3fold.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * z3fold.c
4  *
5  * Author: Vitaly Wool <vitaly.wool@konsulko.com>
6  * Copyright (C) 2016, Sony Mobile Communications Inc.
7  *
8  * This implementation is based on zbud written by Seth Jennings.
9  *
10  * z3fold is an special purpose allocator for storing compressed pages. It
11  * can store up to three compressed pages per page which improves the
12  * compression ratio of zbud while retaining its main concepts (e. g. always
13  * storing an integral number of objects per page) and simplicity.
14  * It still has simple and deterministic reclaim properties that make it
15  * preferable to a higher density approach (with no requirement on integral
16  * number of object per page) when reclaim is used.
17  *
18  * As in zbud, pages are divided into "chunks".  The size of the chunks is
19  * fixed at compile time and is determined by NCHUNKS_ORDER below.
20  *
21  * z3fold doesn't export any API and is meant to be used via zpool API.
22  */
23
24 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
25
26 #include <linux/atomic.h>
27 #include <linux/sched.h>
28 #include <linux/cpumask.h>
29 #include <linux/dcache.h>
30 #include <linux/list.h>
31 #include <linux/mm.h>
32 #include <linux/module.h>
33 #include <linux/page-flags.h>
34 #include <linux/migrate.h>
35 #include <linux/node.h>
36 #include <linux/compaction.h>
37 #include <linux/percpu.h>
38 #include <linux/mount.h>
39 #include <linux/fs.h>
40 #include <linux/preempt.h>
41 #include <linux/workqueue.h>
42 #include <linux/slab.h>
43 #include <linux/spinlock.h>
44 #include <linux/zpool.h>
45
46 /*
47  * NCHUNKS_ORDER determines the internal allocation granularity, effectively
48  * adjusting internal fragmentation.  It also determines the number of
49  * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
50  * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
51  * in the beginning of an allocated page are occupied by z3fold header, so
52  * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
53  * which shows the max number of free chunks in z3fold page, also there will
54  * be 63, or 62, respectively, freelists per pool.
55  */
56 #define NCHUNKS_ORDER   6
57
58 #define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
59 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
60 #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
61 #define ZHDR_CHUNKS     (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
62 #define TOTAL_CHUNKS    (PAGE_SIZE >> CHUNK_SHIFT)
63 #define NCHUNKS         ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
64
65 #define BUDDY_MASK      (0x3)
66 #define BUDDY_SHIFT     2
67 #define SLOTS_ALIGN     (0x40)
68
69 /*****************
70  * Structures
71 *****************/
72 struct z3fold_pool;
73 struct z3fold_ops {
74         int (*evict)(struct z3fold_pool *pool, unsigned long handle);
75 };
76
77 enum buddy {
78         HEADLESS = 0,
79         FIRST,
80         MIDDLE,
81         LAST,
82         BUDDIES_MAX = LAST
83 };
84
85 struct z3fold_buddy_slots {
86         /*
87          * we are using BUDDY_MASK in handle_to_buddy etc. so there should
88          * be enough slots to hold all possible variants
89          */
90         unsigned long slot[BUDDY_MASK + 1];
91         unsigned long pool; /* back link + flags */
92 };
93 #define HANDLE_FLAG_MASK        (0x03)
94
95 /*
96  * struct z3fold_header - z3fold page metadata occupying first chunks of each
97  *                      z3fold page, except for HEADLESS pages
98  * @buddy:              links the z3fold page into the relevant list in the
99  *                      pool
100  * @page_lock:          per-page lock
101  * @refcount:           reference count for the z3fold page
102  * @work:               work_struct for page layout optimization
103  * @slots:              pointer to the structure holding buddy slots
104  * @pool:               pointer to the containing pool
105  * @cpu:                CPU which this page "belongs" to
106  * @first_chunks:       the size of the first buddy in chunks, 0 if free
107  * @middle_chunks:      the size of the middle buddy in chunks, 0 if free
108  * @last_chunks:        the size of the last buddy in chunks, 0 if free
109  * @first_num:          the starting number (for the first handle)
110  * @mapped_count:       the number of objects currently mapped
111  */
112 struct z3fold_header {
113         struct list_head buddy;
114         spinlock_t page_lock;
115         struct kref refcount;
116         struct work_struct work;
117         struct z3fold_buddy_slots *slots;
118         struct z3fold_pool *pool;
119         short cpu;
120         unsigned short first_chunks;
121         unsigned short middle_chunks;
122         unsigned short last_chunks;
123         unsigned short start_middle;
124         unsigned short first_num:2;
125         unsigned short mapped_count:2;
126 };
127
128 /**
129  * struct z3fold_pool - stores metadata for each z3fold pool
130  * @name:       pool name
131  * @lock:       protects pool unbuddied/lru lists
132  * @stale_lock: protects pool stale page list
133  * @unbuddied:  per-cpu array of lists tracking z3fold pages that contain 2-
134  *              buddies; the list each z3fold page is added to depends on
135  *              the size of its free region.
136  * @lru:        list tracking the z3fold pages in LRU order by most recently
137  *              added buddy.
138  * @stale:      list of pages marked for freeing
139  * @pages_nr:   number of z3fold pages in the pool.
140  * @c_handle:   cache for z3fold_buddy_slots allocation
141  * @ops:        pointer to a structure of user defined operations specified at
142  *              pool creation time.
143  * @compact_wq: workqueue for page layout background optimization
144  * @release_wq: workqueue for safe page release
145  * @work:       work_struct for safe page release
146  * @inode:      inode for z3fold pseudo filesystem
147  *
148  * This structure is allocated at pool creation time and maintains metadata
149  * pertaining to a particular z3fold pool.
150  */
151 struct z3fold_pool {
152         const char *name;
153         spinlock_t lock;
154         spinlock_t stale_lock;
155         struct list_head *unbuddied;
156         struct list_head lru;
157         struct list_head stale;
158         atomic64_t pages_nr;
159         struct kmem_cache *c_handle;
160         const struct z3fold_ops *ops;
161         struct zpool *zpool;
162         const struct zpool_ops *zpool_ops;
163         struct workqueue_struct *compact_wq;
164         struct workqueue_struct *release_wq;
165         struct work_struct work;
166         struct inode *inode;
167 };
168
169 /*
170  * Internal z3fold page flags
171  */
172 enum z3fold_page_flags {
173         PAGE_HEADLESS = 0,
174         MIDDLE_CHUNK_MAPPED,
175         NEEDS_COMPACTING,
176         PAGE_STALE,
177         PAGE_CLAIMED, /* by either reclaim or free */
178 };
179
180 /*****************
181  * Helpers
182 *****************/
183
184 /* Converts an allocation size in bytes to size in z3fold chunks */
185 static int size_to_chunks(size_t size)
186 {
187         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
188 }
189
190 #define for_each_unbuddied_list(_iter, _begin) \
191         for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
192
193 static void compact_page_work(struct work_struct *w);
194
195 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
196                                                         gfp_t gfp)
197 {
198         struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
199                                                             gfp);
200
201         if (slots) {
202                 memset(slots->slot, 0, sizeof(slots->slot));
203                 slots->pool = (unsigned long)pool;
204         }
205
206         return slots;
207 }
208
209 static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
210 {
211         return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
212 }
213
214 static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
215 {
216         return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
217 }
218
219 static inline void free_handle(unsigned long handle)
220 {
221         struct z3fold_buddy_slots *slots;
222         int i;
223         bool is_free;
224
225         if (handle & (1 << PAGE_HEADLESS))
226                 return;
227
228         WARN_ON(*(unsigned long *)handle == 0);
229         *(unsigned long *)handle = 0;
230         slots = handle_to_slots(handle);
231         is_free = true;
232         for (i = 0; i <= BUDDY_MASK; i++) {
233                 if (slots->slot[i]) {
234                         is_free = false;
235                         break;
236                 }
237         }
238
239         if (is_free) {
240                 struct z3fold_pool *pool = slots_to_pool(slots);
241
242                 kmem_cache_free(pool->c_handle, slots);
243         }
244 }
245
246 static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
247                                 int flags, const char *dev_name, void *data)
248 {
249         static const struct dentry_operations ops = {
250                 .d_dname = simple_dname,
251         };
252
253         return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
254 }
255
256 static struct file_system_type z3fold_fs = {
257         .name           = "z3fold",
258         .mount          = z3fold_do_mount,
259         .kill_sb        = kill_anon_super,
260 };
261
262 static struct vfsmount *z3fold_mnt;
263 static int z3fold_mount(void)
264 {
265         int ret = 0;
266
267         z3fold_mnt = kern_mount(&z3fold_fs);
268         if (IS_ERR(z3fold_mnt))
269                 ret = PTR_ERR(z3fold_mnt);
270
271         return ret;
272 }
273
274 static void z3fold_unmount(void)
275 {
276         kern_unmount(z3fold_mnt);
277 }
278
279 static const struct address_space_operations z3fold_aops;
280 static int z3fold_register_migration(struct z3fold_pool *pool)
281 {
282         pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
283         if (IS_ERR(pool->inode)) {
284                 pool->inode = NULL;
285                 return 1;
286         }
287
288         pool->inode->i_mapping->private_data = pool;
289         pool->inode->i_mapping->a_ops = &z3fold_aops;
290         return 0;
291 }
292
293 static void z3fold_unregister_migration(struct z3fold_pool *pool)
294 {
295         if (pool->inode)
296                 iput(pool->inode);
297  }
298
299 /* Initializes the z3fold header of a newly allocated z3fold page */
300 static struct z3fold_header *init_z3fold_page(struct page *page,
301                                         struct z3fold_pool *pool, gfp_t gfp)
302 {
303         struct z3fold_header *zhdr = page_address(page);
304         struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp);
305
306         if (!slots)
307                 return NULL;
308
309         INIT_LIST_HEAD(&page->lru);
310         clear_bit(PAGE_HEADLESS, &page->private);
311         clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
312         clear_bit(NEEDS_COMPACTING, &page->private);
313         clear_bit(PAGE_STALE, &page->private);
314         clear_bit(PAGE_CLAIMED, &page->private);
315
316         spin_lock_init(&zhdr->page_lock);
317         kref_init(&zhdr->refcount);
318         zhdr->first_chunks = 0;
319         zhdr->middle_chunks = 0;
320         zhdr->last_chunks = 0;
321         zhdr->first_num = 0;
322         zhdr->start_middle = 0;
323         zhdr->cpu = -1;
324         zhdr->slots = slots;
325         zhdr->pool = pool;
326         INIT_LIST_HEAD(&zhdr->buddy);
327         INIT_WORK(&zhdr->work, compact_page_work);
328         return zhdr;
329 }
330
331 /* Resets the struct page fields and frees the page */
332 static void free_z3fold_page(struct page *page, bool headless)
333 {
334         if (!headless) {
335                 lock_page(page);
336                 __ClearPageMovable(page);
337                 unlock_page(page);
338         }
339         ClearPagePrivate(page);
340         __free_page(page);
341 }
342
343 /* Lock a z3fold page */
344 static inline void z3fold_page_lock(struct z3fold_header *zhdr)
345 {
346         spin_lock(&zhdr->page_lock);
347 }
348
349 /* Try to lock a z3fold page */
350 static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
351 {
352         return spin_trylock(&zhdr->page_lock);
353 }
354
355 /* Unlock a z3fold page */
356 static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
357 {
358         spin_unlock(&zhdr->page_lock);
359 }
360
361 /* Helper function to build the index */
362 static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
363 {
364         return (bud + zhdr->first_num) & BUDDY_MASK;
365 }
366
367 /*
368  * Encodes the handle of a particular buddy within a z3fold page
369  * Pool lock should be held as this function accesses first_num
370  */
371 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
372 {
373         struct z3fold_buddy_slots *slots;
374         unsigned long h = (unsigned long)zhdr;
375         int idx = 0;
376
377         /*
378          * For a headless page, its handle is its pointer with the extra
379          * PAGE_HEADLESS bit set
380          */
381         if (bud == HEADLESS)
382                 return h | (1 << PAGE_HEADLESS);
383
384         /* otherwise, return pointer to encoded handle */
385         idx = __idx(zhdr, bud);
386         h += idx;
387         if (bud == LAST)
388                 h |= (zhdr->last_chunks << BUDDY_SHIFT);
389
390         slots = zhdr->slots;
391         slots->slot[idx] = h;
392         return (unsigned long)&slots->slot[idx];
393 }
394
395 /* Returns the z3fold page where a given handle is stored */
396 static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
397 {
398         unsigned long addr = h;
399
400         if (!(addr & (1 << PAGE_HEADLESS)))
401                 addr = *(unsigned long *)h;
402
403         return (struct z3fold_header *)(addr & PAGE_MASK);
404 }
405
406 /* only for LAST bud, returns zero otherwise */
407 static unsigned short handle_to_chunks(unsigned long handle)
408 {
409         unsigned long addr = *(unsigned long *)handle;
410
411         return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
412 }
413
414 /*
415  * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
416  *  but that doesn't matter. because the masking will result in the
417  *  correct buddy number.
418  */
419 static enum buddy handle_to_buddy(unsigned long handle)
420 {
421         struct z3fold_header *zhdr;
422         unsigned long addr;
423
424         WARN_ON(handle & (1 << PAGE_HEADLESS));
425         addr = *(unsigned long *)handle;
426         zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
427         return (addr - zhdr->first_num) & BUDDY_MASK;
428 }
429
430 static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
431 {
432         return zhdr->pool;
433 }
434
435 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
436 {
437         struct page *page = virt_to_page(zhdr);
438         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
439
440         WARN_ON(!list_empty(&zhdr->buddy));
441         set_bit(PAGE_STALE, &page->private);
442         clear_bit(NEEDS_COMPACTING, &page->private);
443         spin_lock(&pool->lock);
444         if (!list_empty(&page->lru))
445                 list_del_init(&page->lru);
446         spin_unlock(&pool->lock);
447         if (locked)
448                 z3fold_page_unlock(zhdr);
449         spin_lock(&pool->stale_lock);
450         list_add(&zhdr->buddy, &pool->stale);
451         queue_work(pool->release_wq, &pool->work);
452         spin_unlock(&pool->stale_lock);
453 }
454
455 static void __attribute__((__unused__))
456                         release_z3fold_page(struct kref *ref)
457 {
458         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
459                                                 refcount);
460         __release_z3fold_page(zhdr, false);
461 }
462
463 static void release_z3fold_page_locked(struct kref *ref)
464 {
465         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
466                                                 refcount);
467         WARN_ON(z3fold_page_trylock(zhdr));
468         __release_z3fold_page(zhdr, true);
469 }
470
471 static void release_z3fold_page_locked_list(struct kref *ref)
472 {
473         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
474                                                refcount);
475         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
476         spin_lock(&pool->lock);
477         list_del_init(&zhdr->buddy);
478         spin_unlock(&pool->lock);
479
480         WARN_ON(z3fold_page_trylock(zhdr));
481         __release_z3fold_page(zhdr, true);
482 }
483
484 static void free_pages_work(struct work_struct *w)
485 {
486         struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
487
488         spin_lock(&pool->stale_lock);
489         while (!list_empty(&pool->stale)) {
490                 struct z3fold_header *zhdr = list_first_entry(&pool->stale,
491                                                 struct z3fold_header, buddy);
492                 struct page *page = virt_to_page(zhdr);
493
494                 list_del(&zhdr->buddy);
495                 if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
496                         continue;
497                 spin_unlock(&pool->stale_lock);
498                 cancel_work_sync(&zhdr->work);
499                 free_z3fold_page(page, false);
500                 cond_resched();
501                 spin_lock(&pool->stale_lock);
502         }
503         spin_unlock(&pool->stale_lock);
504 }
505
506 /*
507  * Returns the number of free chunks in a z3fold page.
508  * NB: can't be used with HEADLESS pages.
509  */
510 static int num_free_chunks(struct z3fold_header *zhdr)
511 {
512         int nfree;
513         /*
514          * If there is a middle object, pick up the bigger free space
515          * either before or after it. Otherwise just subtract the number
516          * of chunks occupied by the first and the last objects.
517          */
518         if (zhdr->middle_chunks != 0) {
519                 int nfree_before = zhdr->first_chunks ?
520                         0 : zhdr->start_middle - ZHDR_CHUNKS;
521                 int nfree_after = zhdr->last_chunks ?
522                         0 : TOTAL_CHUNKS -
523                                 (zhdr->start_middle + zhdr->middle_chunks);
524                 nfree = max(nfree_before, nfree_after);
525         } else
526                 nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
527         return nfree;
528 }
529
530 /* Add to the appropriate unbuddied list */
531 static inline void add_to_unbuddied(struct z3fold_pool *pool,
532                                 struct z3fold_header *zhdr)
533 {
534         if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
535                         zhdr->middle_chunks == 0) {
536                 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
537
538                 int freechunks = num_free_chunks(zhdr);
539                 spin_lock(&pool->lock);
540                 list_add(&zhdr->buddy, &unbuddied[freechunks]);
541                 spin_unlock(&pool->lock);
542                 zhdr->cpu = smp_processor_id();
543                 put_cpu_ptr(pool->unbuddied);
544         }
545 }
546
547 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
548                                 unsigned short dst_chunk)
549 {
550         void *beg = zhdr;
551         return memmove(beg + (dst_chunk << CHUNK_SHIFT),
552                        beg + (zhdr->start_middle << CHUNK_SHIFT),
553                        zhdr->middle_chunks << CHUNK_SHIFT);
554 }
555
556 #define BIG_CHUNK_GAP   3
557 /* Has to be called with lock held */
558 static int z3fold_compact_page(struct z3fold_header *zhdr)
559 {
560         struct page *page = virt_to_page(zhdr);
561
562         if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
563                 return 0; /* can't move middle chunk, it's used */
564
565         if (unlikely(PageIsolated(page)))
566                 return 0;
567
568         if (zhdr->middle_chunks == 0)
569                 return 0; /* nothing to compact */
570
571         if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
572                 /* move to the beginning */
573                 mchunk_memmove(zhdr, ZHDR_CHUNKS);
574                 zhdr->first_chunks = zhdr->middle_chunks;
575                 zhdr->middle_chunks = 0;
576                 zhdr->start_middle = 0;
577                 zhdr->first_num++;
578                 return 1;
579         }
580
581         /*
582          * moving data is expensive, so let's only do that if
583          * there's substantial gain (at least BIG_CHUNK_GAP chunks)
584          */
585         if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
586             zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
587                         BIG_CHUNK_GAP) {
588                 mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
589                 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
590                 return 1;
591         } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
592                    TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
593                                         + zhdr->middle_chunks) >=
594                         BIG_CHUNK_GAP) {
595                 unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
596                         zhdr->middle_chunks;
597                 mchunk_memmove(zhdr, new_start);
598                 zhdr->start_middle = new_start;
599                 return 1;
600         }
601
602         return 0;
603 }
604
605 static void do_compact_page(struct z3fold_header *zhdr, bool locked)
606 {
607         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
608         struct page *page;
609
610         page = virt_to_page(zhdr);
611         if (locked)
612                 WARN_ON(z3fold_page_trylock(zhdr));
613         else
614                 z3fold_page_lock(zhdr);
615         if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
616                 z3fold_page_unlock(zhdr);
617                 return;
618         }
619         spin_lock(&pool->lock);
620         list_del_init(&zhdr->buddy);
621         spin_unlock(&pool->lock);
622
623         if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
624                 atomic64_dec(&pool->pages_nr);
625                 return;
626         }
627
628         if (unlikely(PageIsolated(page) ||
629                      test_bit(PAGE_STALE, &page->private))) {
630                 z3fold_page_unlock(zhdr);
631                 return;
632         }
633
634         z3fold_compact_page(zhdr);
635         add_to_unbuddied(pool, zhdr);
636         z3fold_page_unlock(zhdr);
637 }
638
639 static void compact_page_work(struct work_struct *w)
640 {
641         struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
642                                                 work);
643
644         do_compact_page(zhdr, false);
645 }
646
647 /* returns _locked_ z3fold page header or NULL */
648 static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
649                                                 size_t size, bool can_sleep)
650 {
651         struct z3fold_header *zhdr = NULL;
652         struct page *page;
653         struct list_head *unbuddied;
654         int chunks = size_to_chunks(size), i;
655
656 lookup:
657         /* First, try to find an unbuddied z3fold page. */
658         unbuddied = get_cpu_ptr(pool->unbuddied);
659         for_each_unbuddied_list(i, chunks) {
660                 struct list_head *l = &unbuddied[i];
661
662                 zhdr = list_first_entry_or_null(READ_ONCE(l),
663                                         struct z3fold_header, buddy);
664
665                 if (!zhdr)
666                         continue;
667
668                 /* Re-check under lock. */
669                 spin_lock(&pool->lock);
670                 l = &unbuddied[i];
671                 if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
672                                                 struct z3fold_header, buddy)) ||
673                     !z3fold_page_trylock(zhdr)) {
674                         spin_unlock(&pool->lock);
675                         zhdr = NULL;
676                         put_cpu_ptr(pool->unbuddied);
677                         if (can_sleep)
678                                 cond_resched();
679                         goto lookup;
680                 }
681                 list_del_init(&zhdr->buddy);
682                 zhdr->cpu = -1;
683                 spin_unlock(&pool->lock);
684
685                 page = virt_to_page(zhdr);
686                 if (test_bit(NEEDS_COMPACTING, &page->private)) {
687                         z3fold_page_unlock(zhdr);
688                         zhdr = NULL;
689                         put_cpu_ptr(pool->unbuddied);
690                         if (can_sleep)
691                                 cond_resched();
692                         goto lookup;
693                 }
694
695                 /*
696                  * this page could not be removed from its unbuddied
697                  * list while pool lock was held, and then we've taken
698                  * page lock so kref_put could not be called before
699                  * we got here, so it's safe to just call kref_get()
700                  */
701                 kref_get(&zhdr->refcount);
702                 break;
703         }
704         put_cpu_ptr(pool->unbuddied);
705
706         if (!zhdr) {
707                 int cpu;
708
709                 /* look for _exact_ match on other cpus' lists */
710                 for_each_online_cpu(cpu) {
711                         struct list_head *l;
712
713                         unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
714                         spin_lock(&pool->lock);
715                         l = &unbuddied[chunks];
716
717                         zhdr = list_first_entry_or_null(READ_ONCE(l),
718                                                 struct z3fold_header, buddy);
719
720                         if (!zhdr || !z3fold_page_trylock(zhdr)) {
721                                 spin_unlock(&pool->lock);
722                                 zhdr = NULL;
723                                 continue;
724                         }
725                         list_del_init(&zhdr->buddy);
726                         zhdr->cpu = -1;
727                         spin_unlock(&pool->lock);
728
729                         page = virt_to_page(zhdr);
730                         if (test_bit(NEEDS_COMPACTING, &page->private)) {
731                                 z3fold_page_unlock(zhdr);
732                                 zhdr = NULL;
733                                 if (can_sleep)
734                                         cond_resched();
735                                 continue;
736                         }
737                         kref_get(&zhdr->refcount);
738                         break;
739                 }
740         }
741
742         return zhdr;
743 }
744
745 /*
746  * API Functions
747  */
748
749 /**
750  * z3fold_create_pool() - create a new z3fold pool
751  * @name:       pool name
752  * @gfp:        gfp flags when allocating the z3fold pool structure
753  * @ops:        user-defined operations for the z3fold pool
754  *
755  * Return: pointer to the new z3fold pool or NULL if the metadata allocation
756  * failed.
757  */
758 static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
759                 const struct z3fold_ops *ops)
760 {
761         struct z3fold_pool *pool = NULL;
762         int i, cpu;
763
764         pool = kzalloc(sizeof(struct z3fold_pool), gfp);
765         if (!pool)
766                 goto out;
767         pool->c_handle = kmem_cache_create("z3fold_handle",
768                                 sizeof(struct z3fold_buddy_slots),
769                                 SLOTS_ALIGN, 0, NULL);
770         if (!pool->c_handle)
771                 goto out_c;
772         spin_lock_init(&pool->lock);
773         spin_lock_init(&pool->stale_lock);
774         pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
775         if (!pool->unbuddied)
776                 goto out_pool;
777         for_each_possible_cpu(cpu) {
778                 struct list_head *unbuddied =
779                                 per_cpu_ptr(pool->unbuddied, cpu);
780                 for_each_unbuddied_list(i, 0)
781                         INIT_LIST_HEAD(&unbuddied[i]);
782         }
783         INIT_LIST_HEAD(&pool->lru);
784         INIT_LIST_HEAD(&pool->stale);
785         atomic64_set(&pool->pages_nr, 0);
786         pool->name = name;
787         pool->compact_wq = create_singlethread_workqueue(pool->name);
788         if (!pool->compact_wq)
789                 goto out_unbuddied;
790         pool->release_wq = create_singlethread_workqueue(pool->name);
791         if (!pool->release_wq)
792                 goto out_wq;
793         if (z3fold_register_migration(pool))
794                 goto out_rwq;
795         INIT_WORK(&pool->work, free_pages_work);
796         pool->ops = ops;
797         return pool;
798
799 out_rwq:
800         destroy_workqueue(pool->release_wq);
801 out_wq:
802         destroy_workqueue(pool->compact_wq);
803 out_unbuddied:
804         free_percpu(pool->unbuddied);
805 out_pool:
806         kmem_cache_destroy(pool->c_handle);
807 out_c:
808         kfree(pool);
809 out:
810         return NULL;
811 }
812
813 /**
814  * z3fold_destroy_pool() - destroys an existing z3fold pool
815  * @pool:       the z3fold pool to be destroyed
816  *
817  * The pool should be emptied before this function is called.
818  */
819 static void z3fold_destroy_pool(struct z3fold_pool *pool)
820 {
821         kmem_cache_destroy(pool->c_handle);
822         z3fold_unregister_migration(pool);
823         destroy_workqueue(pool->release_wq);
824         destroy_workqueue(pool->compact_wq);
825         kfree(pool);
826 }
827
828 /**
829  * z3fold_alloc() - allocates a region of a given size
830  * @pool:       z3fold pool from which to allocate
831  * @size:       size in bytes of the desired allocation
832  * @gfp:        gfp flags used if the pool needs to grow
833  * @handle:     handle of the new allocation
834  *
835  * This function will attempt to find a free region in the pool large enough to
836  * satisfy the allocation request.  A search of the unbuddied lists is
837  * performed first. If no suitable free region is found, then a new page is
838  * allocated and added to the pool to satisfy the request.
839  *
840  * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
841  * as z3fold pool pages.
842  *
843  * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
844  * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
845  * a new page.
846  */
847 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
848                         unsigned long *handle)
849 {
850         int chunks = size_to_chunks(size);
851         struct z3fold_header *zhdr = NULL;
852         struct page *page = NULL;
853         enum buddy bud;
854         bool can_sleep = gfpflags_allow_blocking(gfp);
855
856         if (!size || (gfp & __GFP_HIGHMEM))
857                 return -EINVAL;
858
859         if (size > PAGE_SIZE)
860                 return -ENOSPC;
861
862         if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
863                 bud = HEADLESS;
864         else {
865 retry:
866                 zhdr = __z3fold_alloc(pool, size, can_sleep);
867                 if (zhdr) {
868                         if (zhdr->first_chunks == 0) {
869                                 if (zhdr->middle_chunks != 0 &&
870                                     chunks >= zhdr->start_middle)
871                                         bud = LAST;
872                                 else
873                                         bud = FIRST;
874                         } else if (zhdr->last_chunks == 0)
875                                 bud = LAST;
876                         else if (zhdr->middle_chunks == 0)
877                                 bud = MIDDLE;
878                         else {
879                                 if (kref_put(&zhdr->refcount,
880                                              release_z3fold_page_locked))
881                                         atomic64_dec(&pool->pages_nr);
882                                 else
883                                         z3fold_page_unlock(zhdr);
884                                 pr_err("No free chunks in unbuddied\n");
885                                 WARN_ON(1);
886                                 goto retry;
887                         }
888                         page = virt_to_page(zhdr);
889                         goto found;
890                 }
891                 bud = FIRST;
892         }
893
894         page = NULL;
895         if (can_sleep) {
896                 spin_lock(&pool->stale_lock);
897                 zhdr = list_first_entry_or_null(&pool->stale,
898                                                 struct z3fold_header, buddy);
899                 /*
900                  * Before allocating a page, let's see if we can take one from
901                  * the stale pages list. cancel_work_sync() can sleep so we
902                  * limit this case to the contexts where we can sleep
903                  */
904                 if (zhdr) {
905                         list_del(&zhdr->buddy);
906                         spin_unlock(&pool->stale_lock);
907                         cancel_work_sync(&zhdr->work);
908                         page = virt_to_page(zhdr);
909                 } else {
910                         spin_unlock(&pool->stale_lock);
911                 }
912         }
913         if (!page)
914                 page = alloc_page(gfp);
915
916         if (!page)
917                 return -ENOMEM;
918
919         zhdr = init_z3fold_page(page, pool, gfp);
920         if (!zhdr) {
921                 __free_page(page);
922                 return -ENOMEM;
923         }
924         atomic64_inc(&pool->pages_nr);
925
926         if (bud == HEADLESS) {
927                 set_bit(PAGE_HEADLESS, &page->private);
928                 goto headless;
929         }
930         if (can_sleep) {
931                 lock_page(page);
932                 __SetPageMovable(page, pool->inode->i_mapping);
933                 unlock_page(page);
934         } else {
935                 if (trylock_page(page)) {
936                         __SetPageMovable(page, pool->inode->i_mapping);
937                         unlock_page(page);
938                 }
939         }
940         z3fold_page_lock(zhdr);
941
942 found:
943         if (bud == FIRST)
944                 zhdr->first_chunks = chunks;
945         else if (bud == LAST)
946                 zhdr->last_chunks = chunks;
947         else {
948                 zhdr->middle_chunks = chunks;
949                 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
950         }
951         add_to_unbuddied(pool, zhdr);
952
953 headless:
954         spin_lock(&pool->lock);
955         /* Add/move z3fold page to beginning of LRU */
956         if (!list_empty(&page->lru))
957                 list_del(&page->lru);
958
959         list_add(&page->lru, &pool->lru);
960
961         *handle = encode_handle(zhdr, bud);
962         spin_unlock(&pool->lock);
963         if (bud != HEADLESS)
964                 z3fold_page_unlock(zhdr);
965
966         return 0;
967 }
968
969 /**
970  * z3fold_free() - frees the allocation associated with the given handle
971  * @pool:       pool in which the allocation resided
972  * @handle:     handle associated with the allocation returned by z3fold_alloc()
973  *
974  * In the case that the z3fold page in which the allocation resides is under
975  * reclaim, as indicated by the PG_reclaim flag being set, this function
976  * only sets the first|last_chunks to 0.  The page is actually freed
977  * once both buddies are evicted (see z3fold_reclaim_page() below).
978  */
979 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
980 {
981         struct z3fold_header *zhdr;
982         struct page *page;
983         enum buddy bud;
984
985         zhdr = handle_to_z3fold_header(handle);
986         page = virt_to_page(zhdr);
987
988         if (test_bit(PAGE_HEADLESS, &page->private)) {
989                 /* if a headless page is under reclaim, just leave.
990                  * NB: we use test_and_set_bit for a reason: if the bit
991                  * has not been set before, we release this page
992                  * immediately so we don't care about its value any more.
993                  */
994                 if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
995                         spin_lock(&pool->lock);
996                         list_del(&page->lru);
997                         spin_unlock(&pool->lock);
998                         free_z3fold_page(page, true);
999                         atomic64_dec(&pool->pages_nr);
1000                 }
1001                 return;
1002         }
1003
1004         /* Non-headless case */
1005         z3fold_page_lock(zhdr);
1006         bud = handle_to_buddy(handle);
1007
1008         switch (bud) {
1009         case FIRST:
1010                 zhdr->first_chunks = 0;
1011                 break;
1012         case MIDDLE:
1013                 zhdr->middle_chunks = 0;
1014                 break;
1015         case LAST:
1016                 zhdr->last_chunks = 0;
1017                 break;
1018         default:
1019                 pr_err("%s: unknown bud %d\n", __func__, bud);
1020                 WARN_ON(1);
1021                 z3fold_page_unlock(zhdr);
1022                 return;
1023         }
1024
1025         free_handle(handle);
1026         if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
1027                 atomic64_dec(&pool->pages_nr);
1028                 return;
1029         }
1030         if (test_bit(PAGE_CLAIMED, &page->private)) {
1031                 z3fold_page_unlock(zhdr);
1032                 return;
1033         }
1034         if (unlikely(PageIsolated(page)) ||
1035             test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
1036                 z3fold_page_unlock(zhdr);
1037                 return;
1038         }
1039         if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
1040                 spin_lock(&pool->lock);
1041                 list_del_init(&zhdr->buddy);
1042                 spin_unlock(&pool->lock);
1043                 zhdr->cpu = -1;
1044                 kref_get(&zhdr->refcount);
1045                 do_compact_page(zhdr, true);
1046                 return;
1047         }
1048         kref_get(&zhdr->refcount);
1049         queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
1050         z3fold_page_unlock(zhdr);
1051 }
1052
1053 /**
1054  * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
1055  * @pool:       pool from which a page will attempt to be evicted
1056  * @retries:    number of pages on the LRU list for which eviction will
1057  *              be attempted before failing
1058  *
1059  * z3fold reclaim is different from normal system reclaim in that it is done
1060  * from the bottom, up. This is because only the bottom layer, z3fold, has
1061  * information on how the allocations are organized within each z3fold page.
1062  * This has the potential to create interesting locking situations between
1063  * z3fold and the user, however.
1064  *
1065  * To avoid these, this is how z3fold_reclaim_page() should be called:
1066  *
1067  * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
1068  * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
1069  * call the user-defined eviction handler with the pool and handle as
1070  * arguments.
1071  *
1072  * If the handle can not be evicted, the eviction handler should return
1073  * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
1074  * appropriate list and try the next z3fold page on the LRU up to
1075  * a user defined number of retries.
1076  *
1077  * If the handle is successfully evicted, the eviction handler should
1078  * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
1079  * contains logic to delay freeing the page if the page is under reclaim,
1080  * as indicated by the setting of the PG_reclaim flag on the underlying page.
1081  *
1082  * If all buddies in the z3fold page are successfully evicted, then the
1083  * z3fold page can be freed.
1084  *
1085  * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
1086  * no pages to evict or an eviction handler is not registered, -EAGAIN if
1087  * the retry limit was hit.
1088  */
1089 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1090 {
1091         int i, ret = 0;
1092         struct z3fold_header *zhdr = NULL;
1093         struct page *page = NULL;
1094         struct list_head *pos;
1095         unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1096
1097         spin_lock(&pool->lock);
1098         if (!pool->ops || !pool->ops->evict || retries == 0) {
1099                 spin_unlock(&pool->lock);
1100                 return -EINVAL;
1101         }
1102         for (i = 0; i < retries; i++) {
1103                 if (list_empty(&pool->lru)) {
1104                         spin_unlock(&pool->lock);
1105                         return -EINVAL;
1106                 }
1107                 list_for_each_prev(pos, &pool->lru) {
1108                         page = list_entry(pos, struct page, lru);
1109
1110                         /* this bit could have been set by free, in which case
1111                          * we pass over to the next page in the pool.
1112                          */
1113                         if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1114                                 continue;
1115
1116                         if (unlikely(PageIsolated(page)))
1117                                 continue;
1118                         if (test_bit(PAGE_HEADLESS, &page->private))
1119                                 break;
1120
1121                         zhdr = page_address(page);
1122                         if (!z3fold_page_trylock(zhdr)) {
1123                                 zhdr = NULL;
1124                                 continue; /* can't evict at this point */
1125                         }
1126                         kref_get(&zhdr->refcount);
1127                         list_del_init(&zhdr->buddy);
1128                         zhdr->cpu = -1;
1129                         break;
1130                 }
1131
1132                 if (!zhdr)
1133                         break;
1134
1135                 list_del_init(&page->lru);
1136                 spin_unlock(&pool->lock);
1137
1138                 if (!test_bit(PAGE_HEADLESS, &page->private)) {
1139                         /*
1140                          * We need encode the handles before unlocking, since
1141                          * we can race with free that will set
1142                          * (first|last)_chunks to 0
1143                          */
1144                         first_handle = 0;
1145                         last_handle = 0;
1146                         middle_handle = 0;
1147                         if (zhdr->first_chunks)
1148                                 first_handle = encode_handle(zhdr, FIRST);
1149                         if (zhdr->middle_chunks)
1150                                 middle_handle = encode_handle(zhdr, MIDDLE);
1151                         if (zhdr->last_chunks)
1152                                 last_handle = encode_handle(zhdr, LAST);
1153                         /*
1154                          * it's safe to unlock here because we hold a
1155                          * reference to this page
1156                          */
1157                         z3fold_page_unlock(zhdr);
1158                 } else {
1159                         first_handle = encode_handle(zhdr, HEADLESS);
1160                         last_handle = middle_handle = 0;
1161                 }
1162
1163                 /* Issue the eviction callback(s) */
1164                 if (middle_handle) {
1165                         ret = pool->ops->evict(pool, middle_handle);
1166                         if (ret)
1167                                 goto next;
1168                 }
1169                 if (first_handle) {
1170                         ret = pool->ops->evict(pool, first_handle);
1171                         if (ret)
1172                                 goto next;
1173                 }
1174                 if (last_handle) {
1175                         ret = pool->ops->evict(pool, last_handle);
1176                         if (ret)
1177                                 goto next;
1178                 }
1179 next:
1180                 if (test_bit(PAGE_HEADLESS, &page->private)) {
1181                         if (ret == 0) {
1182                                 free_z3fold_page(page, true);
1183                                 atomic64_dec(&pool->pages_nr);
1184                                 return 0;
1185                         }
1186                         spin_lock(&pool->lock);
1187                         list_add(&page->lru, &pool->lru);
1188                         spin_unlock(&pool->lock);
1189                 } else {
1190                         z3fold_page_lock(zhdr);
1191                         clear_bit(PAGE_CLAIMED, &page->private);
1192                         if (kref_put(&zhdr->refcount,
1193                                         release_z3fold_page_locked)) {
1194                                 atomic64_dec(&pool->pages_nr);
1195                                 return 0;
1196                         }
1197                         /*
1198                          * if we are here, the page is still not completely
1199                          * free. Take the global pool lock then to be able
1200                          * to add it back to the lru list
1201                          */
1202                         spin_lock(&pool->lock);
1203                         list_add(&page->lru, &pool->lru);
1204                         spin_unlock(&pool->lock);
1205                         z3fold_page_unlock(zhdr);
1206                 }
1207
1208                 /* We started off locked to we need to lock the pool back */
1209                 spin_lock(&pool->lock);
1210         }
1211         spin_unlock(&pool->lock);
1212         return -EAGAIN;
1213 }
1214
1215 /**
1216  * z3fold_map() - maps the allocation associated with the given handle
1217  * @pool:       pool in which the allocation resides
1218  * @handle:     handle associated with the allocation to be mapped
1219  *
1220  * Extracts the buddy number from handle and constructs the pointer to the
1221  * correct starting chunk within the page.
1222  *
1223  * Returns: a pointer to the mapped allocation
1224  */
1225 static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
1226 {
1227         struct z3fold_header *zhdr;
1228         struct page *page;
1229         void *addr;
1230         enum buddy buddy;
1231
1232         zhdr = handle_to_z3fold_header(handle);
1233         addr = zhdr;
1234         page = virt_to_page(zhdr);
1235
1236         if (test_bit(PAGE_HEADLESS, &page->private))
1237                 goto out;
1238
1239         z3fold_page_lock(zhdr);
1240         buddy = handle_to_buddy(handle);
1241         switch (buddy) {
1242         case FIRST:
1243                 addr += ZHDR_SIZE_ALIGNED;
1244                 break;
1245         case MIDDLE:
1246                 addr += zhdr->start_middle << CHUNK_SHIFT;
1247                 set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1248                 break;
1249         case LAST:
1250                 addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
1251                 break;
1252         default:
1253                 pr_err("unknown buddy id %d\n", buddy);
1254                 WARN_ON(1);
1255                 addr = NULL;
1256                 break;
1257         }
1258
1259         if (addr)
1260                 zhdr->mapped_count++;
1261         z3fold_page_unlock(zhdr);
1262 out:
1263         return addr;
1264 }
1265
1266 /**
1267  * z3fold_unmap() - unmaps the allocation associated with the given handle
1268  * @pool:       pool in which the allocation resides
1269  * @handle:     handle associated with the allocation to be unmapped
1270  */
1271 static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
1272 {
1273         struct z3fold_header *zhdr;
1274         struct page *page;
1275         enum buddy buddy;
1276
1277         zhdr = handle_to_z3fold_header(handle);
1278         page = virt_to_page(zhdr);
1279
1280         if (test_bit(PAGE_HEADLESS, &page->private))
1281                 return;
1282
1283         z3fold_page_lock(zhdr);
1284         buddy = handle_to_buddy(handle);
1285         if (buddy == MIDDLE)
1286                 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1287         zhdr->mapped_count--;
1288         z3fold_page_unlock(zhdr);
1289 }
1290
1291 /**
1292  * z3fold_get_pool_size() - gets the z3fold pool size in pages
1293  * @pool:       pool whose size is being queried
1294  *
1295  * Returns: size in pages of the given pool.
1296  */
1297 static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
1298 {
1299         return atomic64_read(&pool->pages_nr);
1300 }
1301
1302 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1303 {
1304         struct z3fold_header *zhdr;
1305         struct z3fold_pool *pool;
1306
1307         VM_BUG_ON_PAGE(!PageMovable(page), page);
1308         VM_BUG_ON_PAGE(PageIsolated(page), page);
1309
1310         if (test_bit(PAGE_HEADLESS, &page->private))
1311                 return false;
1312
1313         zhdr = page_address(page);
1314         z3fold_page_lock(zhdr);
1315         if (test_bit(NEEDS_COMPACTING, &page->private) ||
1316             test_bit(PAGE_STALE, &page->private))
1317                 goto out;
1318
1319         pool = zhdr_to_pool(zhdr);
1320
1321         if (zhdr->mapped_count == 0) {
1322                 kref_get(&zhdr->refcount);
1323                 if (!list_empty(&zhdr->buddy))
1324                         list_del_init(&zhdr->buddy);
1325                 spin_lock(&pool->lock);
1326                 if (!list_empty(&page->lru))
1327                         list_del(&page->lru);
1328                 spin_unlock(&pool->lock);
1329                 z3fold_page_unlock(zhdr);
1330                 return true;
1331         }
1332 out:
1333         z3fold_page_unlock(zhdr);
1334         return false;
1335 }
1336
1337 static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
1338                                struct page *page, enum migrate_mode mode)
1339 {
1340         struct z3fold_header *zhdr, *new_zhdr;
1341         struct z3fold_pool *pool;
1342         struct address_space *new_mapping;
1343
1344         VM_BUG_ON_PAGE(!PageMovable(page), page);
1345         VM_BUG_ON_PAGE(!PageIsolated(page), page);
1346         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1347
1348         zhdr = page_address(page);
1349         pool = zhdr_to_pool(zhdr);
1350
1351         if (!trylock_page(page))
1352                 return -EAGAIN;
1353
1354         if (!z3fold_page_trylock(zhdr)) {
1355                 unlock_page(page);
1356                 return -EAGAIN;
1357         }
1358         if (zhdr->mapped_count != 0) {
1359                 z3fold_page_unlock(zhdr);
1360                 unlock_page(page);
1361                 return -EBUSY;
1362         }
1363         new_zhdr = page_address(newpage);
1364         memcpy(new_zhdr, zhdr, PAGE_SIZE);
1365         newpage->private = page->private;
1366         page->private = 0;
1367         z3fold_page_unlock(zhdr);
1368         spin_lock_init(&new_zhdr->page_lock);
1369         new_mapping = page_mapping(page);
1370         __ClearPageMovable(page);
1371         ClearPagePrivate(page);
1372
1373         get_page(newpage);
1374         z3fold_page_lock(new_zhdr);
1375         if (new_zhdr->first_chunks)
1376                 encode_handle(new_zhdr, FIRST);
1377         if (new_zhdr->last_chunks)
1378                 encode_handle(new_zhdr, LAST);
1379         if (new_zhdr->middle_chunks)
1380                 encode_handle(new_zhdr, MIDDLE);
1381         set_bit(NEEDS_COMPACTING, &newpage->private);
1382         new_zhdr->cpu = smp_processor_id();
1383         spin_lock(&pool->lock);
1384         list_add(&newpage->lru, &pool->lru);
1385         spin_unlock(&pool->lock);
1386         __SetPageMovable(newpage, new_mapping);
1387         z3fold_page_unlock(new_zhdr);
1388
1389         queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1390
1391         page_mapcount_reset(page);
1392         unlock_page(page);
1393         put_page(page);
1394         return 0;
1395 }
1396
1397 static void z3fold_page_putback(struct page *page)
1398 {
1399         struct z3fold_header *zhdr;
1400         struct z3fold_pool *pool;
1401
1402         zhdr = page_address(page);
1403         pool = zhdr_to_pool(zhdr);
1404
1405         z3fold_page_lock(zhdr);
1406         if (!list_empty(&zhdr->buddy))
1407                 list_del_init(&zhdr->buddy);
1408         INIT_LIST_HEAD(&page->lru);
1409         if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
1410                 atomic64_dec(&pool->pages_nr);
1411                 return;
1412         }
1413         spin_lock(&pool->lock);
1414         list_add(&page->lru, &pool->lru);
1415         spin_unlock(&pool->lock);
1416         z3fold_page_unlock(zhdr);
1417 }
1418
1419 static const struct address_space_operations z3fold_aops = {
1420         .isolate_page = z3fold_page_isolate,
1421         .migratepage = z3fold_page_migrate,
1422         .putback_page = z3fold_page_putback,
1423 };
1424
1425 /*****************
1426  * zpool
1427  ****************/
1428
1429 static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
1430 {
1431         if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
1432                 return pool->zpool_ops->evict(pool->zpool, handle);
1433         else
1434                 return -ENOENT;
1435 }
1436
1437 static const struct z3fold_ops z3fold_zpool_ops = {
1438         .evict =        z3fold_zpool_evict
1439 };
1440
1441 static void *z3fold_zpool_create(const char *name, gfp_t gfp,
1442                                const struct zpool_ops *zpool_ops,
1443                                struct zpool *zpool)
1444 {
1445         struct z3fold_pool *pool;
1446
1447         pool = z3fold_create_pool(name, gfp,
1448                                 zpool_ops ? &z3fold_zpool_ops : NULL);
1449         if (pool) {
1450                 pool->zpool = zpool;
1451                 pool->zpool_ops = zpool_ops;
1452         }
1453         return pool;
1454 }
1455
1456 static void z3fold_zpool_destroy(void *pool)
1457 {
1458         z3fold_destroy_pool(pool);
1459 }
1460
1461 static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
1462                         unsigned long *handle)
1463 {
1464         return z3fold_alloc(pool, size, gfp, handle);
1465 }
1466 static void z3fold_zpool_free(void *pool, unsigned long handle)
1467 {
1468         z3fold_free(pool, handle);
1469 }
1470
1471 static int z3fold_zpool_shrink(void *pool, unsigned int pages,
1472                         unsigned int *reclaimed)
1473 {
1474         unsigned int total = 0;
1475         int ret = -EINVAL;
1476
1477         while (total < pages) {
1478                 ret = z3fold_reclaim_page(pool, 8);
1479                 if (ret < 0)
1480                         break;
1481                 total++;
1482         }
1483
1484         if (reclaimed)
1485                 *reclaimed = total;
1486
1487         return ret;
1488 }
1489
1490 static void *z3fold_zpool_map(void *pool, unsigned long handle,
1491                         enum zpool_mapmode mm)
1492 {
1493         return z3fold_map(pool, handle);
1494 }
1495 static void z3fold_zpool_unmap(void *pool, unsigned long handle)
1496 {
1497         z3fold_unmap(pool, handle);
1498 }
1499
1500 static u64 z3fold_zpool_total_size(void *pool)
1501 {
1502         return z3fold_get_pool_size(pool) * PAGE_SIZE;
1503 }
1504
1505 static struct zpool_driver z3fold_zpool_driver = {
1506         .type =         "z3fold",
1507         .owner =        THIS_MODULE,
1508         .create =       z3fold_zpool_create,
1509         .destroy =      z3fold_zpool_destroy,
1510         .malloc =       z3fold_zpool_malloc,
1511         .free =         z3fold_zpool_free,
1512         .shrink =       z3fold_zpool_shrink,
1513         .map =          z3fold_zpool_map,
1514         .unmap =        z3fold_zpool_unmap,
1515         .total_size =   z3fold_zpool_total_size,
1516 };
1517
1518 MODULE_ALIAS("zpool-z3fold");
1519
1520 static int __init init_z3fold(void)
1521 {
1522         int ret;
1523
1524         /* Make sure the z3fold header is not larger than the page size */
1525         BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
1526         ret = z3fold_mount();
1527         if (ret)
1528                 return ret;
1529
1530         zpool_register_driver(&z3fold_zpool_driver);
1531
1532         return 0;
1533 }
1534
1535 static void __exit exit_z3fold(void)
1536 {
1537         z3fold_unmount();
1538         zpool_unregister_driver(&z3fold_zpool_driver);
1539 }
1540
1541 module_init(init_z3fold);
1542 module_exit(exit_z3fold);
1543
1544 MODULE_LICENSE("GPL");
1545 MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
1546 MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");