mm/z3fold.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * z3fold.c
   4  *
   5  * Author: Vitaly Wool <vitaly.wool@konsulko.com>
   6  * Copyright (C) 2016, Sony Mobile Communications Inc.
   7  *
   8  * This implementation is based on zbud written by Seth Jennings.
   9  *
  10  * z3fold is an special purpose allocator for storing compressed pages. It
  11  * can store up to three compressed pages per page which improves the
  12  * compression ratio of zbud while retaining its main concepts (e. g. always
  13  * storing an integral number of objects per page) and simplicity.
  14  * It still has simple and deterministic reclaim properties that make it
  15  * preferable to a higher density approach (with no requirement on integral
  16  * number of object per page) when reclaim is used.
  17  *
  18  * As in zbud, pages are divided into "chunks".  The size of the chunks is
  19  * fixed at compile time and is determined by NCHUNKS_ORDER below.
  20  *
  21  * z3fold doesn't export any API and is meant to be used via zpool API.
  22  */
  23
  24 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  25
  26 #include <linux/atomic.h>
  27 #include <linux/sched.h>
  28 #include <linux/cpumask.h>
  29 #include <linux/dcache.h>
  30 #include <linux/list.h>
  31 #include <linux/mm.h>
  32 #include <linux/module.h>
  33 #include <linux/page-flags.h>
  34 #include <linux/migrate.h>
  35 #include <linux/node.h>
  36 #include <linux/compaction.h>
  37 #include <linux/percpu.h>
  38 #include <linux/mount.h>
  39 #include <linux/fs.h>
  40 #include <linux/preempt.h>
  41 #include <linux/workqueue.h>
  42 #include <linux/slab.h>
  43 #include <linux/spinlock.h>
  44 #include <linux/zpool.h>
  45
  46 /*
  47  * NCHUNKS_ORDER determines the internal allocation granularity, effectively
  48  * adjusting internal fragmentation.  It also determines the number of
  49  * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
  50  * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
  51  * in the beginning of an allocated page are occupied by z3fold header, so
  52  * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
  53  * which shows the max number of free chunks in z3fold page, also there will
  54  * be 63, or 62, respectively, freelists per pool.
  55  */
  56 #define NCHUNKS_ORDER   6
  57
  58 #define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
  59 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
  60 #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
  61 #define ZHDR_CHUNKS     (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
  62 #define TOTAL_CHUNKS    (PAGE_SIZE >> CHUNK_SHIFT)
  63 #define NCHUNKS         ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
  64
  65 #define BUDDY_MASK      (0x3)
  66 #define BUDDY_SHIFT     2
  67 #define SLOTS_ALIGN     (0x40)
  68
  69 /*****************
  70  * Structures
  71 *****************/
  72 struct z3fold_pool;
  73 struct z3fold_ops {
  74         int (*evict)(struct z3fold_pool *pool, unsigned long handle);
  75 };
  76
  77 enum buddy {
  78         HEADLESS = 0,
  79         FIRST,
  80         MIDDLE,
  81         LAST,
  82         BUDDIES_MAX = LAST
  83 };
  84
  85 struct z3fold_buddy_slots {
  86         /*
  87          * we are using BUDDY_MASK in handle_to_buddy etc. so there should
  88          * be enough slots to hold all possible variants
  89          */
  90         unsigned long slot[BUDDY_MASK + 1];
  91         unsigned long pool; /* back link + flags */
  92 };
  93 #define HANDLE_FLAG_MASK        (0x03)
  94
  95 /*
  96  * struct z3fold_header - z3fold page metadata occupying first chunks of each
  97  *                      z3fold page, except for HEADLESS pages
  98  * @buddy:              links the z3fold page into the relevant list in the
  99  *                      pool
 100  * @page_lock:          per-page lock
 101  * @refcount:           reference count for the z3fold page
 102  * @work:               work_struct for page layout optimization
 103  * @slots:              pointer to the structure holding buddy slots
 104  * @pool:               pointer to the containing pool
 105  * @cpu:                CPU which this page "belongs" to
 106  * @first_chunks:       the size of the first buddy in chunks, 0 if free
 107  * @middle_chunks:      the size of the middle buddy in chunks, 0 if free
 108  * @last_chunks:        the size of the last buddy in chunks, 0 if free
 109  * @first_num:          the starting number (for the first handle)
 110  * @mapped_count:       the number of objects currently mapped
 111  */
 112 struct z3fold_header {
 113         struct list_head buddy;
 114         spinlock_t page_lock;
 115         struct kref refcount;
 116         struct work_struct work;
 117         struct z3fold_buddy_slots *slots;
 118         struct z3fold_pool *pool;
 119         short cpu;
 120         unsigned short first_chunks;
 121         unsigned short middle_chunks;
 122         unsigned short last_chunks;
 123         unsigned short start_middle;
 124         unsigned short first_num:2;
 125         unsigned short mapped_count:2;
 126 };
 127
 128 /**
 129  * struct z3fold_pool - stores metadata for each z3fold pool
 130  * @name:       pool name
 131  * @lock:       protects pool unbuddied/lru lists
 132  * @stale_lock: protects pool stale page list
 133  * @unbuddied:  per-cpu array of lists tracking z3fold pages that contain 2-
 134  *              buddies; the list each z3fold page is added to depends on
 135  *              the size of its free region.
 136  * @lru:        list tracking the z3fold pages in LRU order by most recently
 137  *              added buddy.
 138  * @stale:      list of pages marked for freeing
 139  * @pages_nr:   number of z3fold pages in the pool.
 140  * @c_handle:   cache for z3fold_buddy_slots allocation
 141  * @ops:        pointer to a structure of user defined operations specified at
 142  *              pool creation time.
 143  * @compact_wq: workqueue for page layout background optimization
 144  * @release_wq: workqueue for safe page release
 145  * @work:       work_struct for safe page release
 146  * @inode:      inode for z3fold pseudo filesystem
 147  *
 148  * This structure is allocated at pool creation time and maintains metadata
 149  * pertaining to a particular z3fold pool.
 150  */
 151 struct z3fold_pool {
 152         const char *name;
 153         spinlock_t lock;
 154         spinlock_t stale_lock;
 155         struct list_head *unbuddied;
 156         struct list_head lru;
 157         struct list_head stale;
 158         atomic64_t pages_nr;
 159         struct kmem_cache *c_handle;
 160         const struct z3fold_ops *ops;
 161         struct zpool *zpool;
 162         const struct zpool_ops *zpool_ops;
 163         struct workqueue_struct *compact_wq;
 164         struct workqueue_struct *release_wq;
 165         struct work_struct work;
 166         struct inode *inode;
 167 };
 168
 169 /*
 170  * Internal z3fold page flags
 171  */
 172 enum z3fold_page_flags {
 173         PAGE_HEADLESS = 0,
 174         MIDDLE_CHUNK_MAPPED,
 175         NEEDS_COMPACTING,
 176         PAGE_STALE,
 177         PAGE_CLAIMED, /* by either reclaim or free */
 178 };
 179
 180 /*****************
 181  * Helpers
 182 *****************/
 183
 184 /* Converts an allocation size in bytes to size in z3fold chunks */
 185 static int size_to_chunks(size_t size)
 186 {
 187         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 188 }
 189
 190 #define for_each_unbuddied_list(_iter, _begin) \
 191         for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
 192
 193 static void compact_page_work(struct work_struct *w);
 194
 195 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
 196                                                         gfp_t gfp)
 197 {
 198         struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
 199                                                             gfp);
 200
 201         if (slots) {
 202                 memset(slots->slot, 0, sizeof(slots->slot));
 203                 slots->pool = (unsigned long)pool;
 204         }
 205
 206         return slots;
 207 }
 208
 209 static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
 210 {
 211         return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
 212 }
 213
 214 static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
 215 {
 216         return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
 217 }
 218
 219 static inline void free_handle(unsigned long handle)
 220 {
 221         struct z3fold_buddy_slots *slots;
 222         int i;
 223         bool is_free;
 224
 225         if (handle & (1 << PAGE_HEADLESS))
 226                 return;
 227
 228         WARN_ON(*(unsigned long *)handle == 0);
 229         *(unsigned long *)handle = 0;
 230         slots = handle_to_slots(handle);
 231         is_free = true;
 232         for (i = 0; i <= BUDDY_MASK; i++) {
 233                 if (slots->slot[i]) {
 234                         is_free = false;
 235                         break;
 236                 }
 237         }
 238
 239         if (is_free) {
 240                 struct z3fold_pool *pool = slots_to_pool(slots);
 241
 242                 kmem_cache_free(pool->c_handle, slots);
 243         }
 244 }
 245
 246 static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
 247                                 int flags, const char *dev_name, void *data)
 248 {
 249         static const struct dentry_operations ops = {
 250                 .d_dname = simple_dname,
 251         };
 252
 253         return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
 254 }
 255
 256 static struct file_system_type z3fold_fs = {
 257         .name           = "z3fold",
 258         .mount          = z3fold_do_mount,
 259         .kill_sb        = kill_anon_super,
 260 };
 261
 262 static struct vfsmount *z3fold_mnt;
 263 static int z3fold_mount(void)
 264 {
 265         int ret = 0;
 266
 267         z3fold_mnt = kern_mount(&z3fold_fs);
 268         if (IS_ERR(z3fold_mnt))
 269                 ret = PTR_ERR(z3fold_mnt);
 270
 271         return ret;
 272 }
 273
 274 static void z3fold_unmount(void)
 275 {
 276         kern_unmount(z3fold_mnt);
 277 }
 278
 279 static const struct address_space_operations z3fold_aops;
 280 static int z3fold_register_migration(struct z3fold_pool *pool)
 281 {
 282         pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
 283         if (IS_ERR(pool->inode)) {
 284                 pool->inode = NULL;
 285                 return 1;
 286         }
 287
 288         pool->inode->i_mapping->private_data = pool;
 289         pool->inode->i_mapping->a_ops = &z3fold_aops;
 290         return 0;
 291 }
 292
 293 static void z3fold_unregister_migration(struct z3fold_pool *pool)
 294 {
 295         if (pool->inode)
 296                 iput(pool->inode);
 297  }
 298
 299 /* Initializes the z3fold header of a newly allocated z3fold page */
 300 static struct z3fold_header *init_z3fold_page(struct page *page,
 301                                         struct z3fold_pool *pool, gfp_t gfp)
 302 {
 303         struct z3fold_header *zhdr = page_address(page);
 304         struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp);
 305
 306         if (!slots)
 307                 return NULL;
 308
 309         INIT_LIST_HEAD(&page->lru);
 310         clear_bit(PAGE_HEADLESS, &page->private);
 311         clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
 312         clear_bit(NEEDS_COMPACTING, &page->private);
 313         clear_bit(PAGE_STALE, &page->private);
 314         clear_bit(PAGE_CLAIMED, &page->private);
 315
 316         spin_lock_init(&zhdr->page_lock);
 317         kref_init(&zhdr->refcount);
 318         zhdr->first_chunks = 0;
 319         zhdr->middle_chunks = 0;
 320         zhdr->last_chunks = 0;
 321         zhdr->first_num = 0;
 322         zhdr->start_middle = 0;
 323         zhdr->cpu = -1;
 324         zhdr->slots = slots;
 325         zhdr->pool = pool;
 326         INIT_LIST_HEAD(&zhdr->buddy);
 327         INIT_WORK(&zhdr->work, compact_page_work);
 328         return zhdr;
 329 }
 330
 331 /* Resets the struct page fields and frees the page */
 332 static void free_z3fold_page(struct page *page, bool headless)
 333 {
 334         if (!headless) {
 335                 lock_page(page);
 336                 __ClearPageMovable(page);
 337                 unlock_page(page);
 338         }
 339         ClearPagePrivate(page);
 340         __free_page(page);
 341 }
 342
 343 /* Lock a z3fold page */
 344 static inline void z3fold_page_lock(struct z3fold_header *zhdr)
 345 {
 346         spin_lock(&zhdr->page_lock);
 347 }
 348
 349 /* Try to lock a z3fold page */
 350 static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
 351 {
 352         return spin_trylock(&zhdr->page_lock);
 353 }
 354
 355 /* Unlock a z3fold page */
 356 static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
 357 {
 358         spin_unlock(&zhdr->page_lock);
 359 }
 360
 361 /* Helper function to build the index */
 362 static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
 363 {
 364         return (bud + zhdr->first_num) & BUDDY_MASK;
 365 }
 366
 367 /*
 368  * Encodes the handle of a particular buddy within a z3fold page
 369  * Pool lock should be held as this function accesses first_num
 370  */
 371 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
 372 {
 373         struct z3fold_buddy_slots *slots;
 374         unsigned long h = (unsigned long)zhdr;
 375         int idx = 0;
 376
 377         /*
 378          * For a headless page, its handle is its pointer with the extra
 379          * PAGE_HEADLESS bit set
 380          */
 381         if (bud == HEADLESS)
 382                 return h | (1 << PAGE_HEADLESS);
 383
 384         /* otherwise, return pointer to encoded handle */
 385         idx = __idx(zhdr, bud);
 386         h += idx;
 387         if (bud == LAST)
 388                 h |= (zhdr->last_chunks << BUDDY_SHIFT);
 389
 390         slots = zhdr->slots;
 391         slots->slot[idx] = h;
 392         return (unsigned long)&slots->slot[idx];
 393 }
 394
 395 /* Returns the z3fold page where a given handle is stored */
 396 static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
 397 {
 398         unsigned long addr = h;
 399
 400         if (!(addr & (1 << PAGE_HEADLESS)))
 401                 addr = *(unsigned long *)h;
 402
 403         return (struct z3fold_header *)(addr & PAGE_MASK);
 404 }
 405
 406 /* only for LAST bud, returns zero otherwise */
 407 static unsigned short handle_to_chunks(unsigned long handle)
 408 {
 409         unsigned long addr = *(unsigned long *)handle;
 410
 411         return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
 412 }
 413
 414 /*
 415  * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
 416  *  but that doesn't matter. because the masking will result in the
 417  *  correct buddy number.
 418  */
 419 static enum buddy handle_to_buddy(unsigned long handle)
 420 {
 421         struct z3fold_header *zhdr;
 422         unsigned long addr;
 423
 424         WARN_ON(handle & (1 << PAGE_HEADLESS));
 425         addr = *(unsigned long *)handle;
 426         zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
 427         return (addr - zhdr->first_num) & BUDDY_MASK;
 428 }
 429
 430 static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
 431 {
 432         return zhdr->pool;
 433 }
 434
 435 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
 436 {
 437         struct page *page = virt_to_page(zhdr);
 438         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 439
 440         WARN_ON(!list_empty(&zhdr->buddy));
 441         set_bit(PAGE_STALE, &page->private);
 442         clear_bit(NEEDS_COMPACTING, &page->private);
 443         spin_lock(&pool->lock);
 444         if (!list_empty(&page->lru))
 445                 list_del_init(&page->lru);
 446         spin_unlock(&pool->lock);
 447         if (locked)
 448                 z3fold_page_unlock(zhdr);
 449         spin_lock(&pool->stale_lock);
 450         list_add(&zhdr->buddy, &pool->stale);
 451         queue_work(pool->release_wq, &pool->work);
 452         spin_unlock(&pool->stale_lock);
 453 }
 454
 455 static void __attribute__((__unused__))
 456                         release_z3fold_page(struct kref *ref)
 457 {
 458         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
 459                                                 refcount);
 460         __release_z3fold_page(zhdr, false);
 461 }
 462
 463 static void release_z3fold_page_locked(struct kref *ref)
 464 {
 465         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
 466                                                 refcount);
 467         WARN_ON(z3fold_page_trylock(zhdr));
 468         __release_z3fold_page(zhdr, true);
 469 }
 470
 471 static void release_z3fold_page_locked_list(struct kref *ref)
 472 {
 473         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
 474                                                refcount);
 475         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 476         spin_lock(&pool->lock);
 477         list_del_init(&zhdr->buddy);
 478         spin_unlock(&pool->lock);
 479
 480         WARN_ON(z3fold_page_trylock(zhdr));
 481         __release_z3fold_page(zhdr, true);
 482 }
 483
 484 static void free_pages_work(struct work_struct *w)
 485 {
 486         struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
 487
 488         spin_lock(&pool->stale_lock);
 489         while (!list_empty(&pool->stale)) {
 490                 struct z3fold_header *zhdr = list_first_entry(&pool->stale,
 491                                                 struct z3fold_header, buddy);
 492                 struct page *page = virt_to_page(zhdr);
 493
 494                 list_del(&zhdr->buddy);
 495                 if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
 496                         continue;
 497                 spin_unlock(&pool->stale_lock);
 498                 cancel_work_sync(&zhdr->work);
 499                 free_z3fold_page(page, false);
 500                 cond_resched();
 501                 spin_lock(&pool->stale_lock);
 502         }
 503         spin_unlock(&pool->stale_lock);
 504 }
 505
 506 /*
 507  * Returns the number of free chunks in a z3fold page.
 508  * NB: can't be used with HEADLESS pages.
 509  */
 510 static int num_free_chunks(struct z3fold_header *zhdr)
 511 {
 512         int nfree;
 513         /*
 514          * If there is a middle object, pick up the bigger free space
 515          * either before or after it. Otherwise just subtract the number
 516          * of chunks occupied by the first and the last objects.
 517          */
 518         if (zhdr->middle_chunks != 0) {
 519                 int nfree_before = zhdr->first_chunks ?
 520                         0 : zhdr->start_middle - ZHDR_CHUNKS;
 521                 int nfree_after = zhdr->last_chunks ?
 522                         0 : TOTAL_CHUNKS -
 523                                 (zhdr->start_middle + zhdr->middle_chunks);
 524                 nfree = max(nfree_before, nfree_after);
 525         } else
 526                 nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
 527         return nfree;
 528 }
 529
 530 /* Add to the appropriate unbuddied list */
 531 static inline void add_to_unbuddied(struct z3fold_pool *pool,
 532                                 struct z3fold_header *zhdr)
 533 {
 534         if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
 535                         zhdr->middle_chunks == 0) {
 536                 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
 537
 538                 int freechunks = num_free_chunks(zhdr);
 539                 spin_lock(&pool->lock);
 540                 list_add(&zhdr->buddy, &unbuddied[freechunks]);
 541                 spin_unlock(&pool->lock);
 542                 zhdr->cpu = smp_processor_id();
 543                 put_cpu_ptr(pool->unbuddied);
 544         }
 545 }
 546
 547 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
 548                                 unsigned short dst_chunk)
 549 {
 550         void *beg = zhdr;
 551         return memmove(beg + (dst_chunk << CHUNK_SHIFT),
 552                        beg + (zhdr->start_middle << CHUNK_SHIFT),
 553                        zhdr->middle_chunks << CHUNK_SHIFT);
 554 }
 555
 556 #define BIG_CHUNK_GAP   3
 557 /* Has to be called with lock held */
 558 static int z3fold_compact_page(struct z3fold_header *zhdr)
 559 {
 560         struct page *page = virt_to_page(zhdr);
 561
 562         if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
 563                 return 0; /* can't move middle chunk, it's used */
 564
 565         if (unlikely(PageIsolated(page)))
 566                 return 0;
 567
 568         if (zhdr->middle_chunks == 0)
 569                 return 0; /* nothing to compact */
 570
 571         if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
 572                 /* move to the beginning */
 573                 mchunk_memmove(zhdr, ZHDR_CHUNKS);
 574                 zhdr->first_chunks = zhdr->middle_chunks;
 575                 zhdr->middle_chunks = 0;
 576                 zhdr->start_middle = 0;
 577                 zhdr->first_num++;
 578                 return 1;
 579         }
 580
 581         /*
 582          * moving data is expensive, so let's only do that if
 583          * there's substantial gain (at least BIG_CHUNK_GAP chunks)
 584          */
 585         if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
 586             zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
 587                         BIG_CHUNK_GAP) {
 588                 mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
 589                 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
 590                 return 1;
 591         } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
 592                    TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
 593                                         + zhdr->middle_chunks) >=
 594                         BIG_CHUNK_GAP) {
 595                 unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
 596                         zhdr->middle_chunks;
 597                 mchunk_memmove(zhdr, new_start);
 598                 zhdr->start_middle = new_start;
 599                 return 1;
 600         }
 601
 602         return 0;
 603 }
 604
 605 static void do_compact_page(struct z3fold_header *zhdr, bool locked)
 606 {
 607         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 608         struct page *page;
 609
 610         page = virt_to_page(zhdr);
 611         if (locked)
 612                 WARN_ON(z3fold_page_trylock(zhdr));
 613         else
 614                 z3fold_page_lock(zhdr);
 615         if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
 616                 z3fold_page_unlock(zhdr);
 617                 return;
 618         }
 619         spin_lock(&pool->lock);
 620         list_del_init(&zhdr->buddy);
 621         spin_unlock(&pool->lock);
 622
 623         if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
 624                 atomic64_dec(&pool->pages_nr);
 625                 return;
 626         }
 627
 628         if (unlikely(PageIsolated(page) ||
 629                      test_bit(PAGE_STALE, &page->private))) {
 630                 z3fold_page_unlock(zhdr);
 631                 return;
 632         }
 633
 634         z3fold_compact_page(zhdr);
 635         add_to_unbuddied(pool, zhdr);
 636         z3fold_page_unlock(zhdr);
 637 }
 638
 639 static void compact_page_work(struct work_struct *w)
 640 {
 641         struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
 642                                                 work);
 643
 644         do_compact_page(zhdr, false);
 645 }
 646
 647 /* returns _locked_ z3fold page header or NULL */
 648 static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
 649                                                 size_t size, bool can_sleep)
 650 {
 651         struct z3fold_header *zhdr = NULL;
 652         struct page *page;
 653         struct list_head *unbuddied;
 654         int chunks = size_to_chunks(size), i;
 655
 656 lookup:
 657         /* First, try to find an unbuddied z3fold page. */
 658         unbuddied = get_cpu_ptr(pool->unbuddied);
 659         for_each_unbuddied_list(i, chunks) {
 660                 struct list_head *l = &unbuddied[i];
 661
 662                 zhdr = list_first_entry_or_null(READ_ONCE(l),
 663                                         struct z3fold_header, buddy);
 664
 665                 if (!zhdr)
 666                         continue;
 667
 668                 /* Re-check under lock. */
 669                 spin_lock(&pool->lock);
 670                 l = &unbuddied[i];
 671                 if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
 672                                                 struct z3fold_header, buddy)) ||
 673                     !z3fold_page_trylock(zhdr)) {
 674                         spin_unlock(&pool->lock);
 675                         zhdr = NULL;
 676                         put_cpu_ptr(pool->unbuddied);
 677                         if (can_sleep)
 678                                 cond_resched();
 679                         goto lookup;
 680                 }
 681                 list_del_init(&zhdr->buddy);
 682                 zhdr->cpu = -1;
 683                 spin_unlock(&pool->lock);
 684
 685                 page = virt_to_page(zhdr);
 686                 if (test_bit(NEEDS_COMPACTING, &page->private)) {
 687                         z3fold_page_unlock(zhdr);
 688                         zhdr = NULL;
 689                         put_cpu_ptr(pool->unbuddied);
 690                         if (can_sleep)
 691                                 cond_resched();
 692                         goto lookup;
 693                 }
 694
 695                 /*
 696                  * this page could not be removed from its unbuddied
 697                  * list while pool lock was held, and then we've taken
 698                  * page lock so kref_put could not be called before
 699                  * we got here, so it's safe to just call kref_get()
 700                  */
 701                 kref_get(&zhdr->refcount);
 702                 break;
 703         }
 704         put_cpu_ptr(pool->unbuddied);
 705
 706         if (!zhdr) {
 707                 int cpu;
 708
 709                 /* look for _exact_ match on other cpus' lists */
 710                 for_each_online_cpu(cpu) {
 711                         struct list_head *l;
 712
 713                         unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
 714                         spin_lock(&pool->lock);
 715                         l = &unbuddied[chunks];
 716
 717                         zhdr = list_first_entry_or_null(READ_ONCE(l),
 718                                                 struct z3fold_header, buddy);
 719
 720                         if (!zhdr || !z3fold_page_trylock(zhdr)) {
 721                                 spin_unlock(&pool->lock);
 722                                 zhdr = NULL;
 723                                 continue;
 724                         }
 725                         list_del_init(&zhdr->buddy);
 726                         zhdr->cpu = -1;
 727                         spin_unlock(&pool->lock);
 728
 729                         page = virt_to_page(zhdr);
 730                         if (test_bit(NEEDS_COMPACTING, &page->private)) {
 731                                 z3fold_page_unlock(zhdr);
 732                                 zhdr = NULL;
 733                                 if (can_sleep)
 734                                         cond_resched();
 735                                 continue;
 736                         }
 737                         kref_get(&zhdr->refcount);
 738                         break;
 739                 }
 740         }
 741
 742         return zhdr;
 743 }
 744
 745 /*
 746  * API Functions
 747  */
 748
 749 /**
 750  * z3fold_create_pool() - create a new z3fold pool
 751  * @name:       pool name
 752  * @gfp:        gfp flags when allocating the z3fold pool structure
 753  * @ops:        user-defined operations for the z3fold pool
 754  *
 755  * Return: pointer to the new z3fold pool or NULL if the metadata allocation
 756  * failed.
 757  */
 758 static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
 759                 const struct z3fold_ops *ops)
 760 {
 761         struct z3fold_pool *pool = NULL;
 762         int i, cpu;
 763
 764         pool = kzalloc(sizeof(struct z3fold_pool), gfp);
 765         if (!pool)
 766                 goto out;
 767         pool->c_handle = kmem_cache_create("z3fold_handle",
 768                                 sizeof(struct z3fold_buddy_slots),
 769                                 SLOTS_ALIGN, 0, NULL);
 770         if (!pool->c_handle)
 771                 goto out_c;
 772         spin_lock_init(&pool->lock);
 773         spin_lock_init(&pool->stale_lock);
 774         pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
 775         if (!pool->unbuddied)
 776                 goto out_pool;
 777         for_each_possible_cpu(cpu) {
 778                 struct list_head *unbuddied =
 779                                 per_cpu_ptr(pool->unbuddied, cpu);
 780                 for_each_unbuddied_list(i, 0)
 781                         INIT_LIST_HEAD(&unbuddied[i]);
 782         }
 783         INIT_LIST_HEAD(&pool->lru);
 784         INIT_LIST_HEAD(&pool->stale);
 785         atomic64_set(&pool->pages_nr, 0);
 786         pool->name = name;
 787         pool->compact_wq = create_singlethread_workqueue(pool->name);
 788         if (!pool->compact_wq)
 789                 goto out_unbuddied;
 790         pool->release_wq = create_singlethread_workqueue(pool->name);
 791         if (!pool->release_wq)
 792                 goto out_wq;
 793         if (z3fold_register_migration(pool))
 794                 goto out_rwq;
 795         INIT_WORK(&pool->work, free_pages_work);
 796         pool->ops = ops;
 797         return pool;
 798
 799 out_rwq:
 800         destroy_workqueue(pool->release_wq);
 801 out_wq:
 802         destroy_workqueue(pool->compact_wq);
 803 out_unbuddied:
 804         free_percpu(pool->unbuddied);
 805 out_pool:
 806         kmem_cache_destroy(pool->c_handle);
 807 out_c:
 808         kfree(pool);
 809 out:
 810         return NULL;
 811 }
 812
 813 /**
 814  * z3fold_destroy_pool() - destroys an existing z3fold pool
 815  * @pool:       the z3fold pool to be destroyed
 816  *
 817  * The pool should be emptied before this function is called.
 818  */
 819 static void z3fold_destroy_pool(struct z3fold_pool *pool)
 820 {
 821         kmem_cache_destroy(pool->c_handle);
 822         z3fold_unregister_migration(pool);
 823         destroy_workqueue(pool->release_wq);
 824         destroy_workqueue(pool->compact_wq);
 825         kfree(pool);
 826 }
 827
 828 /**
 829  * z3fold_alloc() - allocates a region of a given size
 830  * @pool:       z3fold pool from which to allocate
 831  * @size:       size in bytes of the desired allocation
 832  * @gfp:        gfp flags used if the pool needs to grow
 833  * @handle:     handle of the new allocation
 834  *
 835  * This function will attempt to find a free region in the pool large enough to
 836  * satisfy the allocation request.  A search of the unbuddied lists is
 837  * performed first. If no suitable free region is found, then a new page is
 838  * allocated and added to the pool to satisfy the request.
 839  *
 840  * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
 841  * as z3fold pool pages.
 842  *
 843  * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
 844  * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
 845  * a new page.
 846  */
 847 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
 848                         unsigned long *handle)
 849 {
 850         int chunks = size_to_chunks(size);
 851         struct z3fold_header *zhdr = NULL;
 852         struct page *page = NULL;
 853         enum buddy bud;
 854         bool can_sleep = gfpflags_allow_blocking(gfp);
 855
 856         if (!size || (gfp & __GFP_HIGHMEM))
 857                 return -EINVAL;
 858
 859         if (size > PAGE_SIZE)
 860                 return -ENOSPC;
 861
 862         if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
 863                 bud = HEADLESS;
 864         else {
 865 retry:
 866                 zhdr = __z3fold_alloc(pool, size, can_sleep);
 867                 if (zhdr) {
 868                         if (zhdr->first_chunks == 0) {
 869                                 if (zhdr->middle_chunks != 0 &&
 870                                     chunks >= zhdr->start_middle)
 871                                         bud = LAST;
 872                                 else
 873                                         bud = FIRST;
 874                         } else if (zhdr->last_chunks == 0)
 875                                 bud = LAST;
 876                         else if (zhdr->middle_chunks == 0)
 877                                 bud = MIDDLE;
 878                         else {
 879                                 if (kref_put(&zhdr->refcount,
 880                                              release_z3fold_page_locked))
 881                                         atomic64_dec(&pool->pages_nr);
 882                                 else
 883                                         z3fold_page_unlock(zhdr);
 884                                 pr_err("No free chunks in unbuddied\n");
 885                                 WARN_ON(1);
 886                                 goto retry;
 887                         }
 888                         page = virt_to_page(zhdr);
 889                         goto found;
 890                 }
 891                 bud = FIRST;
 892         }
 893
 894         page = NULL;
 895         if (can_sleep) {
 896                 spin_lock(&pool->stale_lock);
 897                 zhdr = list_first_entry_or_null(&pool->stale,
 898                                                 struct z3fold_header, buddy);
 899                 /*
 900                  * Before allocating a page, let's see if we can take one from
 901                  * the stale pages list. cancel_work_sync() can sleep so we
 902                  * limit this case to the contexts where we can sleep
 903                  */
 904                 if (zhdr) {
 905                         list_del(&zhdr->buddy);
 906                         spin_unlock(&pool->stale_lock);
 907                         cancel_work_sync(&zhdr->work);
 908                         page = virt_to_page(zhdr);
 909                 } else {
 910                         spin_unlock(&pool->stale_lock);
 911                 }
 912         }
 913         if (!page)
 914                 page = alloc_page(gfp);
 915
 916         if (!page)
 917                 return -ENOMEM;
 918
 919         zhdr = init_z3fold_page(page, pool, gfp);
 920         if (!zhdr) {
 921                 __free_page(page);
 922                 return -ENOMEM;
 923         }
 924         atomic64_inc(&pool->pages_nr);
 925
 926         if (bud == HEADLESS) {
 927                 set_bit(PAGE_HEADLESS, &page->private);
 928                 goto headless;
 929         }
 930         if (can_sleep) {
 931                 lock_page(page);
 932                 __SetPageMovable(page, pool->inode->i_mapping);
 933                 unlock_page(page);
 934         } else {
 935                 if (trylock_page(page)) {
 936                         __SetPageMovable(page, pool->inode->i_mapping);
 937                         unlock_page(page);
 938                 }
 939         }
 940         z3fold_page_lock(zhdr);
 941
 942 found:
 943         if (bud == FIRST)
 944                 zhdr->first_chunks = chunks;
 945         else if (bud == LAST)
 946                 zhdr->last_chunks = chunks;
 947         else {
 948                 zhdr->middle_chunks = chunks;
 949                 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
 950         }
 951         add_to_unbuddied(pool, zhdr);
 952
 953 headless:
 954         spin_lock(&pool->lock);
 955         /* Add/move z3fold page to beginning of LRU */
 956         if (!list_empty(&page->lru))
 957                 list_del(&page->lru);
 958
 959         list_add(&page->lru, &pool->lru);
 960
 961         *handle = encode_handle(zhdr, bud);
 962         spin_unlock(&pool->lock);
 963         if (bud != HEADLESS)
 964                 z3fold_page_unlock(zhdr);
 965
 966         return 0;
 967 }
 968
 969 /**
 970  * z3fold_free() - frees the allocation associated with the given handle
 971  * @pool:       pool in which the allocation resided
 972  * @handle:     handle associated with the allocation returned by z3fold_alloc()
 973  *
 974  * In the case that the z3fold page in which the allocation resides is under
 975  * reclaim, as indicated by the PG_reclaim flag being set, this function
 976  * only sets the first|last_chunks to 0.  The page is actually freed
 977  * once both buddies are evicted (see z3fold_reclaim_page() below).
 978  */
 979 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
 980 {
 981         struct z3fold_header *zhdr;
 982         struct page *page;
 983         enum buddy bud;
 984
 985         zhdr = handle_to_z3fold_header(handle);
 986         page = virt_to_page(zhdr);
 987
 988         if (test_bit(PAGE_HEADLESS, &page->private)) {
 989                 /* if a headless page is under reclaim, just leave.
 990                  * NB: we use test_and_set_bit for a reason: if the bit
 991                  * has not been set before, we release this page
 992                  * immediately so we don't care about its value any more.
 993                  */
 994                 if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
 995                         spin_lock(&pool->lock);
 996                         list_del(&page->lru);
 997                         spin_unlock(&pool->lock);
 998                         free_z3fold_page(page, true);
 999                         atomic64_dec(&pool->pages_nr);
1000                 }
1001                 return;
1002         }
1003
1004         /* Non-headless case */
1005         z3fold_page_lock(zhdr);
1006         bud = handle_to_buddy(handle);
1007
1008         switch (bud) {
1009         case FIRST:
1010                 zhdr->first_chunks = 0;
1011                 break;
1012         case MIDDLE:
1013                 zhdr->middle_chunks = 0;
1014                 break;
1015         case LAST:
1016                 zhdr->last_chunks = 0;
1017                 break;
1018         default:
1019                 pr_err("%s: unknown bud %d\n", __func__, bud);
1020                 WARN_ON(1);
1021                 z3fold_page_unlock(zhdr);
1022                 return;
1023         }
1024
1025         free_handle(handle);
1026         if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
1027                 atomic64_dec(&pool->pages_nr);
1028                 return;
1029         }
1030         if (test_bit(PAGE_CLAIMED, &page->private)) {
1031                 z3fold_page_unlock(zhdr);
1032                 return;
1033         }
1034         if (unlikely(PageIsolated(page)) ||
1035             test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
1036                 z3fold_page_unlock(zhdr);
1037                 return;
1038         }
1039         if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
1040                 spin_lock(&pool->lock);
1041                 list_del_init(&zhdr->buddy);
1042                 spin_unlock(&pool->lock);
1043                 zhdr->cpu = -1;
1044                 kref_get(&zhdr->refcount);
1045                 do_compact_page(zhdr, true);
1046                 return;
1047         }
1048         kref_get(&zhdr->refcount);
1049         queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
1050         z3fold_page_unlock(zhdr);
1051 }
1052
1053 /**
1054  * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
1055  * @pool:       pool from which a page will attempt to be evicted
1056  * @retries:    number of pages on the LRU list for which eviction will
1057  *              be attempted before failing
1058  *
1059  * z3fold reclaim is different from normal system reclaim in that it is done
1060  * from the bottom, up. This is because only the bottom layer, z3fold, has
1061  * information on how the allocations are organized within each z3fold page.
1062  * This has the potential to create interesting locking situations between
1063  * z3fold and the user, however.
1064  *
1065  * To avoid these, this is how z3fold_reclaim_page() should be called:
1066  *
1067  * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
1068  * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
1069  * call the user-defined eviction handler with the pool and handle as
1070  * arguments.
1071  *
1072  * If the handle can not be evicted, the eviction handler should return
1073  * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
1074  * appropriate list and try the next z3fold page on the LRU up to
1075  * a user defined number of retries.
1076  *
1077  * If the handle is successfully evicted, the eviction handler should
1078  * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
1079  * contains logic to delay freeing the page if the page is under reclaim,
1080  * as indicated by the setting of the PG_reclaim flag on the underlying page.
1081  *
1082  * If all buddies in the z3fold page are successfully evicted, then the
1083  * z3fold page can be freed.
1084  *
1085  * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
1086  * no pages to evict or an eviction handler is not registered, -EAGAIN if
1087  * the retry limit was hit.
1088  */
1089 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1090 {
1091         int i, ret = 0;
1092         struct z3fold_header *zhdr = NULL;
1093         struct page *page = NULL;
1094         struct list_head *pos;
1095         unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1096
1097         spin_lock(&pool->lock);
1098         if (!pool->ops || !pool->ops->evict || retries == 0) {
1099                 spin_unlock(&pool->lock);
1100                 return -EINVAL;
1101         }
1102         for (i = 0; i < retries; i++) {
1103                 if (list_empty(&pool->lru)) {
1104                         spin_unlock(&pool->lock);
1105                         return -EINVAL;
1106                 }
1107                 list_for_each_prev(pos, &pool->lru) {
1108                         page = list_entry(pos, struct page, lru);
1109
1110                         /* this bit could have been set by free, in which case
1111                          * we pass over to the next page in the pool.
1112                          */
1113                         if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1114                                 continue;
1115
1116                         if (unlikely(PageIsolated(page)))
1117                                 continue;
1118                         if (test_bit(PAGE_HEADLESS, &page->private))
1119                                 break;
1120
1121                         zhdr = page_address(page);
1122                         if (!z3fold_page_trylock(zhdr)) {
1123                                 zhdr = NULL;
1124                                 continue; /* can't evict at this point */
1125                         }
1126                         kref_get(&zhdr->refcount);
1127                         list_del_init(&zhdr->buddy);
1128                         zhdr->cpu = -1;
1129                         break;
1130                 }
1131
1132                 if (!zhdr)
1133                         break;
1134
1135                 list_del_init(&page->lru);
1136                 spin_unlock(&pool->lock);
1137
1138                 if (!test_bit(PAGE_HEADLESS, &page->private)) {
1139                         /*
1140                          * We need encode the handles before unlocking, since
1141                          * we can race with free that will set
1142                          * (first|last)_chunks to 0
1143                          */
1144                         first_handle = 0;
1145                         last_handle = 0;
1146                         middle_handle = 0;
1147                         if (zhdr->first_chunks)
1148                                 first_handle = encode_handle(zhdr, FIRST);
1149                         if (zhdr->middle_chunks)
1150                                 middle_handle = encode_handle(zhdr, MIDDLE);
1151                         if (zhdr->last_chunks)
1152                                 last_handle = encode_handle(zhdr, LAST);
1153                         /*
1154                          * it's safe to unlock here because we hold a
1155                          * reference to this page
1156                          */
1157                         z3fold_page_unlock(zhdr);
1158                 } else {
1159                         first_handle = encode_handle(zhdr, HEADLESS);
1160                         last_handle = middle_handle = 0;
1161                 }
1162
1163                 /* Issue the eviction callback(s) */
1164                 if (middle_handle) {
1165                         ret = pool->ops->evict(pool, middle_handle);
1166                         if (ret)
1167                                 goto next;
1168                 }
1169                 if (first_handle) {
1170                         ret = pool->ops->evict(pool, first_handle);
1171                         if (ret)
1172                                 goto next;
1173                 }
1174                 if (last_handle) {
1175                         ret = pool->ops->evict(pool, last_handle);
1176                         if (ret)
1177                                 goto next;
1178                 }
1179 next:
1180                 if (test_bit(PAGE_HEADLESS, &page->private)) {
1181                         if (ret == 0) {
1182                                 free_z3fold_page(page, true);
1183                                 atomic64_dec(&pool->pages_nr);
1184                                 return 0;
1185                         }
1186                         spin_lock(&pool->lock);
1187                         list_add(&page->lru, &pool->lru);
1188                         spin_unlock(&pool->lock);
1189                 } else {
1190                         z3fold_page_lock(zhdr);
1191                         clear_bit(PAGE_CLAIMED, &page->private);
1192                         if (kref_put(&zhdr->refcount,
1193                                         release_z3fold_page_locked)) {
1194                                 atomic64_dec(&pool->pages_nr);
1195                                 return 0;
1196                         }
1197                         /*
1198                          * if we are here, the page is still not completely
1199                          * free. Take the global pool lock then to be able
1200                          * to add it back to the lru list
1201                          */
1202                         spin_lock(&pool->lock);
1203                         list_add(&page->lru, &pool->lru);
1204                         spin_unlock(&pool->lock);
1205                         z3fold_page_unlock(zhdr);
1206                 }
1207
1208                 /* We started off locked to we need to lock the pool back */
1209                 spin_lock(&pool->lock);
1210         }
1211         spin_unlock(&pool->lock);
1212         return -EAGAIN;
1213 }
1214
1215 /**
1216  * z3fold_map() - maps the allocation associated with the given handle
1217  * @pool:       pool in which the allocation resides
1218  * @handle:     handle associated with the allocation to be mapped
1219  *
1220  * Extracts the buddy number from handle and constructs the pointer to the
1221  * correct starting chunk within the page.
1222  *
1223  * Returns: a pointer to the mapped allocation
1224  */
1225 static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
1226 {
1227         struct z3fold_header *zhdr;
1228         struct page *page;
1229         void *addr;
1230         enum buddy buddy;
1231
1232         zhdr = handle_to_z3fold_header(handle);
1233         addr = zhdr;
1234         page = virt_to_page(zhdr);
1235
1236         if (test_bit(PAGE_HEADLESS, &page->private))
1237                 goto out;
1238
1239         z3fold_page_lock(zhdr);
1240         buddy = handle_to_buddy(handle);
1241         switch (buddy) {
1242         case FIRST:
1243                 addr += ZHDR_SIZE_ALIGNED;
1244                 break;
1245         case MIDDLE:
1246                 addr += zhdr->start_middle << CHUNK_SHIFT;
1247                 set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1248                 break;
1249         case LAST:
1250                 addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
1251                 break;
1252         default:
1253                 pr_err("unknown buddy id %d\n", buddy);
1254                 WARN_ON(1);
1255                 addr = NULL;
1256                 break;
1257         }
1258
1259         if (addr)
1260                 zhdr->mapped_count++;
1261         z3fold_page_unlock(zhdr);
1262 out:
1263         return addr;
1264 }
1265
1266 /**
1267  * z3fold_unmap() - unmaps the allocation associated with the given handle
1268  * @pool:       pool in which the allocation resides
1269  * @handle:     handle associated with the allocation to be unmapped
1270  */
1271 static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
1272 {
1273         struct z3fold_header *zhdr;
1274         struct page *page;
1275         enum buddy buddy;
1276
1277         zhdr = handle_to_z3fold_header(handle);
1278         page = virt_to_page(zhdr);
1279
1280         if (test_bit(PAGE_HEADLESS, &page->private))
1281                 return;
1282
1283         z3fold_page_lock(zhdr);
1284         buddy = handle_to_buddy(handle);
1285         if (buddy == MIDDLE)
1286                 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1287         zhdr->mapped_count--;
1288         z3fold_page_unlock(zhdr);
1289 }
1290
1291 /**
1292  * z3fold_get_pool_size() - gets the z3fold pool size in pages
1293  * @pool:       pool whose size is being queried
1294  *
1295  * Returns: size in pages of the given pool.
1296  */
1297 static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
1298 {
1299         return atomic64_read(&pool->pages_nr);
1300 }
1301
1302 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1303 {
1304         struct z3fold_header *zhdr;
1305         struct z3fold_pool *pool;
1306
1307         VM_BUG_ON_PAGE(!PageMovable(page), page);
1308         VM_BUG_ON_PAGE(PageIsolated(page), page);
1309
1310         if (test_bit(PAGE_HEADLESS, &page->private))
1311                 return false;
1312
1313         zhdr = page_address(page);
1314         z3fold_page_lock(zhdr);
1315         if (test_bit(NEEDS_COMPACTING, &page->private) ||
1316             test_bit(PAGE_STALE, &page->private))
1317                 goto out;
1318
1319         pool = zhdr_to_pool(zhdr);
1320
1321         if (zhdr->mapped_count == 0) {
1322                 kref_get(&zhdr->refcount);
1323                 if (!list_empty(&zhdr->buddy))
1324                         list_del_init(&zhdr->buddy);
1325                 spin_lock(&pool->lock);
1326                 if (!list_empty(&page->lru))
1327                         list_del(&page->lru);
1328                 spin_unlock(&pool->lock);
1329                 z3fold_page_unlock(zhdr);
1330                 return true;
1331         }
1332 out:
1333         z3fold_page_unlock(zhdr);
1334         return false;
1335 }
1336
1337 static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
1338                                struct page *page, enum migrate_mode mode)
1339 {
1340         struct z3fold_header *zhdr, *new_zhdr;
1341         struct z3fold_pool *pool;
1342         struct address_space *new_mapping;
1343
1344         VM_BUG_ON_PAGE(!PageMovable(page), page);
1345         VM_BUG_ON_PAGE(!PageIsolated(page), page);
1346         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1347
1348         zhdr = page_address(page);
1349         pool = zhdr_to_pool(zhdr);
1350
1351         if (!trylock_page(page))
1352                 return -EAGAIN;
1353
1354         if (!z3fold_page_trylock(zhdr)) {
1355                 unlock_page(page);
1356                 return -EAGAIN;
1357         }
1358         if (zhdr->mapped_count != 0) {
1359                 z3fold_page_unlock(zhdr);
1360                 unlock_page(page);
1361                 return -EBUSY;
1362         }
1363         new_zhdr = page_address(newpage);
1364         memcpy(new_zhdr, zhdr, PAGE_SIZE);
1365         newpage->private = page->private;
1366         page->private = 0;
1367         z3fold_page_unlock(zhdr);
1368         spin_lock_init(&new_zhdr->page_lock);
1369         new_mapping = page_mapping(page);
1370         __ClearPageMovable(page);
1371         ClearPagePrivate(page);
1372
1373         get_page(newpage);
1374         z3fold_page_lock(new_zhdr);
1375         if (new_zhdr->first_chunks)
1376                 encode_handle(new_zhdr, FIRST);
1377         if (new_zhdr->last_chunks)
1378                 encode_handle(new_zhdr, LAST);
1379         if (new_zhdr->middle_chunks)
1380                 encode_handle(new_zhdr, MIDDLE);
1381         set_bit(NEEDS_COMPACTING, &newpage->private);
1382         new_zhdr->cpu = smp_processor_id();
1383         spin_lock(&pool->lock);
1384         list_add(&newpage->lru, &pool->lru);
1385         spin_unlock(&pool->lock);
1386         __SetPageMovable(newpage, new_mapping);
1387         z3fold_page_unlock(new_zhdr);
1388
1389         queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1390
1391         page_mapcount_reset(page);
1392         unlock_page(page);
1393         put_page(page);
1394         return 0;
1395 }
1396
1397 static void z3fold_page_putback(struct page *page)
1398 {
1399         struct z3fold_header *zhdr;
1400         struct z3fold_pool *pool;
1401
1402         zhdr = page_address(page);
1403         pool = zhdr_to_pool(zhdr);
1404
1405         z3fold_page_lock(zhdr);
1406         if (!list_empty(&zhdr->buddy))
1407                 list_del_init(&zhdr->buddy);
1408         INIT_LIST_HEAD(&page->lru);
1409         if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
1410                 atomic64_dec(&pool->pages_nr);
1411                 return;
1412         }
1413         spin_lock(&pool->lock);
1414         list_add(&page->lru, &pool->lru);
1415         spin_unlock(&pool->lock);
1416         z3fold_page_unlock(zhdr);
1417 }
1418
1419 static const struct address_space_operations z3fold_aops = {
1420         .isolate_page = z3fold_page_isolate,
1421         .migratepage = z3fold_page_migrate,
1422         .putback_page = z3fold_page_putback,
1423 };
1424
1425 /*****************
1426  * zpool
1427  ****************/
1428
1429 static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
1430 {
1431         if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
1432                 return pool->zpool_ops->evict(pool->zpool, handle);
1433         else
1434                 return -ENOENT;
1435 }
1436
1437 static const struct z3fold_ops z3fold_zpool_ops = {
1438         .evict =        z3fold_zpool_evict
1439 };
1440
1441 static void *z3fold_zpool_create(const char *name, gfp_t gfp,
1442                                const struct zpool_ops *zpool_ops,
1443                                struct zpool *zpool)
1444 {
1445         struct z3fold_pool *pool;
1446
1447         pool = z3fold_create_pool(name, gfp,
1448                                 zpool_ops ? &z3fold_zpool_ops : NULL);
1449         if (pool) {
1450                 pool->zpool = zpool;
1451                 pool->zpool_ops = zpool_ops;
1452         }
1453         return pool;
1454 }
1455
1456 static void z3fold_zpool_destroy(void *pool)
1457 {
1458         z3fold_destroy_pool(pool);
1459 }
1460
1461 static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
1462                         unsigned long *handle)
1463 {
1464         return z3fold_alloc(pool, size, gfp, handle);
1465 }
1466 static void z3fold_zpool_free(void *pool, unsigned long handle)
1467 {
1468         z3fold_free(pool, handle);
1469 }
1470
1471 static int z3fold_zpool_shrink(void *pool, unsigned int pages,
1472                         unsigned int *reclaimed)
1473 {
1474         unsigned int total = 0;
1475         int ret = -EINVAL;
1476
1477         while (total < pages) {
1478                 ret = z3fold_reclaim_page(pool, 8);
1479                 if (ret < 0)
1480                         break;
1481                 total++;
1482         }
1483
1484         if (reclaimed)
1485                 *reclaimed = total;
1486
1487         return ret;
1488 }
1489
1490 static void *z3fold_zpool_map(void *pool, unsigned long handle,
1491                         enum zpool_mapmode mm)
1492 {
1493         return z3fold_map(pool, handle);
1494 }
1495 static void z3fold_zpool_unmap(void *pool, unsigned long handle)
1496 {
1497         z3fold_unmap(pool, handle);
1498 }
1499
1500 static u64 z3fold_zpool_total_size(void *pool)
1501 {
1502         return z3fold_get_pool_size(pool) * PAGE_SIZE;
1503 }
1504
1505 static struct zpool_driver z3fold_zpool_driver = {
1506         .type =         "z3fold",
1507         .owner =        THIS_MODULE,
1508         .create =       z3fold_zpool_create,
1509         .destroy =      z3fold_zpool_destroy,
1510         .malloc =       z3fold_zpool_malloc,
1511         .free =         z3fold_zpool_free,
1512         .shrink =       z3fold_zpool_shrink,
1513         .map =          z3fold_zpool_map,
1514         .unmap =        z3fold_zpool_unmap,
1515         .total_size =   z3fold_zpool_total_size,
1516 };
1517
1518 MODULE_ALIAS("zpool-z3fold");
1519
1520 static int __init init_z3fold(void)
1521 {
1522         int ret;
1523
1524         /* Make sure the z3fold header is not larger than the page size */
1525         BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
1526         ret = z3fold_mount();
1527         if (ret)
1528                 return ret;
1529
1530         zpool_register_driver(&z3fold_zpool_driver);
1531
1532         return 0;
1533 }
1534
1535 static void __exit exit_z3fold(void)
1536 {
1537         z3fold_unmount();
1538         zpool_unregister_driver(&z3fold_zpool_driver);
1539 }
1540
1541 module_init(init_z3fold);
1542 module_exit(exit_z3fold);
1543
1544 MODULE_LICENSE("GPL");
1545 MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
1546 MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");