]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/block/zram/zram_drv.c
47e15fec3cd09ad6a5f902a314c61a3ecfc576dd
[linux.git] / drivers / block / zram / zram_drv.c
1 /*
2  * Compressed RAM block device
3  *
4  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5  *               2012, 2013 Minchan Kim
6  *
7  * This code is released using a dual license strategy: BSD/GPL
8  * You can choose the licence that better fits your requirements.
9  *
10  * Released under the terms of 3-clause BSD License
11  * Released under the terms of GNU General Public License Version 2.0
12  *
13  */
14
15 #define KMSG_COMPONENT "zram"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18 #include <linux/module.h>
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/bitops.h>
22 #include <linux/blkdev.h>
23 #include <linux/buffer_head.h>
24 #include <linux/device.h>
25 #include <linux/genhd.h>
26 #include <linux/highmem.h>
27 #include <linux/slab.h>
28 #include <linux/backing-dev.h>
29 #include <linux/string.h>
30 #include <linux/vmalloc.h>
31 #include <linux/err.h>
32 #include <linux/idr.h>
33 #include <linux/sysfs.h>
34 #include <linux/cpuhotplug.h>
35
36 #include "zram_drv.h"
37
38 static DEFINE_IDR(zram_index_idr);
39 /* idr index must be protected */
40 static DEFINE_MUTEX(zram_index_mutex);
41
42 static int zram_major;
43 static const char *default_compressor = "lzo";
44
45 /* Module params (documentation at end) */
46 static unsigned int num_devices = 1;
47
48 static void zram_free_page(struct zram *zram, size_t index);
49
50 static inline bool init_done(struct zram *zram)
51 {
52         return zram->disksize;
53 }
54
55 static inline struct zram *dev_to_zram(struct device *dev)
56 {
57         return (struct zram *)dev_to_disk(dev)->private_data;
58 }
59
60 /* flag operations require table entry bit_spin_lock() being held */
61 static int zram_test_flag(struct zram_meta *meta, u32 index,
62                         enum zram_pageflags flag)
63 {
64         return meta->table[index].value & BIT(flag);
65 }
66
67 static void zram_set_flag(struct zram_meta *meta, u32 index,
68                         enum zram_pageflags flag)
69 {
70         meta->table[index].value |= BIT(flag);
71 }
72
73 static void zram_clear_flag(struct zram_meta *meta, u32 index,
74                         enum zram_pageflags flag)
75 {
76         meta->table[index].value &= ~BIT(flag);
77 }
78
79 static inline void zram_set_element(struct zram_meta *meta, u32 index,
80                         unsigned long element)
81 {
82         meta->table[index].element = element;
83 }
84
85 static inline void zram_clear_element(struct zram_meta *meta, u32 index)
86 {
87         meta->table[index].element = 0;
88 }
89
90 static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
91 {
92         return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
93 }
94
95 static void zram_set_obj_size(struct zram_meta *meta,
96                                         u32 index, size_t size)
97 {
98         unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
99
100         meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
101 }
102
103 #if PAGE_SIZE != 4096
104 static inline bool is_partial_io(struct bio_vec *bvec)
105 {
106         return bvec->bv_len != PAGE_SIZE;
107 }
108 #else
109 static inline bool is_partial_io(struct bio_vec *bvec)
110 {
111         return false;
112 }
113 #endif
114
115 static void zram_revalidate_disk(struct zram *zram)
116 {
117         revalidate_disk(zram->disk);
118         /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */
119         zram->disk->queue->backing_dev_info->capabilities |=
120                 BDI_CAP_STABLE_WRITES;
121 }
122
123 /*
124  * Check if request is within bounds and aligned on zram logical blocks.
125  */
126 static inline bool valid_io_request(struct zram *zram,
127                 sector_t start, unsigned int size)
128 {
129         u64 end, bound;
130
131         /* unaligned request */
132         if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
133                 return false;
134         if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
135                 return false;
136
137         end = start + (size >> SECTOR_SHIFT);
138         bound = zram->disksize >> SECTOR_SHIFT;
139         /* out of range range */
140         if (unlikely(start >= bound || end > bound || start > end))
141                 return false;
142
143         /* I/O request is valid */
144         return true;
145 }
146
147 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
148 {
149         *index  += (*offset + bvec->bv_len) / PAGE_SIZE;
150         *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
151 }
152
153 static inline void update_used_max(struct zram *zram,
154                                         const unsigned long pages)
155 {
156         unsigned long old_max, cur_max;
157
158         old_max = atomic_long_read(&zram->stats.max_used_pages);
159
160         do {
161                 cur_max = old_max;
162                 if (pages > cur_max)
163                         old_max = atomic_long_cmpxchg(
164                                 &zram->stats.max_used_pages, cur_max, pages);
165         } while (old_max != cur_max);
166 }
167
168 static inline void zram_fill_page(char *ptr, unsigned long len,
169                                         unsigned long value)
170 {
171         int i;
172         unsigned long *page = (unsigned long *)ptr;
173
174         WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
175
176         if (likely(value == 0)) {
177                 memset(ptr, 0, len);
178         } else {
179                 for (i = 0; i < len / sizeof(*page); i++)
180                         page[i] = value;
181         }
182 }
183
184 static bool page_same_filled(void *ptr, unsigned long *element)
185 {
186         unsigned int pos;
187         unsigned long *page;
188
189         page = (unsigned long *)ptr;
190
191         for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
192                 if (page[pos] != page[pos + 1])
193                         return false;
194         }
195
196         *element = page[pos];
197
198         return true;
199 }
200
201 static ssize_t initstate_show(struct device *dev,
202                 struct device_attribute *attr, char *buf)
203 {
204         u32 val;
205         struct zram *zram = dev_to_zram(dev);
206
207         down_read(&zram->init_lock);
208         val = init_done(zram);
209         up_read(&zram->init_lock);
210
211         return scnprintf(buf, PAGE_SIZE, "%u\n", val);
212 }
213
214 static ssize_t disksize_show(struct device *dev,
215                 struct device_attribute *attr, char *buf)
216 {
217         struct zram *zram = dev_to_zram(dev);
218
219         return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
220 }
221
222 static ssize_t mem_limit_store(struct device *dev,
223                 struct device_attribute *attr, const char *buf, size_t len)
224 {
225         u64 limit;
226         char *tmp;
227         struct zram *zram = dev_to_zram(dev);
228
229         limit = memparse(buf, &tmp);
230         if (buf == tmp) /* no chars parsed, invalid input */
231                 return -EINVAL;
232
233         down_write(&zram->init_lock);
234         zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
235         up_write(&zram->init_lock);
236
237         return len;
238 }
239
240 static ssize_t mem_used_max_store(struct device *dev,
241                 struct device_attribute *attr, const char *buf, size_t len)
242 {
243         int err;
244         unsigned long val;
245         struct zram *zram = dev_to_zram(dev);
246
247         err = kstrtoul(buf, 10, &val);
248         if (err || val != 0)
249                 return -EINVAL;
250
251         down_read(&zram->init_lock);
252         if (init_done(zram)) {
253                 struct zram_meta *meta = zram->meta;
254                 atomic_long_set(&zram->stats.max_used_pages,
255                                 zs_get_total_pages(meta->mem_pool));
256         }
257         up_read(&zram->init_lock);
258
259         return len;
260 }
261
262 /*
263  * We switched to per-cpu streams and this attr is not needed anymore.
264  * However, we will keep it around for some time, because:
265  * a) we may revert per-cpu streams in the future
266  * b) it's visible to user space and we need to follow our 2 years
267  *    retirement rule; but we already have a number of 'soon to be
268  *    altered' attrs, so max_comp_streams need to wait for the next
269  *    layoff cycle.
270  */
271 static ssize_t max_comp_streams_show(struct device *dev,
272                 struct device_attribute *attr, char *buf)
273 {
274         return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
275 }
276
277 static ssize_t max_comp_streams_store(struct device *dev,
278                 struct device_attribute *attr, const char *buf, size_t len)
279 {
280         return len;
281 }
282
283 static ssize_t comp_algorithm_show(struct device *dev,
284                 struct device_attribute *attr, char *buf)
285 {
286         size_t sz;
287         struct zram *zram = dev_to_zram(dev);
288
289         down_read(&zram->init_lock);
290         sz = zcomp_available_show(zram->compressor, buf);
291         up_read(&zram->init_lock);
292
293         return sz;
294 }
295
296 static ssize_t comp_algorithm_store(struct device *dev,
297                 struct device_attribute *attr, const char *buf, size_t len)
298 {
299         struct zram *zram = dev_to_zram(dev);
300         char compressor[CRYPTO_MAX_ALG_NAME];
301         size_t sz;
302
303         strlcpy(compressor, buf, sizeof(compressor));
304         /* ignore trailing newline */
305         sz = strlen(compressor);
306         if (sz > 0 && compressor[sz - 1] == '\n')
307                 compressor[sz - 1] = 0x00;
308
309         if (!zcomp_available_algorithm(compressor))
310                 return -EINVAL;
311
312         down_write(&zram->init_lock);
313         if (init_done(zram)) {
314                 up_write(&zram->init_lock);
315                 pr_info("Can't change algorithm for initialized device\n");
316                 return -EBUSY;
317         }
318
319         strlcpy(zram->compressor, compressor, sizeof(compressor));
320         up_write(&zram->init_lock);
321         return len;
322 }
323
324 static ssize_t compact_store(struct device *dev,
325                 struct device_attribute *attr, const char *buf, size_t len)
326 {
327         struct zram *zram = dev_to_zram(dev);
328         struct zram_meta *meta;
329
330         down_read(&zram->init_lock);
331         if (!init_done(zram)) {
332                 up_read(&zram->init_lock);
333                 return -EINVAL;
334         }
335
336         meta = zram->meta;
337         zs_compact(meta->mem_pool);
338         up_read(&zram->init_lock);
339
340         return len;
341 }
342
343 static ssize_t io_stat_show(struct device *dev,
344                 struct device_attribute *attr, char *buf)
345 {
346         struct zram *zram = dev_to_zram(dev);
347         ssize_t ret;
348
349         down_read(&zram->init_lock);
350         ret = scnprintf(buf, PAGE_SIZE,
351                         "%8llu %8llu %8llu %8llu\n",
352                         (u64)atomic64_read(&zram->stats.failed_reads),
353                         (u64)atomic64_read(&zram->stats.failed_writes),
354                         (u64)atomic64_read(&zram->stats.invalid_io),
355                         (u64)atomic64_read(&zram->stats.notify_free));
356         up_read(&zram->init_lock);
357
358         return ret;
359 }
360
361 static ssize_t mm_stat_show(struct device *dev,
362                 struct device_attribute *attr, char *buf)
363 {
364         struct zram *zram = dev_to_zram(dev);
365         struct zs_pool_stats pool_stats;
366         u64 orig_size, mem_used = 0;
367         long max_used;
368         ssize_t ret;
369
370         memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
371
372         down_read(&zram->init_lock);
373         if (init_done(zram)) {
374                 mem_used = zs_get_total_pages(zram->meta->mem_pool);
375                 zs_pool_stats(zram->meta->mem_pool, &pool_stats);
376         }
377
378         orig_size = atomic64_read(&zram->stats.pages_stored);
379         max_used = atomic_long_read(&zram->stats.max_used_pages);
380
381         ret = scnprintf(buf, PAGE_SIZE,
382                         "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
383                         orig_size << PAGE_SHIFT,
384                         (u64)atomic64_read(&zram->stats.compr_data_size),
385                         mem_used << PAGE_SHIFT,
386                         zram->limit_pages << PAGE_SHIFT,
387                         max_used << PAGE_SHIFT,
388                         (u64)atomic64_read(&zram->stats.same_pages),
389                         pool_stats.pages_compacted);
390         up_read(&zram->init_lock);
391
392         return ret;
393 }
394
395 static ssize_t debug_stat_show(struct device *dev,
396                 struct device_attribute *attr, char *buf)
397 {
398         int version = 1;
399         struct zram *zram = dev_to_zram(dev);
400         ssize_t ret;
401
402         down_read(&zram->init_lock);
403         ret = scnprintf(buf, PAGE_SIZE,
404                         "version: %d\n%8llu\n",
405                         version,
406                         (u64)atomic64_read(&zram->stats.writestall));
407         up_read(&zram->init_lock);
408
409         return ret;
410 }
411
412 static DEVICE_ATTR_RO(io_stat);
413 static DEVICE_ATTR_RO(mm_stat);
414 static DEVICE_ATTR_RO(debug_stat);
415
416 static bool zram_same_page_read(struct zram *zram, u32 index,
417                                 struct page *page,
418                                 unsigned int offset, unsigned int len)
419 {
420         struct zram_meta *meta = zram->meta;
421
422         bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
423         if (unlikely(!meta->table[index].handle) ||
424                         zram_test_flag(meta, index, ZRAM_SAME)) {
425                 void *mem;
426
427                 bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
428                 mem = kmap_atomic(page);
429                 zram_fill_page(mem + offset, len, meta->table[index].element);
430                 kunmap_atomic(mem);
431                 return true;
432         }
433         bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
434
435         return false;
436 }
437
438 static bool zram_same_page_write(struct zram *zram, u32 index,
439                                         struct page *page)
440 {
441         unsigned long element;
442         void *mem = kmap_atomic(page);
443
444         if (page_same_filled(mem, &element)) {
445                 struct zram_meta *meta = zram->meta;
446
447                 kunmap_atomic(mem);
448                 /* Free memory associated with this sector now. */
449                 bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
450                 zram_free_page(zram, index);
451                 zram_set_flag(meta, index, ZRAM_SAME);
452                 zram_set_element(meta, index, element);
453                 bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
454
455                 atomic64_inc(&zram->stats.same_pages);
456                 return true;
457         }
458         kunmap_atomic(mem);
459
460         return false;
461 }
462
463 static void zram_meta_free(struct zram_meta *meta, u64 disksize)
464 {
465         size_t num_pages = disksize >> PAGE_SHIFT;
466         size_t index;
467
468         /* Free all pages that are still in this zram device */
469         for (index = 0; index < num_pages; index++) {
470                 unsigned long handle = meta->table[index].handle;
471                 /*
472                  * No memory is allocated for same element filled pages.
473                  * Simply clear same page flag.
474                  */
475                 if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
476                         continue;
477
478                 zs_free(meta->mem_pool, handle);
479         }
480
481         zs_destroy_pool(meta->mem_pool);
482         vfree(meta->table);
483         kfree(meta);
484 }
485
486 static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
487 {
488         size_t num_pages;
489         struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
490
491         if (!meta)
492                 return NULL;
493
494         num_pages = disksize >> PAGE_SHIFT;
495         meta->table = vzalloc(num_pages * sizeof(*meta->table));
496         if (!meta->table) {
497                 pr_err("Error allocating zram address table\n");
498                 goto out_error;
499         }
500
501         meta->mem_pool = zs_create_pool(pool_name);
502         if (!meta->mem_pool) {
503                 pr_err("Error creating memory pool\n");
504                 goto out_error;
505         }
506
507         return meta;
508
509 out_error:
510         vfree(meta->table);
511         kfree(meta);
512         return NULL;
513 }
514
515 /*
516  * To protect concurrent access to the same index entry,
517  * caller should hold this table index entry's bit_spinlock to
518  * indicate this index entry is accessing.
519  */
520 static void zram_free_page(struct zram *zram, size_t index)
521 {
522         struct zram_meta *meta = zram->meta;
523         unsigned long handle = meta->table[index].handle;
524
525         /*
526          * No memory is allocated for same element filled pages.
527          * Simply clear same page flag.
528          */
529         if (zram_test_flag(meta, index, ZRAM_SAME)) {
530                 zram_clear_flag(meta, index, ZRAM_SAME);
531                 zram_clear_element(meta, index);
532                 atomic64_dec(&zram->stats.same_pages);
533                 return;
534         }
535
536         if (!handle)
537                 return;
538
539         zs_free(meta->mem_pool, handle);
540
541         atomic64_sub(zram_get_obj_size(meta, index),
542                         &zram->stats.compr_data_size);
543         atomic64_dec(&zram->stats.pages_stored);
544
545         meta->table[index].handle = 0;
546         zram_set_obj_size(meta, index, 0);
547 }
548
549 static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
550 {
551         int ret;
552         unsigned long handle;
553         unsigned int size;
554         void *src, *dst;
555         struct zram_meta *meta = zram->meta;
556
557         if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
558                 return 0;
559
560         bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
561         handle = meta->table[index].handle;
562         size = zram_get_obj_size(meta, index);
563
564         src = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
565         if (size == PAGE_SIZE) {
566                 dst = kmap_atomic(page);
567                 memcpy(dst, src, PAGE_SIZE);
568                 kunmap_atomic(dst);
569                 ret = 0;
570         } else {
571                 struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
572
573                 dst = kmap_atomic(page);
574                 ret = zcomp_decompress(zstrm, src, size, dst);
575                 kunmap_atomic(dst);
576                 zcomp_stream_put(zram->comp);
577         }
578         zs_unmap_object(meta->mem_pool, handle);
579         bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
580
581         /* Should NEVER happen. Return bio error if it does. */
582         if (unlikely(ret))
583                 pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
584
585         return ret;
586 }
587
588 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
589                                 u32 index, int offset)
590 {
591         int ret;
592         struct page *page;
593
594         page = bvec->bv_page;
595         if (is_partial_io(bvec)) {
596                 /* Use a temporary buffer to decompress the page */
597                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
598                 if (!page)
599                         return -ENOMEM;
600         }
601
602         ret = zram_decompress_page(zram, page, index);
603         if (unlikely(ret))
604                 goto out;
605
606         if (is_partial_io(bvec)) {
607                 void *dst = kmap_atomic(bvec->bv_page);
608                 void *src = kmap_atomic(page);
609
610                 memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
611                 kunmap_atomic(src);
612                 kunmap_atomic(dst);
613         }
614 out:
615         if (is_partial_io(bvec))
616                 __free_page(page);
617
618         return ret;
619 }
620
621 static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
622                         struct page *page,
623                         unsigned long *out_handle, unsigned int *out_comp_len)
624 {
625         int ret;
626         unsigned int comp_len;
627         void *src;
628         unsigned long alloced_pages;
629         unsigned long handle = 0;
630         struct zram_meta *meta = zram->meta;
631
632 compress_again:
633         src = kmap_atomic(page);
634         ret = zcomp_compress(*zstrm, src, &comp_len);
635         kunmap_atomic(src);
636
637         if (unlikely(ret)) {
638                 pr_err("Compression failed! err=%d\n", ret);
639                 if (handle)
640                         zs_free(meta->mem_pool, handle);
641                 return ret;
642         }
643
644         if (unlikely(comp_len > max_zpage_size))
645                 comp_len = PAGE_SIZE;
646
647         /*
648          * handle allocation has 2 paths:
649          * a) fast path is executed with preemption disabled (for
650          *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
651          *  since we can't sleep;
652          * b) slow path enables preemption and attempts to allocate
653          *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
654          *  put per-cpu compression stream and, thus, to re-do
655          *  the compression once handle is allocated.
656          *
657          * if we have a 'non-null' handle here then we are coming
658          * from the slow path and handle has already been allocated.
659          */
660         if (!handle)
661                 handle = zs_malloc(meta->mem_pool, comp_len,
662                                 __GFP_KSWAPD_RECLAIM |
663                                 __GFP_NOWARN |
664                                 __GFP_HIGHMEM |
665                                 __GFP_MOVABLE);
666         if (!handle) {
667                 zcomp_stream_put(zram->comp);
668                 atomic64_inc(&zram->stats.writestall);
669                 handle = zs_malloc(meta->mem_pool, comp_len,
670                                 GFP_NOIO | __GFP_HIGHMEM |
671                                 __GFP_MOVABLE);
672                 *zstrm = zcomp_stream_get(zram->comp);
673                 if (handle)
674                         goto compress_again;
675                 return -ENOMEM;
676         }
677
678         alloced_pages = zs_get_total_pages(meta->mem_pool);
679         update_used_max(zram, alloced_pages);
680
681         if (zram->limit_pages && alloced_pages > zram->limit_pages) {
682                 zs_free(meta->mem_pool, handle);
683                 return -ENOMEM;
684         }
685
686         *out_handle = handle;
687         *out_comp_len = comp_len;
688         return 0;
689 }
690
691 static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
692 {
693         int ret;
694         unsigned long handle;
695         unsigned int comp_len;
696         void *src, *dst;
697         struct zcomp_strm *zstrm;
698         struct zram_meta *meta = zram->meta;
699         struct page *page = bvec->bv_page;
700
701         if (zram_same_page_write(zram, index, page))
702                 return 0;
703
704         zstrm = zcomp_stream_get(zram->comp);
705         ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
706         if (ret) {
707                 zcomp_stream_put(zram->comp);
708                 return ret;
709         }
710
711
712         dst = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
713
714         src = zstrm->buffer;
715         if (comp_len == PAGE_SIZE)
716                 src = kmap_atomic(page);
717         memcpy(dst, src, comp_len);
718         if (comp_len == PAGE_SIZE)
719                 kunmap_atomic(src);
720
721         zcomp_stream_put(zram->comp);
722         zs_unmap_object(meta->mem_pool, handle);
723
724         /*
725          * Free memory associated with this sector
726          * before overwriting unused sectors.
727          */
728         bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
729         zram_free_page(zram, index);
730         meta->table[index].handle = handle;
731         zram_set_obj_size(meta, index, comp_len);
732         bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
733
734         /* Update stats */
735         atomic64_add(comp_len, &zram->stats.compr_data_size);
736         atomic64_inc(&zram->stats.pages_stored);
737         return 0;
738 }
739
740 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
741                                 u32 index, int offset)
742 {
743         int ret;
744         struct page *page = NULL;
745         void *src;
746         struct bio_vec vec;
747
748         vec = *bvec;
749         if (is_partial_io(bvec)) {
750                 void *dst;
751                 /*
752                  * This is a partial IO. We need to read the full page
753                  * before to write the changes.
754                  */
755                 page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
756                 if (!page)
757                         return -ENOMEM;
758
759                 ret = zram_decompress_page(zram, page, index);
760                 if (ret)
761                         goto out;
762
763                 src = kmap_atomic(bvec->bv_page);
764                 dst = kmap_atomic(page);
765                 memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
766                 kunmap_atomic(dst);
767                 kunmap_atomic(src);
768
769                 vec.bv_page = page;
770                 vec.bv_len = PAGE_SIZE;
771                 vec.bv_offset = 0;
772         }
773
774         ret = __zram_bvec_write(zram, &vec, index);
775 out:
776         if (is_partial_io(bvec))
777                 __free_page(page);
778         return ret;
779 }
780
781 /*
782  * zram_bio_discard - handler on discard request
783  * @index: physical block index in PAGE_SIZE units
784  * @offset: byte offset within physical block
785  */
786 static void zram_bio_discard(struct zram *zram, u32 index,
787                              int offset, struct bio *bio)
788 {
789         size_t n = bio->bi_iter.bi_size;
790         struct zram_meta *meta = zram->meta;
791
792         /*
793          * zram manages data in physical block size units. Because logical block
794          * size isn't identical with physical block size on some arch, we
795          * could get a discard request pointing to a specific offset within a
796          * certain physical block.  Although we can handle this request by
797          * reading that physiclal block and decompressing and partially zeroing
798          * and re-compressing and then re-storing it, this isn't reasonable
799          * because our intent with a discard request is to save memory.  So
800          * skipping this logical block is appropriate here.
801          */
802         if (offset) {
803                 if (n <= (PAGE_SIZE - offset))
804                         return;
805
806                 n -= (PAGE_SIZE - offset);
807                 index++;
808         }
809
810         while (n >= PAGE_SIZE) {
811                 bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
812                 zram_free_page(zram, index);
813                 bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
814                 atomic64_inc(&zram->stats.notify_free);
815                 index++;
816                 n -= PAGE_SIZE;
817         }
818 }
819
820 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
821                         int offset, bool is_write)
822 {
823         unsigned long start_time = jiffies;
824         int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
825         int ret;
826
827         generic_start_io_acct(rw_acct, bvec->bv_len >> SECTOR_SHIFT,
828                         &zram->disk->part0);
829
830         if (!is_write) {
831                 atomic64_inc(&zram->stats.num_reads);
832                 ret = zram_bvec_read(zram, bvec, index, offset);
833                 flush_dcache_page(bvec->bv_page);
834         } else {
835                 atomic64_inc(&zram->stats.num_writes);
836                 ret = zram_bvec_write(zram, bvec, index, offset);
837         }
838
839         generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
840
841         if (unlikely(ret)) {
842                 if (!is_write)
843                         atomic64_inc(&zram->stats.failed_reads);
844                 else
845                         atomic64_inc(&zram->stats.failed_writes);
846         }
847
848         return ret;
849 }
850
851 static void __zram_make_request(struct zram *zram, struct bio *bio)
852 {
853         int offset;
854         u32 index;
855         struct bio_vec bvec;
856         struct bvec_iter iter;
857
858         index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
859         offset = (bio->bi_iter.bi_sector &
860                   (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
861
862         switch (bio_op(bio)) {
863         case REQ_OP_DISCARD:
864         case REQ_OP_WRITE_ZEROES:
865                 zram_bio_discard(zram, index, offset, bio);
866                 bio_endio(bio);
867                 return;
868         default:
869                 break;
870         }
871
872         bio_for_each_segment(bvec, bio, iter) {
873                 struct bio_vec bv = bvec;
874                 unsigned int unwritten = bvec.bv_len;
875
876                 do {
877                         bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
878                                                         unwritten);
879                         if (zram_bvec_rw(zram, &bv, index, offset,
880                                         op_is_write(bio_op(bio))) < 0)
881                                 goto out;
882
883                         bv.bv_offset += bv.bv_len;
884                         unwritten -= bv.bv_len;
885
886                         update_position(&index, &offset, &bv);
887                 } while (unwritten);
888         }
889
890         bio_endio(bio);
891         return;
892
893 out:
894         bio_io_error(bio);
895 }
896
897 /*
898  * Handler function for all zram I/O requests.
899  */
900 static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
901 {
902         struct zram *zram = queue->queuedata;
903
904         if (!valid_io_request(zram, bio->bi_iter.bi_sector,
905                                         bio->bi_iter.bi_size)) {
906                 atomic64_inc(&zram->stats.invalid_io);
907                 goto error;
908         }
909
910         __zram_make_request(zram, bio);
911         return BLK_QC_T_NONE;
912
913 error:
914         bio_io_error(bio);
915         return BLK_QC_T_NONE;
916 }
917
918 static void zram_slot_free_notify(struct block_device *bdev,
919                                 unsigned long index)
920 {
921         struct zram *zram;
922         struct zram_meta *meta;
923
924         zram = bdev->bd_disk->private_data;
925         meta = zram->meta;
926
927         bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
928         zram_free_page(zram, index);
929         bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
930         atomic64_inc(&zram->stats.notify_free);
931 }
932
933 static int zram_rw_page(struct block_device *bdev, sector_t sector,
934                        struct page *page, bool is_write)
935 {
936         int offset, err = -EIO;
937         u32 index;
938         struct zram *zram;
939         struct bio_vec bv;
940
941         zram = bdev->bd_disk->private_data;
942
943         if (!valid_io_request(zram, sector, PAGE_SIZE)) {
944                 atomic64_inc(&zram->stats.invalid_io);
945                 err = -EINVAL;
946                 goto out;
947         }
948
949         index = sector >> SECTORS_PER_PAGE_SHIFT;
950         offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
951
952         bv.bv_page = page;
953         bv.bv_len = PAGE_SIZE;
954         bv.bv_offset = 0;
955
956         err = zram_bvec_rw(zram, &bv, index, offset, is_write);
957 out:
958         /*
959          * If I/O fails, just return error(ie, non-zero) without
960          * calling page_endio.
961          * It causes resubmit the I/O with bio request by upper functions
962          * of rw_page(e.g., swap_readpage, __swap_writepage) and
963          * bio->bi_end_io does things to handle the error
964          * (e.g., SetPageError, set_page_dirty and extra works).
965          */
966         if (err == 0)
967                 page_endio(page, is_write, 0);
968         return err;
969 }
970
971 static void zram_reset_device(struct zram *zram)
972 {
973         struct zram_meta *meta;
974         struct zcomp *comp;
975         u64 disksize;
976
977         down_write(&zram->init_lock);
978
979         zram->limit_pages = 0;
980
981         if (!init_done(zram)) {
982                 up_write(&zram->init_lock);
983                 return;
984         }
985
986         meta = zram->meta;
987         comp = zram->comp;
988         disksize = zram->disksize;
989
990         /* Reset stats */
991         memset(&zram->stats, 0, sizeof(zram->stats));
992         zram->disksize = 0;
993
994         set_capacity(zram->disk, 0);
995         part_stat_set_all(&zram->disk->part0, 0);
996
997         up_write(&zram->init_lock);
998         /* I/O operation under all of CPU are done so let's free */
999         zram_meta_free(meta, disksize);
1000         zcomp_destroy(comp);
1001 }
1002
1003 static ssize_t disksize_store(struct device *dev,
1004                 struct device_attribute *attr, const char *buf, size_t len)
1005 {
1006         u64 disksize;
1007         struct zcomp *comp;
1008         struct zram_meta *meta;
1009         struct zram *zram = dev_to_zram(dev);
1010         int err;
1011
1012         disksize = memparse(buf, NULL);
1013         if (!disksize)
1014                 return -EINVAL;
1015
1016         disksize = PAGE_ALIGN(disksize);
1017         meta = zram_meta_alloc(zram->disk->disk_name, disksize);
1018         if (!meta)
1019                 return -ENOMEM;
1020
1021         comp = zcomp_create(zram->compressor);
1022         if (IS_ERR(comp)) {
1023                 pr_err("Cannot initialise %s compressing backend\n",
1024                                 zram->compressor);
1025                 err = PTR_ERR(comp);
1026                 goto out_free_meta;
1027         }
1028
1029         down_write(&zram->init_lock);
1030         if (init_done(zram)) {
1031                 pr_info("Cannot change disksize for initialized device\n");
1032                 err = -EBUSY;
1033                 goto out_destroy_comp;
1034         }
1035
1036         zram->meta = meta;
1037         zram->comp = comp;
1038         zram->disksize = disksize;
1039         set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
1040         zram_revalidate_disk(zram);
1041         up_write(&zram->init_lock);
1042
1043         return len;
1044
1045 out_destroy_comp:
1046         up_write(&zram->init_lock);
1047         zcomp_destroy(comp);
1048 out_free_meta:
1049         zram_meta_free(meta, disksize);
1050         return err;
1051 }
1052
1053 static ssize_t reset_store(struct device *dev,
1054                 struct device_attribute *attr, const char *buf, size_t len)
1055 {
1056         int ret;
1057         unsigned short do_reset;
1058         struct zram *zram;
1059         struct block_device *bdev;
1060
1061         ret = kstrtou16(buf, 10, &do_reset);
1062         if (ret)
1063                 return ret;
1064
1065         if (!do_reset)
1066                 return -EINVAL;
1067
1068         zram = dev_to_zram(dev);
1069         bdev = bdget_disk(zram->disk, 0);
1070         if (!bdev)
1071                 return -ENOMEM;
1072
1073         mutex_lock(&bdev->bd_mutex);
1074         /* Do not reset an active device or claimed device */
1075         if (bdev->bd_openers || zram->claim) {
1076                 mutex_unlock(&bdev->bd_mutex);
1077                 bdput(bdev);
1078                 return -EBUSY;
1079         }
1080
1081         /* From now on, anyone can't open /dev/zram[0-9] */
1082         zram->claim = true;
1083         mutex_unlock(&bdev->bd_mutex);
1084
1085         /* Make sure all the pending I/O are finished */
1086         fsync_bdev(bdev);
1087         zram_reset_device(zram);
1088         zram_revalidate_disk(zram);
1089         bdput(bdev);
1090
1091         mutex_lock(&bdev->bd_mutex);
1092         zram->claim = false;
1093         mutex_unlock(&bdev->bd_mutex);
1094
1095         return len;
1096 }
1097
1098 static int zram_open(struct block_device *bdev, fmode_t mode)
1099 {
1100         int ret = 0;
1101         struct zram *zram;
1102
1103         WARN_ON(!mutex_is_locked(&bdev->bd_mutex));
1104
1105         zram = bdev->bd_disk->private_data;
1106         /* zram was claimed to reset so open request fails */
1107         if (zram->claim)
1108                 ret = -EBUSY;
1109
1110         return ret;
1111 }
1112
1113 static const struct block_device_operations zram_devops = {
1114         .open = zram_open,
1115         .swap_slot_free_notify = zram_slot_free_notify,
1116         .rw_page = zram_rw_page,
1117         .owner = THIS_MODULE
1118 };
1119
1120 static DEVICE_ATTR_WO(compact);
1121 static DEVICE_ATTR_RW(disksize);
1122 static DEVICE_ATTR_RO(initstate);
1123 static DEVICE_ATTR_WO(reset);
1124 static DEVICE_ATTR_WO(mem_limit);
1125 static DEVICE_ATTR_WO(mem_used_max);
1126 static DEVICE_ATTR_RW(max_comp_streams);
1127 static DEVICE_ATTR_RW(comp_algorithm);
1128
1129 static struct attribute *zram_disk_attrs[] = {
1130         &dev_attr_disksize.attr,
1131         &dev_attr_initstate.attr,
1132         &dev_attr_reset.attr,
1133         &dev_attr_compact.attr,
1134         &dev_attr_mem_limit.attr,
1135         &dev_attr_mem_used_max.attr,
1136         &dev_attr_max_comp_streams.attr,
1137         &dev_attr_comp_algorithm.attr,
1138         &dev_attr_io_stat.attr,
1139         &dev_attr_mm_stat.attr,
1140         &dev_attr_debug_stat.attr,
1141         NULL,
1142 };
1143
1144 static struct attribute_group zram_disk_attr_group = {
1145         .attrs = zram_disk_attrs,
1146 };
1147
1148 /*
1149  * Allocate and initialize new zram device. the function returns
1150  * '>= 0' device_id upon success, and negative value otherwise.
1151  */
1152 static int zram_add(void)
1153 {
1154         struct zram *zram;
1155         struct request_queue *queue;
1156         int ret, device_id;
1157
1158         zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
1159         if (!zram)
1160                 return -ENOMEM;
1161
1162         ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
1163         if (ret < 0)
1164                 goto out_free_dev;
1165         device_id = ret;
1166
1167         init_rwsem(&zram->init_lock);
1168
1169         queue = blk_alloc_queue(GFP_KERNEL);
1170         if (!queue) {
1171                 pr_err("Error allocating disk queue for device %d\n",
1172                         device_id);
1173                 ret = -ENOMEM;
1174                 goto out_free_idr;
1175         }
1176
1177         blk_queue_make_request(queue, zram_make_request);
1178
1179         /* gendisk structure */
1180         zram->disk = alloc_disk(1);
1181         if (!zram->disk) {
1182                 pr_err("Error allocating disk structure for device %d\n",
1183                         device_id);
1184                 ret = -ENOMEM;
1185                 goto out_free_queue;
1186         }
1187
1188         zram->disk->major = zram_major;
1189         zram->disk->first_minor = device_id;
1190         zram->disk->fops = &zram_devops;
1191         zram->disk->queue = queue;
1192         zram->disk->queue->queuedata = zram;
1193         zram->disk->private_data = zram;
1194         snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1195
1196         /* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
1197         set_capacity(zram->disk, 0);
1198         /* zram devices sort of resembles non-rotational disks */
1199         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
1200         queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1201         /*
1202          * To ensure that we always get PAGE_SIZE aligned
1203          * and n*PAGE_SIZED sized I/O requests.
1204          */
1205         blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
1206         blk_queue_logical_block_size(zram->disk->queue,
1207                                         ZRAM_LOGICAL_BLOCK_SIZE);
1208         blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
1209         blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
1210         zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
1211         blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
1212         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
1213
1214         /*
1215          * zram_bio_discard() will clear all logical blocks if logical block
1216          * size is identical with physical block size(PAGE_SIZE). But if it is
1217          * different, we will skip discarding some parts of logical blocks in
1218          * the part of the request range which isn't aligned to physical block
1219          * size.  So we can't ensure that all discarded logical blocks are
1220          * zeroed.
1221          */
1222         if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
1223                 blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
1224
1225         add_disk(zram->disk);
1226
1227         ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
1228                                 &zram_disk_attr_group);
1229         if (ret < 0) {
1230                 pr_err("Error creating sysfs group for device %d\n",
1231                                 device_id);
1232                 goto out_free_disk;
1233         }
1234         strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
1235         zram->meta = NULL;
1236
1237         pr_info("Added device: %s\n", zram->disk->disk_name);
1238         return device_id;
1239
1240 out_free_disk:
1241         del_gendisk(zram->disk);
1242         put_disk(zram->disk);
1243 out_free_queue:
1244         blk_cleanup_queue(queue);
1245 out_free_idr:
1246         idr_remove(&zram_index_idr, device_id);
1247 out_free_dev:
1248         kfree(zram);
1249         return ret;
1250 }
1251
1252 static int zram_remove(struct zram *zram)
1253 {
1254         struct block_device *bdev;
1255
1256         bdev = bdget_disk(zram->disk, 0);
1257         if (!bdev)
1258                 return -ENOMEM;
1259
1260         mutex_lock(&bdev->bd_mutex);
1261         if (bdev->bd_openers || zram->claim) {
1262                 mutex_unlock(&bdev->bd_mutex);
1263                 bdput(bdev);
1264                 return -EBUSY;
1265         }
1266
1267         zram->claim = true;
1268         mutex_unlock(&bdev->bd_mutex);
1269
1270         /*
1271          * Remove sysfs first, so no one will perform a disksize
1272          * store while we destroy the devices. This also helps during
1273          * hot_remove -- zram_reset_device() is the last holder of
1274          * ->init_lock, no later/concurrent disksize_store() or any
1275          * other sysfs handlers are possible.
1276          */
1277         sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
1278                         &zram_disk_attr_group);
1279
1280         /* Make sure all the pending I/O are finished */
1281         fsync_bdev(bdev);
1282         zram_reset_device(zram);
1283         bdput(bdev);
1284
1285         pr_info("Removed device: %s\n", zram->disk->disk_name);
1286
1287         blk_cleanup_queue(zram->disk->queue);
1288         del_gendisk(zram->disk);
1289         put_disk(zram->disk);
1290         kfree(zram);
1291         return 0;
1292 }
1293
1294 /* zram-control sysfs attributes */
1295 static ssize_t hot_add_show(struct class *class,
1296                         struct class_attribute *attr,
1297                         char *buf)
1298 {
1299         int ret;
1300
1301         mutex_lock(&zram_index_mutex);
1302         ret = zram_add();
1303         mutex_unlock(&zram_index_mutex);
1304
1305         if (ret < 0)
1306                 return ret;
1307         return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
1308 }
1309
1310 static ssize_t hot_remove_store(struct class *class,
1311                         struct class_attribute *attr,
1312                         const char *buf,
1313                         size_t count)
1314 {
1315         struct zram *zram;
1316         int ret, dev_id;
1317
1318         /* dev_id is gendisk->first_minor, which is `int' */
1319         ret = kstrtoint(buf, 10, &dev_id);
1320         if (ret)
1321                 return ret;
1322         if (dev_id < 0)
1323                 return -EINVAL;
1324
1325         mutex_lock(&zram_index_mutex);
1326
1327         zram = idr_find(&zram_index_idr, dev_id);
1328         if (zram) {
1329                 ret = zram_remove(zram);
1330                 if (!ret)
1331                         idr_remove(&zram_index_idr, dev_id);
1332         } else {
1333                 ret = -ENODEV;
1334         }
1335
1336         mutex_unlock(&zram_index_mutex);
1337         return ret ? ret : count;
1338 }
1339
1340 /*
1341  * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
1342  * sense that reading from this file does alter the state of your system -- it
1343  * creates a new un-initialized zram device and returns back this device's
1344  * device_id (or an error code if it fails to create a new device).
1345  */
1346 static struct class_attribute zram_control_class_attrs[] = {
1347         __ATTR(hot_add, 0400, hot_add_show, NULL),
1348         __ATTR_WO(hot_remove),
1349         __ATTR_NULL,
1350 };
1351
1352 static struct class zram_control_class = {
1353         .name           = "zram-control",
1354         .owner          = THIS_MODULE,
1355         .class_attrs    = zram_control_class_attrs,
1356 };
1357
1358 static int zram_remove_cb(int id, void *ptr, void *data)
1359 {
1360         zram_remove(ptr);
1361         return 0;
1362 }
1363
1364 static void destroy_devices(void)
1365 {
1366         class_unregister(&zram_control_class);
1367         idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
1368         idr_destroy(&zram_index_idr);
1369         unregister_blkdev(zram_major, "zram");
1370         cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
1371 }
1372
1373 static int __init zram_init(void)
1374 {
1375         int ret;
1376
1377         ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
1378                                       zcomp_cpu_up_prepare, zcomp_cpu_dead);
1379         if (ret < 0)
1380                 return ret;
1381
1382         ret = class_register(&zram_control_class);
1383         if (ret) {
1384                 pr_err("Unable to register zram-control class\n");
1385                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
1386                 return ret;
1387         }
1388
1389         zram_major = register_blkdev(0, "zram");
1390         if (zram_major <= 0) {
1391                 pr_err("Unable to get major number\n");
1392                 class_unregister(&zram_control_class);
1393                 cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
1394                 return -EBUSY;
1395         }
1396
1397         while (num_devices != 0) {
1398                 mutex_lock(&zram_index_mutex);
1399                 ret = zram_add();
1400                 mutex_unlock(&zram_index_mutex);
1401                 if (ret < 0)
1402                         goto out_error;
1403                 num_devices--;
1404         }
1405
1406         return 0;
1407
1408 out_error:
1409         destroy_devices();
1410         return ret;
1411 }
1412
1413 static void __exit zram_exit(void)
1414 {
1415         destroy_devices();
1416 }
1417
1418 module_init(zram_init);
1419 module_exit(zram_exit);
1420
1421 module_param(num_devices, uint, 0);
1422 MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
1423
1424 MODULE_LICENSE("Dual BSD/GPL");
1425 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1426 MODULE_DESCRIPTION("Compressed RAM Block Device");