]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/md/md.c
41f766ab824af1bf7e1427921aebc32ac625167b
[linux.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3      Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33
34    Errors, Warnings, etc.
35    Please use:
36      pr_crit() for error conditions that risk data loss
37      pr_err() for error conditions that are unexpected, like an IO error
38          or internal inconsistency
39      pr_warn() for error conditions that could have been predicated, like
40          adding a device to an array when it has incompatible metadata
41      pr_info() for every interesting, very rare events, like an array starting
42          or stopping, or resync starting or stopping
43      pr_debug() for everything else.
44
45 */
46
47 #include <linux/sched/signal.h>
48 #include <linux/kthread.h>
49 #include <linux/blkdev.h>
50 #include <linux/badblocks.h>
51 #include <linux/sysctl.h>
52 #include <linux/seq_file.h>
53 #include <linux/fs.h>
54 #include <linux/poll.h>
55 #include <linux/ctype.h>
56 #include <linux/string.h>
57 #include <linux/hdreg.h>
58 #include <linux/proc_fs.h>
59 #include <linux/random.h>
60 #include <linux/module.h>
61 #include <linux/reboot.h>
62 #include <linux/file.h>
63 #include <linux/compat.h>
64 #include <linux/delay.h>
65 #include <linux/raid/md_p.h>
66 #include <linux/raid/md_u.h>
67 #include <linux/slab.h>
68 #include <trace/events/block.h>
69 #include "md.h"
70 #include "bitmap.h"
71 #include "md-cluster.h"
72
73 #ifndef MODULE
74 static void autostart_arrays(int part);
75 #endif
76
77 /* pers_list is a list of registered personalities protected
78  * by pers_lock.
79  * pers_lock does extra service to protect accesses to
80  * mddev->thread when the mutex cannot be held.
81  */
82 static LIST_HEAD(pers_list);
83 static DEFINE_SPINLOCK(pers_lock);
84
85 struct md_cluster_operations *md_cluster_ops;
86 EXPORT_SYMBOL(md_cluster_ops);
87 struct module *md_cluster_mod;
88 EXPORT_SYMBOL(md_cluster_mod);
89
90 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
91 static struct workqueue_struct *md_wq;
92 static struct workqueue_struct *md_misc_wq;
93
94 static int remove_and_add_spares(struct mddev *mddev,
95                                  struct md_rdev *this);
96 static void mddev_detach(struct mddev *mddev);
97
98 /*
99  * Default number of read corrections we'll attempt on an rdev
100  * before ejecting it from the array. We divide the read error
101  * count by 2 for every hour elapsed between read errors.
102  */
103 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
104 /*
105  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
106  * is 1000 KB/sec, so the extra system load does not show up that much.
107  * Increase it if you want to have more _guaranteed_ speed. Note that
108  * the RAID driver will use the maximum available bandwidth if the IO
109  * subsystem is idle. There is also an 'absolute maximum' reconstruction
110  * speed limit - in case reconstruction slows down your system despite
111  * idle IO detection.
112  *
113  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
114  * or /sys/block/mdX/md/sync_speed_{min,max}
115  */
116
117 static int sysctl_speed_limit_min = 1000;
118 static int sysctl_speed_limit_max = 200000;
119 static inline int speed_min(struct mddev *mddev)
120 {
121         return mddev->sync_speed_min ?
122                 mddev->sync_speed_min : sysctl_speed_limit_min;
123 }
124
125 static inline int speed_max(struct mddev *mddev)
126 {
127         return mddev->sync_speed_max ?
128                 mddev->sync_speed_max : sysctl_speed_limit_max;
129 }
130
131 static struct ctl_table_header *raid_table_header;
132
133 static struct ctl_table raid_table[] = {
134         {
135                 .procname       = "speed_limit_min",
136                 .data           = &sysctl_speed_limit_min,
137                 .maxlen         = sizeof(int),
138                 .mode           = S_IRUGO|S_IWUSR,
139                 .proc_handler   = proc_dointvec,
140         },
141         {
142                 .procname       = "speed_limit_max",
143                 .data           = &sysctl_speed_limit_max,
144                 .maxlen         = sizeof(int),
145                 .mode           = S_IRUGO|S_IWUSR,
146                 .proc_handler   = proc_dointvec,
147         },
148         { }
149 };
150
151 static struct ctl_table raid_dir_table[] = {
152         {
153                 .procname       = "raid",
154                 .maxlen         = 0,
155                 .mode           = S_IRUGO|S_IXUGO,
156                 .child          = raid_table,
157         },
158         { }
159 };
160
161 static struct ctl_table raid_root_table[] = {
162         {
163                 .procname       = "dev",
164                 .maxlen         = 0,
165                 .mode           = 0555,
166                 .child          = raid_dir_table,
167         },
168         {  }
169 };
170
171 static const struct block_device_operations md_fops;
172
173 static int start_readonly;
174
175 /* bio_clone_mddev
176  * like bio_clone, but with a local bio set
177  */
178
179 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
180                             struct mddev *mddev)
181 {
182         struct bio *b;
183
184         if (!mddev || !mddev->bio_set)
185                 return bio_alloc(gfp_mask, nr_iovecs);
186
187         b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
188         if (!b)
189                 return NULL;
190         return b;
191 }
192 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
193
194 /*
195  * We have a system wide 'event count' that is incremented
196  * on any 'interesting' event, and readers of /proc/mdstat
197  * can use 'poll' or 'select' to find out when the event
198  * count increases.
199  *
200  * Events are:
201  *  start array, stop array, error, add device, remove device,
202  *  start build, activate spare
203  */
204 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
205 static atomic_t md_event_count;
206 void md_new_event(struct mddev *mddev)
207 {
208         atomic_inc(&md_event_count);
209         wake_up(&md_event_waiters);
210 }
211 EXPORT_SYMBOL_GPL(md_new_event);
212
213 /*
214  * Enables to iterate over all existing md arrays
215  * all_mddevs_lock protects this list.
216  */
217 static LIST_HEAD(all_mddevs);
218 static DEFINE_SPINLOCK(all_mddevs_lock);
219
220 /*
221  * iterates through all used mddevs in the system.
222  * We take care to grab the all_mddevs_lock whenever navigating
223  * the list, and to always hold a refcount when unlocked.
224  * Any code which breaks out of this loop while own
225  * a reference to the current mddev and must mddev_put it.
226  */
227 #define for_each_mddev(_mddev,_tmp)                                     \
228                                                                         \
229         for (({ spin_lock(&all_mddevs_lock);                            \
230                 _tmp = all_mddevs.next;                                 \
231                 _mddev = NULL;});                                       \
232              ({ if (_tmp != &all_mddevs)                                \
233                         mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
234                 spin_unlock(&all_mddevs_lock);                          \
235                 if (_mddev) mddev_put(_mddev);                          \
236                 _mddev = list_entry(_tmp, struct mddev, all_mddevs);    \
237                 _tmp != &all_mddevs;});                                 \
238              ({ spin_lock(&all_mddevs_lock);                            \
239                 _tmp = _tmp->next;})                                    \
240                 )
241
242 /* Rather than calling directly into the personality make_request function,
243  * IO requests come here first so that we can check if the device is
244  * being suspended pending a reconfiguration.
245  * We hold a refcount over the call to ->make_request.  By the time that
246  * call has finished, the bio has been linked into some internal structure
247  * and so is visible to ->quiesce(), so we don't need the refcount any more.
248  */
249 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
250 {
251         const int rw = bio_data_dir(bio);
252         struct mddev *mddev = q->queuedata;
253         unsigned int sectors;
254         int cpu;
255
256         blk_queue_split(q, &bio, q->bio_split);
257
258         if (mddev == NULL || mddev->pers == NULL) {
259                 bio_io_error(bio);
260                 return BLK_QC_T_NONE;
261         }
262         if (mddev->ro == 1 && unlikely(rw == WRITE)) {
263                 if (bio_sectors(bio) != 0)
264                         bio->bi_error = -EROFS;
265                 bio_endio(bio);
266                 return BLK_QC_T_NONE;
267         }
268         smp_rmb(); /* Ensure implications of  'active' are visible */
269         rcu_read_lock();
270         if (mddev->suspended) {
271                 DEFINE_WAIT(__wait);
272                 for (;;) {
273                         prepare_to_wait(&mddev->sb_wait, &__wait,
274                                         TASK_UNINTERRUPTIBLE);
275                         if (!mddev->suspended)
276                                 break;
277                         rcu_read_unlock();
278                         schedule();
279                         rcu_read_lock();
280                 }
281                 finish_wait(&mddev->sb_wait, &__wait);
282         }
283         atomic_inc(&mddev->active_io);
284         rcu_read_unlock();
285
286         /*
287          * save the sectors now since our bio can
288          * go away inside make_request
289          */
290         sectors = bio_sectors(bio);
291         /* bio could be mergeable after passing to underlayer */
292         bio->bi_opf &= ~REQ_NOMERGE;
293         mddev->pers->make_request(mddev, bio);
294
295         cpu = part_stat_lock();
296         part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
297         part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
298         part_stat_unlock();
299
300         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
301                 wake_up(&mddev->sb_wait);
302
303         return BLK_QC_T_NONE;
304 }
305
306 /* mddev_suspend makes sure no new requests are submitted
307  * to the device, and that any requests that have been submitted
308  * are completely handled.
309  * Once mddev_detach() is called and completes, the module will be
310  * completely unused.
311  */
312 void mddev_suspend(struct mddev *mddev)
313 {
314         WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
315         if (mddev->suspended++)
316                 return;
317         synchronize_rcu();
318         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
319         mddev->pers->quiesce(mddev, 1);
320
321         del_timer_sync(&mddev->safemode_timer);
322 }
323 EXPORT_SYMBOL_GPL(mddev_suspend);
324
325 void mddev_resume(struct mddev *mddev)
326 {
327         if (--mddev->suspended)
328                 return;
329         wake_up(&mddev->sb_wait);
330         mddev->pers->quiesce(mddev, 0);
331
332         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
333         md_wakeup_thread(mddev->thread);
334         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
335 }
336 EXPORT_SYMBOL_GPL(mddev_resume);
337
338 int mddev_congested(struct mddev *mddev, int bits)
339 {
340         struct md_personality *pers = mddev->pers;
341         int ret = 0;
342
343         rcu_read_lock();
344         if (mddev->suspended)
345                 ret = 1;
346         else if (pers && pers->congested)
347                 ret = pers->congested(mddev, bits);
348         rcu_read_unlock();
349         return ret;
350 }
351 EXPORT_SYMBOL_GPL(mddev_congested);
352 static int md_congested(void *data, int bits)
353 {
354         struct mddev *mddev = data;
355         return mddev_congested(mddev, bits);
356 }
357
358 /*
359  * Generic flush handling for md
360  */
361
362 static void md_end_flush(struct bio *bio)
363 {
364         struct md_rdev *rdev = bio->bi_private;
365         struct mddev *mddev = rdev->mddev;
366
367         rdev_dec_pending(rdev, mddev);
368
369         if (atomic_dec_and_test(&mddev->flush_pending)) {
370                 /* The pre-request flush has finished */
371                 queue_work(md_wq, &mddev->flush_work);
372         }
373         bio_put(bio);
374 }
375
376 static void md_submit_flush_data(struct work_struct *ws);
377
378 static void submit_flushes(struct work_struct *ws)
379 {
380         struct mddev *mddev = container_of(ws, struct mddev, flush_work);
381         struct md_rdev *rdev;
382
383         INIT_WORK(&mddev->flush_work, md_submit_flush_data);
384         atomic_set(&mddev->flush_pending, 1);
385         rcu_read_lock();
386         rdev_for_each_rcu(rdev, mddev)
387                 if (rdev->raid_disk >= 0 &&
388                     !test_bit(Faulty, &rdev->flags)) {
389                         /* Take two references, one is dropped
390                          * when request finishes, one after
391                          * we reclaim rcu_read_lock
392                          */
393                         struct bio *bi;
394                         atomic_inc(&rdev->nr_pending);
395                         atomic_inc(&rdev->nr_pending);
396                         rcu_read_unlock();
397                         bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
398                         bi->bi_end_io = md_end_flush;
399                         bi->bi_private = rdev;
400                         bi->bi_bdev = rdev->bdev;
401                         bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
402                         atomic_inc(&mddev->flush_pending);
403                         submit_bio(bi);
404                         rcu_read_lock();
405                         rdev_dec_pending(rdev, mddev);
406                 }
407         rcu_read_unlock();
408         if (atomic_dec_and_test(&mddev->flush_pending))
409                 queue_work(md_wq, &mddev->flush_work);
410 }
411
412 static void md_submit_flush_data(struct work_struct *ws)
413 {
414         struct mddev *mddev = container_of(ws, struct mddev, flush_work);
415         struct bio *bio = mddev->flush_bio;
416
417         if (bio->bi_iter.bi_size == 0)
418                 /* an empty barrier - all done */
419                 bio_endio(bio);
420         else {
421                 bio->bi_opf &= ~REQ_PREFLUSH;
422                 mddev->pers->make_request(mddev, bio);
423         }
424
425         mddev->flush_bio = NULL;
426         wake_up(&mddev->sb_wait);
427 }
428
429 void md_flush_request(struct mddev *mddev, struct bio *bio)
430 {
431         spin_lock_irq(&mddev->lock);
432         wait_event_lock_irq(mddev->sb_wait,
433                             !mddev->flush_bio,
434                             mddev->lock);
435         mddev->flush_bio = bio;
436         spin_unlock_irq(&mddev->lock);
437
438         INIT_WORK(&mddev->flush_work, submit_flushes);
439         queue_work(md_wq, &mddev->flush_work);
440 }
441 EXPORT_SYMBOL(md_flush_request);
442
443 static inline struct mddev *mddev_get(struct mddev *mddev)
444 {
445         atomic_inc(&mddev->active);
446         return mddev;
447 }
448
449 static void mddev_delayed_delete(struct work_struct *ws);
450
451 static void mddev_put(struct mddev *mddev)
452 {
453         struct bio_set *bs = NULL;
454
455         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
456                 return;
457         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
458             mddev->ctime == 0 && !mddev->hold_active) {
459                 /* Array is not configured at all, and not held active,
460                  * so destroy it */
461                 list_del_init(&mddev->all_mddevs);
462                 bs = mddev->bio_set;
463                 mddev->bio_set = NULL;
464                 if (mddev->gendisk) {
465                         /* We did a probe so need to clean up.  Call
466                          * queue_work inside the spinlock so that
467                          * flush_workqueue() after mddev_find will
468                          * succeed in waiting for the work to be done.
469                          */
470                         INIT_WORK(&mddev->del_work, mddev_delayed_delete);
471                         queue_work(md_misc_wq, &mddev->del_work);
472                 } else
473                         kfree(mddev);
474         }
475         spin_unlock(&all_mddevs_lock);
476         if (bs)
477                 bioset_free(bs);
478 }
479
480 static void md_safemode_timeout(unsigned long data);
481
482 void mddev_init(struct mddev *mddev)
483 {
484         mutex_init(&mddev->open_mutex);
485         mutex_init(&mddev->reconfig_mutex);
486         mutex_init(&mddev->bitmap_info.mutex);
487         INIT_LIST_HEAD(&mddev->disks);
488         INIT_LIST_HEAD(&mddev->all_mddevs);
489         setup_timer(&mddev->safemode_timer, md_safemode_timeout,
490                     (unsigned long) mddev);
491         atomic_set(&mddev->active, 1);
492         atomic_set(&mddev->openers, 0);
493         atomic_set(&mddev->active_io, 0);
494         spin_lock_init(&mddev->lock);
495         atomic_set(&mddev->flush_pending, 0);
496         init_waitqueue_head(&mddev->sb_wait);
497         init_waitqueue_head(&mddev->recovery_wait);
498         mddev->reshape_position = MaxSector;
499         mddev->reshape_backwards = 0;
500         mddev->last_sync_action = "none";
501         mddev->resync_min = 0;
502         mddev->resync_max = MaxSector;
503         mddev->level = LEVEL_NONE;
504 }
505 EXPORT_SYMBOL_GPL(mddev_init);
506
507 static struct mddev *mddev_find(dev_t unit)
508 {
509         struct mddev *mddev, *new = NULL;
510
511         if (unit && MAJOR(unit) != MD_MAJOR)
512                 unit &= ~((1<<MdpMinorShift)-1);
513
514  retry:
515         spin_lock(&all_mddevs_lock);
516
517         if (unit) {
518                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
519                         if (mddev->unit == unit) {
520                                 mddev_get(mddev);
521                                 spin_unlock(&all_mddevs_lock);
522                                 kfree(new);
523                                 return mddev;
524                         }
525
526                 if (new) {
527                         list_add(&new->all_mddevs, &all_mddevs);
528                         spin_unlock(&all_mddevs_lock);
529                         new->hold_active = UNTIL_IOCTL;
530                         return new;
531                 }
532         } else if (new) {
533                 /* find an unused unit number */
534                 static int next_minor = 512;
535                 int start = next_minor;
536                 int is_free = 0;
537                 int dev = 0;
538                 while (!is_free) {
539                         dev = MKDEV(MD_MAJOR, next_minor);
540                         next_minor++;
541                         if (next_minor > MINORMASK)
542                                 next_minor = 0;
543                         if (next_minor == start) {
544                                 /* Oh dear, all in use. */
545                                 spin_unlock(&all_mddevs_lock);
546                                 kfree(new);
547                                 return NULL;
548                         }
549
550                         is_free = 1;
551                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
552                                 if (mddev->unit == dev) {
553                                         is_free = 0;
554                                         break;
555                                 }
556                 }
557                 new->unit = dev;
558                 new->md_minor = MINOR(dev);
559                 new->hold_active = UNTIL_STOP;
560                 list_add(&new->all_mddevs, &all_mddevs);
561                 spin_unlock(&all_mddevs_lock);
562                 return new;
563         }
564         spin_unlock(&all_mddevs_lock);
565
566         new = kzalloc(sizeof(*new), GFP_KERNEL);
567         if (!new)
568                 return NULL;
569
570         new->unit = unit;
571         if (MAJOR(unit) == MD_MAJOR)
572                 new->md_minor = MINOR(unit);
573         else
574                 new->md_minor = MINOR(unit) >> MdpMinorShift;
575
576         mddev_init(new);
577
578         goto retry;
579 }
580
581 static struct attribute_group md_redundancy_group;
582
583 void mddev_unlock(struct mddev *mddev)
584 {
585         if (mddev->to_remove) {
586                 /* These cannot be removed under reconfig_mutex as
587                  * an access to the files will try to take reconfig_mutex
588                  * while holding the file unremovable, which leads to
589                  * a deadlock.
590                  * So hold set sysfs_active while the remove in happeing,
591                  * and anything else which might set ->to_remove or my
592                  * otherwise change the sysfs namespace will fail with
593                  * -EBUSY if sysfs_active is still set.
594                  * We set sysfs_active under reconfig_mutex and elsewhere
595                  * test it under the same mutex to ensure its correct value
596                  * is seen.
597                  */
598                 struct attribute_group *to_remove = mddev->to_remove;
599                 mddev->to_remove = NULL;
600                 mddev->sysfs_active = 1;
601                 mutex_unlock(&mddev->reconfig_mutex);
602
603                 if (mddev->kobj.sd) {
604                         if (to_remove != &md_redundancy_group)
605                                 sysfs_remove_group(&mddev->kobj, to_remove);
606                         if (mddev->pers == NULL ||
607                             mddev->pers->sync_request == NULL) {
608                                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
609                                 if (mddev->sysfs_action)
610                                         sysfs_put(mddev->sysfs_action);
611                                 mddev->sysfs_action = NULL;
612                         }
613                 }
614                 mddev->sysfs_active = 0;
615         } else
616                 mutex_unlock(&mddev->reconfig_mutex);
617
618         /* As we've dropped the mutex we need a spinlock to
619          * make sure the thread doesn't disappear
620          */
621         spin_lock(&pers_lock);
622         md_wakeup_thread(mddev->thread);
623         spin_unlock(&pers_lock);
624 }
625 EXPORT_SYMBOL_GPL(mddev_unlock);
626
627 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
628 {
629         struct md_rdev *rdev;
630
631         rdev_for_each_rcu(rdev, mddev)
632                 if (rdev->desc_nr == nr)
633                         return rdev;
634
635         return NULL;
636 }
637 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
638
639 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
640 {
641         struct md_rdev *rdev;
642
643         rdev_for_each(rdev, mddev)
644                 if (rdev->bdev->bd_dev == dev)
645                         return rdev;
646
647         return NULL;
648 }
649
650 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
651 {
652         struct md_rdev *rdev;
653
654         rdev_for_each_rcu(rdev, mddev)
655                 if (rdev->bdev->bd_dev == dev)
656                         return rdev;
657
658         return NULL;
659 }
660
661 static struct md_personality *find_pers(int level, char *clevel)
662 {
663         struct md_personality *pers;
664         list_for_each_entry(pers, &pers_list, list) {
665                 if (level != LEVEL_NONE && pers->level == level)
666                         return pers;
667                 if (strcmp(pers->name, clevel)==0)
668                         return pers;
669         }
670         return NULL;
671 }
672
673 /* return the offset of the super block in 512byte sectors */
674 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
675 {
676         sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
677         return MD_NEW_SIZE_SECTORS(num_sectors);
678 }
679
680 static int alloc_disk_sb(struct md_rdev *rdev)
681 {
682         rdev->sb_page = alloc_page(GFP_KERNEL);
683         if (!rdev->sb_page)
684                 return -ENOMEM;
685         return 0;
686 }
687
688 void md_rdev_clear(struct md_rdev *rdev)
689 {
690         if (rdev->sb_page) {
691                 put_page(rdev->sb_page);
692                 rdev->sb_loaded = 0;
693                 rdev->sb_page = NULL;
694                 rdev->sb_start = 0;
695                 rdev->sectors = 0;
696         }
697         if (rdev->bb_page) {
698                 put_page(rdev->bb_page);
699                 rdev->bb_page = NULL;
700         }
701         badblocks_exit(&rdev->badblocks);
702 }
703 EXPORT_SYMBOL_GPL(md_rdev_clear);
704
705 static void super_written(struct bio *bio)
706 {
707         struct md_rdev *rdev = bio->bi_private;
708         struct mddev *mddev = rdev->mddev;
709
710         if (bio->bi_error) {
711                 pr_err("md: super_written gets error=%d\n", bio->bi_error);
712                 md_error(mddev, rdev);
713                 if (!test_bit(Faulty, &rdev->flags)
714                     && (bio->bi_opf & MD_FAILFAST)) {
715                         set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
716                         set_bit(LastDev, &rdev->flags);
717                 }
718         } else
719                 clear_bit(LastDev, &rdev->flags);
720
721         if (atomic_dec_and_test(&mddev->pending_writes))
722                 wake_up(&mddev->sb_wait);
723         rdev_dec_pending(rdev, mddev);
724         bio_put(bio);
725 }
726
727 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
728                    sector_t sector, int size, struct page *page)
729 {
730         /* write first size bytes of page to sector of rdev
731          * Increment mddev->pending_writes before returning
732          * and decrement it on completion, waking up sb_wait
733          * if zero is reached.
734          * If an error occurred, call md_error
735          */
736         struct bio *bio;
737         int ff = 0;
738
739         if (test_bit(Faulty, &rdev->flags))
740                 return;
741
742         bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
743
744         atomic_inc(&rdev->nr_pending);
745
746         bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
747         bio->bi_iter.bi_sector = sector;
748         bio_add_page(bio, page, size, 0);
749         bio->bi_private = rdev;
750         bio->bi_end_io = super_written;
751
752         if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
753             test_bit(FailFast, &rdev->flags) &&
754             !test_bit(LastDev, &rdev->flags))
755                 ff = MD_FAILFAST;
756         bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff;
757
758         atomic_inc(&mddev->pending_writes);
759         submit_bio(bio);
760 }
761
762 int md_super_wait(struct mddev *mddev)
763 {
764         /* wait for all superblock writes that were scheduled to complete */
765         wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
766         if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
767                 return -EAGAIN;
768         return 0;
769 }
770
771 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
772                  struct page *page, int op, int op_flags, bool metadata_op)
773 {
774         struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
775         int ret;
776
777         bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
778                 rdev->meta_bdev : rdev->bdev;
779         bio_set_op_attrs(bio, op, op_flags);
780         if (metadata_op)
781                 bio->bi_iter.bi_sector = sector + rdev->sb_start;
782         else if (rdev->mddev->reshape_position != MaxSector &&
783                  (rdev->mddev->reshape_backwards ==
784                   (sector >= rdev->mddev->reshape_position)))
785                 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
786         else
787                 bio->bi_iter.bi_sector = sector + rdev->data_offset;
788         bio_add_page(bio, page, size, 0);
789
790         submit_bio_wait(bio);
791
792         ret = !bio->bi_error;
793         bio_put(bio);
794         return ret;
795 }
796 EXPORT_SYMBOL_GPL(sync_page_io);
797
798 static int read_disk_sb(struct md_rdev *rdev, int size)
799 {
800         char b[BDEVNAME_SIZE];
801
802         if (rdev->sb_loaded)
803                 return 0;
804
805         if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
806                 goto fail;
807         rdev->sb_loaded = 1;
808         return 0;
809
810 fail:
811         pr_err("md: disabled device %s, could not read superblock.\n",
812                bdevname(rdev->bdev,b));
813         return -EINVAL;
814 }
815
816 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
817 {
818         return  sb1->set_uuid0 == sb2->set_uuid0 &&
819                 sb1->set_uuid1 == sb2->set_uuid1 &&
820                 sb1->set_uuid2 == sb2->set_uuid2 &&
821                 sb1->set_uuid3 == sb2->set_uuid3;
822 }
823
824 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
825 {
826         int ret;
827         mdp_super_t *tmp1, *tmp2;
828
829         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
830         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
831
832         if (!tmp1 || !tmp2) {
833                 ret = 0;
834                 goto abort;
835         }
836
837         *tmp1 = *sb1;
838         *tmp2 = *sb2;
839
840         /*
841          * nr_disks is not constant
842          */
843         tmp1->nr_disks = 0;
844         tmp2->nr_disks = 0;
845
846         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
847 abort:
848         kfree(tmp1);
849         kfree(tmp2);
850         return ret;
851 }
852
853 static u32 md_csum_fold(u32 csum)
854 {
855         csum = (csum & 0xffff) + (csum >> 16);
856         return (csum & 0xffff) + (csum >> 16);
857 }
858
859 static unsigned int calc_sb_csum(mdp_super_t *sb)
860 {
861         u64 newcsum = 0;
862         u32 *sb32 = (u32*)sb;
863         int i;
864         unsigned int disk_csum, csum;
865
866         disk_csum = sb->sb_csum;
867         sb->sb_csum = 0;
868
869         for (i = 0; i < MD_SB_BYTES/4 ; i++)
870                 newcsum += sb32[i];
871         csum = (newcsum & 0xffffffff) + (newcsum>>32);
872
873 #ifdef CONFIG_ALPHA
874         /* This used to use csum_partial, which was wrong for several
875          * reasons including that different results are returned on
876          * different architectures.  It isn't critical that we get exactly
877          * the same return value as before (we always csum_fold before
878          * testing, and that removes any differences).  However as we
879          * know that csum_partial always returned a 16bit value on
880          * alphas, do a fold to maximise conformity to previous behaviour.
881          */
882         sb->sb_csum = md_csum_fold(disk_csum);
883 #else
884         sb->sb_csum = disk_csum;
885 #endif
886         return csum;
887 }
888
889 /*
890  * Handle superblock details.
891  * We want to be able to handle multiple superblock formats
892  * so we have a common interface to them all, and an array of
893  * different handlers.
894  * We rely on user-space to write the initial superblock, and support
895  * reading and updating of superblocks.
896  * Interface methods are:
897  *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
898  *      loads and validates a superblock on dev.
899  *      if refdev != NULL, compare superblocks on both devices
900  *    Return:
901  *      0 - dev has a superblock that is compatible with refdev
902  *      1 - dev has a superblock that is compatible and newer than refdev
903  *          so dev should be used as the refdev in future
904  *     -EINVAL superblock incompatible or invalid
905  *     -othererror e.g. -EIO
906  *
907  *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
908  *      Verify that dev is acceptable into mddev.
909  *       The first time, mddev->raid_disks will be 0, and data from
910  *       dev should be merged in.  Subsequent calls check that dev
911  *       is new enough.  Return 0 or -EINVAL
912  *
913  *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
914  *     Update the superblock for rdev with data in mddev
915  *     This does not write to disc.
916  *
917  */
918
919 struct super_type  {
920         char                *name;
921         struct module       *owner;
922         int                 (*load_super)(struct md_rdev *rdev,
923                                           struct md_rdev *refdev,
924                                           int minor_version);
925         int                 (*validate_super)(struct mddev *mddev,
926                                               struct md_rdev *rdev);
927         void                (*sync_super)(struct mddev *mddev,
928                                           struct md_rdev *rdev);
929         unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
930                                                 sector_t num_sectors);
931         int                 (*allow_new_offset)(struct md_rdev *rdev,
932                                                 unsigned long long new_offset);
933 };
934
935 /*
936  * Check that the given mddev has no bitmap.
937  *
938  * This function is called from the run method of all personalities that do not
939  * support bitmaps. It prints an error message and returns non-zero if mddev
940  * has a bitmap. Otherwise, it returns 0.
941  *
942  */
943 int md_check_no_bitmap(struct mddev *mddev)
944 {
945         if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
946                 return 0;
947         pr_warn("%s: bitmaps are not supported for %s\n",
948                 mdname(mddev), mddev->pers->name);
949         return 1;
950 }
951 EXPORT_SYMBOL(md_check_no_bitmap);
952
953 /*
954  * load_super for 0.90.0
955  */
956 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
957 {
958         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
959         mdp_super_t *sb;
960         int ret;
961
962         /*
963          * Calculate the position of the superblock (512byte sectors),
964          * it's at the end of the disk.
965          *
966          * It also happens to be a multiple of 4Kb.
967          */
968         rdev->sb_start = calc_dev_sboffset(rdev);
969
970         ret = read_disk_sb(rdev, MD_SB_BYTES);
971         if (ret)
972                 return ret;
973
974         ret = -EINVAL;
975
976         bdevname(rdev->bdev, b);
977         sb = page_address(rdev->sb_page);
978
979         if (sb->md_magic != MD_SB_MAGIC) {
980                 pr_warn("md: invalid raid superblock magic on %s\n", b);
981                 goto abort;
982         }
983
984         if (sb->major_version != 0 ||
985             sb->minor_version < 90 ||
986             sb->minor_version > 91) {
987                 pr_warn("Bad version number %d.%d on %s\n",
988                         sb->major_version, sb->minor_version, b);
989                 goto abort;
990         }
991
992         if (sb->raid_disks <= 0)
993                 goto abort;
994
995         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
996                 pr_warn("md: invalid superblock checksum on %s\n", b);
997                 goto abort;
998         }
999
1000         rdev->preferred_minor = sb->md_minor;
1001         rdev->data_offset = 0;
1002         rdev->new_data_offset = 0;
1003         rdev->sb_size = MD_SB_BYTES;
1004         rdev->badblocks.shift = -1;
1005
1006         if (sb->level == LEVEL_MULTIPATH)
1007                 rdev->desc_nr = -1;
1008         else
1009                 rdev->desc_nr = sb->this_disk.number;
1010
1011         if (!refdev) {
1012                 ret = 1;
1013         } else {
1014                 __u64 ev1, ev2;
1015                 mdp_super_t *refsb = page_address(refdev->sb_page);
1016                 if (!uuid_equal(refsb, sb)) {
1017                         pr_warn("md: %s has different UUID to %s\n",
1018                                 b, bdevname(refdev->bdev,b2));
1019                         goto abort;
1020                 }
1021                 if (!sb_equal(refsb, sb)) {
1022                         pr_warn("md: %s has same UUID but different superblock to %s\n",
1023                                 b, bdevname(refdev->bdev, b2));
1024                         goto abort;
1025                 }
1026                 ev1 = md_event(sb);
1027                 ev2 = md_event(refsb);
1028                 if (ev1 > ev2)
1029                         ret = 1;
1030                 else
1031                         ret = 0;
1032         }
1033         rdev->sectors = rdev->sb_start;
1034         /* Limit to 4TB as metadata cannot record more than that.
1035          * (not needed for Linear and RAID0 as metadata doesn't
1036          * record this size)
1037          */
1038         if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
1039             sb->level >= 1)
1040                 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1041
1042         if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1043                 /* "this cannot possibly happen" ... */
1044                 ret = -EINVAL;
1045
1046  abort:
1047         return ret;
1048 }
1049
1050 /*
1051  * validate_super for 0.90.0
1052  */
1053 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1054 {
1055         mdp_disk_t *desc;
1056         mdp_super_t *sb = page_address(rdev->sb_page);
1057         __u64 ev1 = md_event(sb);
1058
1059         rdev->raid_disk = -1;
1060         clear_bit(Faulty, &rdev->flags);
1061         clear_bit(In_sync, &rdev->flags);
1062         clear_bit(Bitmap_sync, &rdev->flags);
1063         clear_bit(WriteMostly, &rdev->flags);
1064
1065         if (mddev->raid_disks == 0) {
1066                 mddev->major_version = 0;
1067                 mddev->minor_version = sb->minor_version;
1068                 mddev->patch_version = sb->patch_version;
1069                 mddev->external = 0;
1070                 mddev->chunk_sectors = sb->chunk_size >> 9;
1071                 mddev->ctime = sb->ctime;
1072                 mddev->utime = sb->utime;
1073                 mddev->level = sb->level;
1074                 mddev->clevel[0] = 0;
1075                 mddev->layout = sb->layout;
1076                 mddev->raid_disks = sb->raid_disks;
1077                 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1078                 mddev->events = ev1;
1079                 mddev->bitmap_info.offset = 0;
1080                 mddev->bitmap_info.space = 0;
1081                 /* bitmap can use 60 K after the 4K superblocks */
1082                 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1083                 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1084                 mddev->reshape_backwards = 0;
1085
1086                 if (mddev->minor_version >= 91) {
1087                         mddev->reshape_position = sb->reshape_position;
1088                         mddev->delta_disks = sb->delta_disks;
1089                         mddev->new_level = sb->new_level;
1090                         mddev->new_layout = sb->new_layout;
1091                         mddev->new_chunk_sectors = sb->new_chunk >> 9;
1092                         if (mddev->delta_disks < 0)
1093                                 mddev->reshape_backwards = 1;
1094                 } else {
1095                         mddev->reshape_position = MaxSector;
1096                         mddev->delta_disks = 0;
1097                         mddev->new_level = mddev->level;
1098                         mddev->new_layout = mddev->layout;
1099                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1100                 }
1101
1102                 if (sb->state & (1<<MD_SB_CLEAN))
1103                         mddev->recovery_cp = MaxSector;
1104                 else {
1105                         if (sb->events_hi == sb->cp_events_hi &&
1106                                 sb->events_lo == sb->cp_events_lo) {
1107                                 mddev->recovery_cp = sb->recovery_cp;
1108                         } else
1109                                 mddev->recovery_cp = 0;
1110                 }
1111
1112                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1113                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1114                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1115                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1116
1117                 mddev->max_disks = MD_SB_DISKS;
1118
1119                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1120                     mddev->bitmap_info.file == NULL) {
1121                         mddev->bitmap_info.offset =
1122                                 mddev->bitmap_info.default_offset;
1123                         mddev->bitmap_info.space =
1124                                 mddev->bitmap_info.default_space;
1125                 }
1126
1127         } else if (mddev->pers == NULL) {
1128                 /* Insist on good event counter while assembling, except
1129                  * for spares (which don't need an event count) */
1130                 ++ev1;
1131                 if (sb->disks[rdev->desc_nr].state & (
1132                             (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1133                         if (ev1 < mddev->events)
1134                                 return -EINVAL;
1135         } else if (mddev->bitmap) {
1136                 /* if adding to array with a bitmap, then we can accept an
1137                  * older device ... but not too old.
1138                  */
1139                 if (ev1 < mddev->bitmap->events_cleared)
1140                         return 0;
1141                 if (ev1 < mddev->events)
1142                         set_bit(Bitmap_sync, &rdev->flags);
1143         } else {
1144                 if (ev1 < mddev->events)
1145                         /* just a hot-add of a new device, leave raid_disk at -1 */
1146                         return 0;
1147         }
1148
1149         if (mddev->level != LEVEL_MULTIPATH) {
1150                 desc = sb->disks + rdev->desc_nr;
1151
1152                 if (desc->state & (1<<MD_DISK_FAULTY))
1153                         set_bit(Faulty, &rdev->flags);
1154                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1155                             desc->raid_disk < mddev->raid_disks */) {
1156                         set_bit(In_sync, &rdev->flags);
1157                         rdev->raid_disk = desc->raid_disk;
1158                         rdev->saved_raid_disk = desc->raid_disk;
1159                 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1160                         /* active but not in sync implies recovery up to
1161                          * reshape position.  We don't know exactly where
1162                          * that is, so set to zero for now */
1163                         if (mddev->minor_version >= 91) {
1164                                 rdev->recovery_offset = 0;
1165                                 rdev->raid_disk = desc->raid_disk;
1166                         }
1167                 }
1168                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1169                         set_bit(WriteMostly, &rdev->flags);
1170                 if (desc->state & (1<<MD_DISK_FAILFAST))
1171                         set_bit(FailFast, &rdev->flags);
1172         } else /* MULTIPATH are always insync */
1173                 set_bit(In_sync, &rdev->flags);
1174         return 0;
1175 }
1176
1177 /*
1178  * sync_super for 0.90.0
1179  */
1180 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1181 {
1182         mdp_super_t *sb;
1183         struct md_rdev *rdev2;
1184         int next_spare = mddev->raid_disks;
1185
1186         /* make rdev->sb match mddev data..
1187          *
1188          * 1/ zero out disks
1189          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1190          * 3/ any empty disks < next_spare become removed
1191          *
1192          * disks[0] gets initialised to REMOVED because
1193          * we cannot be sure from other fields if it has
1194          * been initialised or not.
1195          */
1196         int i;
1197         int active=0, working=0,failed=0,spare=0,nr_disks=0;
1198
1199         rdev->sb_size = MD_SB_BYTES;
1200
1201         sb = page_address(rdev->sb_page);
1202
1203         memset(sb, 0, sizeof(*sb));
1204
1205         sb->md_magic = MD_SB_MAGIC;
1206         sb->major_version = mddev->major_version;
1207         sb->patch_version = mddev->patch_version;
1208         sb->gvalid_words  = 0; /* ignored */
1209         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1210         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1211         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1212         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1213
1214         sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1215         sb->level = mddev->level;
1216         sb->size = mddev->dev_sectors / 2;
1217         sb->raid_disks = mddev->raid_disks;
1218         sb->md_minor = mddev->md_minor;
1219         sb->not_persistent = 0;
1220         sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1221         sb->state = 0;
1222         sb->events_hi = (mddev->events>>32);
1223         sb->events_lo = (u32)mddev->events;
1224
1225         if (mddev->reshape_position == MaxSector)
1226                 sb->minor_version = 90;
1227         else {
1228                 sb->minor_version = 91;
1229                 sb->reshape_position = mddev->reshape_position;
1230                 sb->new_level = mddev->new_level;
1231                 sb->delta_disks = mddev->delta_disks;
1232                 sb->new_layout = mddev->new_layout;
1233                 sb->new_chunk = mddev->new_chunk_sectors << 9;
1234         }
1235         mddev->minor_version = sb->minor_version;
1236         if (mddev->in_sync)
1237         {
1238                 sb->recovery_cp = mddev->recovery_cp;
1239                 sb->cp_events_hi = (mddev->events>>32);
1240                 sb->cp_events_lo = (u32)mddev->events;
1241                 if (mddev->recovery_cp == MaxSector)
1242                         sb->state = (1<< MD_SB_CLEAN);
1243         } else
1244                 sb->recovery_cp = 0;
1245
1246         sb->layout = mddev->layout;
1247         sb->chunk_size = mddev->chunk_sectors << 9;
1248
1249         if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1250                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1251
1252         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1253         rdev_for_each(rdev2, mddev) {
1254                 mdp_disk_t *d;
1255                 int desc_nr;
1256                 int is_active = test_bit(In_sync, &rdev2->flags);
1257
1258                 if (rdev2->raid_disk >= 0 &&
1259                     sb->minor_version >= 91)
1260                         /* we have nowhere to store the recovery_offset,
1261                          * but if it is not below the reshape_position,
1262                          * we can piggy-back on that.
1263                          */
1264                         is_active = 1;
1265                 if (rdev2->raid_disk < 0 ||
1266                     test_bit(Faulty, &rdev2->flags))
1267                         is_active = 0;
1268                 if (is_active)
1269                         desc_nr = rdev2->raid_disk;
1270                 else
1271                         desc_nr = next_spare++;
1272                 rdev2->desc_nr = desc_nr;
1273                 d = &sb->disks[rdev2->desc_nr];
1274                 nr_disks++;
1275                 d->number = rdev2->desc_nr;
1276                 d->major = MAJOR(rdev2->bdev->bd_dev);
1277                 d->minor = MINOR(rdev2->bdev->bd_dev);
1278                 if (is_active)
1279                         d->raid_disk = rdev2->raid_disk;
1280                 else
1281                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1282                 if (test_bit(Faulty, &rdev2->flags))
1283                         d->state = (1<<MD_DISK_FAULTY);
1284                 else if (is_active) {
1285                         d->state = (1<<MD_DISK_ACTIVE);
1286                         if (test_bit(In_sync, &rdev2->flags))
1287                                 d->state |= (1<<MD_DISK_SYNC);
1288                         active++;
1289                         working++;
1290                 } else {
1291                         d->state = 0;
1292                         spare++;
1293                         working++;
1294                 }
1295                 if (test_bit(WriteMostly, &rdev2->flags))
1296                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1297                 if (test_bit(FailFast, &rdev2->flags))
1298                         d->state |= (1<<MD_DISK_FAILFAST);
1299         }
1300         /* now set the "removed" and "faulty" bits on any missing devices */
1301         for (i=0 ; i < mddev->raid_disks ; i++) {
1302                 mdp_disk_t *d = &sb->disks[i];
1303                 if (d->state == 0 && d->number == 0) {
1304                         d->number = i;
1305                         d->raid_disk = i;
1306                         d->state = (1<<MD_DISK_REMOVED);
1307                         d->state |= (1<<MD_DISK_FAULTY);
1308                         failed++;
1309                 }
1310         }
1311         sb->nr_disks = nr_disks;
1312         sb->active_disks = active;
1313         sb->working_disks = working;
1314         sb->failed_disks = failed;
1315         sb->spare_disks = spare;
1316
1317         sb->this_disk = sb->disks[rdev->desc_nr];
1318         sb->sb_csum = calc_sb_csum(sb);
1319 }
1320
1321 /*
1322  * rdev_size_change for 0.90.0
1323  */
1324 static unsigned long long
1325 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1326 {
1327         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1328                 return 0; /* component must fit device */
1329         if (rdev->mddev->bitmap_info.offset)
1330                 return 0; /* can't move bitmap */
1331         rdev->sb_start = calc_dev_sboffset(rdev);
1332         if (!num_sectors || num_sectors > rdev->sb_start)
1333                 num_sectors = rdev->sb_start;
1334         /* Limit to 4TB as metadata cannot record more than that.
1335          * 4TB == 2^32 KB, or 2*2^32 sectors.
1336          */
1337         if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
1338             rdev->mddev->level >= 1)
1339                 num_sectors = (sector_t)(2ULL << 32) - 2;
1340         do {
1341                 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1342                        rdev->sb_page);
1343         } while (md_super_wait(rdev->mddev) < 0);
1344         return num_sectors;
1345 }
1346
1347 static int
1348 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1349 {
1350         /* non-zero offset changes not possible with v0.90 */
1351         return new_offset == 0;
1352 }
1353
1354 /*
1355  * version 1 superblock
1356  */
1357
1358 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1359 {
1360         __le32 disk_csum;
1361         u32 csum;
1362         unsigned long long newcsum;
1363         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1364         __le32 *isuper = (__le32*)sb;
1365
1366         disk_csum = sb->sb_csum;
1367         sb->sb_csum = 0;
1368         newcsum = 0;
1369         for (; size >= 4; size -= 4)
1370                 newcsum += le32_to_cpu(*isuper++);
1371
1372         if (size == 2)
1373                 newcsum += le16_to_cpu(*(__le16*) isuper);
1374
1375         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1376         sb->sb_csum = disk_csum;
1377         return cpu_to_le32(csum);
1378 }
1379
1380 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1381 {
1382         struct mdp_superblock_1 *sb;
1383         int ret;
1384         sector_t sb_start;
1385         sector_t sectors;
1386         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1387         int bmask;
1388
1389         /*
1390          * Calculate the position of the superblock in 512byte sectors.
1391          * It is always aligned to a 4K boundary and
1392          * depeding on minor_version, it can be:
1393          * 0: At least 8K, but less than 12K, from end of device
1394          * 1: At start of device
1395          * 2: 4K from start of device.
1396          */
1397         switch(minor_version) {
1398         case 0:
1399                 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1400                 sb_start -= 8*2;
1401                 sb_start &= ~(sector_t)(4*2-1);
1402                 break;
1403         case 1:
1404                 sb_start = 0;
1405                 break;
1406         case 2:
1407                 sb_start = 8;
1408                 break;
1409         default:
1410                 return -EINVAL;
1411         }
1412         rdev->sb_start = sb_start;
1413
1414         /* superblock is rarely larger than 1K, but it can be larger,
1415          * and it is safe to read 4k, so we do that
1416          */
1417         ret = read_disk_sb(rdev, 4096);
1418         if (ret) return ret;
1419
1420         sb = page_address(rdev->sb_page);
1421
1422         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1423             sb->major_version != cpu_to_le32(1) ||
1424             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1425             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1426             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1427                 return -EINVAL;
1428
1429         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1430                 pr_warn("md: invalid superblock checksum on %s\n",
1431                         bdevname(rdev->bdev,b));
1432                 return -EINVAL;
1433         }
1434         if (le64_to_cpu(sb->data_size) < 10) {
1435                 pr_warn("md: data_size too small on %s\n",
1436                         bdevname(rdev->bdev,b));
1437                 return -EINVAL;
1438         }
1439         if (sb->pad0 ||
1440             sb->pad3[0] ||
1441             memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1442                 /* Some padding is non-zero, might be a new feature */
1443                 return -EINVAL;
1444
1445         rdev->preferred_minor = 0xffff;
1446         rdev->data_offset = le64_to_cpu(sb->data_offset);
1447         rdev->new_data_offset = rdev->data_offset;
1448         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1449             (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1450                 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1451         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1452
1453         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1454         bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1455         if (rdev->sb_size & bmask)
1456                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1457
1458         if (minor_version
1459             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1460                 return -EINVAL;
1461         if (minor_version
1462             && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1463                 return -EINVAL;
1464
1465         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1466                 rdev->desc_nr = -1;
1467         else
1468                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1469
1470         if (!rdev->bb_page) {
1471                 rdev->bb_page = alloc_page(GFP_KERNEL);
1472                 if (!rdev->bb_page)
1473                         return -ENOMEM;
1474         }
1475         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1476             rdev->badblocks.count == 0) {
1477                 /* need to load the bad block list.
1478                  * Currently we limit it to one page.
1479                  */
1480                 s32 offset;
1481                 sector_t bb_sector;
1482                 u64 *bbp;
1483                 int i;
1484                 int sectors = le16_to_cpu(sb->bblog_size);
1485                 if (sectors > (PAGE_SIZE / 512))
1486                         return -EINVAL;
1487                 offset = le32_to_cpu(sb->bblog_offset);
1488                 if (offset == 0)
1489                         return -EINVAL;
1490                 bb_sector = (long long)offset;
1491                 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1492                                   rdev->bb_page, REQ_OP_READ, 0, true))
1493                         return -EIO;
1494                 bbp = (u64 *)page_address(rdev->bb_page);
1495                 rdev->badblocks.shift = sb->bblog_shift;
1496                 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1497                         u64 bb = le64_to_cpu(*bbp);
1498                         int count = bb & (0x3ff);
1499                         u64 sector = bb >> 10;
1500                         sector <<= sb->bblog_shift;
1501                         count <<= sb->bblog_shift;
1502                         if (bb + 1 == 0)
1503                                 break;
1504                         if (badblocks_set(&rdev->badblocks, sector, count, 1))
1505                                 return -EINVAL;
1506                 }
1507         } else if (sb->bblog_offset != 0)
1508                 rdev->badblocks.shift = 0;
1509
1510         if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
1511                 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1512                 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1513                 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1514         }
1515
1516         if (!refdev) {
1517                 ret = 1;
1518         } else {
1519                 __u64 ev1, ev2;
1520                 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1521
1522                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1523                     sb->level != refsb->level ||
1524                     sb->layout != refsb->layout ||
1525                     sb->chunksize != refsb->chunksize) {
1526                         pr_warn("md: %s has strangely different superblock to %s\n",
1527                                 bdevname(rdev->bdev,b),
1528                                 bdevname(refdev->bdev,b2));
1529                         return -EINVAL;
1530                 }
1531                 ev1 = le64_to_cpu(sb->events);
1532                 ev2 = le64_to_cpu(refsb->events);
1533
1534                 if (ev1 > ev2)
1535                         ret = 1;
1536                 else
1537                         ret = 0;
1538         }
1539         if (minor_version) {
1540                 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1541                 sectors -= rdev->data_offset;
1542         } else
1543                 sectors = rdev->sb_start;
1544         if (sectors < le64_to_cpu(sb->data_size))
1545                 return -EINVAL;
1546         rdev->sectors = le64_to_cpu(sb->data_size);
1547         return ret;
1548 }
1549
1550 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1551 {
1552         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1553         __u64 ev1 = le64_to_cpu(sb->events);
1554
1555         rdev->raid_disk = -1;
1556         clear_bit(Faulty, &rdev->flags);
1557         clear_bit(In_sync, &rdev->flags);
1558         clear_bit(Bitmap_sync, &rdev->flags);
1559         clear_bit(WriteMostly, &rdev->flags);
1560
1561         if (mddev->raid_disks == 0) {
1562                 mddev->major_version = 1;
1563                 mddev->patch_version = 0;
1564                 mddev->external = 0;
1565                 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1566                 mddev->ctime = le64_to_cpu(sb->ctime);
1567                 mddev->utime = le64_to_cpu(sb->utime);
1568                 mddev->level = le32_to_cpu(sb->level);
1569                 mddev->clevel[0] = 0;
1570                 mddev->layout = le32_to_cpu(sb->layout);
1571                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1572                 mddev->dev_sectors = le64_to_cpu(sb->size);
1573                 mddev->events = ev1;
1574                 mddev->bitmap_info.offset = 0;
1575                 mddev->bitmap_info.space = 0;
1576                 /* Default location for bitmap is 1K after superblock
1577                  * using 3K - total of 4K
1578                  */
1579                 mddev->bitmap_info.default_offset = 1024 >> 9;
1580                 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1581                 mddev->reshape_backwards = 0;
1582
1583                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1584                 memcpy(mddev->uuid, sb->set_uuid, 16);
1585
1586                 mddev->max_disks =  (4096-256)/2;
1587
1588                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1589                     mddev->bitmap_info.file == NULL) {
1590                         mddev->bitmap_info.offset =
1591                                 (__s32)le32_to_cpu(sb->bitmap_offset);
1592                         /* Metadata doesn't record how much space is available.
1593                          * For 1.0, we assume we can use up to the superblock
1594                          * if before, else to 4K beyond superblock.
1595                          * For others, assume no change is possible.
1596                          */
1597                         if (mddev->minor_version > 0)
1598                                 mddev->bitmap_info.space = 0;
1599                         else if (mddev->bitmap_info.offset > 0)
1600                                 mddev->bitmap_info.space =
1601                                         8 - mddev->bitmap_info.offset;
1602                         else
1603                                 mddev->bitmap_info.space =
1604                                         -mddev->bitmap_info.offset;
1605                 }
1606
1607                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1608                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1609                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1610                         mddev->new_level = le32_to_cpu(sb->new_level);
1611                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1612                         mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1613                         if (mddev->delta_disks < 0 ||
1614                             (mddev->delta_disks == 0 &&
1615                              (le32_to_cpu(sb->feature_map)
1616                               & MD_FEATURE_RESHAPE_BACKWARDS)))
1617                                 mddev->reshape_backwards = 1;
1618                 } else {
1619                         mddev->reshape_position = MaxSector;
1620                         mddev->delta_disks = 0;
1621                         mddev->new_level = mddev->level;
1622                         mddev->new_layout = mddev->layout;
1623                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1624                 }
1625
1626                 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1627                         set_bit(MD_HAS_JOURNAL, &mddev->flags);
1628
1629                 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
1630                         if (le32_to_cpu(sb->feature_map) &
1631                             (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1632                                 return -EINVAL;
1633                         set_bit(MD_HAS_PPL, &mddev->flags);
1634                 }
1635         } else if (mddev->pers == NULL) {
1636                 /* Insist of good event counter while assembling, except for
1637                  * spares (which don't need an event count) */
1638                 ++ev1;
1639                 if (rdev->desc_nr >= 0 &&
1640                     rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1641                     (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1642                      le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1643                         if (ev1 < mddev->events)
1644                                 return -EINVAL;
1645         } else if (mddev->bitmap) {
1646                 /* If adding to array with a bitmap, then we can accept an
1647                  * older device, but not too old.
1648                  */
1649                 if (ev1 < mddev->bitmap->events_cleared)
1650                         return 0;
1651                 if (ev1 < mddev->events)
1652                         set_bit(Bitmap_sync, &rdev->flags);
1653         } else {
1654                 if (ev1 < mddev->events)
1655                         /* just a hot-add of a new device, leave raid_disk at -1 */
1656                         return 0;
1657         }
1658         if (mddev->level != LEVEL_MULTIPATH) {
1659                 int role;
1660                 if (rdev->desc_nr < 0 ||
1661                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1662                         role = MD_DISK_ROLE_SPARE;
1663                         rdev->desc_nr = -1;
1664                 } else
1665                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1666                 switch(role) {
1667                 case MD_DISK_ROLE_SPARE: /* spare */
1668                         break;
1669                 case MD_DISK_ROLE_FAULTY: /* faulty */
1670                         set_bit(Faulty, &rdev->flags);
1671                         break;
1672                 case MD_DISK_ROLE_JOURNAL: /* journal device */
1673                         if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1674                                 /* journal device without journal feature */
1675                                 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1676                                 return -EINVAL;
1677                         }
1678                         set_bit(Journal, &rdev->flags);
1679                         rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1680                         rdev->raid_disk = 0;
1681                         break;
1682                 default:
1683                         rdev->saved_raid_disk = role;
1684                         if ((le32_to_cpu(sb->feature_map) &
1685                              MD_FEATURE_RECOVERY_OFFSET)) {
1686                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1687                                 if (!(le32_to_cpu(sb->feature_map) &
1688                                       MD_FEATURE_RECOVERY_BITMAP))
1689                                         rdev->saved_raid_disk = -1;
1690                         } else
1691                                 set_bit(In_sync, &rdev->flags);
1692                         rdev->raid_disk = role;
1693                         break;
1694                 }
1695                 if (sb->devflags & WriteMostly1)
1696                         set_bit(WriteMostly, &rdev->flags);
1697                 if (sb->devflags & FailFast1)
1698                         set_bit(FailFast, &rdev->flags);
1699                 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1700                         set_bit(Replacement, &rdev->flags);
1701         } else /* MULTIPATH are always insync */
1702                 set_bit(In_sync, &rdev->flags);
1703
1704         return 0;
1705 }
1706
1707 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1708 {
1709         struct mdp_superblock_1 *sb;
1710         struct md_rdev *rdev2;
1711         int max_dev, i;
1712         /* make rdev->sb match mddev and rdev data. */
1713
1714         sb = page_address(rdev->sb_page);
1715
1716         sb->feature_map = 0;
1717         sb->pad0 = 0;
1718         sb->recovery_offset = cpu_to_le64(0);
1719         memset(sb->pad3, 0, sizeof(sb->pad3));
1720
1721         sb->utime = cpu_to_le64((__u64)mddev->utime);
1722         sb->events = cpu_to_le64(mddev->events);
1723         if (mddev->in_sync)
1724                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1725         else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1726                 sb->resync_offset = cpu_to_le64(MaxSector);
1727         else
1728                 sb->resync_offset = cpu_to_le64(0);
1729
1730         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1731
1732         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1733         sb->size = cpu_to_le64(mddev->dev_sectors);
1734         sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1735         sb->level = cpu_to_le32(mddev->level);
1736         sb->layout = cpu_to_le32(mddev->layout);
1737         if (test_bit(FailFast, &rdev->flags))
1738                 sb->devflags |= FailFast1;
1739         else
1740                 sb->devflags &= ~FailFast1;
1741
1742         if (test_bit(WriteMostly, &rdev->flags))
1743                 sb->devflags |= WriteMostly1;
1744         else
1745                 sb->devflags &= ~WriteMostly1;
1746         sb->data_offset = cpu_to_le64(rdev->data_offset);
1747         sb->data_size = cpu_to_le64(rdev->sectors);
1748
1749         if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1750                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1751                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1752         }
1753
1754         if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1755             !test_bit(In_sync, &rdev->flags)) {
1756                 sb->feature_map |=
1757                         cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1758                 sb->recovery_offset =
1759                         cpu_to_le64(rdev->recovery_offset);
1760                 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1761                         sb->feature_map |=
1762                                 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1763         }
1764         /* Note: recovery_offset and journal_tail share space  */
1765         if (test_bit(Journal, &rdev->flags))
1766                 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1767         if (test_bit(Replacement, &rdev->flags))
1768                 sb->feature_map |=
1769                         cpu_to_le32(MD_FEATURE_REPLACEMENT);
1770
1771         if (mddev->reshape_position != MaxSector) {
1772                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1773                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1774                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1775                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1776                 sb->new_level = cpu_to_le32(mddev->new_level);
1777                 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1778                 if (mddev->delta_disks == 0 &&
1779                     mddev->reshape_backwards)
1780                         sb->feature_map
1781                                 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1782                 if (rdev->new_data_offset != rdev->data_offset) {
1783                         sb->feature_map
1784                                 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1785                         sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1786                                                              - rdev->data_offset));
1787                 }
1788         }
1789
1790         if (mddev_is_clustered(mddev))
1791                 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1792
1793         if (rdev->badblocks.count == 0)
1794                 /* Nothing to do for bad blocks*/ ;
1795         else if (sb->bblog_offset == 0)
1796                 /* Cannot record bad blocks on this device */
1797                 md_error(mddev, rdev);
1798         else {
1799                 struct badblocks *bb = &rdev->badblocks;
1800                 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1801                 u64 *p = bb->page;
1802                 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1803                 if (bb->changed) {
1804                         unsigned seq;
1805
1806 retry:
1807                         seq = read_seqbegin(&bb->lock);
1808
1809                         memset(bbp, 0xff, PAGE_SIZE);
1810
1811                         for (i = 0 ; i < bb->count ; i++) {
1812                                 u64 internal_bb = p[i];
1813                                 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1814                                                 | BB_LEN(internal_bb));
1815                                 bbp[i] = cpu_to_le64(store_bb);
1816                         }
1817                         bb->changed = 0;
1818                         if (read_seqretry(&bb->lock, seq))
1819                                 goto retry;
1820
1821                         bb->sector = (rdev->sb_start +
1822                                       (int)le32_to_cpu(sb->bblog_offset));
1823                         bb->size = le16_to_cpu(sb->bblog_size);
1824                 }
1825         }
1826
1827         max_dev = 0;
1828         rdev_for_each(rdev2, mddev)
1829                 if (rdev2->desc_nr+1 > max_dev)
1830                         max_dev = rdev2->desc_nr+1;
1831
1832         if (max_dev > le32_to_cpu(sb->max_dev)) {
1833                 int bmask;
1834                 sb->max_dev = cpu_to_le32(max_dev);
1835                 rdev->sb_size = max_dev * 2 + 256;
1836                 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1837                 if (rdev->sb_size & bmask)
1838                         rdev->sb_size = (rdev->sb_size | bmask) + 1;
1839         } else
1840                 max_dev = le32_to_cpu(sb->max_dev);
1841
1842         for (i=0; i<max_dev;i++)
1843                 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1844
1845         if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1846                 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1847
1848         if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1849                 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1850                 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
1851                 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
1852         }
1853
1854         rdev_for_each(rdev2, mddev) {
1855                 i = rdev2->desc_nr;
1856                 if (test_bit(Faulty, &rdev2->flags))
1857                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1858                 else if (test_bit(In_sync, &rdev2->flags))
1859                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1860                 else if (test_bit(Journal, &rdev2->flags))
1861                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1862                 else if (rdev2->raid_disk >= 0)
1863                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1864                 else
1865                         sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1866         }
1867
1868         sb->sb_csum = calc_sb_1_csum(sb);
1869 }
1870
1871 static unsigned long long
1872 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1873 {
1874         struct mdp_superblock_1 *sb;
1875         sector_t max_sectors;
1876         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1877                 return 0; /* component must fit device */
1878         if (rdev->data_offset != rdev->new_data_offset)
1879                 return 0; /* too confusing */
1880         if (rdev->sb_start < rdev->data_offset) {
1881                 /* minor versions 1 and 2; superblock before data */
1882                 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1883                 max_sectors -= rdev->data_offset;
1884                 if (!num_sectors || num_sectors > max_sectors)
1885                         num_sectors = max_sectors;
1886         } else if (rdev->mddev->bitmap_info.offset) {
1887                 /* minor version 0 with bitmap we can't move */
1888                 return 0;
1889         } else {
1890                 /* minor version 0; superblock after data */
1891                 sector_t sb_start;
1892                 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1893                 sb_start &= ~(sector_t)(4*2 - 1);
1894                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1895                 if (!num_sectors || num_sectors > max_sectors)
1896                         num_sectors = max_sectors;
1897                 rdev->sb_start = sb_start;
1898         }
1899         sb = page_address(rdev->sb_page);
1900         sb->data_size = cpu_to_le64(num_sectors);
1901         sb->super_offset = cpu_to_le64(rdev->sb_start);
1902         sb->sb_csum = calc_sb_1_csum(sb);
1903         do {
1904                 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1905                                rdev->sb_page);
1906         } while (md_super_wait(rdev->mddev) < 0);
1907         return num_sectors;
1908
1909 }
1910
1911 static int
1912 super_1_allow_new_offset(struct md_rdev *rdev,
1913                          unsigned long long new_offset)
1914 {
1915         /* All necessary checks on new >= old have been done */
1916         struct bitmap *bitmap;
1917         if (new_offset >= rdev->data_offset)
1918                 return 1;
1919
1920         /* with 1.0 metadata, there is no metadata to tread on
1921          * so we can always move back */
1922         if (rdev->mddev->minor_version == 0)
1923                 return 1;
1924
1925         /* otherwise we must be sure not to step on
1926          * any metadata, so stay:
1927          * 36K beyond start of superblock
1928          * beyond end of badblocks
1929          * beyond write-intent bitmap
1930          */
1931         if (rdev->sb_start + (32+4)*2 > new_offset)
1932                 return 0;
1933         bitmap = rdev->mddev->bitmap;
1934         if (bitmap && !rdev->mddev->bitmap_info.file &&
1935             rdev->sb_start + rdev->mddev->bitmap_info.offset +
1936             bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1937                 return 0;
1938         if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1939                 return 0;
1940
1941         return 1;
1942 }
1943
1944 static struct super_type super_types[] = {
1945         [0] = {
1946                 .name   = "0.90.0",
1947                 .owner  = THIS_MODULE,
1948                 .load_super         = super_90_load,
1949                 .validate_super     = super_90_validate,
1950                 .sync_super         = super_90_sync,
1951                 .rdev_size_change   = super_90_rdev_size_change,
1952                 .allow_new_offset   = super_90_allow_new_offset,
1953         },
1954         [1] = {
1955                 .name   = "md-1",
1956                 .owner  = THIS_MODULE,
1957                 .load_super         = super_1_load,
1958                 .validate_super     = super_1_validate,
1959                 .sync_super         = super_1_sync,
1960                 .rdev_size_change   = super_1_rdev_size_change,
1961                 .allow_new_offset   = super_1_allow_new_offset,
1962         },
1963 };
1964
1965 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1966 {
1967         if (mddev->sync_super) {
1968                 mddev->sync_super(mddev, rdev);
1969                 return;
1970         }
1971
1972         BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1973
1974         super_types[mddev->major_version].sync_super(mddev, rdev);
1975 }
1976
1977 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1978 {
1979         struct md_rdev *rdev, *rdev2;
1980
1981         rcu_read_lock();
1982         rdev_for_each_rcu(rdev, mddev1) {
1983                 if (test_bit(Faulty, &rdev->flags) ||
1984                     test_bit(Journal, &rdev->flags) ||
1985                     rdev->raid_disk == -1)
1986                         continue;
1987                 rdev_for_each_rcu(rdev2, mddev2) {
1988                         if (test_bit(Faulty, &rdev2->flags) ||
1989                             test_bit(Journal, &rdev2->flags) ||
1990                             rdev2->raid_disk == -1)
1991                                 continue;
1992                         if (rdev->bdev->bd_contains ==
1993                             rdev2->bdev->bd_contains) {
1994                                 rcu_read_unlock();
1995                                 return 1;
1996                         }
1997                 }
1998         }
1999         rcu_read_unlock();
2000         return 0;
2001 }
2002
2003 static LIST_HEAD(pending_raid_disks);
2004
2005 /*
2006  * Try to register data integrity profile for an mddev
2007  *
2008  * This is called when an array is started and after a disk has been kicked
2009  * from the array. It only succeeds if all working and active component devices
2010  * are integrity capable with matching profiles.
2011  */
2012 int md_integrity_register(struct mddev *mddev)
2013 {
2014         struct md_rdev *rdev, *reference = NULL;
2015
2016         if (list_empty(&mddev->disks))
2017                 return 0; /* nothing to do */
2018         if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2019                 return 0; /* shouldn't register, or already is */
2020         rdev_for_each(rdev, mddev) {
2021                 /* skip spares and non-functional disks */
2022                 if (test_bit(Faulty, &rdev->flags))
2023                         continue;
2024                 if (rdev->raid_disk < 0)
2025                         continue;
2026                 if (!reference) {
2027                         /* Use the first rdev as the reference */
2028                         reference = rdev;
2029                         continue;
2030                 }
2031                 /* does this rdev's profile match the reference profile? */
2032                 if (blk_integrity_compare(reference->bdev->bd_disk,
2033                                 rdev->bdev->bd_disk) < 0)
2034                         return -EINVAL;
2035         }
2036         if (!reference || !bdev_get_integrity(reference->bdev))
2037                 return 0;
2038         /*
2039          * All component devices are integrity capable and have matching
2040          * profiles, register the common profile for the md device.
2041          */
2042         blk_integrity_register(mddev->gendisk,
2043                                bdev_get_integrity(reference->bdev));
2044
2045         pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2046         if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2047                 pr_err("md: failed to create integrity pool for %s\n",
2048                        mdname(mddev));
2049                 return -EINVAL;
2050         }
2051         return 0;
2052 }
2053 EXPORT_SYMBOL(md_integrity_register);
2054
2055 /*
2056  * Attempt to add an rdev, but only if it is consistent with the current
2057  * integrity profile
2058  */
2059 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2060 {
2061         struct blk_integrity *bi_rdev;
2062         struct blk_integrity *bi_mddev;
2063         char name[BDEVNAME_SIZE];
2064
2065         if (!mddev->gendisk)
2066                 return 0;
2067
2068         bi_rdev = bdev_get_integrity(rdev->bdev);
2069         bi_mddev = blk_get_integrity(mddev->gendisk);
2070
2071         if (!bi_mddev) /* nothing to do */
2072                 return 0;
2073
2074         if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2075                 pr_err("%s: incompatible integrity profile for %s\n",
2076                        mdname(mddev), bdevname(rdev->bdev, name));
2077                 return -ENXIO;
2078         }
2079
2080         return 0;
2081 }
2082 EXPORT_SYMBOL(md_integrity_add_rdev);
2083
2084 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2085 {
2086         char b[BDEVNAME_SIZE];
2087         struct kobject *ko;
2088         int err;
2089
2090         /* prevent duplicates */
2091         if (find_rdev(mddev, rdev->bdev->bd_dev))
2092                 return -EEXIST;
2093
2094         /* make sure rdev->sectors exceeds mddev->dev_sectors */
2095         if (!test_bit(Journal, &rdev->flags) &&
2096             rdev->sectors &&
2097             (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2098                 if (mddev->pers) {
2099                         /* Cannot change size, so fail
2100                          * If mddev->level <= 0, then we don't care
2101                          * about aligning sizes (e.g. linear)
2102                          */
2103                         if (mddev->level > 0)
2104                                 return -ENOSPC;
2105                 } else
2106                         mddev->dev_sectors = rdev->sectors;
2107         }
2108
2109         /* Verify rdev->desc_nr is unique.
2110          * If it is -1, assign a free number, else
2111          * check number is not in use
2112          */
2113         rcu_read_lock();
2114         if (rdev->desc_nr < 0) {
2115                 int choice = 0;
2116                 if (mddev->pers)
2117                         choice = mddev->raid_disks;
2118                 while (md_find_rdev_nr_rcu(mddev, choice))
2119                         choice++;
2120                 rdev->desc_nr = choice;
2121         } else {
2122                 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2123                         rcu_read_unlock();
2124                         return -EBUSY;
2125                 }
2126         }
2127         rcu_read_unlock();
2128         if (!test_bit(Journal, &rdev->flags) &&
2129             mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2130                 pr_warn("md: %s: array is limited to %d devices\n",
2131                         mdname(mddev), mddev->max_disks);
2132                 return -EBUSY;
2133         }
2134         bdevname(rdev->bdev,b);
2135         strreplace(b, '/', '!');
2136
2137         rdev->mddev = mddev;
2138         pr_debug("md: bind<%s>\n", b);
2139
2140         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2141                 goto fail;
2142
2143         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2144         if (sysfs_create_link(&rdev->kobj, ko, "block"))
2145                 /* failure here is OK */;
2146         rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2147
2148         list_add_rcu(&rdev->same_set, &mddev->disks);
2149         bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2150
2151         /* May as well allow recovery to be retried once */
2152         mddev->recovery_disabled++;
2153
2154         return 0;
2155
2156  fail:
2157         pr_warn("md: failed to register dev-%s for %s\n",
2158                 b, mdname(mddev));
2159         return err;
2160 }
2161
2162 static void md_delayed_delete(struct work_struct *ws)
2163 {
2164         struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2165         kobject_del(&rdev->kobj);
2166         kobject_put(&rdev->kobj);
2167 }
2168
2169 static void unbind_rdev_from_array(struct md_rdev *rdev)
2170 {
2171         char b[BDEVNAME_SIZE];
2172
2173         bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2174         list_del_rcu(&rdev->same_set);
2175         pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2176         rdev->mddev = NULL;
2177         sysfs_remove_link(&rdev->kobj, "block");
2178         sysfs_put(rdev->sysfs_state);
2179         rdev->sysfs_state = NULL;
2180         rdev->badblocks.count = 0;
2181         /* We need to delay this, otherwise we can deadlock when
2182          * writing to 'remove' to "dev/state".  We also need
2183          * to delay it due to rcu usage.
2184          */
2185         synchronize_rcu();
2186         INIT_WORK(&rdev->del_work, md_delayed_delete);
2187         kobject_get(&rdev->kobj);
2188         queue_work(md_misc_wq, &rdev->del_work);
2189 }
2190
2191 /*
2192  * prevent the device from being mounted, repartitioned or
2193  * otherwise reused by a RAID array (or any other kernel
2194  * subsystem), by bd_claiming the device.
2195  */
2196 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2197 {
2198         int err = 0;
2199         struct block_device *bdev;
2200         char b[BDEVNAME_SIZE];
2201
2202         bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2203                                  shared ? (struct md_rdev *)lock_rdev : rdev);
2204         if (IS_ERR(bdev)) {
2205                 pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2206                 return PTR_ERR(bdev);
2207         }
2208         rdev->bdev = bdev;
2209         return err;
2210 }
2211
2212 static void unlock_rdev(struct md_rdev *rdev)
2213 {
2214         struct block_device *bdev = rdev->bdev;
2215         rdev->bdev = NULL;
2216         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2217 }
2218
2219 void md_autodetect_dev(dev_t dev);
2220
2221 static void export_rdev(struct md_rdev *rdev)
2222 {
2223         char b[BDEVNAME_SIZE];
2224
2225         pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2226         md_rdev_clear(rdev);
2227 #ifndef MODULE
2228         if (test_bit(AutoDetected, &rdev->flags))
2229                 md_autodetect_dev(rdev->bdev->bd_dev);
2230 #endif
2231         unlock_rdev(rdev);
2232         kobject_put(&rdev->kobj);
2233 }
2234
2235 void md_kick_rdev_from_array(struct md_rdev *rdev)
2236 {
2237         unbind_rdev_from_array(rdev);
2238         export_rdev(rdev);
2239 }
2240 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2241
2242 static void export_array(struct mddev *mddev)
2243 {
2244         struct md_rdev *rdev;
2245
2246         while (!list_empty(&mddev->disks)) {
2247                 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2248                                         same_set);
2249                 md_kick_rdev_from_array(rdev);
2250         }
2251         mddev->raid_disks = 0;
2252         mddev->major_version = 0;
2253 }
2254
2255 static void sync_sbs(struct mddev *mddev, int nospares)
2256 {
2257         /* Update each superblock (in-memory image), but
2258          * if we are allowed to, skip spares which already
2259          * have the right event counter, or have one earlier
2260          * (which would mean they aren't being marked as dirty
2261          * with the rest of the array)
2262          */
2263         struct md_rdev *rdev;
2264         rdev_for_each(rdev, mddev) {
2265                 if (rdev->sb_events == mddev->events ||
2266                     (nospares &&
2267                      rdev->raid_disk < 0 &&
2268                      rdev->sb_events+1 == mddev->events)) {
2269                         /* Don't update this superblock */
2270                         rdev->sb_loaded = 2;
2271                 } else {
2272                         sync_super(mddev, rdev);
2273                         rdev->sb_loaded = 1;
2274                 }
2275         }
2276 }
2277
2278 static bool does_sb_need_changing(struct mddev *mddev)
2279 {
2280         struct md_rdev *rdev;
2281         struct mdp_superblock_1 *sb;
2282         int role;
2283
2284         /* Find a good rdev */
2285         rdev_for_each(rdev, mddev)
2286                 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2287                         break;
2288
2289         /* No good device found. */
2290         if (!rdev)
2291                 return false;
2292
2293         sb = page_address(rdev->sb_page);
2294         /* Check if a device has become faulty or a spare become active */
2295         rdev_for_each(rdev, mddev) {
2296                 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2297                 /* Device activated? */
2298                 if (role == 0xffff && rdev->raid_disk >=0 &&
2299                     !test_bit(Faulty, &rdev->flags))
2300                         return true;
2301                 /* Device turned faulty? */
2302                 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2303                         return true;
2304         }
2305
2306         /* Check if any mddev parameters have changed */
2307         if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2308             (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2309             (mddev->layout != le32_to_cpu(sb->layout)) ||
2310             (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2311             (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2312                 return true;
2313
2314         return false;
2315 }
2316
2317 void md_update_sb(struct mddev *mddev, int force_change)
2318 {
2319         struct md_rdev *rdev;
2320         int sync_req;
2321         int nospares = 0;
2322         int any_badblocks_changed = 0;
2323         int ret = -1;
2324
2325         if (mddev->ro) {
2326                 if (force_change)
2327                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2328                 return;
2329         }
2330
2331 repeat:
2332         if (mddev_is_clustered(mddev)) {
2333                 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2334                         force_change = 1;
2335                 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2336                         nospares = 1;
2337                 ret = md_cluster_ops->metadata_update_start(mddev);
2338                 /* Has someone else has updated the sb */
2339                 if (!does_sb_need_changing(mddev)) {
2340                         if (ret == 0)
2341                                 md_cluster_ops->metadata_update_cancel(mddev);
2342                         bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2343                                                          BIT(MD_SB_CHANGE_DEVS) |
2344                                                          BIT(MD_SB_CHANGE_CLEAN));
2345                         return;
2346                 }
2347         }
2348
2349         /* First make sure individual recovery_offsets are correct */
2350         rdev_for_each(rdev, mddev) {
2351                 if (rdev->raid_disk >= 0 &&
2352                     mddev->delta_disks >= 0 &&
2353                     !test_bit(Journal, &rdev->flags) &&
2354                     !test_bit(In_sync, &rdev->flags) &&
2355                     mddev->curr_resync_completed > rdev->recovery_offset)
2356                                 rdev->recovery_offset = mddev->curr_resync_completed;
2357
2358         }
2359         if (!mddev->persistent) {
2360                 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2361                 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2362                 if (!mddev->external) {
2363                         clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2364                         rdev_for_each(rdev, mddev) {
2365                                 if (rdev->badblocks.changed) {
2366                                         rdev->badblocks.changed = 0;
2367                                         ack_all_badblocks(&rdev->badblocks);
2368                                         md_error(mddev, rdev);
2369                                 }
2370                                 clear_bit(Blocked, &rdev->flags);
2371                                 clear_bit(BlockedBadBlocks, &rdev->flags);
2372                                 wake_up(&rdev->blocked_wait);
2373                         }
2374                 }
2375                 wake_up(&mddev->sb_wait);
2376                 return;
2377         }
2378
2379         spin_lock(&mddev->lock);
2380
2381         mddev->utime = ktime_get_real_seconds();
2382
2383         if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2384                 force_change = 1;
2385         if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2386                 /* just a clean<-> dirty transition, possibly leave spares alone,
2387                  * though if events isn't the right even/odd, we will have to do
2388                  * spares after all
2389                  */
2390                 nospares = 1;
2391         if (force_change)
2392                 nospares = 0;
2393         if (mddev->degraded)
2394                 /* If the array is degraded, then skipping spares is both
2395                  * dangerous and fairly pointless.
2396                  * Dangerous because a device that was removed from the array
2397                  * might have a event_count that still looks up-to-date,
2398                  * so it can be re-added without a resync.
2399                  * Pointless because if there are any spares to skip,
2400                  * then a recovery will happen and soon that array won't
2401                  * be degraded any more and the spare can go back to sleep then.
2402                  */
2403                 nospares = 0;
2404
2405         sync_req = mddev->in_sync;
2406
2407         /* If this is just a dirty<->clean transition, and the array is clean
2408          * and 'events' is odd, we can roll back to the previous clean state */
2409         if (nospares
2410             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2411             && mddev->can_decrease_events
2412             && mddev->events != 1) {
2413                 mddev->events--;
2414                 mddev->can_decrease_events = 0;
2415         } else {
2416                 /* otherwise we have to go forward and ... */
2417                 mddev->events ++;
2418                 mddev->can_decrease_events = nospares;
2419         }
2420
2421         /*
2422          * This 64-bit counter should never wrap.
2423          * Either we are in around ~1 trillion A.C., assuming
2424          * 1 reboot per second, or we have a bug...
2425          */
2426         WARN_ON(mddev->events == 0);
2427
2428         rdev_for_each(rdev, mddev) {
2429                 if (rdev->badblocks.changed)
2430                         any_badblocks_changed++;
2431                 if (test_bit(Faulty, &rdev->flags))
2432                         set_bit(FaultRecorded, &rdev->flags);
2433         }
2434
2435         sync_sbs(mddev, nospares);
2436         spin_unlock(&mddev->lock);
2437
2438         pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2439                  mdname(mddev), mddev->in_sync);
2440
2441         if (mddev->queue)
2442                 blk_add_trace_msg(mddev->queue, "md md_update_sb");
2443 rewrite:
2444         bitmap_update_sb(mddev->bitmap);
2445         rdev_for_each(rdev, mddev) {
2446                 char b[BDEVNAME_SIZE];
2447
2448                 if (rdev->sb_loaded != 1)
2449                         continue; /* no noise on spare devices */
2450
2451                 if (!test_bit(Faulty, &rdev->flags)) {
2452                         md_super_write(mddev,rdev,
2453                                        rdev->sb_start, rdev->sb_size,
2454                                        rdev->sb_page);
2455                         pr_debug("md: (write) %s's sb offset: %llu\n",
2456                                  bdevname(rdev->bdev, b),
2457                                  (unsigned long long)rdev->sb_start);
2458                         rdev->sb_events = mddev->events;
2459                         if (rdev->badblocks.size) {
2460                                 md_super_write(mddev, rdev,
2461                                                rdev->badblocks.sector,
2462                                                rdev->badblocks.size << 9,
2463                                                rdev->bb_page);
2464                                 rdev->badblocks.size = 0;
2465                         }
2466
2467                 } else
2468                         pr_debug("md: %s (skipping faulty)\n",
2469                                  bdevname(rdev->bdev, b));
2470
2471                 if (mddev->level == LEVEL_MULTIPATH)
2472                         /* only need to write one superblock... */
2473                         break;
2474         }
2475         if (md_super_wait(mddev) < 0)
2476                 goto rewrite;
2477         /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2478
2479         if (mddev_is_clustered(mddev) && ret == 0)
2480                 md_cluster_ops->metadata_update_finish(mddev);
2481
2482         if (mddev->in_sync != sync_req ||
2483             !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2484                                BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2485                 /* have to write it out again */
2486                 goto repeat;
2487         wake_up(&mddev->sb_wait);
2488         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2489                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2490
2491         rdev_for_each(rdev, mddev) {
2492                 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2493                         clear_bit(Blocked, &rdev->flags);
2494
2495                 if (any_badblocks_changed)
2496                         ack_all_badblocks(&rdev->badblocks);
2497                 clear_bit(BlockedBadBlocks, &rdev->flags);
2498                 wake_up(&rdev->blocked_wait);
2499         }
2500 }
2501 EXPORT_SYMBOL(md_update_sb);
2502
2503 static int add_bound_rdev(struct md_rdev *rdev)
2504 {
2505         struct mddev *mddev = rdev->mddev;
2506         int err = 0;
2507         bool add_journal = test_bit(Journal, &rdev->flags);
2508
2509         if (!mddev->pers->hot_remove_disk || add_journal) {
2510                 /* If there is hot_add_disk but no hot_remove_disk
2511                  * then added disks for geometry changes,
2512                  * and should be added immediately.
2513                  */
2514                 super_types[mddev->major_version].
2515                         validate_super(mddev, rdev);
2516                 if (add_journal)
2517                         mddev_suspend(mddev);
2518                 err = mddev->pers->hot_add_disk(mddev, rdev);
2519                 if (add_journal)
2520                         mddev_resume(mddev);
2521                 if (err) {
2522                         md_kick_rdev_from_array(rdev);
2523                         return err;
2524                 }
2525         }
2526         sysfs_notify_dirent_safe(rdev->sysfs_state);
2527
2528         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2529         if (mddev->degraded)
2530                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2531         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2532         md_new_event(mddev);
2533         md_wakeup_thread(mddev->thread);
2534         return 0;
2535 }
2536
2537 /* words written to sysfs files may, or may not, be \n terminated.
2538  * We want to accept with case. For this we use cmd_match.
2539  */
2540 static int cmd_match(const char *cmd, const char *str)
2541 {
2542         /* See if cmd, written into a sysfs file, matches
2543          * str.  They must either be the same, or cmd can
2544          * have a trailing newline
2545          */
2546         while (*cmd && *str && *cmd == *str) {
2547                 cmd++;
2548                 str++;
2549         }
2550         if (*cmd == '\n')
2551                 cmd++;
2552         if (*str || *cmd)
2553                 return 0;
2554         return 1;
2555 }
2556
2557 struct rdev_sysfs_entry {
2558         struct attribute attr;
2559         ssize_t (*show)(struct md_rdev *, char *);
2560         ssize_t (*store)(struct md_rdev *, const char *, size_t);
2561 };
2562
2563 static ssize_t
2564 state_show(struct md_rdev *rdev, char *page)
2565 {
2566         char *sep = ",";
2567         size_t len = 0;
2568         unsigned long flags = ACCESS_ONCE(rdev->flags);
2569
2570         if (test_bit(Faulty, &flags) ||
2571             (!test_bit(ExternalBbl, &flags) &&
2572             rdev->badblocks.unacked_exist))
2573                 len += sprintf(page+len, "faulty%s", sep);
2574         if (test_bit(In_sync, &flags))
2575                 len += sprintf(page+len, "in_sync%s", sep);
2576         if (test_bit(Journal, &flags))
2577                 len += sprintf(page+len, "journal%s", sep);
2578         if (test_bit(WriteMostly, &flags))
2579                 len += sprintf(page+len, "write_mostly%s", sep);
2580         if (test_bit(Blocked, &flags) ||
2581             (rdev->badblocks.unacked_exist
2582              && !test_bit(Faulty, &flags)))
2583                 len += sprintf(page+len, "blocked%s", sep);
2584         if (!test_bit(Faulty, &flags) &&
2585             !test_bit(Journal, &flags) &&
2586             !test_bit(In_sync, &flags))
2587                 len += sprintf(page+len, "spare%s", sep);
2588         if (test_bit(WriteErrorSeen, &flags))
2589                 len += sprintf(page+len, "write_error%s", sep);
2590         if (test_bit(WantReplacement, &flags))
2591                 len += sprintf(page+len, "want_replacement%s", sep);
2592         if (test_bit(Replacement, &flags))
2593                 len += sprintf(page+len, "replacement%s", sep);
2594         if (test_bit(ExternalBbl, &flags))
2595                 len += sprintf(page+len, "external_bbl%s", sep);
2596         if (test_bit(FailFast, &flags))
2597                 len += sprintf(page+len, "failfast%s", sep);
2598
2599         if (len)
2600                 len -= strlen(sep);
2601
2602         return len+sprintf(page+len, "\n");
2603 }
2604
2605 static ssize_t
2606 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2607 {
2608         /* can write
2609          *  faulty  - simulates an error
2610          *  remove  - disconnects the device
2611          *  writemostly - sets write_mostly
2612          *  -writemostly - clears write_mostly
2613          *  blocked - sets the Blocked flags
2614          *  -blocked - clears the Blocked and possibly simulates an error
2615          *  insync - sets Insync providing device isn't active
2616          *  -insync - clear Insync for a device with a slot assigned,
2617          *            so that it gets rebuilt based on bitmap
2618          *  write_error - sets WriteErrorSeen
2619          *  -write_error - clears WriteErrorSeen
2620          *  {,-}failfast - set/clear FailFast
2621          */
2622         int err = -EINVAL;
2623         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2624                 md_error(rdev->mddev, rdev);
2625                 if (test_bit(Faulty, &rdev->flags))
2626                         err = 0;
2627                 else
2628                         err = -EBUSY;
2629         } else if (cmd_match(buf, "remove")) {
2630                 if (rdev->mddev->pers) {
2631                         clear_bit(Blocked, &rdev->flags);
2632                         remove_and_add_spares(rdev->mddev, rdev);
2633                 }
2634                 if (rdev->raid_disk >= 0)
2635                         err = -EBUSY;
2636                 else {
2637                         struct mddev *mddev = rdev->mddev;
2638                         err = 0;
2639                         if (mddev_is_clustered(mddev))
2640                                 err = md_cluster_ops->remove_disk(mddev, rdev);
2641
2642                         if (err == 0) {
2643                                 md_kick_rdev_from_array(rdev);
2644                                 if (mddev->pers) {
2645                                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2646                                         md_wakeup_thread(mddev->thread);
2647                                 }
2648                                 md_new_event(mddev);
2649                         }
2650                 }
2651         } else if (cmd_match(buf, "writemostly")) {
2652                 set_bit(WriteMostly, &rdev->flags);
2653                 err = 0;
2654         } else if (cmd_match(buf, "-writemostly")) {
2655                 clear_bit(WriteMostly, &rdev->flags);
2656                 err = 0;
2657         } else if (cmd_match(buf, "blocked")) {
2658                 set_bit(Blocked, &rdev->flags);
2659                 err = 0;
2660         } else if (cmd_match(buf, "-blocked")) {
2661                 if (!test_bit(Faulty, &rdev->flags) &&
2662                     !test_bit(ExternalBbl, &rdev->flags) &&
2663                     rdev->badblocks.unacked_exist) {
2664                         /* metadata handler doesn't understand badblocks,
2665                          * so we need to fail the device
2666                          */
2667                         md_error(rdev->mddev, rdev);
2668                 }
2669                 clear_bit(Blocked, &rdev->flags);
2670                 clear_bit(BlockedBadBlocks, &rdev->flags);
2671                 wake_up(&rdev->blocked_wait);
2672                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2673                 md_wakeup_thread(rdev->mddev->thread);
2674
2675                 err = 0;
2676         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2677                 set_bit(In_sync, &rdev->flags);
2678                 err = 0;
2679         } else if (cmd_match(buf, "failfast")) {
2680                 set_bit(FailFast, &rdev->flags);
2681                 err = 0;
2682         } else if (cmd_match(buf, "-failfast")) {
2683                 clear_bit(FailFast, &rdev->flags);
2684                 err = 0;
2685         } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2686                    !test_bit(Journal, &rdev->flags)) {
2687                 if (rdev->mddev->pers == NULL) {
2688                         clear_bit(In_sync, &rdev->flags);
2689                         rdev->saved_raid_disk = rdev->raid_disk;
2690                         rdev->raid_disk = -1;
2691                         err = 0;
2692                 }
2693         } else if (cmd_match(buf, "write_error")) {
2694                 set_bit(WriteErrorSeen, &rdev->flags);
2695                 err = 0;
2696         } else if (cmd_match(buf, "-write_error")) {
2697                 clear_bit(WriteErrorSeen, &rdev->flags);
2698                 err = 0;
2699         } else if (cmd_match(buf, "want_replacement")) {
2700                 /* Any non-spare device that is not a replacement can
2701                  * become want_replacement at any time, but we then need to
2702                  * check if recovery is needed.
2703                  */
2704                 if (rdev->raid_disk >= 0 &&
2705                     !test_bit(Journal, &rdev->flags) &&
2706                     !test_bit(Replacement, &rdev->flags))
2707                         set_bit(WantReplacement, &rdev->flags);
2708                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2709                 md_wakeup_thread(rdev->mddev->thread);
2710                 err = 0;
2711         } else if (cmd_match(buf, "-want_replacement")) {
2712                 /* Clearing 'want_replacement' is always allowed.
2713                  * Once replacements starts it is too late though.
2714                  */
2715                 err = 0;
2716                 clear_bit(WantReplacement, &rdev->flags);
2717         } else if (cmd_match(buf, "replacement")) {
2718                 /* Can only set a device as a replacement when array has not
2719                  * yet been started.  Once running, replacement is automatic
2720                  * from spares, or by assigning 'slot'.
2721                  */
2722                 if (rdev->mddev->pers)
2723                         err = -EBUSY;
2724                 else {
2725                         set_bit(Replacement, &rdev->flags);
2726                         err = 0;
2727                 }
2728         } else if (cmd_match(buf, "-replacement")) {
2729                 /* Similarly, can only clear Replacement before start */
2730                 if (rdev->mddev->pers)
2731                         err = -EBUSY;
2732                 else {
2733                         clear_bit(Replacement, &rdev->flags);
2734                         err = 0;
2735                 }
2736         } else if (cmd_match(buf, "re-add")) {
2737                 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
2738                         /* clear_bit is performed _after_ all the devices
2739                          * have their local Faulty bit cleared. If any writes
2740                          * happen in the meantime in the local node, they
2741                          * will land in the local bitmap, which will be synced
2742                          * by this node eventually
2743                          */
2744                         if (!mddev_is_clustered(rdev->mddev) ||
2745                             (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2746                                 clear_bit(Faulty, &rdev->flags);
2747                                 err = add_bound_rdev(rdev);
2748                         }
2749                 } else
2750                         err = -EBUSY;
2751         } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2752                 set_bit(ExternalBbl, &rdev->flags);
2753                 rdev->badblocks.shift = 0;
2754                 err = 0;
2755         } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2756                 clear_bit(ExternalBbl, &rdev->flags);
2757                 err = 0;
2758         }
2759         if (!err)
2760                 sysfs_notify_dirent_safe(rdev->sysfs_state);
2761         return err ? err : len;
2762 }
2763 static struct rdev_sysfs_entry rdev_state =
2764 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2765
2766 static ssize_t
2767 errors_show(struct md_rdev *rdev, char *page)
2768 {
2769         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2770 }
2771
2772 static ssize_t
2773 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2774 {
2775         unsigned int n;
2776         int rv;
2777
2778         rv = kstrtouint(buf, 10, &n);
2779         if (rv < 0)
2780                 return rv;
2781         atomic_set(&rdev->corrected_errors, n);
2782         return len;
2783 }
2784 static struct rdev_sysfs_entry rdev_errors =
2785 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2786
2787 static ssize_t
2788 slot_show(struct md_rdev *rdev, char *page)
2789 {
2790         if (test_bit(Journal, &rdev->flags))
2791                 return sprintf(page, "journal\n");
2792         else if (rdev->raid_disk < 0)
2793                 return sprintf(page, "none\n");
2794         else
2795                 return sprintf(page, "%d\n", rdev->raid_disk);
2796 }
2797
2798 static ssize_t
2799 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2800 {
2801         int slot;
2802         int err;
2803
2804         if (test_bit(Journal, &rdev->flags))
2805                 return -EBUSY;
2806         if (strncmp(buf, "none", 4)==0)
2807                 slot = -1;
2808         else {
2809                 err = kstrtouint(buf, 10, (unsigned int *)&slot);
2810                 if (err < 0)
2811                         return err;
2812         }
2813         if (rdev->mddev->pers && slot == -1) {
2814                 /* Setting 'slot' on an active array requires also
2815                  * updating the 'rd%d' link, and communicating
2816                  * with the personality with ->hot_*_disk.
2817                  * For now we only support removing
2818                  * failed/spare devices.  This normally happens automatically,
2819                  * but not when the metadata is externally managed.
2820                  */
2821                 if (rdev->raid_disk == -1)
2822                         return -EEXIST;
2823                 /* personality does all needed checks */
2824                 if (rdev->mddev->pers->hot_remove_disk == NULL)
2825                         return -EINVAL;
2826                 clear_bit(Blocked, &rdev->flags);
2827                 remove_and_add_spares(rdev->mddev, rdev);
2828                 if (rdev->raid_disk >= 0)
2829                         return -EBUSY;
2830                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2831                 md_wakeup_thread(rdev->mddev->thread);
2832         } else if (rdev->mddev->pers) {
2833                 /* Activating a spare .. or possibly reactivating
2834                  * if we ever get bitmaps working here.
2835                  */
2836                 int err;
2837
2838                 if (rdev->raid_disk != -1)
2839                         return -EBUSY;
2840
2841                 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2842                         return -EBUSY;
2843
2844                 if (rdev->mddev->pers->hot_add_disk == NULL)
2845                         return -EINVAL;
2846
2847                 if (slot >= rdev->mddev->raid_disks &&
2848                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2849                         return -ENOSPC;
2850
2851                 rdev->raid_disk = slot;
2852                 if (test_bit(In_sync, &rdev->flags))
2853                         rdev->saved_raid_disk = slot;
2854                 else
2855                         rdev->saved_raid_disk = -1;
2856                 clear_bit(In_sync, &rdev->flags);
2857                 clear_bit(Bitmap_sync, &rdev->flags);
2858                 err = rdev->mddev->pers->
2859                         hot_add_disk(rdev->mddev, rdev);
2860                 if (err) {
2861                         rdev->raid_disk = -1;
2862                         return err;
2863                 } else
2864                         sysfs_notify_dirent_safe(rdev->sysfs_state);
2865                 if (sysfs_link_rdev(rdev->mddev, rdev))
2866                         /* failure here is OK */;
2867                 /* don't wakeup anyone, leave that to userspace. */
2868         } else {
2869                 if (slot >= rdev->mddev->raid_disks &&
2870                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2871                         return -ENOSPC;
2872                 rdev->raid_disk = slot;
2873                 /* assume it is working */
2874                 clear_bit(Faulty, &rdev->flags);
2875                 clear_bit(WriteMostly, &rdev->flags);
2876                 set_bit(In_sync, &rdev->flags);
2877                 sysfs_notify_dirent_safe(rdev->sysfs_state);
2878         }
2879         return len;
2880 }
2881
2882 static struct rdev_sysfs_entry rdev_slot =
2883 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2884
2885 static ssize_t
2886 offset_show(struct md_rdev *rdev, char *page)
2887 {
2888         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2889 }
2890
2891 static ssize_t
2892 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2893 {
2894         unsigned long long offset;
2895         if (kstrtoull(buf, 10, &offset) < 0)
2896                 return -EINVAL;
2897         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2898                 return -EBUSY;
2899         if (rdev->sectors && rdev->mddev->external)
2900                 /* Must set offset before size, so overlap checks
2901                  * can be sane */
2902                 return -EBUSY;
2903         rdev->data_offset = offset;
2904         rdev->new_data_offset = offset;
2905         return len;
2906 }
2907
2908 static struct rdev_sysfs_entry rdev_offset =
2909 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2910
2911 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2912 {
2913         return sprintf(page, "%llu\n",
2914                        (unsigned long long)rdev->new_data_offset);
2915 }
2916
2917 static ssize_t new_offset_store(struct md_rdev *rdev,
2918                                 const char *buf, size_t len)
2919 {
2920         unsigned long long new_offset;
2921         struct mddev *mddev = rdev->mddev;
2922
2923         if (kstrtoull(buf, 10, &new_offset) < 0)
2924                 return -EINVAL;
2925
2926         if (mddev->sync_thread ||
2927             test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
2928                 return -EBUSY;
2929         if (new_offset == rdev->data_offset)
2930                 /* reset is always permitted */
2931                 ;
2932         else if (new_offset > rdev->data_offset) {
2933                 /* must not push array size beyond rdev_sectors */
2934                 if (new_offset - rdev->data_offset
2935                     + mddev->dev_sectors > rdev->sectors)
2936                                 return -E2BIG;
2937         }
2938         /* Metadata worries about other space details. */
2939
2940         /* decreasing the offset is inconsistent with a backwards
2941          * reshape.
2942          */
2943         if (new_offset < rdev->data_offset &&
2944             mddev->reshape_backwards)
2945                 return -EINVAL;
2946         /* Increasing offset is inconsistent with forwards
2947          * reshape.  reshape_direction should be set to
2948          * 'backwards' first.
2949          */
2950         if (new_offset > rdev->data_offset &&
2951             !mddev->reshape_backwards)
2952                 return -EINVAL;
2953
2954         if (mddev->pers && mddev->persistent &&
2955             !super_types[mddev->major_version]
2956             .allow_new_offset(rdev, new_offset))
2957                 return -E2BIG;
2958         rdev->new_data_offset = new_offset;
2959         if (new_offset > rdev->data_offset)
2960                 mddev->reshape_backwards = 1;
2961         else if (new_offset < rdev->data_offset)
2962                 mddev->reshape_backwards = 0;
2963
2964         return len;
2965 }
2966 static struct rdev_sysfs_entry rdev_new_offset =
2967 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2968
2969 static ssize_t
2970 rdev_size_show(struct md_rdev *rdev, char *page)
2971 {
2972         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2973 }
2974
2975 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2976 {
2977         /* check if two start/length pairs overlap */
2978         if (s1+l1 <= s2)
2979                 return 0;
2980         if (s2+l2 <= s1)
2981                 return 0;
2982         return 1;
2983 }
2984
2985 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2986 {
2987         unsigned long long blocks;
2988         sector_t new;
2989
2990         if (kstrtoull(buf, 10, &blocks) < 0)
2991                 return -EINVAL;
2992
2993         if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2994                 return -EINVAL; /* sector conversion overflow */
2995
2996         new = blocks * 2;
2997         if (new != blocks * 2)
2998                 return -EINVAL; /* unsigned long long to sector_t overflow */
2999
3000         *sectors = new;
3001         return 0;
3002 }
3003
3004 static ssize_t
3005 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3006 {
3007         struct mddev *my_mddev = rdev->mddev;
3008         sector_t oldsectors = rdev->sectors;
3009         sector_t sectors;
3010
3011         if (test_bit(Journal, &rdev->flags))
3012                 return -EBUSY;
3013         if (strict_blocks_to_sectors(buf, &sectors) < 0)
3014                 return -EINVAL;
3015         if (rdev->data_offset != rdev->new_data_offset)
3016                 return -EINVAL; /* too confusing */
3017         if (my_mddev->pers && rdev->raid_disk >= 0) {
3018                 if (my_mddev->persistent) {
3019                         sectors = super_types[my_mddev->major_version].
3020                                 rdev_size_change(rdev, sectors);
3021                         if (!sectors)
3022                                 return -EBUSY;
3023                 } else if (!sectors)
3024                         sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3025                                 rdev->data_offset;
3026                 if (!my_mddev->pers->resize)
3027                         /* Cannot change size for RAID0 or Linear etc */
3028                         return -EINVAL;
3029         }
3030         if (sectors < my_mddev->dev_sectors)
3031                 return -EINVAL; /* component must fit device */
3032
3033         rdev->sectors = sectors;
3034         if (sectors > oldsectors && my_mddev->external) {
3035                 /* Need to check that all other rdevs with the same
3036                  * ->bdev do not overlap.  'rcu' is sufficient to walk
3037                  * the rdev lists safely.
3038                  * This check does not provide a hard guarantee, it
3039                  * just helps avoid dangerous mistakes.
3040                  */
3041                 struct mddev *mddev;
3042                 int overlap = 0;
3043                 struct list_head *tmp;
3044
3045                 rcu_read_lock();
3046                 for_each_mddev(mddev, tmp) {
3047                         struct md_rdev *rdev2;
3048
3049                         rdev_for_each(rdev2, mddev)
3050                                 if (rdev->bdev == rdev2->bdev &&
3051                                     rdev != rdev2 &&
3052                                     overlaps(rdev->data_offset, rdev->sectors,
3053                                              rdev2->data_offset,
3054                                              rdev2->sectors)) {
3055                                         overlap = 1;
3056                                         break;
3057                                 }
3058                         if (overlap) {
3059                                 mddev_put(mddev);
3060                                 break;
3061                         }
3062                 }
3063                 rcu_read_unlock();
3064                 if (overlap) {
3065                         /* Someone else could have slipped in a size
3066                          * change here, but doing so is just silly.
3067                          * We put oldsectors back because we *know* it is
3068                          * safe, and trust userspace not to race with
3069                          * itself
3070                          */
3071                         rdev->sectors = oldsectors;
3072                         return -EBUSY;
3073                 }
3074         }
3075         return len;
3076 }
3077
3078 static struct rdev_sysfs_entry rdev_size =
3079 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3080
3081 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3082 {
3083         unsigned long long recovery_start = rdev->recovery_offset;
3084
3085         if (test_bit(In_sync, &rdev->flags) ||
3086             recovery_start == MaxSector)
3087                 return sprintf(page, "none\n");
3088
3089         return sprintf(page, "%llu\n", recovery_start);
3090 }
3091
3092 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3093 {
3094         unsigned long long recovery_start;
3095
3096         if (cmd_match(buf, "none"))
3097                 recovery_start = MaxSector;
3098         else if (kstrtoull(buf, 10, &recovery_start))
3099                 return -EINVAL;
3100
3101         if (rdev->mddev->pers &&
3102             rdev->raid_disk >= 0)
3103                 return -EBUSY;
3104
3105         rdev->recovery_offset = recovery_start;
3106         if (recovery_start == MaxSector)
3107                 set_bit(In_sync, &rdev->flags);
3108         else
3109                 clear_bit(In_sync, &rdev->flags);
3110         return len;
3111 }
3112
3113 static struct rdev_sysfs_entry rdev_recovery_start =
3114 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3115
3116 /* sysfs access to bad-blocks list.
3117  * We present two files.
3118  * 'bad-blocks' lists sector numbers and lengths of ranges that
3119  *    are recorded as bad.  The list is truncated to fit within
3120  *    the one-page limit of sysfs.
3121  *    Writing "sector length" to this file adds an acknowledged
3122  *    bad block list.
3123  * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3124  *    been acknowledged.  Writing to this file adds bad blocks
3125  *    without acknowledging them.  This is largely for testing.
3126  */
3127 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3128 {
3129         return badblocks_show(&rdev->badblocks, page, 0);
3130 }
3131 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3132 {
3133         int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3134         /* Maybe that ack was all we needed */
3135         if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3136                 wake_up(&rdev->blocked_wait);
3137         return rv;
3138 }
3139 static struct rdev_sysfs_entry rdev_bad_blocks =
3140 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3141
3142 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3143 {
3144         return badblocks_show(&rdev->badblocks, page, 1);
3145 }
3146 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3147 {
3148         return badblocks_store(&rdev->badblocks, page, len, 1);
3149 }
3150 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3151 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3152
3153 static ssize_t
3154 ppl_sector_show(struct md_rdev *rdev, char *page)
3155 {
3156         return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3157 }
3158
3159 static ssize_t
3160 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3161 {
3162         unsigned long long sector;
3163
3164         if (kstrtoull(buf, 10, &sector) < 0)
3165                 return -EINVAL;
3166         if (sector != (sector_t)sector)
3167                 return -EINVAL;
3168
3169         if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3170             rdev->raid_disk >= 0)
3171                 return -EBUSY;
3172
3173         if (rdev->mddev->persistent) {
3174                 if (rdev->mddev->major_version == 0)
3175                         return -EINVAL;
3176                 if ((sector > rdev->sb_start &&
3177                      sector - rdev->sb_start > S16_MAX) ||
3178                     (sector < rdev->sb_start &&
3179                      rdev->sb_start - sector > -S16_MIN))
3180                         return -EINVAL;
3181                 rdev->ppl.offset = sector - rdev->sb_start;
3182         } else if (!rdev->mddev->external) {
3183                 return -EBUSY;
3184         }
3185         rdev->ppl.sector = sector;
3186         return len;
3187 }
3188
3189 static struct rdev_sysfs_entry rdev_ppl_sector =
3190 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3191
3192 static ssize_t
3193 ppl_size_show(struct md_rdev *rdev, char *page)
3194 {
3195         return sprintf(page, "%u\n", rdev->ppl.size);
3196 }
3197
3198 static ssize_t
3199 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3200 {
3201         unsigned int size;
3202
3203         if (kstrtouint(buf, 10, &size) < 0)
3204                 return -EINVAL;
3205
3206         if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3207             rdev->raid_disk >= 0)
3208                 return -EBUSY;
3209
3210         if (rdev->mddev->persistent) {
3211                 if (rdev->mddev->major_version == 0)
3212                         return -EINVAL;
3213                 if (size > U16_MAX)
3214                         return -EINVAL;
3215         } else if (!rdev->mddev->external) {
3216                 return -EBUSY;
3217         }
3218         rdev->ppl.size = size;
3219         return len;
3220 }
3221
3222 static struct rdev_sysfs_entry rdev_ppl_size =
3223 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3224
3225 static struct attribute *rdev_default_attrs[] = {
3226         &rdev_state.attr,
3227         &rdev_errors.attr,
3228         &rdev_slot.attr,
3229         &rdev_offset.attr,
3230         &rdev_new_offset.attr,
3231         &rdev_size.attr,
3232         &rdev_recovery_start.attr,
3233         &rdev_bad_blocks.attr,
3234         &rdev_unack_bad_blocks.attr,
3235         &rdev_ppl_sector.attr,
3236         &rdev_ppl_size.attr,
3237         NULL,
3238 };
3239 static ssize_t
3240 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3241 {
3242         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3243         struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3244
3245         if (!entry->show)
3246                 return -EIO;
3247         if (!rdev->mddev)
3248                 return -EBUSY;
3249         return entry->show(rdev, page);
3250 }
3251
3252 static ssize_t
3253 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3254               const char *page, size_t length)
3255 {
3256         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3257         struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3258         ssize_t rv;
3259         struct mddev *mddev = rdev->mddev;
3260
3261         if (!entry->store)
3262                 return -EIO;
3263         if (!capable(CAP_SYS_ADMIN))
3264                 return -EACCES;
3265         rv = mddev ? mddev_lock(mddev): -EBUSY;
3266         if (!rv) {
3267                 if (rdev->mddev == NULL)
3268                         rv = -EBUSY;
3269                 else
3270                         rv = entry->store(rdev, page, length);
3271                 mddev_unlock(mddev);
3272         }
3273         return rv;
3274 }
3275
3276 static void rdev_free(struct kobject *ko)
3277 {
3278         struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3279         kfree(rdev);
3280 }
3281 static const struct sysfs_ops rdev_sysfs_ops = {
3282         .show           = rdev_attr_show,
3283         .store          = rdev_attr_store,
3284 };
3285 static struct kobj_type rdev_ktype = {
3286         .release        = rdev_free,
3287         .sysfs_ops      = &rdev_sysfs_ops,
3288         .default_attrs  = rdev_default_attrs,
3289 };
3290
3291 int md_rdev_init(struct md_rdev *rdev)
3292 {
3293         rdev->desc_nr = -1;
3294         rdev->saved_raid_disk = -1;
3295         rdev->raid_disk = -1;
3296         rdev->flags = 0;
3297         rdev->data_offset = 0;
3298         rdev->new_data_offset = 0;
3299         rdev->sb_events = 0;
3300         rdev->last_read_error = 0;
3301         rdev->sb_loaded = 0;
3302         rdev->bb_page = NULL;
3303         atomic_set(&rdev->nr_pending, 0);
3304         atomic_set(&rdev->read_errors, 0);
3305         atomic_set(&rdev->corrected_errors, 0);
3306
3307         INIT_LIST_HEAD(&rdev->same_set);
3308         init_waitqueue_head(&rdev->blocked_wait);
3309
3310         /* Add space to store bad block list.
3311          * This reserves the space even on arrays where it cannot
3312          * be used - I wonder if that matters
3313          */
3314         return badblocks_init(&rdev->badblocks, 0);
3315 }
3316 EXPORT_SYMBOL_GPL(md_rdev_init);
3317 /*
3318  * Import a device. If 'super_format' >= 0, then sanity check the superblock
3319  *
3320  * mark the device faulty if:
3321  *
3322  *   - the device is nonexistent (zero size)
3323  *   - the device has no valid superblock
3324  *
3325  * a faulty rdev _never_ has rdev->sb set.
3326  */
3327 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3328 {
3329         char b[BDEVNAME_SIZE];
3330         int err;
3331         struct md_rdev *rdev;
3332         sector_t size;
3333
3334         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3335         if (!rdev)
3336                 return ERR_PTR(-ENOMEM);
3337
3338         err = md_rdev_init(rdev);
3339         if (err)
3340                 goto abort_free;
3341         err = alloc_disk_sb(rdev);
3342         if (err)
3343                 goto abort_free;
3344
3345         err = lock_rdev(rdev, newdev, super_format == -2);
3346         if (err)
3347                 goto abort_free;
3348
3349         kobject_init(&rdev->kobj, &rdev_ktype);
3350
3351         size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3352         if (!size) {
3353                 pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3354                         bdevname(rdev->bdev,b));
3355                 err = -EINVAL;
3356                 goto abort_free;
3357         }
3358
3359         if (super_format >= 0) {
3360                 err = super_types[super_format].
3361                         load_super(rdev, NULL, super_minor);
3362                 if (err == -EINVAL) {
3363                         pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3364                                 bdevname(rdev->bdev,b),
3365                                 super_format, super_minor);
3366                         goto abort_free;
3367                 }
3368                 if (err < 0) {
3369                         pr_warn("md: could not read %s's sb, not importing!\n",
3370                                 bdevname(rdev->bdev,b));
3371                         goto abort_free;
3372                 }
3373         }
3374
3375         return rdev;
3376
3377 abort_free:
3378         if (rdev->bdev)
3379                 unlock_rdev(rdev);
3380         md_rdev_clear(rdev);
3381         kfree(rdev);
3382         return ERR_PTR(err);
3383 }
3384
3385 /*
3386  * Check a full RAID array for plausibility
3387  */
3388
3389 static void analyze_sbs(struct mddev *mddev)
3390 {
3391         int i;
3392         struct md_rdev *rdev, *freshest, *tmp;
3393         char b[BDEVNAME_SIZE];
3394
3395         freshest = NULL;
3396         rdev_for_each_safe(rdev, tmp, mddev)
3397                 switch (super_types[mddev->major_version].
3398                         load_super(rdev, freshest, mddev->minor_version)) {
3399                 case 1:
3400                         freshest = rdev;
3401                         break;
3402                 case 0:
3403                         break;
3404                 default:
3405                         pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3406                                 bdevname(rdev->bdev,b));
3407                         md_kick_rdev_from_array(rdev);
3408                 }
3409
3410         super_types[mddev->major_version].
3411                 validate_super(mddev, freshest);
3412
3413         i = 0;
3414         rdev_for_each_safe(rdev, tmp, mddev) {
3415                 if (mddev->max_disks &&
3416                     (rdev->desc_nr >= mddev->max_disks ||
3417                      i > mddev->max_disks)) {
3418                         pr_warn("md: %s: %s: only %d devices permitted\n",
3419                                 mdname(mddev), bdevname(rdev->bdev, b),
3420                                 mddev->max_disks);
3421                         md_kick_rdev_from_array(rdev);
3422                         continue;
3423                 }
3424                 if (rdev != freshest) {
3425                         if (super_types[mddev->major_version].
3426                             validate_super(mddev, rdev)) {
3427                                 pr_warn("md: kicking non-fresh %s from array!\n",
3428                                         bdevname(rdev->bdev,b));
3429                                 md_kick_rdev_from_array(rdev);
3430                                 continue;
3431                         }
3432                 }
3433                 if (mddev->level == LEVEL_MULTIPATH) {
3434                         rdev->desc_nr = i++;
3435                         rdev->raid_disk = rdev->desc_nr;
3436                         set_bit(In_sync, &rdev->flags);
3437                 } else if (rdev->raid_disk >=
3438                             (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3439                            !test_bit(Journal, &rdev->flags)) {
3440                         rdev->raid_disk = -1;
3441                         clear_bit(In_sync, &rdev->flags);
3442                 }
3443         }
3444 }
3445
3446 /* Read a fixed-point number.
3447  * Numbers in sysfs attributes should be in "standard" units where
3448  * possible, so time should be in seconds.
3449  * However we internally use a a much smaller unit such as
3450  * milliseconds or jiffies.
3451  * This function takes a decimal number with a possible fractional
3452  * component, and produces an integer which is the result of
3453  * multiplying that number by 10^'scale'.
3454  * all without any floating-point arithmetic.
3455  */
3456 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3457 {
3458         unsigned long result = 0;
3459         long decimals = -1;
3460         while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3461                 if (*cp == '.')
3462                         decimals = 0;
3463                 else if (decimals < scale) {
3464                         unsigned int value;
3465                         value = *cp - '0';
3466                         result = result * 10 + value;
3467                         if (decimals >= 0)
3468                                 decimals++;
3469                 }
3470                 cp++;
3471         }
3472         if (*cp == '\n')
3473                 cp++;
3474         if (*cp)
3475                 return -EINVAL;
3476         if (decimals < 0)
3477                 decimals = 0;
3478         while (decimals < scale) {
3479                 result *= 10;
3480                 decimals ++;
3481         }
3482         *res = result;
3483         return 0;
3484 }
3485
3486 static ssize_t
3487 safe_delay_show(struct mddev *mddev, char *page)
3488 {
3489         int msec = (mddev->safemode_delay*1000)/HZ;
3490         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3491 }
3492 static ssize_t
3493 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3494 {
3495         unsigned long msec;
3496
3497         if (mddev_is_clustered(mddev)) {
3498                 pr_warn("md: Safemode is disabled for clustered mode\n");
3499                 return -EINVAL;
3500         }
3501
3502         if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3503                 return -EINVAL;
3504         if (msec == 0)
3505                 mddev->safemode_delay = 0;
3506         else {
3507                 unsigned long old_delay = mddev->safemode_delay;
3508                 unsigned long new_delay = (msec*HZ)/1000;
3509
3510                 if (new_delay == 0)
3511                         new_delay = 1;
3512                 mddev->safemode_delay = new_delay;
3513                 if (new_delay < old_delay || old_delay == 0)
3514                         mod_timer(&mddev->safemode_timer, jiffies+1);
3515         }
3516         return len;
3517 }
3518 static struct md_sysfs_entry md_safe_delay =
3519 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3520
3521 static ssize_t
3522 level_show(struct mddev *mddev, char *page)
3523 {
3524         struct md_personality *p;
3525         int ret;
3526         spin_lock(&mddev->lock);
3527         p = mddev->pers;
3528         if (p)
3529                 ret = sprintf(page, "%s\n", p->name);
3530         else if (mddev->clevel[0])
3531                 ret = sprintf(page, "%s\n", mddev->clevel);
3532         else if (mddev->level != LEVEL_NONE)
3533                 ret = sprintf(page, "%d\n", mddev->level);
3534         else
3535                 ret = 0;
3536         spin_unlock(&mddev->lock);
3537         return ret;
3538 }
3539
3540 static ssize_t
3541 level_store(struct mddev *mddev, const char *buf, size_t len)
3542 {
3543         char clevel[16];
3544         ssize_t rv;
3545         size_t slen = len;
3546         struct md_personality *pers, *oldpers;
3547         long level;
3548         void *priv, *oldpriv;
3549         struct md_rdev *rdev;
3550
3551         if (slen == 0 || slen >= sizeof(clevel))
3552                 return -EINVAL;
3553
3554         rv = mddev_lock(mddev);
3555         if (rv)
3556                 return rv;
3557
3558         if (mddev->pers == NULL) {
3559                 strncpy(mddev->clevel, buf, slen);
3560                 if (mddev->clevel[slen-1] == '\n')
3561                         slen--;
3562                 mddev->clevel[slen] = 0;
3563                 mddev->level = LEVEL_NONE;
3564                 rv = len;
3565                 goto out_unlock;
3566         }
3567         rv = -EROFS;
3568         if (mddev->ro)
3569                 goto out_unlock;
3570
3571         /* request to change the personality.  Need to ensure:
3572          *  - array is not engaged in resync/recovery/reshape
3573          *  - old personality can be suspended
3574          *  - new personality will access other array.
3575          */
3576
3577         rv = -EBUSY;
3578         if (mddev->sync_thread ||
3579             test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3580             mddev->reshape_position != MaxSector ||
3581             mddev->sysfs_active)
3582                 goto out_unlock;
3583
3584         rv = -EINVAL;
3585         if (!mddev->pers->quiesce) {
3586                 pr_warn("md: %s: %s does not support online personality change\n",
3587                         mdname(mddev), mddev->pers->name);
3588                 goto out_unlock;
3589         }
3590
3591         /* Now find the new personality */
3592         strncpy(clevel, buf, slen);
3593         if (clevel[slen-1] == '\n')
3594                 slen--;
3595         clevel[slen] = 0;
3596         if (kstrtol(clevel, 10, &level))
3597                 level = LEVEL_NONE;
3598
3599         if (request_module("md-%s", clevel) != 0)
3600                 request_module("md-level-%s", clevel);
3601         spin_lock(&pers_lock);
3602         pers = find_pers(level, clevel);
3603         if (!pers || !try_module_get(pers->owner)) {
3604                 spin_unlock(&pers_lock);
3605                 pr_warn("md: personality %s not loaded\n", clevel);
3606                 rv = -EINVAL;
3607                 goto out_unlock;
3608         }
3609         spin_unlock(&pers_lock);
3610
3611         if (pers == mddev->pers) {
3612                 /* Nothing to do! */
3613                 module_put(pers->owner);
3614                 rv = len;
3615                 goto out_unlock;
3616         }
3617         if (!pers->takeover) {
3618                 module_put(pers->owner);
3619                 pr_warn("md: %s: %s does not support personality takeover\n",
3620                         mdname(mddev), clevel);
3621                 rv = -EINVAL;
3622                 goto out_unlock;
3623         }
3624
3625         rdev_for_each(rdev, mddev)
3626                 rdev->new_raid_disk = rdev->raid_disk;
3627
3628         /* ->takeover must set new_* and/or delta_disks
3629          * if it succeeds, and may set them when it fails.
3630          */
3631         priv = pers->takeover(mddev);
3632         if (IS_ERR(priv)) {
3633                 mddev->new_level = mddev->level;
3634                 mddev->new_layout = mddev->layout;
3635                 mddev->new_chunk_sectors = mddev->chunk_sectors;
3636                 mddev->raid_disks -= mddev->delta_disks;
3637                 mddev->delta_disks = 0;
3638                 mddev->reshape_backwards = 0;
3639                 module_put(pers->owner);
3640                 pr_warn("md: %s: %s would not accept array\n",
3641                         mdname(mddev), clevel);
3642                 rv = PTR_ERR(priv);
3643                 goto out_unlock;
3644         }
3645
3646         /* Looks like we have a winner */
3647         mddev_suspend(mddev);
3648         mddev_detach(mddev);
3649
3650         spin_lock(&mddev->lock);
3651         oldpers = mddev->pers;
3652         oldpriv = mddev->private;
3653         mddev->pers = pers;
3654         mddev->private = priv;
3655         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3656         mddev->level = mddev->new_level;
3657         mddev->layout = mddev->new_layout;
3658         mddev->chunk_sectors = mddev->new_chunk_sectors;
3659         mddev->delta_disks = 0;
3660         mddev->reshape_backwards = 0;
3661         mddev->degraded = 0;
3662         spin_unlock(&mddev->lock);
3663
3664         if (oldpers->sync_request == NULL &&
3665             mddev->external) {
3666                 /* We are converting from a no-redundancy array
3667                  * to a redundancy array and metadata is managed
3668                  * externally so we need to be sure that writes
3669                  * won't block due to a need to transition
3670                  *      clean->dirty
3671                  * until external management is started.
3672                  */
3673                 mddev->in_sync = 0;
3674                 mddev->safemode_delay = 0;
3675                 mddev->safemode = 0;
3676         }
3677
3678         oldpers->free(mddev, oldpriv);
3679
3680         if (oldpers->sync_request == NULL &&
3681             pers->sync_request != NULL) {
3682                 /* need to add the md_redundancy_group */
3683                 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3684                         pr_warn("md: cannot register extra attributes for %s\n",
3685                                 mdname(mddev));
3686                 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3687         }
3688         if (oldpers->sync_request != NULL &&
3689             pers->sync_request == NULL) {
3690                 /* need to remove the md_redundancy_group */
3691                 if (mddev->to_remove == NULL)
3692                         mddev->to_remove = &md_redundancy_group;
3693         }
3694
3695         module_put(oldpers->owner);
3696
3697         rdev_for_each(rdev, mddev) {
3698                 if (rdev->raid_disk < 0)
3699                         continue;
3700                 if (rdev->new_raid_disk >= mddev->raid_disks)
3701                         rdev->new_raid_disk = -1;
3702                 if (rdev->new_raid_disk == rdev->raid_disk)
3703                         continue;
3704                 sysfs_unlink_rdev(mddev, rdev);
3705         }
3706         rdev_for_each(rdev, mddev) {
3707                 if (rdev->raid_disk < 0)
3708                         continue;
3709                 if (rdev->new_raid_disk == rdev->raid_disk)
3710                         continue;
3711                 rdev->raid_disk = rdev->new_raid_disk;
3712                 if (rdev->raid_disk < 0)
3713                         clear_bit(In_sync, &rdev->flags);
3714                 else {
3715                         if (sysfs_link_rdev(mddev, rdev))
3716                                 pr_warn("md: cannot register rd%d for %s after level change\n",
3717                                         rdev->raid_disk, mdname(mddev));
3718                 }
3719         }
3720
3721         if (pers->sync_request == NULL) {
3722                 /* this is now an array without redundancy, so
3723                  * it must always be in_sync
3724                  */
3725                 mddev->in_sync = 1;
3726                 del_timer_sync(&mddev->safemode_timer);
3727         }
3728         blk_set_stacking_limits(&mddev->queue->limits);
3729         pers->run(mddev);
3730         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3731         mddev_resume(mddev);
3732         if (!mddev->thread)
3733                 md_update_sb(mddev, 1);
3734         sysfs_notify(&mddev->kobj, NULL, "level");
3735         md_new_event(mddev);
3736         rv = len;
3737 out_unlock:
3738         mddev_unlock(mddev);
3739         return rv;
3740 }
3741
3742 static struct md_sysfs_entry md_level =
3743 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3744
3745 static ssize_t
3746 layout_show(struct mddev *mddev, char *page)
3747 {
3748         /* just a number, not meaningful for all levels */
3749         if (mddev->reshape_position != MaxSector &&
3750             mddev->layout != mddev->new_layout)
3751                 return sprintf(page, "%d (%d)\n",
3752                                mddev->new_layout, mddev->layout);
3753         return sprintf(page, "%d\n", mddev->layout);
3754 }
3755
3756 static ssize_t
3757 layout_store(struct mddev *mddev, const char *buf, size_t len)
3758 {
3759         unsigned int n;
3760         int err;
3761
3762         err = kstrtouint(buf, 10, &n);
3763         if (err < 0)
3764                 return err;
3765         err = mddev_lock(mddev);
3766         if (err)
3767                 return err;
3768
3769         if (mddev->pers) {
3770                 if (mddev->pers->check_reshape == NULL)
3771                         err = -EBUSY;
3772                 else if (mddev->ro)
3773                         err = -EROFS;
3774                 else {
3775                         mddev->new_layout = n;
3776                         err = mddev->pers->check_reshape(mddev);
3777                         if (err)
3778                                 mddev->new_layout = mddev->layout;
3779                 }
3780         } else {
3781                 mddev->new_layout = n;
3782                 if (mddev->reshape_position == MaxSector)
3783                         mddev->layout = n;
3784         }
3785         mddev_unlock(mddev);
3786         return err ?: len;
3787 }
3788 static struct md_sysfs_entry md_layout =
3789 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3790
3791 static ssize_t
3792 raid_disks_show(struct mddev *mddev, char *page)
3793 {
3794         if (mddev->raid_disks == 0)
3795                 return 0;
3796         if (mddev->reshape_position != MaxSector &&
3797             mddev->delta_disks != 0)
3798                 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3799                                mddev->raid_disks - mddev->delta_disks);
3800         return sprintf(page, "%d\n", mddev->raid_disks);
3801 }
3802
3803 static int update_raid_disks(struct mddev *mddev, int raid_disks);
3804
3805 static ssize_t
3806 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3807 {
3808         unsigned int n;
3809         int err;
3810
3811         err = kstrtouint(buf, 10, &n);
3812         if (err < 0)
3813                 return err;
3814
3815         err = mddev_lock(mddev);
3816         if (err)
3817                 return err;
3818         if (mddev->pers)
3819                 err = update_raid_disks(mddev, n);
3820         else if (mddev->reshape_position != MaxSector) {
3821                 struct md_rdev *rdev;
3822                 int olddisks = mddev->raid_disks - mddev->delta_disks;
3823
3824                 err = -EINVAL;
3825                 rdev_for_each(rdev, mddev) {
3826                         if (olddisks < n &&
3827                             rdev->data_offset < rdev->new_data_offset)
3828                                 goto out_unlock;
3829                         if (olddisks > n &&
3830                             rdev->data_offset > rdev->new_data_offset)
3831                                 goto out_unlock;
3832                 }
3833                 err = 0;
3834                 mddev->delta_disks = n - olddisks;
3835                 mddev->raid_disks = n;
3836                 mddev->reshape_backwards = (mddev->delta_disks < 0);
3837         } else
3838                 mddev->raid_disks = n;
3839 out_unlock:
3840         mddev_unlock(mddev);
3841         return err ? err : len;
3842 }
3843 static struct md_sysfs_entry md_raid_disks =
3844 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3845
3846 static ssize_t
3847 chunk_size_show(struct mddev *mddev, char *page)
3848 {
3849         if (mddev->reshape_position != MaxSector &&
3850             mddev->chunk_sectors != mddev->new_chunk_sectors)
3851                 return sprintf(page, "%d (%d)\n",
3852                                mddev->new_chunk_sectors << 9,
3853                                mddev->chunk_sectors << 9);
3854         return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3855 }
3856
3857 static ssize_t
3858 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3859 {
3860         unsigned long n;
3861         int err;
3862
3863         err = kstrtoul(buf, 10, &n);
3864         if (err < 0)
3865                 return err;
3866
3867         err = mddev_lock(mddev);
3868         if (err)
3869                 return err;
3870         if (mddev->pers) {
3871                 if (mddev->pers->check_reshape == NULL)
3872                         err = -EBUSY;
3873                 else if (mddev->ro)
3874                         err = -EROFS;
3875                 else {
3876                         mddev->new_chunk_sectors = n >> 9;
3877                         err = mddev->pers->check_reshape(mddev);
3878                         if (err)
3879                                 mddev->new_chunk_sectors = mddev->chunk_sectors;
3880                 }
3881         } else {
3882                 mddev->new_chunk_sectors = n >> 9;
3883                 if (mddev->reshape_position == MaxSector)
3884                         mddev->chunk_sectors = n >> 9;
3885         }
3886         mddev_unlock(mddev);
3887         return err ?: len;
3888 }
3889 static struct md_sysfs_entry md_chunk_size =
3890 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3891
3892 static ssize_t
3893 resync_start_show(struct mddev *mddev, char *page)
3894 {
3895         if (mddev->recovery_cp == MaxSector)
3896                 return sprintf(page, "none\n");
3897         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3898 }
3899
3900 static ssize_t
3901 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3902 {
3903         unsigned long long n;
3904         int err;
3905
3906         if (cmd_match(buf, "none"))
3907                 n = MaxSector;
3908         else {
3909                 err = kstrtoull(buf, 10, &n);
3910                 if (err < 0)
3911                         return err;
3912                 if (n != (sector_t)n)
3913                         return -EINVAL;
3914         }
3915
3916         err = mddev_lock(mddev);
3917         if (err)
3918                 return err;
3919         if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3920                 err = -EBUSY;
3921
3922         if (!err) {
3923                 mddev->recovery_cp = n;
3924                 if (mddev->pers)
3925                         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
3926         }
3927         mddev_unlock(mddev);
3928         return err ?: len;
3929 }
3930 static struct md_sysfs_entry md_resync_start =
3931 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
3932                 resync_start_show, resync_start_store);
3933
3934 /*
3935  * The array state can be:
3936  *
3937  * clear
3938  *     No devices, no size, no level
3939  *     Equivalent to STOP_ARRAY ioctl
3940  * inactive
3941  *     May have some settings, but array is not active
3942  *        all IO results in error
3943  *     When written, doesn't tear down array, but just stops it
3944  * suspended (not supported yet)
3945  *     All IO requests will block. The array can be reconfigured.
3946  *     Writing this, if accepted, will block until array is quiescent
3947  * readonly
3948  *     no resync can happen.  no superblocks get written.
3949  *     write requests fail
3950  * read-auto
3951  *     like readonly, but behaves like 'clean' on a write request.
3952  *
3953  * clean - no pending writes, but otherwise active.
3954  *     When written to inactive array, starts without resync
3955  *     If a write request arrives then
3956  *       if metadata is known, mark 'dirty' and switch to 'active'.
3957  *       if not known, block and switch to write-pending
3958  *     If written to an active array that has pending writes, then fails.
3959  * active
3960  *     fully active: IO and resync can be happening.
3961  *     When written to inactive array, starts with resync
3962  *
3963  * write-pending
3964  *     clean, but writes are blocked waiting for 'active' to be written.
3965  *
3966  * active-idle
3967  *     like active, but no writes have been seen for a while (100msec).
3968  *
3969  */
3970 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3971                    write_pending, active_idle, bad_word};
3972 static char *array_states[] = {
3973         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3974         "write-pending", "active-idle", NULL };
3975
3976 static int match_word(const char *word, char **list)
3977 {
3978         int n;
3979         for (n=0; list[n]; n++)
3980                 if (cmd_match(word, list[n]))
3981                         break;
3982         return n;
3983 }
3984
3985 static ssize_t
3986 array_state_show(struct mddev *mddev, char *page)
3987 {
3988         enum array_state st = inactive;
3989
3990         if (mddev->pers)
3991                 switch(mddev->ro) {
3992                 case 1:
3993                         st = readonly;
3994                         break;
3995                 case 2:
3996                         st = read_auto;
3997                         break;
3998                 case 0:
3999                         if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4000                                 st = write_pending;
4001                         else if (mddev->in_sync)
4002                                 st = clean;
4003                         else if (mddev->safemode)
4004                                 st = active_idle;
4005                         else
4006                                 st = active;
4007                 }
4008         else {
4009                 if (list_empty(&mddev->disks) &&
4010                     mddev->raid_disks == 0 &&
4011                     mddev->dev_sectors == 0)
4012                         st = clear;
4013                 else
4014                         st = inactive;
4015         }
4016         return sprintf(page, "%s\n", array_states[st]);
4017 }
4018
4019 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4020 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4021 static int do_md_run(struct mddev *mddev);
4022 static int restart_array(struct mddev *mddev);
4023
4024 static ssize_t
4025 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4026 {
4027         int err;
4028         enum array_state st = match_word(buf, array_states);
4029
4030         if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4031                 /* don't take reconfig_mutex when toggling between
4032                  * clean and active
4033                  */
4034                 spin_lock(&mddev->lock);
4035                 if (st == active) {
4036                         restart_array(mddev);
4037                         clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4038                         md_wakeup_thread(mddev->thread);
4039                         wake_up(&mddev->sb_wait);
4040                         err = 0;
4041                 } else /* st == clean */ {
4042                         restart_array(mddev);
4043                         if (atomic_read(&mddev->writes_pending) == 0) {
4044                                 if (mddev->in_sync == 0) {
4045                                         mddev->in_sync = 1;
4046                                         if (mddev->safemode == 1)
4047                                                 mddev->safemode = 0;
4048                                         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4049                                 }
4050                                 err = 0;
4051                         } else
4052                                 err = -EBUSY;
4053                 }
4054                 if (!err)
4055                         sysfs_notify_dirent_safe(mddev->sysfs_state);
4056                 spin_unlock(&mddev->lock);
4057                 return err ?: len;
4058         }
4059         err = mddev_lock(mddev);
4060         if (err)
4061                 return err;
4062         err = -EINVAL;
4063         switch(st) {
4064         case bad_word:
4065                 break;
4066         case clear:
4067                 /* stopping an active array */
4068                 err = do_md_stop(mddev, 0, NULL);
4069                 break;
4070         case inactive:
4071                 /* stopping an active array */
4072                 if (mddev->pers)
4073                         err = do_md_stop(mddev, 2, NULL);
4074                 else
4075                         err = 0; /* already inactive */
4076                 break;
4077         case suspended:
4078                 break; /* not supported yet */
4079         case readonly:
4080                 if (mddev->pers)
4081                         err = md_set_readonly(mddev, NULL);
4082                 else {
4083                         mddev->ro = 1;
4084                         set_disk_ro(mddev->gendisk, 1);
4085                         err = do_md_run(mddev);
4086                 }
4087                 break;
4088         case read_auto:
4089                 if (mddev->pers) {
4090                         if (mddev->ro == 0)
4091                                 err = md_set_readonly(mddev, NULL);
4092                         else if (mddev->ro == 1)
4093                                 err = restart_array(mddev);
4094                         if (err == 0) {
4095                                 mddev->ro = 2;
4096                                 set_disk_ro(mddev->gendisk, 0);
4097                         }
4098                 } else {
4099                         mddev->ro = 2;
4100                         err = do_md_run(mddev);
4101                 }
4102                 break;
4103         case clean:
4104                 if (mddev->pers) {
4105                         err = restart_array(mddev);
4106                         if (err)
4107                                 break;
4108                         spin_lock(&mddev->lock);
4109                         if (atomic_read(&mddev->writes_pending) == 0) {
4110                                 if (mddev->in_sync == 0) {
4111                                         mddev->in_sync = 1;
4112                                         if (mddev->safemode == 1)
4113                                                 mddev->safemode = 0;
4114                                         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4115                                 }
4116                                 err = 0;
4117                         } else
4118                                 err = -EBUSY;
4119                         spin_unlock(&mddev->lock);
4120                 } else
4121                         err = -EINVAL;
4122                 break;
4123         case active:
4124                 if (mddev->pers) {
4125                         err = restart_array(mddev);
4126                         if (err)
4127                                 break;
4128                         clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4129                         wake_up(&mddev->sb_wait);
4130                         err = 0;
4131                 } else {
4132                         mddev->ro = 0;
4133                         set_disk_ro(mddev->gendisk, 0);
4134                         err = do_md_run(mddev);
4135                 }
4136                 break;
4137         case write_pending:
4138         case active_idle:
4139                 /* these cannot be set */
4140                 break;
4141         }
4142
4143         if (!err) {
4144                 if (mddev->hold_active == UNTIL_IOCTL)
4145                         mddev->hold_active = 0;
4146                 sysfs_notify_dirent_safe(mddev->sysfs_state);
4147         }
4148         mddev_unlock(mddev);
4149         return err ?: len;
4150 }
4151 static struct md_sysfs_entry md_array_state =
4152 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4153
4154 static ssize_t
4155 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4156         return sprintf(page, "%d\n",
4157                        atomic_read(&mddev->max_corr_read_errors));
4158 }
4159
4160 static ssize_t
4161 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4162 {
4163         unsigned int n;
4164         int rv;
4165
4166         rv = kstrtouint(buf, 10, &n);
4167         if (rv < 0)
4168                 return rv;
4169         atomic_set(&mddev->max_corr_read_errors, n);
4170         return len;
4171 }
4172
4173 static struct md_sysfs_entry max_corr_read_errors =
4174 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4175         max_corrected_read_errors_store);
4176
4177 static ssize_t
4178 null_show(struct mddev *mddev, char *page)
4179 {
4180         return -EINVAL;
4181 }
4182
4183 static ssize_t
4184 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4185 {
4186         /* buf must be %d:%d\n? giving major and minor numbers */
4187         /* The new device is added to the array.
4188          * If the array has a persistent superblock, we read the
4189          * superblock to initialise info and check validity.
4190          * Otherwise, only checking done is that in bind_rdev_to_array,
4191          * which mainly checks size.
4192          */
4193         char *e;
4194         int major = simple_strtoul(buf, &e, 10);
4195         int minor;
4196         dev_t dev;
4197         struct md_rdev *rdev;
4198         int err;
4199
4200         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4201                 return -EINVAL;
4202         minor = simple_strtoul(e+1, &e, 10);
4203         if (*e && *e != '\n')
4204                 return -EINVAL;
4205         dev = MKDEV(major, minor);
4206         if (major != MAJOR(dev) ||
4207             minor != MINOR(dev))
4208                 return -EOVERFLOW;
4209
4210         flush_workqueue(md_misc_wq);
4211
4212         err = mddev_lock(mddev);
4213         if (err)
4214                 return err;
4215         if (mddev->persistent) {
4216                 rdev = md_import_device(dev, mddev->major_version,
4217                                         mddev->minor_version);
4218                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4219                         struct md_rdev *rdev0
4220                                 = list_entry(mddev->disks.next,
4221                                              struct md_rdev, same_set);
4222                         err = super_types[mddev->major_version]
4223                                 .load_super(rdev, rdev0, mddev->minor_version);
4224                         if (err < 0)
4225                                 goto out;
4226                 }
4227         } else if (mddev->external)
4228                 rdev = md_import_device(dev, -2, -1);
4229         else
4230                 rdev = md_import_device(dev, -1, -1);
4231
4232         if (IS_ERR(rdev)) {
4233                 mddev_unlock(mddev);
4234                 return PTR_ERR(rdev);
4235         }
4236         err = bind_rdev_to_array(rdev, mddev);
4237  out:
4238         if (err)
4239                 export_rdev(rdev);
4240         mddev_unlock(mddev);
4241         return err ? err : len;
4242 }
4243
4244 static struct md_sysfs_entry md_new_device =
4245 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4246
4247 static ssize_t
4248 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4249 {
4250         char *end;
4251         unsigned long chunk, end_chunk;
4252         int err;
4253
4254         err = mddev_lock(mddev);
4255         if (err)
4256                 return err;
4257         if (!mddev->bitmap)
4258                 goto out;
4259         /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4260         while (*buf) {
4261                 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4262                 if (buf == end) break;
4263                 if (*end == '-') { /* range */
4264                         buf = end + 1;
4265                         end_chunk = simple_strtoul(buf, &end, 0);
4266                         if (buf == end) break;
4267                 }
4268                 if (*end && !isspace(*end)) break;
4269                 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4270                 buf = skip_spaces(end);
4271         }
4272         bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4273 out:
4274         mddev_unlock(mddev);
4275         return len;
4276 }
4277
4278 static struct md_sysfs_entry md_bitmap =
4279 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4280
4281 static ssize_t
4282 size_show(struct mddev *mddev, char *page)
4283 {
4284         return sprintf(page, "%llu\n",
4285                 (unsigned long long)mddev->dev_sectors / 2);
4286 }
4287
4288 static int update_size(struct mddev *mddev, sector_t num_sectors);
4289
4290 static ssize_t
4291 size_store(struct mddev *mddev, const char *buf, size_t len)
4292 {
4293         /* If array is inactive, we can reduce the component size, but
4294          * not increase it (except from 0).
4295          * If array is active, we can try an on-line resize
4296          */
4297         sector_t sectors;
4298         int err = strict_blocks_to_sectors(buf, &sectors);
4299
4300         if (err < 0)
4301                 return err;
4302         err = mddev_lock(mddev);
4303         if (err)
4304                 return err;
4305         if (mddev->pers) {
4306                 err = update_size(mddev, sectors);
4307                 if (err == 0)
4308                         md_update_sb(mddev, 1);
4309         } else {
4310                 if (mddev->dev_sectors == 0 ||
4311                     mddev->dev_sectors > sectors)
4312                         mddev->dev_sectors = sectors;
4313                 else
4314                         err = -ENOSPC;
4315         }
4316         mddev_unlock(mddev);
4317         return err ? err : len;
4318 }
4319
4320 static struct md_sysfs_entry md_size =
4321 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4322
4323 /* Metadata version.
4324  * This is one of
4325  *   'none' for arrays with no metadata (good luck...)
4326  *   'external' for arrays with externally managed metadata,
4327  * or N.M for internally known formats
4328  */
4329 static ssize_t
4330 metadata_show(struct mddev *mddev, char *page)
4331 {
4332         if (mddev->persistent)
4333                 return sprintf(page, "%d.%d\n",
4334                                mddev->major_version, mddev->minor_version);
4335         else if (mddev->external)
4336                 return sprintf(page, "external:%s\n", mddev->metadata_type);
4337         else
4338                 return sprintf(page, "none\n");
4339 }
4340
4341 static ssize_t
4342 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4343 {
4344         int major, minor;
4345         char *e;
4346         int err;
4347         /* Changing the details of 'external' metadata is
4348          * always permitted.  Otherwise there must be
4349          * no devices attached to the array.
4350          */
4351
4352         err = mddev_lock(mddev);
4353         if (err)
4354                 return err;
4355         err = -EBUSY;
4356         if (mddev->external && strncmp(buf, "external:", 9) == 0)
4357                 ;
4358         else if (!list_empty(&mddev->disks))
4359                 goto out_unlock;
4360
4361         err = 0;
4362         if (cmd_match(buf, "none")) {
4363                 mddev->persistent = 0;
4364                 mddev->external = 0;
4365                 mddev->major_version = 0;
4366                 mddev->minor_version = 90;
4367                 goto out_unlock;
4368         }
4369         if (strncmp(buf, "external:", 9) == 0) {
4370                 size_t namelen = len-9;
4371                 if (namelen >= sizeof(mddev->metadata_type))
4372                         namelen = sizeof(mddev->metadata_type)-1;
4373                 strncpy(mddev->metadata_type, buf+9, namelen);
4374                 mddev->metadata_type[namelen] = 0;
4375                 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4376                         mddev->metadata_type[--namelen] = 0;
4377                 mddev->persistent = 0;
4378                 mddev->external = 1;
4379                 mddev->major_version = 0;
4380                 mddev->minor_version = 90;
4381                 goto out_unlock;
4382         }
4383         major = simple_strtoul(buf, &e, 10);
4384         err = -EINVAL;
4385         if (e==buf || *e != '.')
4386                 goto out_unlock;
4387         buf = e+1;
4388         minor = simple_strtoul(buf, &e, 10);
4389         if (e==buf || (*e && *e != '\n') )
4390                 goto out_unlock;
4391         err = -ENOENT;
4392         if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4393                 goto out_unlock;
4394         mddev->major_version = major;
4395         mddev->minor_version = minor;
4396         mddev->persistent = 1;
4397         mddev->external = 0;
4398         err = 0;
4399 out_unlock:
4400         mddev_unlock(mddev);
4401         return err ?: len;
4402 }
4403
4404 static struct md_sysfs_entry md_metadata =
4405 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4406
4407 static ssize_t
4408 action_show(struct mddev *mddev, char *page)
4409 {
4410         char *type = "idle";
4411         unsigned long recovery = mddev->recovery;
4412         if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4413                 type = "frozen";
4414         else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4415             (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4416                 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4417                         type = "reshape";
4418                 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4419                         if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4420                                 type = "resync";
4421                         else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4422                                 type = "check";
4423                         else
4424                                 type = "repair";
4425                 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4426                         type = "recover";
4427                 else if (mddev->reshape_position != MaxSector)
4428                         type = "reshape";
4429         }
4430         return sprintf(page, "%s\n", type);
4431 }
4432
4433 static ssize_t
4434 action_store(struct mddev *mddev, const char *page, size_t len)
4435 {
4436         if (!mddev->pers || !mddev->pers->sync_request)
4437                 return -EINVAL;
4438
4439
4440         if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4441                 if (cmd_match(page, "frozen"))
4442                         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4443                 else
4444                         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4445                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4446                     mddev_lock(mddev) == 0) {
4447                         flush_workqueue(md_misc_wq);
4448                         if (mddev->sync_thread) {
4449                                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4450                                 md_reap_sync_thread(mddev);
4451                         }
4452                         mddev_unlock(mddev);
4453                 }
4454         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4455                 return -EBUSY;
4456         else if (cmd_match(page, "resync"))
4457                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4458         else if (cmd_match(page, "recover")) {
4459                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4460                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4461         } else if (cmd_match(page, "reshape")) {
4462                 int err;
4463                 if (mddev->pers->start_reshape == NULL)
4464                         return -EINVAL;
4465                 err = mddev_lock(mddev);
4466                 if (!err) {
4467                         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4468                                 err =  -EBUSY;
4469                         else {
4470                                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4471                                 err = mddev->pers->start_reshape(mddev);
4472                         }
4473                         mddev_unlock(mddev);
4474                 }
4475                 if (err)
4476                         return err;
4477                 sysfs_notify(&mddev->kobj, NULL, "degraded");
4478         } else {
4479                 if (cmd_match(page, "check"))
4480                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4481                 else if (!cmd_match(page, "repair"))
4482                         return -EINVAL;
4483                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4484                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4485                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4486         }
4487         if (mddev->ro == 2) {
4488                 /* A write to sync_action is enough to justify
4489                  * canceling read-auto mode
4490                  */
4491                 mddev->ro = 0;
4492                 md_wakeup_thread(mddev->sync_thread);
4493         }
4494         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4495         md_wakeup_thread(mddev->thread);
4496         sysfs_notify_dirent_safe(mddev->sysfs_action);
4497         return len;
4498 }
4499
4500 static struct md_sysfs_entry md_scan_mode =
4501 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4502
4503 static ssize_t
4504 last_sync_action_show(struct mddev *mddev, char *page)
4505 {
4506         return sprintf(page, "%s\n", mddev->last_sync_action);
4507 }
4508
4509 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4510
4511 static ssize_t
4512 mismatch_cnt_show(struct mddev *mddev, char *page)
4513 {
4514         return sprintf(page, "%llu\n",
4515                        (unsigned long long)
4516                        atomic64_read(&mddev->resync_mismatches));
4517 }
4518
4519 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4520
4521 static ssize_t
4522 sync_min_show(struct mddev *mddev, char *page)
4523 {
4524         return sprintf(page, "%d (%s)\n", speed_min(mddev),
4525                        mddev->sync_speed_min ? "local": "system");
4526 }
4527
4528 static ssize_t
4529 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4530 {
4531         unsigned int min;
4532         int rv;
4533
4534         if (strncmp(buf, "system", 6)==0) {
4535                 min = 0;
4536         } else {
4537                 rv = kstrtouint(buf, 10, &min);
4538                 if (rv < 0)
4539                         return rv;
4540                 if (min == 0)
4541                         return -EINVAL;
4542         }
4543         mddev->sync_speed_min = min;
4544         return len;
4545 }
4546
4547 static struct md_sysfs_entry md_sync_min =
4548 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4549
4550 static ssize_t
4551 sync_max_show(struct mddev *mddev, char *page)
4552 {
4553         return sprintf(page, "%d (%s)\n", speed_max(mddev),
4554                        mddev->sync_speed_max ? "local": "system");
4555 }
4556
4557 static ssize_t
4558 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4559 {
4560         unsigned int max;
4561         int rv;
4562
4563         if (strncmp(buf, "system", 6)==0) {
4564                 max = 0;
4565         } else {
4566                 rv = kstrtouint(buf, 10, &max);
4567                 if (rv < 0)
4568                         return rv;
4569                 if (max == 0)
4570                         return -EINVAL;
4571         }
4572         mddev->sync_speed_max = max;
4573         return len;
4574 }
4575
4576 static struct md_sysfs_entry md_sync_max =
4577 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4578
4579 static ssize_t
4580 degraded_show(struct mddev *mddev, char *page)
4581 {
4582         return sprintf(page, "%d\n", mddev->degraded);
4583 }
4584 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4585
4586 static ssize_t
4587 sync_force_parallel_show(struct mddev *mddev, char *page)
4588 {
4589         return sprintf(page, "%d\n", mddev->parallel_resync);
4590 }
4591
4592 static ssize_t
4593 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4594 {
4595         long n;
4596
4597         if (kstrtol(buf, 10, &n))
4598                 return -EINVAL;
4599
4600         if (n != 0 && n != 1)
4601                 return -EINVAL;
4602
4603         mddev->parallel_resync = n;
4604
4605         if (mddev->sync_thread)
4606                 wake_up(&resync_wait);
4607
4608         return len;
4609 }
4610
4611 /* force parallel resync, even with shared block devices */
4612 static struct md_sysfs_entry md_sync_force_parallel =
4613 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4614        sync_force_parallel_show, sync_force_parallel_store);
4615
4616 static ssize_t
4617 sync_speed_show(struct mddev *mddev, char *page)
4618 {
4619         unsigned long resync, dt, db;
4620         if (mddev->curr_resync == 0)
4621                 return sprintf(page, "none\n");
4622         resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4623         dt = (jiffies - mddev->resync_mark) / HZ;
4624         if (!dt) dt++;
4625         db = resync - mddev->resync_mark_cnt;
4626         return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4627 }
4628
4629 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4630
4631 static ssize_t
4632 sync_completed_show(struct mddev *mddev, char *page)
4633 {
4634         unsigned long long max_sectors, resync;
4635
4636         if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4637                 return sprintf(page, "none\n");
4638
4639         if (mddev->curr_resync == 1 ||
4640             mddev->curr_resync == 2)
4641                 return sprintf(page, "delayed\n");
4642
4643         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4644             test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4645                 max_sectors = mddev->resync_max_sectors;
4646         else
4647                 max_sectors = mddev->dev_sectors;
4648
4649         resync = mddev->curr_resync_completed;
4650         return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4651 }
4652
4653 static struct md_sysfs_entry md_sync_completed =
4654         __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4655
4656 static ssize_t
4657 min_sync_show(struct mddev *mddev, char *page)
4658 {
4659         return sprintf(page, "%llu\n",
4660                        (unsigned long long)mddev->resync_min);
4661 }
4662 static ssize_t
4663 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4664 {
4665         unsigned long long min;
4666         int err;
4667
4668         if (kstrtoull(buf, 10, &min))
4669                 return -EINVAL;
4670
4671         spin_lock(&mddev->lock);
4672         err = -EINVAL;
4673         if (min > mddev->resync_max)
4674                 goto out_unlock;
4675
4676         err = -EBUSY;
4677         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4678                 goto out_unlock;
4679
4680         /* Round down to multiple of 4K for safety */
4681         mddev->resync_min = round_down(min, 8);
4682         err = 0;
4683
4684 out_unlock:
4685         spin_unlock(&mddev->lock);
4686         return err ?: len;
4687 }
4688
4689 static struct md_sysfs_entry md_min_sync =
4690 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4691
4692 static ssize_t
4693 max_sync_show(struct mddev *mddev, char *page)
4694 {
4695         if (mddev->resync_max == MaxSector)
4696                 return sprintf(page, "max\n");
4697         else
4698                 return sprintf(page, "%llu\n",
4699                                (unsigned long long)mddev->resync_max);
4700 }
4701 static ssize_t
4702 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4703 {
4704         int err;
4705         spin_lock(&mddev->lock);
4706         if (strncmp(buf, "max", 3) == 0)
4707                 mddev->resync_max = MaxSector;
4708         else {
4709                 unsigned long long max;
4710                 int chunk;
4711
4712                 err = -EINVAL;
4713                 if (kstrtoull(buf, 10, &max))
4714                         goto out_unlock;
4715                 if (max < mddev->resync_min)
4716                         goto out_unlock;
4717
4718                 err = -EBUSY;
4719                 if (max < mddev->resync_max &&
4720                     mddev->ro == 0 &&
4721                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4722                         goto out_unlock;
4723
4724                 /* Must be a multiple of chunk_size */
4725                 chunk = mddev->chunk_sectors;
4726                 if (chunk) {
4727                         sector_t temp = max;
4728
4729                         err = -EINVAL;
4730                         if (sector_div(temp, chunk))
4731                                 goto out_unlock;
4732                 }
4733                 mddev->resync_max = max;
4734         }
4735         wake_up(&mddev->recovery_wait);
4736         err = 0;
4737 out_unlock:
4738         spin_unlock(&mddev->lock);
4739         return err ?: len;
4740 }
4741
4742 static struct md_sysfs_entry md_max_sync =
4743 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4744
4745 static ssize_t
4746 suspend_lo_show(struct mddev *mddev, char *page)
4747 {
4748         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4749 }
4750
4751 static ssize_t
4752 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4753 {
4754         unsigned long long old, new;
4755         int err;
4756
4757         err = kstrtoull(buf, 10, &new);
4758         if (err < 0)
4759                 return err;
4760         if (new != (sector_t)new)
4761                 return -EINVAL;
4762
4763         err = mddev_lock(mddev);
4764         if (err)
4765                 return err;
4766         err = -EINVAL;
4767         if (mddev->pers == NULL ||
4768             mddev->pers->quiesce == NULL)
4769                 goto unlock;
4770         old = mddev->suspend_lo;
4771         mddev->suspend_lo = new;
4772         if (new >= old)
4773                 /* Shrinking suspended region */
4774                 mddev->pers->quiesce(mddev, 2);
4775         else {
4776                 /* Expanding suspended region - need to wait */
4777                 mddev->pers->quiesce(mddev, 1);
4778                 mddev->pers->quiesce(mddev, 0);
4779         }
4780         err = 0;
4781 unlock:
4782         mddev_unlock(mddev);
4783         return err ?: len;
4784 }
4785 static struct md_sysfs_entry md_suspend_lo =
4786 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4787
4788 static ssize_t
4789 suspend_hi_show(struct mddev *mddev, char *page)
4790 {
4791         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4792 }
4793
4794 static ssize_t
4795 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4796 {
4797         unsigned long long old, new;
4798         int err;
4799
4800         err = kstrtoull(buf, 10, &new);
4801         if (err < 0)
4802                 return err;
4803         if (new != (sector_t)new)
4804                 return -EINVAL;
4805
4806         err = mddev_lock(mddev);
4807         if (err)
4808                 return err;
4809         err = -EINVAL;
4810         if (mddev->pers == NULL ||
4811             mddev->pers->quiesce == NULL)
4812                 goto unlock;
4813         old = mddev->suspend_hi;
4814         mddev->suspend_hi = new;
4815         if (new <= old)
4816                 /* Shrinking suspended region */
4817                 mddev->pers->quiesce(mddev, 2);
4818         else {
4819                 /* Expanding suspended region - need to wait */
4820                 mddev->pers->quiesce(mddev, 1);
4821                 mddev->pers->quiesce(mddev, 0);
4822         }
4823         err = 0;
4824 unlock:
4825         mddev_unlock(mddev);
4826         return err ?: len;
4827 }
4828 static struct md_sysfs_entry md_suspend_hi =
4829 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4830
4831 static ssize_t
4832 reshape_position_show(struct mddev *mddev, char *page)
4833 {
4834         if (mddev->reshape_position != MaxSector)
4835                 return sprintf(page, "%llu\n",
4836                                (unsigned long long)mddev->reshape_position);
4837         strcpy(page, "none\n");
4838         return 5;
4839 }
4840
4841 static ssize_t
4842 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4843 {
4844         struct md_rdev *rdev;
4845         unsigned long long new;
4846         int err;
4847
4848         err = kstrtoull(buf, 10, &new);
4849         if (err < 0)
4850                 return err;
4851         if (new != (sector_t)new)
4852                 return -EINVAL;
4853         err = mddev_lock(mddev);
4854         if (err)
4855                 return err;
4856         err = -EBUSY;
4857         if (mddev->pers)
4858                 goto unlock;
4859         mddev->reshape_position = new;
4860         mddev->delta_disks = 0;
4861         mddev->reshape_backwards = 0;
4862         mddev->new_level = mddev->level;
4863         mddev->new_layout = mddev->layout;
4864         mddev->new_chunk_sectors = mddev->chunk_sectors;
4865         rdev_for_each(rdev, mddev)
4866                 rdev->new_data_offset = rdev->data_offset;
4867         err = 0;
4868 unlock:
4869         mddev_unlock(mddev);
4870         return err ?: len;
4871 }
4872
4873 static struct md_sysfs_entry md_reshape_position =
4874 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4875        reshape_position_store);
4876
4877 static ssize_t
4878 reshape_direction_show(struct mddev *mddev, char *page)
4879 {
4880         return sprintf(page, "%s\n",
4881                        mddev->reshape_backwards ? "backwards" : "forwards");
4882 }
4883
4884 static ssize_t
4885 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4886 {
4887         int backwards = 0;
4888         int err;
4889
4890         if (cmd_match(buf, "forwards"))
4891                 backwards = 0;
4892         else if (cmd_match(buf, "backwards"))
4893                 backwards = 1;
4894         else
4895                 return -EINVAL;
4896         if (mddev->reshape_backwards == backwards)
4897                 return len;
4898
4899         err = mddev_lock(mddev);
4900         if (err)
4901                 return err;
4902         /* check if we are allowed to change */
4903         if (mddev->delta_disks)
4904                 err = -EBUSY;
4905         else if (mddev->persistent &&
4906             mddev->major_version == 0)
4907                 err =  -EINVAL;
4908         else
4909                 mddev->reshape_backwards = backwards;
4910         mddev_unlock(mddev);
4911         return err ?: len;
4912 }
4913
4914 static struct md_sysfs_entry md_reshape_direction =
4915 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4916        reshape_direction_store);
4917
4918 static ssize_t
4919 array_size_show(struct mddev *mddev, char *page)
4920 {
4921         if (mddev->external_size)
4922                 return sprintf(page, "%llu\n",
4923                                (unsigned long long)mddev->array_sectors/2);
4924         else
4925                 return sprintf(page, "default\n");
4926 }
4927
4928 static ssize_t
4929 array_size_store(struct mddev *mddev, const char *buf, size_t len)
4930 {
4931         sector_t sectors;
4932         int err;
4933
4934         err = mddev_lock(mddev);
4935         if (err)
4936                 return err;
4937
4938         /* cluster raid doesn't support change array_sectors */
4939         if (mddev_is_clustered(mddev))
4940                 return -EINVAL;
4941
4942         if (strncmp(buf, "default", 7) == 0) {
4943                 if (mddev->pers)
4944                         sectors = mddev->pers->size(mddev, 0, 0);
4945                 else
4946                         sectors = mddev->array_sectors;
4947
4948                 mddev->external_size = 0;
4949         } else {
4950                 if (strict_blocks_to_sectors(buf, &sectors) < 0)
4951                         err = -EINVAL;
4952                 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4953                         err = -E2BIG;
4954                 else
4955                         mddev->external_size = 1;
4956         }
4957
4958         if (!err) {
4959                 mddev->array_sectors = sectors;
4960                 if (mddev->pers) {
4961                         set_capacity(mddev->gendisk, mddev->array_sectors);
4962                         revalidate_disk(mddev->gendisk);
4963                 }
4964         }
4965         mddev_unlock(mddev);
4966         return err ?: len;
4967 }
4968
4969 static struct md_sysfs_entry md_array_size =
4970 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4971        array_size_store);
4972
4973 static ssize_t
4974 consistency_policy_show(struct mddev *mddev, char *page)
4975 {
4976         int ret;
4977
4978         if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
4979                 ret = sprintf(page, "journal\n");
4980         } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
4981                 ret = sprintf(page, "ppl\n");
4982         } else if (mddev->bitmap) {
4983                 ret = sprintf(page, "bitmap\n");
4984         } else if (mddev->pers) {
4985                 if (mddev->pers->sync_request)
4986                         ret = sprintf(page, "resync\n");
4987                 else
4988                         ret = sprintf(page, "none\n");
4989         } else {
4990                 ret = sprintf(page, "unknown\n");
4991         }
4992
4993         return ret;
4994 }
4995
4996 static ssize_t
4997 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
4998 {
4999         int err = 0;
5000
5001         if (mddev->pers) {
5002                 if (mddev->pers->change_consistency_policy)
5003                         err = mddev->pers->change_consistency_policy(mddev, buf);
5004                 else
5005                         err = -EBUSY;
5006         } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5007                 set_bit(MD_HAS_PPL, &mddev->flags);
5008         } else {
5009                 err = -EINVAL;
5010         }
5011
5012         return err ? err : len;
5013 }
5014
5015 static struct md_sysfs_entry md_consistency_policy =
5016 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5017        consistency_policy_store);
5018
5019 static struct attribute *md_default_attrs[] = {
5020         &md_level.attr,
5021         &md_layout.attr,
5022         &md_raid_disks.attr,
5023         &md_chunk_size.attr,
5024         &md_size.attr,
5025         &md_resync_start.attr,
5026         &md_metadata.attr,
5027         &md_new_device.attr,
5028         &md_safe_delay.attr,
5029         &md_array_state.attr,
5030         &md_reshape_position.attr,
5031         &md_reshape_direction.attr,
5032         &md_array_size.attr,
5033         &max_corr_read_errors.attr,
5034         &md_consistency_policy.attr,
5035         NULL,
5036 };
5037
5038 static struct attribute *md_redundancy_attrs[] = {
5039         &md_scan_mode.attr,
5040         &md_last_scan_mode.attr,
5041         &md_mismatches.attr,
5042         &md_sync_min.attr,
5043         &md_sync_max.attr,
5044         &md_sync_speed.attr,
5045         &md_sync_force_parallel.attr,
5046         &md_sync_completed.attr,
5047         &md_min_sync.attr,
5048         &md_max_sync.attr,
5049         &md_suspend_lo.attr,
5050         &md_suspend_hi.attr,
5051         &md_bitmap.attr,
5052         &md_degraded.attr,
5053         NULL,
5054 };
5055 static struct attribute_group md_redundancy_group = {
5056         .name = NULL,
5057         .attrs = md_redundancy_attrs,
5058 };
5059
5060 static ssize_t
5061 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5062 {
5063         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5064         struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5065         ssize_t rv;
5066
5067         if (!entry->show)
5068                 return -EIO;
5069         spin_lock(&all_mddevs_lock);
5070         if (list_empty(&mddev->all_mddevs)) {
5071                 spin_unlock(&all_mddevs_lock);
5072                 return -EBUSY;
5073         }
5074         mddev_get(mddev);
5075         spin_unlock(&all_mddevs_lock);
5076
5077         rv = entry->show(mddev, page);
5078         mddev_put(mddev);
5079         return rv;
5080 }
5081
5082 static ssize_t
5083 md_attr_store(struct kobject *kobj, struct attribute *attr,
5084               const char *page, size_t length)
5085 {
5086         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5087         struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5088         ssize_t rv;
5089
5090         if (!entry->store)
5091                 return -EIO;
5092         if (!capable(CAP_SYS_ADMIN))
5093                 return -EACCES;
5094         spin_lock(&all_mddevs_lock);
5095         if (list_empty(&mddev->all_mddevs)) {
5096                 spin_unlock(&all_mddevs_lock);
5097                 return -EBUSY;
5098         }
5099         mddev_get(mddev);
5100         spin_unlock(&all_mddevs_lock);
5101         rv = entry->store(mddev, page, length);
5102         mddev_put(mddev);
5103         return rv;
5104 }
5105
5106 static void md_free(struct kobject *ko)
5107 {
5108         struct mddev *mddev = container_of(ko, struct mddev, kobj);
5109
5110         if (mddev->sysfs_state)
5111                 sysfs_put(mddev->sysfs_state);
5112
5113         if (mddev->queue)
5114                 blk_cleanup_queue(mddev->queue);
5115         if (mddev->gendisk) {
5116                 del_gendisk(mddev->gendisk);
5117                 put_disk(mddev->gendisk);
5118         }
5119
5120         kfree(mddev);
5121 }
5122
5123 static const struct sysfs_ops md_sysfs_ops = {
5124         .show   = md_attr_show,
5125         .store  = md_attr_store,
5126 };
5127 static struct kobj_type md_ktype = {
5128         .release        = md_free,
5129         .sysfs_ops      = &md_sysfs_ops,
5130         .default_attrs  = md_default_attrs,
5131 };
5132
5133 int mdp_major = 0;
5134
5135 static void mddev_delayed_delete(struct work_struct *ws)
5136 {
5137         struct mddev *mddev = container_of(ws, struct mddev, del_work);
5138
5139         sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5140         kobject_del(&mddev->kobj);
5141         kobject_put(&mddev->kobj);
5142 }
5143
5144 static int md_alloc(dev_t dev, char *name)
5145 {
5146         static DEFINE_MUTEX(disks_mutex);
5147         struct mddev *mddev = mddev_find(dev);
5148         struct gendisk *disk;
5149         int partitioned;
5150         int shift;
5151         int unit;
5152         int error;
5153
5154         if (!mddev)
5155                 return -ENODEV;
5156
5157         partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5158         shift = partitioned ? MdpMinorShift : 0;
5159         unit = MINOR(mddev->unit) >> shift;
5160
5161         /* wait for any previous instance of this device to be
5162          * completely removed (mddev_delayed_delete).
5163          */
5164         flush_workqueue(md_misc_wq);
5165
5166         mutex_lock(&disks_mutex);
5167         error = -EEXIST;
5168         if (mddev->gendisk)
5169                 goto abort;
5170
5171         if (name) {
5172                 /* Need to ensure that 'name' is not a duplicate.
5173                  */
5174                 struct mddev *mddev2;
5175                 spin_lock(&all_mddevs_lock);
5176
5177                 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5178                         if (mddev2->gendisk &&
5179                             strcmp(mddev2->gendisk->disk_name, name) == 0) {
5180                                 spin_unlock(&all_mddevs_lock);
5181                                 goto abort;
5182                         }
5183                 spin_unlock(&all_mddevs_lock);
5184         }
5185
5186         error = -ENOMEM;
5187         mddev->queue = blk_alloc_queue(GFP_KERNEL);
5188         if (!mddev->queue)
5189                 goto abort;
5190         mddev->queue->queuedata = mddev;
5191
5192         blk_queue_make_request(mddev->queue, md_make_request);
5193         blk_set_stacking_limits(&mddev->queue->limits);
5194
5195         disk = alloc_disk(1 << shift);
5196         if (!disk) {
5197                 blk_cleanup_queue(mddev->queue);
5198                 mddev->queue = NULL;
5199                 goto abort;
5200         }
5201         disk->major = MAJOR(mddev->unit);
5202         disk->first_minor = unit << shift;
5203         if (name)
5204                 strcpy(disk->disk_name, name);
5205         else if (partitioned)
5206                 sprintf(disk->disk_name, "md_d%d", unit);
5207         else
5208                 sprintf(disk->disk_name, "md%d", unit);
5209         disk->fops = &md_fops;
5210         disk->private_data = mddev;
5211         disk->queue = mddev->queue;
5212         blk_queue_write_cache(mddev->queue, true, true);
5213         /* Allow extended partitions.  This makes the
5214          * 'mdp' device redundant, but we can't really
5215          * remove it now.
5216          */
5217         disk->flags |= GENHD_FL_EXT_DEVT;
5218         mddev->gendisk = disk;
5219         /* As soon as we call add_disk(), another thread could get
5220          * through to md_open, so make sure it doesn't get too far
5221          */
5222         mutex_lock(&mddev->open_mutex);
5223         add_disk(disk);
5224
5225         error = kobject_init_and_add(&mddev->kobj, &md_ktype,
5226                                      &disk_to_dev(disk)->kobj, "%s", "md");
5227         if (error) {
5228                 /* This isn't possible, but as kobject_init_and_add is marked
5229                  * __must_check, we must do something with the result
5230                  */
5231                 pr_debug("md: cannot register %s/md - name in use\n",
5232                          disk->disk_name);
5233                 error = 0;
5234         }
5235         if (mddev->kobj.sd &&
5236             sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5237                 pr_debug("pointless warning\n");
5238         mutex_unlock(&mddev->open_mutex);
5239  abort:
5240         mutex_unlock(&disks_mutex);
5241         if (!error && mddev->kobj.sd) {
5242                 kobject_uevent(&mddev->kobj, KOBJ_ADD);
5243                 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5244         }
5245         mddev_put(mddev);
5246         return error;
5247 }
5248
5249 static struct kobject *md_probe(dev_t dev, int *part, void *data)
5250 {
5251         md_alloc(dev, NULL);
5252         return NULL;
5253 }
5254
5255 static int add_named_array(const char *val, struct kernel_param *kp)
5256 {
5257         /* val must be "md_*" where * is not all digits.
5258          * We allocate an array with a large free minor number, and
5259          * set the name to val.  val must not already be an active name.
5260          */
5261         int len = strlen(val);
5262         char buf[DISK_NAME_LEN];
5263
5264         while (len && val[len-1] == '\n')
5265                 len--;
5266         if (len >= DISK_NAME_LEN)
5267                 return -E2BIG;
5268         strlcpy(buf, val, len+1);
5269         if (strncmp(buf, "md_", 3) != 0)
5270                 return -EINVAL;
5271         return md_alloc(0, buf);
5272 }
5273
5274 static void md_safemode_timeout(unsigned long data)
5275 {
5276         struct mddev *mddev = (struct mddev *) data;
5277
5278         if (!atomic_read(&mddev->writes_pending)) {
5279                 mddev->safemode = 1;
5280                 if (mddev->external)
5281                         sysfs_notify_dirent_safe(mddev->sysfs_state);
5282         }
5283         md_wakeup_thread(mddev->thread);
5284 }
5285
5286 static int start_dirty_degraded;
5287
5288 int md_run(struct mddev *mddev)
5289 {
5290         int err;
5291         struct md_rdev *rdev;
5292         struct md_personality *pers;
5293
5294         if (list_empty(&mddev->disks))
5295                 /* cannot run an array with no devices.. */
5296                 return -EINVAL;
5297
5298         if (mddev->pers)
5299                 return -EBUSY;
5300         /* Cannot run until previous stop completes properly */
5301         if (mddev->sysfs_active)
5302                 return -EBUSY;
5303
5304         /*
5305          * Analyze all RAID superblock(s)
5306          */
5307         if (!mddev->raid_disks) {
5308                 if (!mddev->persistent)
5309                         return -EINVAL;
5310                 analyze_sbs(mddev);
5311         }
5312
5313         if (mddev->level != LEVEL_NONE)
5314                 request_module("md-level-%d", mddev->level);
5315         else if (mddev->clevel[0])
5316                 request_module("md-%s", mddev->clevel);
5317
5318         /*
5319          * Drop all container device buffers, from now on
5320          * the only valid external interface is through the md
5321          * device.
5322          */
5323         rdev_for_each(rdev, mddev) {
5324                 if (test_bit(Faulty, &rdev->flags))
5325                         continue;
5326                 sync_blockdev(rdev->bdev);
5327                 invalidate_bdev(rdev->bdev);
5328
5329                 /* perform some consistency tests on the device.
5330                  * We don't want the data to overlap the metadata,
5331                  * Internal Bitmap issues have been handled elsewhere.
5332                  */
5333                 if (rdev->meta_bdev) {
5334                         /* Nothing to check */;
5335                 } else if (rdev->data_offset < rdev->sb_start) {
5336                         if (mddev->dev_sectors &&
5337                             rdev->data_offset + mddev->dev_sectors
5338                             > rdev->sb_start) {
5339                                 pr_warn("md: %s: data overlaps metadata\n",
5340                                         mdname(mddev));
5341                                 return -EINVAL;
5342                         }
5343                 } else {
5344                         if (rdev->sb_start + rdev->sb_size/512
5345                             > rdev->data_offset) {
5346                                 pr_warn("md: %s: metadata overlaps data\n",
5347                                         mdname(mddev));
5348                                 return -EINVAL;
5349                         }
5350                 }
5351                 sysfs_notify_dirent_safe(rdev->sysfs_state);
5352         }
5353
5354         if (mddev->bio_set == NULL) {
5355                 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
5356                 if (!mddev->bio_set)
5357                         return -ENOMEM;
5358         }
5359
5360         spin_lock(&pers_lock);
5361         pers = find_pers(mddev->level, mddev->clevel);
5362         if (!pers || !try_module_get(pers->owner)) {
5363                 spin_unlock(&pers_lock);
5364                 if (mddev->level != LEVEL_NONE)
5365                         pr_warn("md: personality for level %d is not loaded!\n",
5366                                 mddev->level);
5367                 else
5368                         pr_warn("md: personality for level %s is not loaded!\n",
5369                                 mddev->clevel);
5370                 return -EINVAL;
5371         }
5372         spin_unlock(&pers_lock);
5373         if (mddev->level != pers->level) {
5374                 mddev->level = pers->level;
5375                 mddev->new_level = pers->level;
5376         }
5377         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5378
5379         if (mddev->reshape_position != MaxSector &&
5380             pers->start_reshape == NULL) {
5381                 /* This personality cannot handle reshaping... */
5382                 module_put(pers->owner);
5383                 return -EINVAL;
5384         }
5385
5386         if (pers->sync_request) {
5387                 /* Warn if this is a potentially silly
5388                  * configuration.
5389                  */
5390                 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5391                 struct md_rdev *rdev2;
5392                 int warned = 0;
5393
5394                 rdev_for_each(rdev, mddev)
5395                         rdev_for_each(rdev2, mddev) {
5396                                 if (rdev < rdev2 &&
5397                                     rdev->bdev->bd_contains ==
5398                                     rdev2->bdev->bd_contains) {
5399                                         pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5400                                                 mdname(mddev),
5401                                                 bdevname(rdev->bdev,b),
5402                                                 bdevname(rdev2->bdev,b2));
5403                                         warned = 1;
5404                                 }
5405                         }
5406
5407                 if (warned)
5408                         pr_warn("True protection against single-disk failure might be compromised.\n");
5409         }
5410
5411         mddev->recovery = 0;
5412         /* may be over-ridden by personality */
5413         mddev->resync_max_sectors = mddev->dev_sectors;
5414
5415         mddev->ok_start_degraded = start_dirty_degraded;
5416
5417         if (start_readonly && mddev->ro == 0)
5418                 mddev->ro = 2; /* read-only, but switch on first write */
5419
5420         /*
5421          * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
5422          * up mddev->thread. It is important to initialize critical
5423          * resources for mddev->thread BEFORE calling pers->run().
5424          */
5425         err = pers->run(mddev);
5426         if (err)
5427                 pr_warn("md: pers->run() failed ...\n");
5428         else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5429                 WARN_ONCE(!mddev->external_size,
5430                           "%s: default size too small, but 'external_size' not in effect?\n",
5431                           __func__);
5432                 pr_warn("md: invalid array_size %llu > default size %llu\n",
5433                         (unsigned long long)mddev->array_sectors / 2,
5434                         (unsigned long long)pers->size(mddev, 0, 0) / 2);
5435                 err = -EINVAL;
5436         }
5437         if (err == 0 && pers->sync_request &&
5438             (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5439                 struct bitmap *bitmap;
5440
5441                 bitmap = bitmap_create(mddev, -1);
5442                 if (IS_ERR(bitmap)) {
5443                         err = PTR_ERR(bitmap);
5444                         pr_warn("%s: failed to create bitmap (%d)\n",
5445                                 mdname(mddev), err);
5446                 } else
5447                         mddev->bitmap = bitmap;
5448
5449         }
5450         if (err) {
5451                 mddev_detach(mddev);
5452                 if (mddev->private)
5453                         pers->free(mddev, mddev->private);
5454                 mddev->private = NULL;
5455                 module_put(pers->owner);
5456                 bitmap_destroy(mddev);
5457                 return err;
5458         }
5459         if (mddev->queue) {
5460                 bool nonrot = true;
5461
5462                 rdev_for_each(rdev, mddev) {
5463                         if (rdev->raid_disk >= 0 &&
5464                             !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
5465                                 nonrot = false;
5466                                 break;
5467                         }
5468                 }
5469                 if (mddev->degraded)
5470                         nonrot = false;
5471                 if (nonrot)
5472                         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
5473                 else
5474                         queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
5475                 mddev->queue->backing_dev_info->congested_data = mddev;
5476                 mddev->queue->backing_dev_info->congested_fn = md_congested;
5477         }
5478         if (pers->sync_request) {
5479                 if (mddev->kobj.sd &&
5480                     sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5481                         pr_warn("md: cannot register extra attributes for %s\n",
5482                                 mdname(mddev));
5483                 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5484         } else if (mddev->ro == 2) /* auto-readonly not meaningful */
5485                 mddev->ro = 0;
5486
5487         atomic_set(&mddev->writes_pending,0);
5488         atomic_set(&mddev->max_corr_read_errors,
5489                    MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5490         mddev->safemode = 0;
5491         if (mddev_is_clustered(mddev))
5492                 mddev->safemode_delay = 0;
5493         else
5494                 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5495         mddev->in_sync = 1;
5496         smp_wmb();
5497         spin_lock(&mddev->lock);
5498         mddev->pers = pers;
5499         spin_unlock(&mddev->lock);
5500         rdev_for_each(rdev, mddev)
5501                 if (rdev->raid_disk >= 0)
5502                         if (sysfs_link_rdev(mddev, rdev))
5503                                 /* failure here is OK */;
5504
5505         if (mddev->degraded && !mddev->ro)
5506                 /* This ensures that recovering status is reported immediately
5507                  * via sysfs - until a lack of spares is confirmed.
5508                  */
5509                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5510         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5511
5512         if (mddev->sb_flags)
5513                 md_update_sb(mddev, 0);
5514
5515         md_new_event(mddev);
5516         sysfs_notify_dirent_safe(mddev->sysfs_state);
5517         sysfs_notify_dirent_safe(mddev->sysfs_action);
5518         sysfs_notify(&mddev->kobj, NULL, "degraded");
5519         return 0;
5520 }
5521 EXPORT_SYMBOL_GPL(md_run);
5522
5523 static int do_md_run(struct mddev *mddev)
5524 {
5525         int err;
5526
5527         err = md_run(mddev);
5528         if (err)
5529                 goto out;
5530         err = bitmap_load(mddev);
5531         if (err) {
5532                 bitmap_destroy(mddev);
5533                 goto out;
5534         }
5535
5536         if (mddev_is_clustered(mddev))
5537                 md_allow_write(mddev);
5538
5539         md_wakeup_thread(mddev->thread);
5540         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5541
5542         set_capacity(mddev->gendisk, mddev->array_sectors);
5543         revalidate_disk(mddev->gendisk);
5544         mddev->changed = 1;
5545         kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5546 out:
5547         return err;
5548 }
5549
5550 static int restart_array(struct mddev *mddev)
5551 {
5552         struct gendisk *disk = mddev->gendisk;
5553
5554         /* Complain if it has no devices */
5555         if (list_empty(&mddev->disks))
5556                 return -ENXIO;
5557         if (!mddev->pers)
5558                 return -EINVAL;
5559         if (!mddev->ro)
5560                 return -EBUSY;
5561         if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5562                 struct md_rdev *rdev;
5563                 bool has_journal = false;
5564
5565                 rcu_read_lock();
5566                 rdev_for_each_rcu(rdev, mddev) {
5567                         if (test_bit(Journal, &rdev->flags) &&
5568                             !test_bit(Faulty, &rdev->flags)) {
5569                                 has_journal = true;
5570                                 break;
5571                         }
5572                 }
5573                 rcu_read_unlock();
5574
5575                 /* Don't restart rw with journal missing/faulty */
5576                 if (!has_journal)
5577                         return -EINVAL;
5578         }
5579
5580         mddev->safemode = 0;
5581         mddev->ro = 0;
5582         set_disk_ro(disk, 0);
5583         pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
5584         /* Kick recovery or resync if necessary */
5585         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5586         md_wakeup_thread(mddev->thread);
5587         md_wakeup_thread(mddev->sync_thread);
5588         sysfs_notify_dirent_safe(mddev->sysfs_state);
5589         return 0;
5590 }
5591
5592 static void md_clean(struct mddev *mddev)
5593 {
5594         mddev->array_sectors = 0;
5595         mddev->external_size = 0;
5596         mddev->dev_sectors = 0;
5597         mddev->raid_disks = 0;
5598         mddev->recovery_cp = 0;
5599         mddev->resync_min = 0;
5600         mddev->resync_max = MaxSector;
5601         mddev->reshape_position = MaxSector;
5602         mddev->external = 0;
5603         mddev->persistent = 0;
5604         mddev->level = LEVEL_NONE;
5605         mddev->clevel[0] = 0;
5606         mddev->flags = 0;
5607         mddev->sb_flags = 0;
5608         mddev->ro = 0;
5609         mddev->metadata_type[0] = 0;
5610         mddev->chunk_sectors = 0;
5611         mddev->ctime = mddev->utime = 0;
5612         mddev->layout = 0;
5613         mddev->max_disks = 0;
5614         mddev->events = 0;
5615         mddev->can_decrease_events = 0;
5616         mddev->delta_disks = 0;
5617         mddev->reshape_backwards = 0;
5618         mddev->new_level = LEVEL_NONE;
5619         mddev->new_layout = 0;
5620         mddev->new_chunk_sectors = 0;
5621         mddev->curr_resync = 0;
5622         atomic64_set(&mddev->resync_mismatches, 0);
5623         mddev->suspend_lo = mddev->suspend_hi = 0;
5624         mddev->sync_speed_min = mddev->sync_speed_max = 0;
5625         mddev->recovery = 0;
5626         mddev->in_sync = 0;
5627         mddev->changed = 0;
5628         mddev->degraded = 0;
5629         mddev->safemode = 0;
5630         mddev->private = NULL;
5631         mddev->cluster_info = NULL;
5632         mddev->bitmap_info.offset = 0;
5633         mddev->bitmap_info.default_offset = 0;
5634         mddev->bitmap_info.default_space = 0;
5635         mddev->bitmap_info.chunksize = 0;
5636         mddev->bitmap_info.daemon_sleep = 0;
5637         mddev->bitmap_info.max_write_behind = 0;
5638         mddev->bitmap_info.nodes = 0;
5639 }
5640
5641 static void __md_stop_writes(struct mddev *mddev)
5642 {
5643         set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5644         flush_workqueue(md_misc_wq);
5645         if (mddev->sync_thread) {
5646                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5647                 md_reap_sync_thread(mddev);
5648         }
5649
5650         del_timer_sync(&mddev->safemode_timer);
5651
5652         if (mddev->pers && mddev->pers->quiesce) {
5653                 mddev->pers->quiesce(mddev, 1);
5654                 mddev->pers->quiesce(mddev, 0);
5655         }
5656         bitmap_flush(mddev);
5657
5658         if (mddev->ro == 0 &&
5659             ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5660              mddev->sb_flags)) {
5661                 /* mark array as shutdown cleanly */
5662                 if (!mddev_is_clustered(mddev))
5663                         mddev->in_sync = 1;
5664                 md_update_sb(mddev, 1);
5665         }
5666 }
5667
5668 void md_stop_writes(struct mddev *mddev)
5669 {
5670         mddev_lock_nointr(mddev);
5671         __md_stop_writes(mddev);
5672         mddev_unlock(mddev);
5673 }
5674 EXPORT_SYMBOL_GPL(md_stop_writes);
5675
5676 static void mddev_detach(struct mddev *mddev)
5677 {
5678         bitmap_wait_behind_writes(mddev);
5679         if (mddev->pers && mddev->pers->quiesce) {
5680                 mddev->pers->quiesce(mddev, 1);
5681                 mddev->pers->quiesce(mddev, 0);
5682         }
5683         md_unregister_thread(&mddev->thread);
5684         if (mddev->queue)
5685                 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5686 }
5687
5688 static void __md_stop(struct mddev *mddev)
5689 {
5690         struct md_personality *pers = mddev->pers;
5691         bitmap_destroy(mddev);
5692         mddev_detach(mddev);
5693         /* Ensure ->event_work is done */
5694         flush_workqueue(md_misc_wq);
5695         spin_lock(&mddev->lock);
5696         mddev->pers = NULL;
5697         spin_unlock(&mddev->lock);
5698         pers->free(mddev, mddev->private);
5699         mddev->private = NULL;
5700         if (pers->sync_request && mddev->to_remove == NULL)
5701                 mddev->to_remove = &md_redundancy_group;
5702         module_put(pers->owner);
5703         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5704 }
5705
5706 void md_stop(struct mddev *mddev)
5707 {
5708         /* stop the array and free an attached data structures.
5709          * This is called from dm-raid
5710          */
5711         __md_stop(mddev);
5712         if (mddev->bio_set)
5713                 bioset_free(mddev->bio_set);
5714 }
5715
5716 EXPORT_SYMBOL_GPL(md_stop);
5717
5718 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5719 {
5720         int err = 0;
5721         int did_freeze = 0;
5722
5723         if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5724                 did_freeze = 1;
5725                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5726                 md_wakeup_thread(mddev->thread);
5727         }
5728         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5729                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5730         if (mddev->sync_thread)
5731                 /* Thread might be blocked waiting for metadata update
5732                  * which will now never happen */
5733                 wake_up_process(mddev->sync_thread->tsk);
5734
5735         if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
5736                 return -EBUSY;
5737         mddev_unlock(mddev);
5738         wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
5739                                           &mddev->recovery));
5740         wait_event(mddev->sb_wait,
5741                    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
5742         mddev_lock_nointr(mddev);
5743
5744         mutex_lock(&mddev->open_mutex);
5745         if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5746             mddev->sync_thread ||
5747             test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5748                 pr_warn("md: %s still in use.\n",mdname(mddev));
5749                 if (did_freeze) {
5750                         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5751                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5752                         md_wakeup_thread(mddev->thread);
5753                 }
5754                 err = -EBUSY;
5755                 goto out;
5756         }
5757         if (mddev->pers) {
5758                 __md_stop_writes(mddev);
5759
5760                 err  = -ENXIO;
5761                 if (mddev->ro==1)
5762                         goto out;
5763                 mddev->ro = 1;
5764                 set_disk_ro(mddev->gendisk, 1);
5765                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5766                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5767                 md_wakeup_thread(mddev->thread);
5768                 sysfs_notify_dirent_safe(mddev->sysfs_state);
5769                 err = 0;
5770         }
5771 out:
5772         mutex_unlock(&mddev->open_mutex);
5773         return err;
5774 }
5775
5776 /* mode:
5777  *   0 - completely stop and dis-assemble array
5778  *   2 - stop but do not disassemble array
5779  */
5780 static int do_md_stop(struct mddev *mddev, int mode,
5781                       struct block_device *bdev)
5782 {
5783         struct gendisk *disk = mddev->gendisk;
5784         struct md_rdev *rdev;
5785         int did_freeze = 0;
5786
5787         if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5788                 did_freeze = 1;
5789                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5790                 md_wakeup_thread(mddev->thread);
5791         }
5792         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5793                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5794         if (mddev->sync_thread)
5795                 /* Thread might be blocked waiting for metadata update
5796                  * which will now never happen */
5797                 wake_up_process(mddev->sync_thread->tsk);
5798
5799         mddev_unlock(mddev);
5800         wait_event(resync_wait, (mddev->sync_thread == NULL &&
5801                                  !test_bit(MD_RECOVERY_RUNNING,
5802                                            &mddev->recovery)));
5803         mddev_lock_nointr(mddev);
5804
5805         mutex_lock(&mddev->open_mutex);
5806         if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5807             mddev->sysfs_active ||
5808             mddev->sync_thread ||
5809             test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5810                 pr_warn("md: %s still in use.\n",mdname(mddev));
5811                 mutex_unlock(&mddev->open_mutex);
5812                 if (did_freeze) {
5813                         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5814                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5815                         md_wakeup_thread(mddev->thread);
5816                 }
5817                 return -EBUSY;
5818         }
5819         if (mddev->pers) {
5820                 if (mddev->ro)
5821                         set_disk_ro(disk, 0);
5822
5823                 __md_stop_writes(mddev);
5824                 __md_stop(mddev);
5825                 mddev->queue->backing_dev_info->congested_fn = NULL;
5826
5827                 /* tell userspace to handle 'inactive' */
5828                 sysfs_notify_dirent_safe(mddev->sysfs_state);
5829
5830                 rdev_for_each(rdev, mddev)
5831                         if (rdev->raid_disk >= 0)
5832                                 sysfs_unlink_rdev(mddev, rdev);
5833
5834                 set_capacity(disk, 0);
5835                 mutex_unlock(&mddev->open_mutex);
5836                 mddev->changed = 1;
5837                 revalidate_disk(disk);
5838
5839                 if (mddev->ro)
5840                         mddev->ro = 0;
5841         } else
5842                 mutex_unlock(&mddev->open_mutex);
5843         /*
5844          * Free resources if final stop
5845          */
5846         if (mode == 0) {
5847                 pr_info("md: %s stopped.\n", mdname(mddev));
5848
5849                 if (mddev->bitmap_info.file) {
5850                         struct file *f = mddev->bitmap_info.file;
5851                         spin_lock(&mddev->lock);
5852                         mddev->bitmap_info.file = NULL;
5853                         spin_unlock(&mddev->lock);
5854                         fput(f);
5855                 }
5856                 mddev->bitmap_info.offset = 0;
5857
5858                 export_array(mddev);
5859
5860                 md_clean(mddev);
5861                 if (mddev->hold_active == UNTIL_STOP)
5862                         mddev->hold_active = 0;
5863         }
5864         md_new_event(mddev);
5865         sysfs_notify_dirent_safe(mddev->sysfs_state);
5866         return 0;
5867 }
5868
5869 #ifndef MODULE
5870 static void autorun_array(struct mddev *mddev)
5871 {
5872         struct md_rdev *rdev;
5873         int err;
5874
5875         if (list_empty(&mddev->disks))
5876                 return;
5877
5878         pr_info("md: running: ");
5879
5880         rdev_for_each(rdev, mddev) {
5881                 char b[BDEVNAME_SIZE];
5882                 pr_cont("<%s>", bdevname(rdev->bdev,b));
5883         }
5884         pr_cont("\n");
5885
5886         err = do_md_run(mddev);
5887         if (err) {
5888                 pr_warn("md: do_md_run() returned %d\n", err);
5889                 do_md_stop(mddev, 0, NULL);
5890         }
5891 }
5892
5893 /*
5894  * lets try to run arrays based on all disks that have arrived
5895  * until now. (those are in pending_raid_disks)
5896  *
5897  * the method: pick the first pending disk, collect all disks with
5898  * the same UUID, remove all from the pending list and put them into
5899  * the 'same_array' list. Then order this list based on superblock
5900  * update time (freshest comes first), kick out 'old' disks and
5901  * compare superblocks. If everything's fine then run it.
5902  *
5903  * If "unit" is allocated, then bump its reference count
5904  */
5905 static void autorun_devices(int part)
5906 {
5907         struct md_rdev *rdev0, *rdev, *tmp;
5908         struct mddev *mddev;
5909         char b[BDEVNAME_SIZE];
5910
5911         pr_info("md: autorun ...\n");
5912         while (!list_empty(&pending_raid_disks)) {
5913                 int unit;
5914                 dev_t dev;
5915                 LIST_HEAD(candidates);
5916                 rdev0 = list_entry(pending_raid_disks.next,
5917                                          struct md_rdev, same_set);
5918
5919                 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
5920                 INIT_LIST_HEAD(&candidates);
5921                 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5922                         if (super_90_load(rdev, rdev0, 0) >= 0) {
5923                                 pr_debug("md:  adding %s ...\n",
5924                                          bdevname(rdev->bdev,b));
5925                                 list_move(&rdev->same_set, &candidates);
5926                         }
5927                 /*
5928                  * now we have a set of devices, with all of them having
5929                  * mostly sane superblocks. It's time to allocate the
5930                  * mddev.
5931                  */
5932                 if (part) {
5933                         dev = MKDEV(mdp_major,
5934                                     rdev0->preferred_minor << MdpMinorShift);
5935                         unit = MINOR(dev) >> MdpMinorShift;
5936                 } else {
5937                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5938                         unit = MINOR(dev);
5939                 }
5940                 if (rdev0->preferred_minor != unit) {
5941                         pr_warn("md: unit number in %s is bad: %d\n",
5942                                 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5943                         break;
5944                 }
5945
5946                 md_probe(dev, NULL, NULL);
5947                 mddev = mddev_find(dev);
5948                 if (!mddev || !mddev->gendisk) {
5949                         if (mddev)
5950                                 mddev_put(mddev);
5951                         break;
5952                 }
5953                 if (mddev_lock(mddev))
5954                         pr_warn("md: %s locked, cannot run\n", mdname(mddev));
5955                 else if (mddev->raid_disks || mddev->major_version
5956                          || !list_empty(&mddev->disks)) {
5957                         pr_warn("md: %s already running, cannot run %s\n",
5958                                 mdname(mddev), bdevname(rdev0->bdev,b));
5959                         mddev_unlock(mddev);
5960                 } else {
5961                         pr_debug("md: created %s\n", mdname(mddev));
5962                         mddev->persistent = 1;
5963                         rdev_for_each_list(rdev, tmp, &candidates) {
5964                                 list_del_init(&rdev->same_set);
5965                                 if (bind_rdev_to_array(rdev, mddev))
5966                                         export_rdev(rdev);
5967                         }
5968                         autorun_array(mddev);
5969                         mddev_unlock(mddev);
5970                 }
5971                 /* on success, candidates will be empty, on error
5972                  * it won't...
5973                  */
5974                 rdev_for_each_list(rdev, tmp, &candidates) {
5975                         list_del_init(&rdev->same_set);
5976                         export_rdev(rdev);
5977                 }
5978                 mddev_put(mddev);
5979         }
5980         pr_info("md: ... autorun DONE.\n");
5981 }
5982 #endif /* !MODULE */
5983
5984 static int get_version(void __user *arg)
5985 {
5986         mdu_version_t ver;
5987
5988         ver.major = MD_MAJOR_VERSION;
5989         ver.minor = MD_MINOR_VERSION;
5990         ver.patchlevel = MD_PATCHLEVEL_VERSION;
5991
5992         if (copy_to_user(arg, &ver, sizeof(ver)))
5993                 return -EFAULT;
5994
5995         return 0;
5996 }
5997
5998 static int get_array_info(struct mddev *mddev, void __user *arg)
5999 {
6000         mdu_array_info_t info;
6001         int nr,working,insync,failed,spare;
6002         struct md_rdev *rdev;
6003
6004         nr = working = insync = failed = spare = 0;
6005         rcu_read_lock();
6006         rdev_for_each_rcu(rdev, mddev) {
6007                 nr++;
6008                 if (test_bit(Faulty, &rdev->flags))
6009                         failed++;
6010                 else {
6011                         working++;
6012                         if (test_bit(In_sync, &rdev->flags))
6013                                 insync++;
6014                         else if (test_bit(Journal, &rdev->flags))
6015                                 /* TODO: add journal count to md_u.h */
6016                                 ;
6017                         else
6018                                 spare++;
6019                 }
6020         }
6021         rcu_read_unlock();
6022
6023         info.major_version = mddev->major_version;
6024         info.minor_version = mddev->minor_version;
6025         info.patch_version = MD_PATCHLEVEL_VERSION;
6026         info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6027         info.level         = mddev->level;
6028         info.size          = mddev->dev_sectors / 2;
6029         if (info.size != mddev->dev_sectors / 2) /* overflow */
6030                 info.size = -1;
6031         info.nr_disks      = nr;
6032         info.raid_disks    = mddev->raid_disks;
6033         info.md_minor      = mddev->md_minor;
6034         info.not_persistent= !mddev->persistent;
6035
6036         info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6037         info.state         = 0;
6038         if (mddev->in_sync)
6039                 info.state = (1<<MD_SB_CLEAN);
6040         if (mddev->bitmap && mddev->bitmap_info.offset)
6041                 info.state |= (1<<MD_SB_BITMAP_PRESENT);
6042         if (mddev_is_clustered(mddev))
6043                 info.state |= (1<<MD_SB_CLUSTERED);
6044         info.active_disks  = insync;
6045         info.working_disks = working;
6046         info.failed_disks  = failed;
6047         info.spare_disks   = spare;
6048
6049         info.layout        = mddev->layout;
6050         info.chunk_size    = mddev->chunk_sectors << 9;
6051
6052         if (copy_to_user(arg, &info, sizeof(info)))
6053                 return -EFAULT;
6054
6055         return 0;
6056 }
6057
6058 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6059 {
6060         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6061         char *ptr;
6062         int err;
6063
6064         file = kzalloc(sizeof(*file), GFP_NOIO);
6065         if (!file)
6066                 return -ENOMEM;
6067
6068         err = 0;
6069         spin_lock(&mddev->lock);
6070         /* bitmap enabled */
6071         if (mddev->bitmap_info.file) {
6072                 ptr = file_path(mddev->bitmap_info.file, file->pathname,
6073                                 sizeof(file->pathname));
6074                 if (IS_ERR(ptr))
6075                         err = PTR_ERR(ptr);
6076                 else
6077                         memmove(file->pathname, ptr,
6078                                 sizeof(file->pathname)-(ptr-file->pathname));
6079         }
6080         spin_unlock(&mddev->lock);
6081
6082         if (err == 0 &&
6083             copy_to_user(arg, file, sizeof(*file)))
6084                 err = -EFAULT;
6085
6086         kfree(file);
6087         return err;
6088 }
6089
6090 static int get_disk_info(struct mddev *mddev, void __user * arg)
6091 {
6092         mdu_disk_info_t info;
6093         struct md_rdev *rdev;
6094
6095         if (copy_from_user(&info, arg, sizeof(info)))
6096                 return -EFAULT;
6097
6098         rcu_read_lock();
6099         rdev = md_find_rdev_nr_rcu(mddev, info.number);
6100         if (rdev) {
6101                 info.major = MAJOR(rdev->bdev->bd_dev);
6102                 info.minor = MINOR(rdev->bdev->bd_dev);
6103                 info.raid_disk = rdev->raid_disk;
6104                 info.state = 0;
6105                 if (test_bit(Faulty, &rdev->flags))
6106                         info.state |= (1<<MD_DISK_FAULTY);
6107                 else if (test_bit(In_sync, &rdev->flags)) {
6108                         info.state |= (1<<MD_DISK_ACTIVE);
6109                         info.state |= (1<<MD_DISK_SYNC);
6110                 }
6111                 if (test_bit(Journal, &rdev->flags))
6112                         info.state |= (1<<MD_DISK_JOURNAL);
6113                 if (test_bit(WriteMostly, &rdev->flags))
6114                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
6115                 if (test_bit(FailFast, &rdev->flags))
6116                         info.state |= (1<<MD_DISK_FAILFAST);
6117         } else {
6118                 info.major = info.minor = 0;
6119                 info.raid_disk = -1;
6120                 info.state = (1<<MD_DISK_REMOVED);
6121         }
6122         rcu_read_unlock();
6123
6124         if (copy_to_user(arg, &info, sizeof(info)))
6125                 return -EFAULT;
6126
6127         return 0;
6128 }
6129
6130 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6131 {
6132         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6133         struct md_rdev *rdev;
6134         dev_t dev = MKDEV(info->major,info->minor);
6135
6136         if (mddev_is_clustered(mddev) &&
6137                 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6138                 pr_warn("%s: Cannot add to clustered mddev.\n",
6139                         mdname(mddev));
6140                 return -EINVAL;
6141         }
6142
6143         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6144                 return -EOVERFLOW;
6145
6146         if (!mddev->raid_disks) {
6147                 int err;
6148                 /* expecting a device which has a superblock */
6149                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6150                 if (IS_ERR(rdev)) {
6151                         pr_warn("md: md_import_device returned %ld\n",
6152                                 PTR_ERR(rdev));
6153                         return PTR_ERR(rdev);
6154                 }
6155                 if (!list_empty(&mddev->disks)) {
6156                         struct md_rdev *rdev0
6157                                 = list_entry(mddev->disks.next,
6158                                              struct md_rdev, same_set);
6159                         err = super_types[mddev->major_version]
6160                                 .load_super(rdev, rdev0, mddev->minor_version);
6161                         if (err < 0) {
6162                                 pr_warn("md: %s has different UUID to %s\n",
6163                                         bdevname(rdev->bdev,b),
6164                                         bdevname(rdev0->bdev,b2));
6165                                 export_rdev(rdev);
6166                                 return -EINVAL;
6167                         }
6168                 }
6169                 err = bind_rdev_to_array(rdev, mddev);
6170                 if (err)
6171                         export_rdev(rdev);
6172                 return err;
6173         }
6174
6175         /*
6176          * add_new_disk can be used once the array is assembled
6177          * to add "hot spares".  They must already have a superblock
6178          * written
6179          */
6180         if (mddev->pers) {
6181                 int err;
6182                 if (!mddev->pers->hot_add_disk) {
6183                         pr_warn("%s: personality does not support diskops!\n",
6184                                 mdname(mddev));
6185                         return -EINVAL;
6186                 }
6187                 if (mddev->persistent)
6188                         rdev = md_import_device(dev, mddev->major_version,
6189                                                 mddev->minor_version);
6190                 else
6191                         rdev = md_import_device(dev, -1, -1);
6192                 if (IS_ERR(rdev)) {
6193                         pr_warn("md: md_import_device returned %ld\n",
6194                                 PTR_ERR(rdev));
6195                         return PTR_ERR(rdev);
6196                 }
6197                 /* set saved_raid_disk if appropriate */
6198                 if (!mddev->persistent) {
6199                         if (info->state & (1<<MD_DISK_SYNC)  &&
6200                             info->raid_disk < mddev->raid_disks) {
6201                                 rdev->raid_disk = info->raid_disk;
6202                                 set_bit(In_sync, &rdev->flags);
6203                                 clear_bit(Bitmap_sync, &rdev->flags);
6204                         } else
6205                                 rdev->raid_disk = -1;
6206                         rdev->saved_raid_disk = rdev->raid_disk;
6207                 } else
6208                         super_types[mddev->major_version].
6209                                 validate_super(mddev, rdev);
6210                 if ((info->state & (1<<MD_DISK_SYNC)) &&
6211                      rdev->raid_disk != info->raid_disk) {
6212                         /* This was a hot-add request, but events doesn't
6213                          * match, so reject it.
6214                          */
6215                         export_rdev(rdev);
6216                         return -EINVAL;
6217                 }
6218
6219                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
6220                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6221                         set_bit(WriteMostly, &rdev->flags);
6222                 else
6223                         clear_bit(WriteMostly, &rdev->flags);
6224                 if (info->state & (1<<MD_DISK_FAILFAST))
6225                         set_bit(FailFast, &rdev->flags);
6226                 else
6227                         clear_bit(FailFast, &rdev->flags);
6228
6229                 if (info->state & (1<<MD_DISK_JOURNAL)) {
6230                         struct md_rdev *rdev2;
6231                         bool has_journal = false;
6232
6233                         /* make sure no existing journal disk */
6234                         rdev_for_each(rdev2, mddev) {
6235                                 if (test_bit(Journal, &rdev2->flags)) {
6236                                         has_journal = true;
6237                                         break;
6238                                 }
6239                         }
6240                         if (has_journal) {
6241                                 export_rdev(rdev);
6242                                 return -EBUSY;
6243                         }
6244                         set_bit(Journal, &rdev->flags);
6245                 }
6246                 /*
6247                  * check whether the device shows up in other nodes
6248                  */
6249                 if (mddev_is_clustered(mddev)) {
6250                         if (info->state & (1 << MD_DISK_CANDIDATE))
6251                                 set_bit(Candidate, &rdev->flags);
6252                         else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6253                                 /* --add initiated by this node */
6254                                 err = md_cluster_ops->add_new_disk(mddev, rdev);
6255                                 if (err) {
6256                                         export_rdev(rdev);
6257                                         return err;
6258                                 }
6259                         }
6260                 }
6261
6262                 rdev->raid_disk = -1;
6263                 err = bind_rdev_to_array(rdev, mddev);
6264
6265                 if (err)
6266                         export_rdev(rdev);
6267
6268                 if (mddev_is_clustered(mddev)) {
6269                         if (info->state & (1 << MD_DISK_CANDIDATE)) {
6270                                 if (!err) {
6271                                         err = md_cluster_ops->new_disk_ack(mddev,
6272                                                 err == 0);
6273                                         if (err)
6274                                                 md_kick_rdev_from_array(rdev);
6275                                 }
6276                         } else {
6277                                 if (err)
6278                                         md_cluster_ops->add_new_disk_cancel(mddev);
6279                                 else
6280                                         err = add_bound_rdev(rdev);
6281                         }
6282
6283                 } else if (!err)
6284                         err = add_bound_rdev(rdev);
6285
6286                 return err;
6287         }
6288
6289         /* otherwise, add_new_disk is only allowed
6290          * for major_version==0 superblocks
6291          */
6292         if (mddev->major_version != 0) {
6293                 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6294                 return -EINVAL;
6295         }
6296
6297         if (!(info->state & (1<<MD_DISK_FAULTY))) {
6298                 int err;
6299                 rdev = md_import_device(dev, -1, 0);
6300                 if (IS_ERR(rdev)) {
6301                         pr_warn("md: error, md_import_device() returned %ld\n",
6302                                 PTR_ERR(rdev));
6303                         return PTR_ERR(rdev);
6304                 }
6305                 rdev->desc_nr = info->number;
6306                 if (info->raid_disk < mddev->raid_disks)
6307                         rdev->raid_disk = info->raid_disk;
6308                 else
6309                         rdev->raid_disk = -1;
6310
6311                 if (rdev->raid_disk < mddev->raid_disks)
6312                         if (info->state & (1<<MD_DISK_SYNC))
6313                                 set_bit(In_sync, &rdev->flags);
6314
6315                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6316                         set_bit(WriteMostly, &rdev->flags);
6317                 if (info->state & (1<<MD_DISK_FAILFAST))
6318                         set_bit(FailFast, &rdev->flags);
6319
6320                 if (!mddev->persistent) {
6321                         pr_debug("md: nonpersistent superblock ...\n");
6322                         rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6323                 } else
6324                         rdev->sb_start = calc_dev_sboffset(rdev);
6325                 rdev->sectors = rdev->sb_start;
6326
6327                 err = bind_rdev_to_array(rdev, mddev);
6328                 if (err) {
6329                         export_rdev(rdev);
6330                         return err;
6331                 }
6332         }
6333
6334         return 0;
6335 }
6336
6337 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6338 {
6339         char b[BDEVNAME_SIZE];
6340         struct md_rdev *rdev;
6341
6342         rdev = find_rdev(mddev, dev);
6343         if (!rdev)
6344                 return -ENXIO;
6345
6346         if (rdev->raid_disk < 0)
6347                 goto kick_rdev;
6348
6349         clear_bit(Blocked, &rdev->flags);
6350         remove_and_add_spares(mddev, rdev);
6351
6352         if (rdev->raid_disk >= 0)
6353                 goto busy;
6354
6355 kick_rdev:
6356         if (mddev_is_clustered(mddev))
6357                 md_cluster_ops->remove_disk(mddev, rdev);
6358
6359         md_kick_rdev_from_array(rdev);
6360         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6361         if (mddev->thread)
6362                 md_wakeup_thread(mddev->thread);
6363         else
6364                 md_update_sb(mddev, 1);
6365         md_new_event(mddev);
6366
6367         return 0;
6368 busy:
6369         pr_debug("md: cannot remove active disk %s from %s ...\n",
6370                  bdevname(rdev->bdev,b), mdname(mddev));
6371         return -EBUSY;
6372 }
6373
6374 static int hot_add_disk(struct mddev *mddev, dev_t dev)
6375 {
6376         char b[BDEVNAME_SIZE];
6377         int err;
6378         struct md_rdev *rdev;
6379
6380         if (!mddev->pers)
6381                 return -ENODEV;
6382
6383         if (mddev->major_version != 0) {
6384                 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6385                         mdname(mddev));
6386                 return -EINVAL;
6387         }
6388         if (!mddev->pers->hot_add_disk) {
6389                 pr_warn("%s: personality does not support diskops!\n",
6390                         mdname(mddev));
6391                 return -EINVAL;
6392         }
6393
6394         rdev = md_import_device(dev, -1, 0);
6395         if (IS_ERR(rdev)) {
6396                 pr_warn("md: error, md_import_device() returned %ld\n",
6397                         PTR_ERR(rdev));
6398                 return -EINVAL;
6399         }
6400
6401         if (mddev->persistent)
6402                 rdev->sb_start = calc_dev_sboffset(rdev);
6403         else
6404                 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6405
6406         rdev->sectors = rdev->sb_start;
6407
6408         if (test_bit(Faulty, &rdev->flags)) {
6409                 pr_warn("md: can not hot-add faulty %s disk to %s!\n",
6410                         bdevname(rdev->bdev,b), mdname(mddev));
6411                 err = -EINVAL;
6412                 goto abort_export;
6413         }
6414
6415         clear_bit(In_sync, &rdev->flags);
6416         rdev->desc_nr = -1;
6417         rdev->saved_raid_disk = -1;
6418         err = bind_rdev_to_array(rdev, mddev);
6419         if (err)
6420                 goto abort_export;
6421
6422         /*
6423          * The rest should better be atomic, we can have disk failures
6424          * noticed in interrupt contexts ...
6425          */
6426
6427         rdev->raid_disk = -1;
6428
6429         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6430         if (!mddev->thread)
6431                 md_update_sb(mddev, 1);
6432         /*
6433          * Kick recovery, maybe this spare has to be added to the
6434          * array immediately.
6435          */
6436         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6437         md_wakeup_thread(mddev->thread);
6438         md_new_event(mddev);
6439         return 0;
6440
6441 abort_export:
6442         export_rdev(rdev);
6443         return err;
6444 }
6445
6446 static int set_bitmap_file(struct mddev *mddev, int fd)
6447 {
6448         int err = 0;
6449
6450         if (mddev->pers) {
6451                 if (!mddev->pers->quiesce || !mddev->thread)
6452                         return -EBUSY;
6453                 if (mddev->recovery || mddev->sync_thread)
6454                         return -EBUSY;
6455                 /* we should be able to change the bitmap.. */
6456         }
6457
6458         if (fd >= 0) {
6459                 struct inode *inode;
6460                 struct file *f;
6461
6462                 if (mddev->bitmap || mddev->bitmap_info.file)
6463                         return -EEXIST; /* cannot add when bitmap is present */
6464                 f = fget(fd);
6465
6466                 if (f == NULL) {
6467                         pr_warn("%s: error: failed to get bitmap file\n",
6468                                 mdname(mddev));
6469                         return -EBADF;
6470                 }
6471
6472                 inode = f->f_mapping->host;
6473                 if (!S_ISREG(inode->i_mode)) {
6474                         pr_warn("%s: error: bitmap file must be a regular file\n",
6475                                 mdname(mddev));
6476                         err = -EBADF;
6477                 } else if (!(f->f_mode & FMODE_WRITE)) {
6478                         pr_warn("%s: error: bitmap file must open for write\n",
6479                                 mdname(mddev));
6480                         err = -EBADF;
6481                 } else if (atomic_read(&inode->i_writecount) != 1) {
6482                         pr_warn("%s: error: bitmap file is already in use\n",
6483                                 mdname(mddev));
6484                         err = -EBUSY;
6485                 }
6486                 if (err) {
6487                         fput(f);
6488                         return err;
6489                 }
6490                 mddev->bitmap_info.file = f;
6491                 mddev->bitmap_info.offset = 0; /* file overrides offset */
6492         } else if (mddev->bitmap == NULL)
6493                 return -ENOENT; /* cannot remove what isn't there */
6494         err = 0;
6495         if (mddev->pers) {
6496                 mddev->pers->quiesce(mddev, 1);
6497                 if (fd >= 0) {
6498                         struct bitmap *bitmap;
6499
6500                         bitmap = bitmap_create(mddev, -1);
6501                         if (!IS_ERR(bitmap)) {
6502                                 mddev->bitmap = bitmap;
6503                                 err = bitmap_load(mddev);
6504                         } else
6505                                 err = PTR_ERR(bitmap);
6506                 }
6507                 if (fd < 0 || err) {
6508                         bitmap_destroy(mddev);
6509                         fd = -1; /* make sure to put the file */
6510                 }
6511                 mddev->pers->quiesce(mddev, 0);
6512         }
6513         if (fd < 0) {
6514                 struct file *f = mddev->bitmap_info.file;
6515                 if (f) {
6516                         spin_lock(&mddev->lock);
6517                         mddev->bitmap_info.file = NULL;
6518                         spin_unlock(&mddev->lock);
6519                         fput(f);
6520                 }
6521         }
6522
6523         return err;
6524 }
6525
6526 /*
6527  * set_array_info is used two different ways
6528  * The original usage is when creating a new array.
6529  * In this usage, raid_disks is > 0 and it together with
6530  *  level, size, not_persistent,layout,chunksize determine the
6531  *  shape of the array.
6532  *  This will always create an array with a type-0.90.0 superblock.
6533  * The newer usage is when assembling an array.
6534  *  In this case raid_disks will be 0, and the major_version field is
6535  *  use to determine which style super-blocks are to be found on the devices.
6536  *  The minor and patch _version numbers are also kept incase the
6537  *  super_block handler wishes to interpret them.
6538  */
6539 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6540 {
6541
6542         if (info->raid_disks == 0) {
6543                 /* just setting version number for superblock loading */
6544                 if (info->major_version < 0 ||
6545                     info->major_version >= ARRAY_SIZE(super_types) ||
6546                     super_types[info->major_version].name == NULL) {
6547                         /* maybe try to auto-load a module? */
6548                         pr_warn("md: superblock version %d not known\n",
6549                                 info->major_version);
6550                         return -EINVAL;
6551                 }
6552                 mddev->major_version = info->major_version;
6553                 mddev->minor_version = info->minor_version;
6554                 mddev->patch_version = info->patch_version;
6555                 mddev->persistent = !info->not_persistent;
6556                 /* ensure mddev_put doesn't delete this now that there
6557                  * is some minimal configuration.
6558                  */
6559                 mddev->ctime         = ktime_get_real_seconds();
6560                 return 0;
6561         }
6562         mddev->major_version = MD_MAJOR_VERSION;
6563         mddev->minor_version = MD_MINOR_VERSION;
6564         mddev->patch_version = MD_PATCHLEVEL_VERSION;
6565         mddev->ctime         = ktime_get_real_seconds();
6566
6567         mddev->level         = info->level;
6568         mddev->clevel[0]     = 0;
6569         mddev->dev_sectors   = 2 * (sector_t)info->size;
6570         mddev->raid_disks    = info->raid_disks;
6571         /* don't set md_minor, it is determined by which /dev/md* was
6572          * openned
6573          */
6574         if (info->state & (1<<MD_SB_CLEAN))
6575                 mddev->recovery_cp = MaxSector;
6576         else
6577                 mddev->recovery_cp = 0;
6578         mddev->persistent    = ! info->not_persistent;
6579         mddev->external      = 0;
6580
6581         mddev->layout        = info->layout;
6582         mddev->chunk_sectors = info->chunk_size >> 9;
6583
6584         if (mddev->persistent) {
6585                 mddev->max_disks = MD_SB_DISKS;
6586                 mddev->flags = 0;
6587                 mddev->sb_flags = 0;
6588         }
6589         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6590
6591         mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6592         mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6593         mddev->bitmap_info.offset = 0;
6594
6595         mddev->reshape_position = MaxSector;
6596
6597         /*
6598          * Generate a 128 bit UUID
6599          */
6600         get_random_bytes(mddev->uuid, 16);
6601
6602         mddev->new_level = mddev->level;
6603         mddev->new_chunk_sectors = mddev->chunk_sectors;
6604         mddev->new_layout = mddev->layout;
6605         mddev->delta_disks = 0;
6606         mddev->reshape_backwards = 0;
6607
6608         return 0;
6609 }
6610
6611 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6612 {
6613         WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6614
6615         if (mddev->external_size)
6616                 return;
6617
6618         mddev->array_sectors = array_sectors;
6619 }
6620 EXPORT_SYMBOL(md_set_array_sectors);
6621
6622 static int update_size(struct mddev *mddev, sector_t num_sectors)
6623 {
6624         struct md_rdev *rdev;
6625         int rv;
6626         int fit = (num_sectors == 0);
6627         sector_t old_dev_sectors = mddev->dev_sectors;
6628
6629         if (mddev->pers->resize == NULL)
6630                 return -EINVAL;
6631         /* The "num_sectors" is the number of sectors of each device that
6632          * is used.  This can only make sense for arrays with redundancy.
6633          * linear and raid0 always use whatever space is available. We can only
6634          * consider changing this number if no resync or reconstruction is
6635          * happening, and if the new size is acceptable. It must fit before the
6636          * sb_start or, if that is <data_offset, it must fit before the size
6637          * of each device.  If num_sectors is zero, we find the largest size
6638          * that fits.
6639          */
6640         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6641             mddev->sync_thread)
6642                 return -EBUSY;
6643         if (mddev->ro)
6644                 return -EROFS;
6645
6646         rdev_for_each(rdev, mddev) {
6647                 sector_t avail = rdev->sectors;
6648
6649                 if (fit && (num_sectors == 0 || num_sectors > avail))
6650                         num_sectors = avail;
6651                 if (avail < num_sectors)
6652                         return -ENOSPC;
6653         }
6654         rv = mddev->pers->resize(mddev, num_sectors);
6655         if (!rv) {
6656                 if (mddev_is_clustered(mddev))
6657                         md_cluster_ops->update_size(mddev, old_dev_sectors);
6658                 else if (mddev->queue) {
6659                         set_capacity(mddev->gendisk, mddev->array_sectors);
6660                         revalidate_disk(mddev->gendisk);
6661                 }
6662         }
6663         return rv;
6664 }
6665
6666 static int update_raid_disks(struct mddev *mddev, int raid_disks)
6667 {
6668         int rv;
6669         struct md_rdev *rdev;
6670         /* change the number of raid disks */
6671         if (mddev->pers->check_reshape == NULL)
6672                 return -EINVAL;
6673         if (mddev->ro)
6674                 return -EROFS;
6675         if (raid_disks <= 0 ||
6676             (mddev->max_disks && raid_disks >= mddev->max_disks))
6677                 return -EINVAL;
6678         if (mddev->sync_thread ||
6679             test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6680             mddev->reshape_position != MaxSector)
6681                 return -EBUSY;
6682
6683         rdev_for_each(rdev, mddev) {
6684                 if (mddev->raid_disks < raid_disks &&
6685                     rdev->data_offset < rdev->new_data_offset)
6686                         return -EINVAL;
6687                 if (mddev->raid_disks > raid_disks &&
6688                     rdev->data_offset > rdev->new_data_offset)
6689                         return -EINVAL;
6690         }
6691
6692         mddev->delta_disks = raid_disks - mddev->raid_disks;
6693         if (mddev->delta_disks < 0)
6694                 mddev->reshape_backwards = 1;
6695         else if (mddev->delta_disks > 0)
6696                 mddev->reshape_backwards = 0;
6697
6698         rv = mddev->pers->check_reshape(mddev);
6699         if (rv < 0) {
6700                 mddev->delta_disks = 0;
6701                 mddev->reshape_backwards = 0;
6702         }
6703         return rv;
6704 }
6705
6706 /*
6707  * update_array_info is used to change the configuration of an
6708  * on-line array.
6709  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6710  * fields in the info are checked against the array.
6711  * Any differences that cannot be handled will cause an error.
6712  * Normally, only one change can be managed at a time.
6713  */
6714 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6715 {
6716         int rv = 0;
6717         int cnt = 0;
6718         int state = 0;
6719
6720         /* calculate expected state,ignoring low bits */
6721         if (mddev->bitmap && mddev->bitmap_info.offset)
6722                 state |= (1 << MD_SB_BITMAP_PRESENT);
6723
6724         if (mddev->major_version != info->major_version ||
6725             mddev->minor_version != info->minor_version ||
6726 /*          mddev->patch_version != info->patch_version || */
6727             mddev->ctime         != info->ctime         ||
6728             mddev->level         != info->level         ||
6729 /*          mddev->layout        != info->layout        || */
6730             mddev->persistent    != !info->not_persistent ||
6731             mddev->chunk_sectors != info->chunk_size >> 9 ||
6732             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6733             ((state^info->state) & 0xfffffe00)
6734                 )
6735                 return -EINVAL;
6736         /* Check there is only one change */
6737         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6738                 cnt++;
6739         if (mddev->raid_disks != info->raid_disks)
6740                 cnt++;
6741         if (mddev->layout != info->layout)
6742                 cnt++;
6743         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6744                 cnt++;
6745         if (cnt == 0)
6746                 return 0;
6747         if (cnt > 1)
6748                 return -EINVAL;
6749
6750         if (mddev->layout != info->layout) {
6751                 /* Change layout
6752                  * we don't need to do anything at the md level, the
6753                  * personality will take care of it all.
6754                  */
6755                 if (mddev->pers->check_reshape == NULL)
6756                         return -EINVAL;
6757                 else {
6758                         mddev->new_layout = info->layout;
6759                         rv = mddev->pers->check_reshape(mddev);
6760                         if (rv)
6761                                 mddev->new_layout = mddev->layout;
6762                         return rv;
6763                 }
6764         }
6765         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6766                 rv = update_size(mddev, (sector_t)info->size * 2);
6767
6768         if (mddev->raid_disks    != info->raid_disks)
6769                 rv = update_raid_disks(mddev, info->raid_disks);
6770
6771         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6772                 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
6773                         rv = -EINVAL;
6774                         goto err;
6775                 }
6776                 if (mddev->recovery || mddev->sync_thread) {
6777                         rv = -EBUSY;
6778                         goto err;
6779                 }
6780                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6781                         struct bitmap *bitmap;
6782                         /* add the bitmap */
6783                         if (mddev->bitmap) {
6784                                 rv = -EEXIST;
6785                                 goto err;
6786                         }
6787                         if (mddev->bitmap_info.default_offset == 0) {
6788                                 rv = -EINVAL;
6789                                 goto err;
6790                         }
6791                         mddev->bitmap_info.offset =
6792                                 mddev->bitmap_info.default_offset;
6793                         mddev->bitmap_info.space =
6794                                 mddev->bitmap_info.default_space;
6795                         mddev->pers->quiesce(mddev, 1);
6796                         bitmap = bitmap_create(mddev, -1);
6797                         if (!IS_ERR(bitmap)) {
6798                                 mddev->bitmap = bitmap;
6799                                 rv = bitmap_load(mddev);
6800                         } else
6801                                 rv = PTR_ERR(bitmap);
6802                         if (rv)
6803                                 bitmap_destroy(mddev);
6804                         mddev->pers->quiesce(mddev, 0);
6805                 } else {
6806                         /* remove the bitmap */
6807                         if (!mddev->bitmap) {
6808                                 rv = -ENOENT;
6809                                 goto err;
6810                         }
6811                         if (mddev->bitmap->storage.file) {
6812                                 rv = -EINVAL;
6813                                 goto err;
6814                         }
6815                         if (mddev->bitmap_info.nodes) {
6816                                 /* hold PW on all the bitmap lock */
6817                                 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
6818                                         pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
6819                                         rv = -EPERM;
6820                                         md_cluster_ops->unlock_all_bitmaps(mddev);
6821                                         goto err;
6822                                 }
6823
6824                                 mddev->bitmap_info.nodes = 0;
6825                                 md_cluster_ops->leave(mddev);
6826                         }
6827                         mddev->pers->quiesce(mddev, 1);
6828                         bitmap_destroy(mddev);
6829                         mddev->pers->quiesce(mddev, 0);
6830                         mddev->bitmap_info.offset = 0;
6831                 }
6832         }
6833         md_update_sb(mddev, 1);
6834         return rv;
6835 err:
6836         return rv;
6837 }
6838
6839 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6840 {
6841         struct md_rdev *rdev;
6842         int err = 0;
6843
6844         if (mddev->pers == NULL)
6845                 return -ENODEV;
6846
6847         rcu_read_lock();
6848         rdev = find_rdev_rcu(mddev, dev);
6849         if (!rdev)
6850                 err =  -ENODEV;
6851         else {
6852                 md_error(mddev, rdev);
6853                 if (!test_bit(Faulty, &rdev->flags))
6854                         err = -EBUSY;
6855         }
6856         rcu_read_unlock();
6857         return err;
6858 }
6859
6860 /*
6861  * We have a problem here : there is no easy way to give a CHS
6862  * virtual geometry. We currently pretend that we have a 2 heads
6863  * 4 sectors (with a BIG number of cylinders...). This drives
6864  * dosfs just mad... ;-)
6865  */
6866 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6867 {
6868         struct mddev *mddev = bdev->bd_disk->private_data;
6869
6870         geo->heads = 2;
6871         geo->sectors = 4;
6872         geo->cylinders = mddev->array_sectors / 8;
6873         return 0;
6874 }
6875
6876 static inline bool md_ioctl_valid(unsigned int cmd)
6877 {
6878         switch (cmd) {
6879         case ADD_NEW_DISK:
6880         case BLKROSET:
6881         case GET_ARRAY_INFO:
6882         case GET_BITMAP_FILE:
6883         case GET_DISK_INFO:
6884         case HOT_ADD_DISK:
6885         case HOT_REMOVE_DISK:
6886         case RAID_AUTORUN:
6887         case RAID_VERSION:
6888         case RESTART_ARRAY_RW:
6889         case RUN_ARRAY:
6890         case SET_ARRAY_INFO:
6891         case SET_BITMAP_FILE:
6892         case SET_DISK_FAULTY:
6893         case STOP_ARRAY:
6894         case STOP_ARRAY_RO:
6895         case CLUSTERED_DISK_NACK:
6896                 return true;
6897         default:
6898                 return false;
6899         }
6900 }
6901
6902 static int md_ioctl(struct block_device *bdev, fmode_t mode,
6903                         unsigned int cmd, unsigned long arg)
6904 {
6905         int err = 0;
6906         void __user *argp = (void __user *)arg;
6907         struct mddev *mddev = NULL;
6908         int ro;
6909
6910         if (!md_ioctl_valid(cmd))
6911                 return -ENOTTY;
6912
6913         switch (cmd) {
6914         case RAID_VERSION:
6915         case GET_ARRAY_INFO:
6916         case GET_DISK_INFO:
6917                 break;
6918         default:
6919                 if (!capable(CAP_SYS_ADMIN))
6920                         return -EACCES;
6921         }
6922
6923         /*
6924          * Commands dealing with the RAID driver but not any
6925          * particular array:
6926          */
6927         switch (cmd) {
6928         case RAID_VERSION:
6929                 err = get_version(argp);
6930                 goto out;
6931
6932 #ifndef MODULE
6933         case RAID_AUTORUN:
6934                 err = 0;
6935                 autostart_arrays(arg);
6936                 goto out;
6937 #endif
6938         default:;
6939         }
6940
6941         /*
6942          * Commands creating/starting a new array:
6943          */
6944
6945         mddev = bdev->bd_disk->private_data;
6946
6947         if (!mddev) {
6948                 BUG();
6949                 goto out;
6950         }
6951
6952         /* Some actions do not requires the mutex */
6953         switch (cmd) {
6954         case GET_ARRAY_INFO:
6955                 if (!mddev->raid_disks && !mddev->external)
6956                         err = -ENODEV;
6957                 else
6958                         err = get_array_info(mddev, argp);
6959                 goto out;
6960
6961         case GET_DISK_INFO:
6962                 if (!mddev->raid_disks && !mddev->external)
6963                         err = -ENODEV;
6964                 else
6965                         err = get_disk_info(mddev, argp);
6966                 goto out;
6967
6968         case SET_DISK_FAULTY:
6969                 err = set_disk_faulty(mddev, new_decode_dev(arg));
6970                 goto out;
6971
6972         case GET_BITMAP_FILE:
6973                 err = get_bitmap_file(mddev, argp);
6974                 goto out;
6975
6976         }
6977
6978         if (cmd == ADD_NEW_DISK)
6979                 /* need to ensure md_delayed_delete() has completed */
6980                 flush_workqueue(md_misc_wq);
6981
6982         if (cmd == HOT_REMOVE_DISK)
6983                 /* need to ensure recovery thread has run */
6984                 wait_event_interruptible_timeout(mddev->sb_wait,
6985                                                  !test_bit(MD_RECOVERY_NEEDED,
6986                                                            &mddev->recovery),
6987                                                  msecs_to_jiffies(5000));
6988         if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6989                 /* Need to flush page cache, and ensure no-one else opens
6990                  * and writes
6991                  */
6992                 mutex_lock(&mddev->open_mutex);
6993                 if (mddev->pers && atomic_read(&mddev->openers) > 1) {
6994                         mutex_unlock(&mddev->open_mutex);
6995                         err = -EBUSY;
6996                         goto out;
6997                 }
6998                 set_bit(MD_CLOSING, &mddev->flags);
6999                 mutex_unlock(&mddev->open_mutex);
7000                 sync_blockdev(bdev);
7001         }
7002         err = mddev_lock(mddev);
7003         if (err) {
7004                 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7005                          err, cmd);
7006                 goto out;
7007         }
7008
7009         if (cmd == SET_ARRAY_INFO) {
7010                 mdu_array_info_t info;
7011                 if (!arg)
7012                         memset(&info, 0, sizeof(info));
7013                 else if (copy_from_user(&info, argp, sizeof(info))) {
7014                         err = -EFAULT;
7015                         goto unlock;
7016                 }
7017                 if (mddev->pers) {
7018                         err = update_array_info(mddev, &info);
7019                         if (err) {
7020                                 pr_warn("md: couldn't update array info. %d\n", err);
7021                                 goto unlock;
7022                         }
7023                         goto unlock;
7024                 }
7025                 if (!list_empty(&mddev->disks)) {
7026                         pr_warn("md: array %s already has disks!\n", mdname(mddev));
7027                         err = -EBUSY;
7028                         goto unlock;
7029                 }
7030                 if (mddev->raid_disks) {
7031                         pr_warn("md: array %s already initialised!\n", mdname(mddev));
7032                         err = -EBUSY;
7033                         goto unlock;
7034                 }
7035                 err = set_array_info(mddev, &info);
7036                 if (err) {
7037                         pr_warn("md: couldn't set array info. %d\n", err);
7038                         goto unlock;
7039                 }
7040                 goto unlock;
7041         }
7042
7043         /*
7044          * Commands querying/configuring an existing array:
7045          */
7046         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7047          * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7048         if ((!mddev->raid_disks && !mddev->external)
7049             && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7050             && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7051             && cmd != GET_BITMAP_FILE) {
7052                 err = -ENODEV;
7053                 goto unlock;
7054         }
7055
7056         /*
7057          * Commands even a read-only array can execute:
7058          */
7059         switch (cmd) {
7060         case RESTART_ARRAY_RW:
7061                 err = restart_array(mddev);
7062                 goto unlock;
7063
7064         case STOP_ARRAY:
7065                 err = do_md_stop(mddev, 0, bdev);
7066                 goto unlock;
7067
7068         case STOP_ARRAY_RO:
7069                 err = md_set_readonly(mddev, bdev);
7070                 goto unlock;
7071
7072         case HOT_REMOVE_DISK:
7073                 err = hot_remove_disk(mddev, new_decode_dev(arg));
7074                 goto unlock;
7075
7076         case ADD_NEW_DISK:
7077                 /* We can support ADD_NEW_DISK on read-only arrays
7078                  * only if we are re-adding a preexisting device.
7079                  * So require mddev->pers and MD_DISK_SYNC.
7080                  */
7081                 if (mddev->pers) {
7082                         mdu_disk_info_t info;
7083                         if (copy_from_user(&info, argp, sizeof(info)))
7084                                 err = -EFAULT;
7085                         else if (!(info.state & (1<<MD_DISK_SYNC)))
7086                                 /* Need to clear read-only for this */
7087                                 break;
7088                         else
7089                                 err = add_new_disk(mddev, &info);
7090                         goto unlock;
7091                 }
7092                 break;
7093
7094         case BLKROSET:
7095                 if (get_user(ro, (int __user *)(arg))) {
7096                         err = -EFAULT;
7097                         goto unlock;
7098                 }
7099                 err = -EINVAL;
7100
7101                 /* if the bdev is going readonly the value of mddev->ro
7102                  * does not matter, no writes are coming
7103                  */
7104                 if (ro)
7105                         goto unlock;
7106
7107                 /* are we are already prepared for writes? */
7108                 if (mddev->ro != 1)
7109                         goto unlock;
7110
7111                 /* transitioning to readauto need only happen for
7112                  * arrays that call md_write_start
7113                  */
7114                 if (mddev->pers) {
7115                         err = restart_array(mddev);
7116                         if (err == 0) {
7117                                 mddev->ro = 2;
7118                                 set_disk_ro(mddev->gendisk, 0);
7119                         }
7120                 }
7121                 goto unlock;
7122         }
7123
7124         /*
7125          * The remaining ioctls are changing the state of the
7126          * superblock, so we do not allow them on read-only arrays.
7127          */
7128         if (mddev->ro && mddev->pers) {
7129                 if (mddev->ro == 2) {
7130                         mddev->ro = 0;
7131                         sysfs_notify_dirent_safe(mddev->sysfs_state);
7132                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7133                         /* mddev_unlock will wake thread */
7134                         /* If a device failed while we were read-only, we
7135                          * need to make sure the metadata is updated now.
7136                          */
7137                         if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7138                                 mddev_unlock(mddev);
7139                                 wait_event(mddev->sb_wait,
7140                                            !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7141                                            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7142                                 mddev_lock_nointr(mddev);
7143                         }
7144                 } else {
7145                         err = -EROFS;
7146                         goto unlock;
7147                 }
7148         }
7149
7150         switch (cmd) {
7151         case ADD_NEW_DISK:
7152         {
7153                 mdu_disk_info_t info;
7154                 if (copy_from_user(&info, argp, sizeof(info)))
7155                         err = -EFAULT;
7156                 else
7157                         err = add_new_disk(mddev, &info);
7158                 goto unlock;
7159         }
7160
7161         case CLUSTERED_DISK_NACK:
7162                 if (mddev_is_clustered(mddev))
7163                         md_cluster_ops->new_disk_ack(mddev, false);
7164                 else
7165                         err = -EINVAL;
7166                 goto unlock;
7167
7168         case HOT_ADD_DISK:
7169                 err = hot_add_disk(mddev, new_decode_dev(arg));
7170                 goto unlock;
7171
7172         case RUN_ARRAY:
7173                 err = do_md_run(mddev);
7174                 goto unlock;
7175
7176         case SET_BITMAP_FILE:
7177                 err = set_bitmap_file(mddev, (int)arg);
7178                 goto unlock;
7179
7180         default:
7181                 err = -EINVAL;
7182                 goto unlock;
7183         }
7184
7185 unlock:
7186         if (mddev->hold_active == UNTIL_IOCTL &&
7187             err != -EINVAL)
7188                 mddev->hold_active = 0;
7189         mddev_unlock(mddev);
7190 out:
7191         return err;
7192 }
7193 #ifdef CONFIG_COMPAT
7194 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7195                     unsigned int cmd, unsigned long arg)
7196 {
7197         switch (cmd) {
7198         case HOT_REMOVE_DISK:
7199         case HOT_ADD_DISK:
7200         case SET_DISK_FAULTY:
7201         case SET_BITMAP_FILE:
7202                 /* These take in integer arg, do not convert */
7203                 break;
7204         default:
7205                 arg = (unsigned long)compat_ptr(arg);
7206                 break;
7207         }
7208
7209         return md_ioctl(bdev, mode, cmd, arg);
7210 }
7211 #endif /* CONFIG_COMPAT */
7212
7213 static int md_open(struct block_device *bdev, fmode_t mode)
7214 {
7215         /*
7216          * Succeed if we can lock the mddev, which confirms that
7217          * it isn't being stopped right now.
7218          */
7219         struct mddev *mddev = mddev_find(bdev->bd_dev);
7220         int err;
7221
7222         if (!mddev)
7223                 return -ENODEV;
7224
7225         if (mddev->gendisk != bdev->bd_disk) {
7226                 /* we are racing with mddev_put which is discarding this
7227                  * bd_disk.
7228                  */
7229                 mddev_put(mddev);
7230                 /* Wait until bdev->bd_disk is definitely gone */
7231                 flush_workqueue(md_misc_wq);
7232                 /* Then retry the open from the top */
7233                 return -ERESTARTSYS;
7234         }
7235         BUG_ON(mddev != bdev->bd_disk->private_data);
7236
7237         if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
7238                 goto out;
7239
7240         if (test_bit(MD_CLOSING, &mddev->flags)) {
7241                 mutex_unlock(&mddev->open_mutex);
7242                 err = -ENODEV;
7243                 goto out;
7244         }
7245
7246         err = 0;
7247         atomic_inc(&mddev->openers);
7248         mutex_unlock(&mddev->open_mutex);
7249
7250         check_disk_change(bdev);
7251  out:
7252         if (err)
7253                 mddev_put(mddev);
7254         return err;
7255 }
7256
7257 static void md_release(struct gendisk *disk, fmode_t mode)
7258 {
7259         struct mddev *mddev = disk->private_data;
7260
7261         BUG_ON(!mddev);
7262         atomic_dec(&mddev->openers);
7263         mddev_put(mddev);
7264 }
7265
7266 static int md_media_changed(struct gendisk *disk)
7267 {
7268         struct mddev *mddev = disk->private_data;
7269
7270         return mddev->changed;
7271 }
7272
7273 static int md_revalidate(struct gendisk *disk)
7274 {
7275         struct mddev *mddev = disk->private_data;
7276
7277         mddev->changed = 0;
7278         return 0;
7279 }
7280 static const struct block_device_operations md_fops =
7281 {
7282         .owner          = THIS_MODULE,
7283         .open           = md_open,
7284         .release        = md_release,
7285         .ioctl          = md_ioctl,
7286 #ifdef CONFIG_COMPAT
7287         .compat_ioctl   = md_compat_ioctl,
7288 #endif
7289         .getgeo         = md_getgeo,
7290         .media_changed  = md_media_changed,
7291         .revalidate_disk= md_revalidate,
7292 };
7293
7294 static int md_thread(void *arg)
7295 {
7296         struct md_thread *thread = arg;
7297
7298         /*
7299          * md_thread is a 'system-thread', it's priority should be very
7300          * high. We avoid resource deadlocks individually in each
7301          * raid personality. (RAID5 does preallocation) We also use RR and
7302          * the very same RT priority as kswapd, thus we will never get
7303          * into a priority inversion deadlock.
7304          *
7305          * we definitely have to have equal or higher priority than
7306          * bdflush, otherwise bdflush will deadlock if there are too
7307          * many dirty RAID5 blocks.
7308          */
7309
7310         allow_signal(SIGKILL);
7311         while (!kthread_should_stop()) {
7312
7313                 /* We need to wait INTERRUPTIBLE so that
7314                  * we don't add to the load-average.
7315                  * That means we need to be sure no signals are
7316                  * pending
7317                  */
7318                 if (signal_pending(current))
7319                         flush_signals(current);
7320
7321                 wait_event_interruptible_timeout
7322                         (thread->wqueue,
7323                          test_bit(THREAD_WAKEUP, &thread->flags)
7324                          || kthread_should_stop() || kthread_should_park(),
7325                          thread->timeout);
7326
7327                 clear_bit(THREAD_WAKEUP, &thread->flags);
7328                 if (kthread_should_park())
7329                         kthread_parkme();
7330                 if (!kthread_should_stop())
7331                         thread->run(thread);
7332         }
7333
7334         return 0;
7335 }
7336
7337 void md_wakeup_thread(struct md_thread *thread)
7338 {
7339         if (thread) {
7340                 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7341                 set_bit(THREAD_WAKEUP, &thread->flags);
7342                 wake_up(&thread->wqueue);
7343         }
7344 }
7345 EXPORT_SYMBOL(md_wakeup_thread);
7346
7347 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7348                 struct mddev *mddev, const char *name)
7349 {
7350         struct md_thread *thread;
7351
7352         thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7353         if (!thread)
7354                 return NULL;
7355
7356         init_waitqueue_head(&thread->wqueue);
7357
7358         thread->run = run;
7359         thread->mddev = mddev;
7360         thread->timeout = MAX_SCHEDULE_TIMEOUT;
7361         thread->tsk = kthread_run(md_thread, thread,
7362                                   "%s_%s",
7363                                   mdname(thread->mddev),
7364                                   name);
7365         if (IS_ERR(thread->tsk)) {
7366                 kfree(thread);
7367                 return NULL;
7368         }
7369         return thread;
7370 }
7371 EXPORT_SYMBOL(md_register_thread);
7372
7373 void md_unregister_thread(struct md_thread **threadp)
7374 {
7375         struct md_thread *thread = *threadp;
7376         if (!thread)
7377                 return;
7378         pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7379         /* Locking ensures that mddev_unlock does not wake_up a
7380          * non-existent thread
7381          */
7382         spin_lock(&pers_lock);
7383         *threadp = NULL;
7384         spin_unlock(&pers_lock);
7385
7386         kthread_stop(thread->tsk);
7387         kfree(thread);
7388 }
7389 EXPORT_SYMBOL(md_unregister_thread);
7390
7391 void md_error(struct mddev *mddev, struct md_rdev *rdev)
7392 {
7393         if (!rdev || test_bit(Faulty, &rdev->flags))
7394                 return;
7395
7396         if (!mddev->pers || !mddev->pers->error_handler)
7397                 return;
7398         mddev->pers->error_handler(mddev,rdev);
7399         if (mddev->degraded)
7400                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7401         sysfs_notify_dirent_safe(rdev->sysfs_state);
7402         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7403         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7404         md_wakeup_thread(mddev->thread);
7405         if (mddev->event_work.func)
7406                 queue_work(md_misc_wq, &mddev->event_work);
7407         md_new_event(mddev);
7408 }
7409 EXPORT_SYMBOL(md_error);
7410
7411 /* seq_file implementation /proc/mdstat */
7412
7413 static void status_unused(struct seq_file *seq)
7414 {
7415         int i = 0;
7416         struct md_rdev *rdev;
7417
7418         seq_printf(seq, "unused devices: ");
7419
7420         list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7421                 char b[BDEVNAME_SIZE];
7422                 i++;
7423                 seq_printf(seq, "%s ",
7424                               bdevname(rdev->bdev,b));
7425         }
7426         if (!i)
7427                 seq_printf(seq, "<none>");
7428
7429         seq_printf(seq, "\n");
7430 }
7431
7432 static int status_resync(struct seq_file *seq, struct mddev *mddev)
7433 {
7434         sector_t max_sectors, resync, res;
7435         unsigned long dt, db;
7436         sector_t rt;
7437         int scale;
7438         unsigned int per_milli;
7439
7440         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7441             test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7442                 max_sectors = mddev->resync_max_sectors;
7443         else
7444                 max_sectors = mddev->dev_sectors;
7445
7446         resync = mddev->curr_resync;
7447         if (resync <= 3) {
7448                 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7449                         /* Still cleaning up */
7450                         resync = max_sectors;
7451         } else
7452                 resync -= atomic_read(&mddev->recovery_active);
7453
7454         if (resync == 0) {
7455                 if (mddev->recovery_cp < MaxSector) {
7456                         seq_printf(seq, "\tresync=PENDING");
7457                         return 1;
7458                 }
7459                 return 0;
7460         }
7461         if (resync < 3) {
7462                 seq_printf(seq, "\tresync=DELAYED");
7463                 return 1;
7464         }
7465
7466         WARN_ON(max_sectors == 0);
7467         /* Pick 'scale' such that (resync>>scale)*1000 will fit
7468          * in a sector_t, and (max_sectors>>scale) will fit in a
7469          * u32, as those are the requirements for sector_div.
7470          * Thus 'scale' must be at least 10
7471          */
7472         scale = 10;
7473         if (sizeof(sector_t) > sizeof(unsigned long)) {
7474                 while ( max_sectors/2 > (1ULL<<(scale+32)))
7475                         scale++;
7476         }
7477         res = (resync>>scale)*1000;
7478         sector_div(res, (u32)((max_sectors>>scale)+1));
7479
7480         per_milli = res;
7481         {
7482                 int i, x = per_milli/50, y = 20-x;
7483                 seq_printf(seq, "[");
7484                 for (i = 0; i < x; i++)
7485                         seq_printf(seq, "=");
7486                 seq_printf(seq, ">");
7487                 for (i = 0; i < y; i++)
7488                         seq_printf(seq, ".");
7489                 seq_printf(seq, "] ");
7490         }
7491         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7492                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7493                     "reshape" :
7494                     (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7495                      "check" :
7496                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7497                       "resync" : "recovery"))),
7498                    per_milli/10, per_milli % 10,
7499                    (unsigned long long) resync/2,
7500                    (unsigned long long) max_sectors/2);
7501
7502         /*
7503          * dt: time from mark until now
7504          * db: blocks written from mark until now
7505          * rt: remaining time
7506          *
7507          * rt is a sector_t, so could be 32bit or 64bit.
7508          * So we divide before multiply in case it is 32bit and close
7509          * to the limit.
7510          * We scale the divisor (db) by 32 to avoid losing precision
7511          * near the end of resync when the number of remaining sectors
7512          * is close to 'db'.
7513          * We then divide rt by 32 after multiplying by db to compensate.
7514          * The '+1' avoids division by zero if db is very small.
7515          */
7516         dt = ((jiffies - mddev->resync_mark) / HZ);
7517         if (!dt) dt++;
7518         db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
7519                 - mddev->resync_mark_cnt;
7520
7521         rt = max_sectors - resync;    /* number of remaining sectors */
7522         sector_div(rt, db/32+1);
7523         rt *= dt;
7524         rt >>= 5;
7525
7526         seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7527                    ((unsigned long)rt % 60)/6);
7528
7529         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7530         return 1;
7531 }
7532
7533 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7534 {
7535         struct list_head *tmp;
7536         loff_t l = *pos;
7537         struct mddev *mddev;
7538
7539         if (l >= 0x10000)
7540                 return NULL;
7541         if (!l--)
7542                 /* header */
7543                 return (void*)1;
7544
7545         spin_lock(&all_mddevs_lock);
7546         list_for_each(tmp,&all_mddevs)
7547                 if (!l--) {
7548                         mddev = list_entry(tmp, struct mddev, all_mddevs);
7549                         mddev_get(mddev);
7550                         spin_unlock(&all_mddevs_lock);
7551                         return mddev;
7552                 }
7553         spin_unlock(&all_mddevs_lock);
7554         if (!l--)
7555                 return (void*)2;/* tail */
7556         return NULL;
7557 }
7558
7559 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7560 {
7561         struct list_head *tmp;
7562         struct mddev *next_mddev, *mddev = v;
7563
7564         ++*pos;
7565         if (v == (void*)2)
7566                 return NULL;
7567
7568         spin_lock(&all_mddevs_lock);
7569         if (v == (void*)1)
7570                 tmp = all_mddevs.next;
7571         else
7572                 tmp = mddev->all_mddevs.next;
7573         if (tmp != &all_mddevs)
7574                 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7575         else {
7576                 next_mddev = (void*)2;
7577                 *pos = 0x10000;
7578         }
7579         spin_unlock(&all_mddevs_lock);
7580
7581         if (v != (void*)1)
7582                 mddev_put(mddev);
7583         return next_mddev;
7584
7585 }
7586
7587 static void md_seq_stop(struct seq_file *seq, void *v)
7588 {
7589         struct mddev *mddev = v;
7590
7591         if (mddev && v != (void*)1 && v != (void*)2)
7592                 mddev_put(mddev);
7593 }
7594
7595 static int md_seq_show(struct seq_file *seq, void *v)
7596 {
7597         struct mddev *mddev = v;
7598         sector_t sectors;
7599         struct md_rdev *rdev;
7600
7601         if (v == (void*)1) {
7602                 struct md_personality *pers;
7603                 seq_printf(seq, "Personalities : ");
7604                 spin_lock(&pers_lock);
7605                 list_for_each_entry(pers, &pers_list, list)
7606                         seq_printf(seq, "[%s] ", pers->name);
7607
7608                 spin_unlock(&pers_lock);
7609                 seq_printf(seq, "\n");
7610                 seq->poll_event = atomic_read(&md_event_count);
7611                 return 0;
7612         }
7613         if (v == (void*)2) {
7614                 status_unused(seq);
7615                 return 0;
7616         }
7617
7618         spin_lock(&mddev->lock);
7619         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7620                 seq_printf(seq, "%s : %sactive", mdname(mddev),
7621                                                 mddev->pers ? "" : "in");
7622                 if (mddev->pers) {
7623                         if (mddev->ro==1)
7624                                 seq_printf(seq, " (read-only)");
7625                         if (mddev->ro==2)
7626                                 seq_printf(seq, " (auto-read-only)");
7627                         seq_printf(seq, " %s", mddev->pers->name);
7628                 }
7629
7630                 sectors = 0;
7631                 rcu_read_lock();
7632                 rdev_for_each_rcu(rdev, mddev) {
7633                         char b[BDEVNAME_SIZE];
7634                         seq_printf(seq, " %s[%d]",
7635                                 bdevname(rdev->bdev,b), rdev->desc_nr);
7636                         if (test_bit(WriteMostly, &rdev->flags))
7637                                 seq_printf(seq, "(W)");
7638                         if (test_bit(Journal, &rdev->flags))
7639                                 seq_printf(seq, "(J)");
7640                         if (test_bit(Faulty, &rdev->flags)) {
7641                                 seq_printf(seq, "(F)");
7642                                 continue;
7643                         }
7644                         if (rdev->raid_disk < 0)
7645                                 seq_printf(seq, "(S)"); /* spare */
7646                         if (test_bit(Replacement, &rdev->flags))
7647                                 seq_printf(seq, "(R)");
7648                         sectors += rdev->sectors;
7649                 }
7650                 rcu_read_unlock();
7651
7652                 if (!list_empty(&mddev->disks)) {
7653                         if (mddev->pers)
7654                                 seq_printf(seq, "\n      %llu blocks",
7655                                            (unsigned long long)
7656                                            mddev->array_sectors / 2);
7657                         else
7658                                 seq_printf(seq, "\n      %llu blocks",
7659                                            (unsigned long long)sectors / 2);
7660                 }
7661                 if (mddev->persistent) {
7662                         if (mddev->major_version != 0 ||
7663                             mddev->minor_version != 90) {
7664                                 seq_printf(seq," super %d.%d",
7665                                            mddev->major_version,
7666                                            mddev->minor_version);
7667                         }
7668                 } else if (mddev->external)
7669                         seq_printf(seq, " super external:%s",
7670                                    mddev->metadata_type);
7671                 else
7672                         seq_printf(seq, " super non-persistent");
7673
7674                 if (mddev->pers) {
7675                         mddev->pers->status(seq, mddev);
7676                         seq_printf(seq, "\n      ");
7677                         if (mddev->pers->sync_request) {
7678                                 if (status_resync(seq, mddev))
7679                                         seq_printf(seq, "\n      ");
7680                         }
7681                 } else
7682                         seq_printf(seq, "\n       ");
7683
7684                 bitmap_status(seq, mddev->bitmap);
7685
7686                 seq_printf(seq, "\n");
7687         }
7688         spin_unlock(&mddev->lock);
7689
7690         return 0;
7691 }
7692
7693 static const struct seq_operations md_seq_ops = {
7694         .start  = md_seq_start,
7695         .next   = md_seq_next,
7696         .stop   = md_seq_stop,
7697         .show   = md_seq_show,
7698 };
7699
7700 static int md_seq_open(struct inode *inode, struct file *file)
7701 {
7702         struct seq_file *seq;
7703         int error;
7704
7705         error = seq_open(file, &md_seq_ops);
7706         if (error)
7707                 return error;
7708
7709         seq = file->private_data;
7710         seq->poll_event = atomic_read(&md_event_count);
7711         return error;
7712 }
7713
7714 static int md_unloading;
7715 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7716 {
7717         struct seq_file *seq = filp->private_data;
7718         int mask;
7719
7720         if (md_unloading)
7721                 return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
7722         poll_wait(filp, &md_event_waiters, wait);
7723
7724         /* always allow read */
7725         mask = POLLIN | POLLRDNORM;
7726
7727         if (seq->poll_event != atomic_read(&md_event_count))
7728                 mask |= POLLERR | POLLPRI;
7729         return mask;
7730 }
7731
7732 static const struct file_operations md_seq_fops = {
7733         .owner          = THIS_MODULE,
7734         .open           = md_seq_open,
7735         .read           = seq_read,
7736         .llseek         = seq_lseek,
7737         .release        = seq_release_private,
7738         .poll           = mdstat_poll,
7739 };
7740
7741 int register_md_personality(struct md_personality *p)
7742 {
7743         pr_debug("md: %s personality registered for level %d\n",
7744                  p->name, p->level);
7745         spin_lock(&pers_lock);
7746         list_add_tail(&p->list, &pers_list);
7747         spin_unlock(&pers_lock);
7748         return 0;
7749 }
7750 EXPORT_SYMBOL(register_md_personality);
7751
7752 int unregister_md_personality(struct md_personality *p)
7753 {
7754         pr_debug("md: %s personality unregistered\n", p->name);
7755         spin_lock(&pers_lock);
7756         list_del_init(&p->list);
7757         spin_unlock(&pers_lock);
7758         return 0;
7759 }
7760 EXPORT_SYMBOL(unregister_md_personality);
7761
7762 int register_md_cluster_operations(struct md_cluster_operations *ops,
7763                                    struct module *module)
7764 {
7765         int ret = 0;
7766         spin_lock(&pers_lock);
7767         if (md_cluster_ops != NULL)
7768                 ret = -EALREADY;
7769         else {
7770                 md_cluster_ops = ops;
7771                 md_cluster_mod = module;
7772         }
7773         spin_unlock(&pers_lock);
7774         return ret;
7775 }
7776 EXPORT_SYMBOL(register_md_cluster_operations);
7777
7778 int unregister_md_cluster_operations(void)
7779 {
7780         spin_lock(&pers_lock);
7781         md_cluster_ops = NULL;
7782         spin_unlock(&pers_lock);
7783         return 0;
7784 }
7785 EXPORT_SYMBOL(unregister_md_cluster_operations);
7786
7787 int md_setup_cluster(struct mddev *mddev, int nodes)
7788 {
7789         if (!md_cluster_ops)
7790                 request_module("md-cluster");
7791         spin_lock(&pers_lock);
7792         /* ensure module won't be unloaded */
7793         if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
7794                 pr_warn("can't find md-cluster module or get it's reference.\n");
7795                 spin_unlock(&pers_lock);
7796                 return -ENOENT;
7797         }
7798         spin_unlock(&pers_lock);
7799
7800         return md_cluster_ops->join(mddev, nodes);
7801 }
7802
7803 void md_cluster_stop(struct mddev *mddev)
7804 {
7805         if (!md_cluster_ops)
7806                 return;
7807         md_cluster_ops->leave(mddev);
7808         module_put(md_cluster_mod);
7809 }
7810
7811 static int is_mddev_idle(struct mddev *mddev, int init)
7812 {
7813         struct md_rdev *rdev;
7814         int idle;
7815         int curr_events;
7816
7817         idle = 1;
7818         rcu_read_lock();
7819         rdev_for_each_rcu(rdev, mddev) {
7820                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7821                 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7822                               (int)part_stat_read(&disk->part0, sectors[1]) -
7823                               atomic_read(&disk->sync_io);
7824                 /* sync IO will cause sync_io to increase before the disk_stats
7825                  * as sync_io is counted when a request starts, and
7826                  * disk_stats is counted when it completes.
7827                  * So resync activity will cause curr_events to be smaller than
7828                  * when there was no such activity.
7829                  * non-sync IO will cause disk_stat to increase without
7830                  * increasing sync_io so curr_events will (eventually)
7831                  * be larger than it was before.  Once it becomes
7832                  * substantially larger, the test below will cause
7833                  * the array to appear non-idle, and resync will slow
7834                  * down.
7835                  * If there is a lot of outstanding resync activity when
7836                  * we set last_event to curr_events, then all that activity
7837                  * completing might cause the array to appear non-idle
7838                  * and resync will be slowed down even though there might
7839                  * not have been non-resync activity.  This will only
7840                  * happen once though.  'last_events' will soon reflect
7841                  * the state where there is little or no outstanding
7842                  * resync requests, and further resync activity will
7843                  * always make curr_events less than last_events.
7844                  *
7845                  */
7846                 if (init || curr_events - rdev->last_events > 64) {
7847                         rdev->last_events = curr_events;
7848                         idle = 0;
7849                 }
7850         }
7851         rcu_read_unlock();
7852         return idle;
7853 }
7854
7855 void md_done_sync(struct mddev *mddev, int blocks, int ok)
7856 {
7857         /* another "blocks" (512byte) blocks have been synced */
7858         atomic_sub(blocks, &mddev->recovery_active);
7859         wake_up(&mddev->recovery_wait);
7860         if (!ok) {
7861                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7862                 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7863                 md_wakeup_thread(mddev->thread);
7864                 // stop recovery, signal do_sync ....
7865         }
7866 }
7867 EXPORT_SYMBOL(md_done_sync);
7868
7869 /* md_write_start(mddev, bi)
7870  * If we need to update some array metadata (e.g. 'active' flag
7871  * in superblock) before writing, schedule a superblock update
7872  * and wait for it to complete.
7873  */
7874 void md_write_start(struct mddev *mddev, struct bio *bi)
7875 {
7876         int did_change = 0;
7877         if (bio_data_dir(bi) != WRITE)
7878                 return;
7879
7880         BUG_ON(mddev->ro == 1);
7881         if (mddev->ro == 2) {
7882                 /* need to switch to read/write */
7883                 mddev->ro = 0;
7884                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7885                 md_wakeup_thread(mddev->thread);
7886                 md_wakeup_thread(mddev->sync_thread);
7887                 did_change = 1;
7888         }
7889         atomic_inc(&mddev->writes_pending);
7890         if (mddev->safemode == 1)
7891                 mddev->safemode = 0;
7892         if (mddev->in_sync) {
7893                 spin_lock(&mddev->lock);
7894                 if (mddev->in_sync) {
7895                         mddev->in_sync = 0;
7896                         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
7897                         set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
7898                         md_wakeup_thread(mddev->thread);
7899                         did_change = 1;
7900                 }
7901                 spin_unlock(&mddev->lock);
7902         }
7903         if (did_change)
7904                 sysfs_notify_dirent_safe(mddev->sysfs_state);
7905         wait_event(mddev->sb_wait,
7906                    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7907 }
7908 EXPORT_SYMBOL(md_write_start);
7909
7910 /* md_write_inc can only be called when md_write_start() has
7911  * already been called at least once of the current request.
7912  * It increments the counter and is useful when a single request
7913  * is split into several parts.  Each part causes an increment and
7914  * so needs a matching md_write_end().
7915  * Unlike md_write_start(), it is safe to call md_write_inc() inside
7916  * a spinlocked region.
7917  */
7918 void md_write_inc(struct mddev *mddev, struct bio *bi)
7919 {
7920         if (bio_data_dir(bi) != WRITE)
7921                 return;
7922         WARN_ON_ONCE(mddev->in_sync || mddev->ro);
7923         atomic_inc(&mddev->writes_pending);
7924 }
7925 EXPORT_SYMBOL(md_write_inc);
7926
7927 void md_write_end(struct mddev *mddev)
7928 {
7929         if (atomic_dec_and_test(&mddev->writes_pending)) {
7930                 if (mddev->safemode == 2)
7931                         md_wakeup_thread(mddev->thread);
7932                 else if (mddev->safemode_delay)
7933                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7934         }
7935 }
7936 EXPORT_SYMBOL(md_write_end);
7937
7938 /* md_allow_write(mddev)
7939  * Calling this ensures that the array is marked 'active' so that writes
7940  * may proceed without blocking.  It is important to call this before
7941  * attempting a GFP_KERNEL allocation while holding the mddev lock.
7942  * Must be called with mddev_lock held.
7943  *
7944  * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
7945  * is dropped, so return -EAGAIN after notifying userspace.
7946  */
7947 int md_allow_write(struct mddev *mddev)
7948 {
7949         if (!mddev->pers)
7950                 return 0;
7951         if (mddev->ro)
7952                 return 0;
7953         if (!mddev->pers->sync_request)
7954                 return 0;
7955
7956         spin_lock(&mddev->lock);
7957         if (mddev->in_sync) {
7958                 mddev->in_sync = 0;
7959                 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
7960                 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
7961                 if (mddev->safemode_delay &&
7962                     mddev->safemode == 0)
7963                         mddev->safemode = 1;
7964                 spin_unlock(&mddev->lock);
7965                 md_update_sb(mddev, 0);
7966                 sysfs_notify_dirent_safe(mddev->sysfs_state);
7967         } else
7968                 spin_unlock(&mddev->lock);
7969
7970         if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
7971                 return -EAGAIN;
7972         else
7973                 return 0;
7974 }
7975 EXPORT_SYMBOL_GPL(md_allow_write);
7976
7977 #define SYNC_MARKS      10
7978 #define SYNC_MARK_STEP  (3*HZ)
7979 #define UPDATE_FREQUENCY (5*60*HZ)
7980 void md_do_sync(struct md_thread *thread)
7981 {
7982         struct mddev *mddev = thread->mddev;
7983         struct mddev *mddev2;
7984         unsigned int currspeed = 0,
7985                  window;
7986         sector_t max_sectors,j, io_sectors, recovery_done;
7987         unsigned long mark[SYNC_MARKS];
7988         unsigned long update_time;
7989         sector_t mark_cnt[SYNC_MARKS];
7990         int last_mark,m;
7991         struct list_head *tmp;
7992         sector_t last_check;
7993         int skipped = 0;
7994         struct md_rdev *rdev;
7995         char *desc, *action = NULL;
7996         struct blk_plug plug;
7997         int ret;
7998
7999         /* just incase thread restarts... */
8000         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8001                 return;
8002         if (mddev->ro) {/* never try to sync a read-only array */
8003                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8004                 return;
8005         }
8006
8007         if (mddev_is_clustered(mddev)) {
8008                 ret = md_cluster_ops->resync_start(mddev);
8009                 if (ret)
8010                         goto skip;
8011
8012                 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8013                 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8014                         test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8015                         test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8016                      && ((unsigned long long)mddev->curr_resync_completed
8017                          < (unsigned long long)mddev->resync_max_sectors))
8018                         goto skip;
8019         }
8020
8021         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8022                 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8023                         desc = "data-check";
8024                         action = "check";
8025                 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8026                         desc = "requested-resync";
8027                         action = "repair";
8028                 } else
8029                         desc = "resync";
8030         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8031                 desc = "reshape";
8032         else
8033                 desc = "recovery";
8034
8035         mddev->last_sync_action = action ?: desc;
8036
8037         /* we overload curr_resync somewhat here.
8038          * 0 == not engaged in resync at all
8039          * 2 == checking that there is no conflict with another sync
8040          * 1 == like 2, but have yielded to allow conflicting resync to
8041          *              commense
8042          * other == active in resync - this many blocks
8043          *
8044          * Before starting a resync we must have set curr_resync to
8045          * 2, and then checked that every "conflicting" array has curr_resync
8046          * less than ours.  When we find one that is the same or higher
8047          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8048          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8049          * This will mean we have to start checking from the beginning again.
8050          *
8051          */
8052
8053         do {
8054                 int mddev2_minor = -1;
8055                 mddev->curr_resync = 2;
8056
8057         try_again:
8058                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8059                         goto skip;
8060                 for_each_mddev(mddev2, tmp) {
8061                         if (mddev2 == mddev)
8062                                 continue;
8063                         if (!mddev->parallel_resync
8064                         &&  mddev2->curr_resync
8065                         &&  match_mddev_units(mddev, mddev2)) {
8066                                 DEFINE_WAIT(wq);
8067                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
8068                                         /* arbitrarily yield */
8069                                         mddev->curr_resync = 1;
8070                                         wake_up(&resync_wait);
8071                                 }
8072                                 if (mddev > mddev2 && mddev->curr_resync == 1)
8073                                         /* no need to wait here, we can wait the next
8074                                          * time 'round when curr_resync == 2
8075                                          */
8076                                         continue;
8077                                 /* We need to wait 'interruptible' so as not to
8078                                  * contribute to the load average, and not to
8079                                  * be caught by 'softlockup'
8080                                  */
8081                                 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8082                                 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8083                                     mddev2->curr_resync >= mddev->curr_resync) {
8084                                         if (mddev2_minor != mddev2->md_minor) {
8085                                                 mddev2_minor = mddev2->md_minor;
8086                                                 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8087                                                         desc, mdname(mddev),
8088                                                         mdname(mddev2));
8089                                         }
8090                                         mddev_put(mddev2);
8091                                         if (signal_pending(current))
8092                                                 flush_signals(current);
8093                                         schedule();
8094                                         finish_wait(&resync_wait, &wq);
8095                                         goto try_again;
8096                                 }
8097                                 finish_wait(&resync_wait, &wq);
8098                         }
8099                 }
8100         } while (mddev->curr_resync < 2);
8101
8102         j = 0;
8103         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8104                 /* resync follows the size requested by the personality,
8105                  * which defaults to physical size, but can be virtual size
8106                  */
8107                 max_sectors = mddev->resync_max_sectors;
8108                 atomic64_set(&mddev->resync_mismatches, 0);
8109                 /* we don't use the checkpoint if there's a bitmap */
8110                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8111                         j = mddev->resync_min;
8112                 else if (!mddev->bitmap)
8113                         j = mddev->recovery_cp;
8114
8115         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8116                 max_sectors = mddev->resync_max_sectors;
8117         else {
8118                 /* recovery follows the physical size of devices */
8119                 max_sectors = mddev->dev_sectors;
8120                 j = MaxSector;
8121                 rcu_read_lock();
8122                 rdev_for_each_rcu(rdev, mddev)
8123                         if (rdev->raid_disk >= 0 &&
8124                             !test_bit(Journal, &rdev->flags) &&
8125                             !test_bit(Faulty, &rdev->flags) &&
8126                             !test_bit(In_sync, &rdev->flags) &&
8127                             rdev->recovery_offset < j)
8128                                 j = rdev->recovery_offset;
8129                 rcu_read_unlock();
8130
8131                 /* If there is a bitmap, we need to make sure all
8132                  * writes that started before we added a spare
8133                  * complete before we start doing a recovery.
8134                  * Otherwise the write might complete and (via
8135                  * bitmap_endwrite) set a bit in the bitmap after the
8136                  * recovery has checked that bit and skipped that
8137                  * region.
8138                  */
8139                 if (mddev->bitmap) {
8140                         mddev->pers->quiesce(mddev, 1);
8141                         mddev->pers->quiesce(mddev, 0);
8142                 }
8143         }
8144
8145         pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8146         pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
8147         pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8148                  speed_max(mddev), desc);
8149
8150         is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8151
8152         io_sectors = 0;
8153         for (m = 0; m < SYNC_MARKS; m++) {
8154                 mark[m] = jiffies;
8155                 mark_cnt[m] = io_sectors;
8156         }
8157         last_mark = 0;
8158         mddev->resync_mark = mark[last_mark];
8159         mddev->resync_mark_cnt = mark_cnt[last_mark];
8160
8161         /*
8162          * Tune reconstruction:
8163          */
8164         window = 32*(PAGE_SIZE/512);
8165         pr_debug("md: using %dk window, over a total of %lluk.\n",
8166                  window/2, (unsigned long long)max_sectors/2);
8167
8168         atomic_set(&mddev->recovery_active, 0);
8169         last_check = 0;
8170
8171         if (j>2) {
8172                 pr_debug("md: resuming %s of %s from checkpoint.\n",
8173                          desc, mdname(mddev));
8174                 mddev->curr_resync = j;
8175         } else
8176                 mddev->curr_resync = 3; /* no longer delayed */
8177         mddev->curr_resync_completed = j;
8178         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8179         md_new_event(mddev);
8180         update_time = jiffies;
8181
8182         blk_start_plug(&plug);
8183         while (j < max_sectors) {
8184                 sector_t sectors;
8185
8186                 skipped = 0;
8187
8188                 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8189                     ((mddev->curr_resync > mddev->curr_resync_completed &&
8190                       (mddev->curr_resync - mddev->curr_resync_completed)
8191                       > (max_sectors >> 4)) ||
8192                      time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8193                      (j - mddev->curr_resync_completed)*2
8194                      >= mddev->resync_max - mddev->curr_resync_completed ||
8195                      mddev->curr_resync_completed > mddev->resync_max
8196                             )) {
8197                         /* time to update curr_resync_completed */
8198                         wait_event(mddev->recovery_wait,
8199                                    atomic_read(&mddev->recovery_active) == 0);
8200                         mddev->curr_resync_completed = j;
8201                         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8202                             j > mddev->recovery_cp)
8203                                 mddev->recovery_cp = j;
8204                         update_time = jiffies;
8205                         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8206                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8207                 }
8208
8209                 while (j >= mddev->resync_max &&
8210                        !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8211                         /* As this condition is controlled by user-space,
8212                          * we can block indefinitely, so use '_interruptible'
8213                          * to avoid triggering warnings.
8214                          */
8215                         flush_signals(current); /* just in case */
8216                         wait_event_interruptible(mddev->recovery_wait,
8217                                                  mddev->resync_max > j
8218                                                  || test_bit(MD_RECOVERY_INTR,
8219                                                              &mddev->recovery));
8220                 }
8221
8222                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8223                         break;
8224
8225                 sectors = mddev->pers->sync_request(mddev, j, &skipped);
8226                 if (sectors == 0) {
8227                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8228                         break;
8229                 }
8230
8231                 if (!skipped) { /* actual IO requested */
8232                         io_sectors += sectors;
8233                         atomic_add(sectors, &mddev->recovery_active);
8234                 }
8235
8236                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8237                         break;
8238
8239                 j += sectors;
8240                 if (j > max_sectors)
8241                         /* when skipping, extra large numbers can be returned. */
8242                         j = max_sectors;
8243                 if (j > 2)
8244                         mddev->curr_resync = j;
8245                 mddev->curr_mark_cnt = io_sectors;
8246                 if (last_check == 0)
8247                         /* this is the earliest that rebuild will be
8248                          * visible in /proc/mdstat
8249                          */
8250                         md_new_event(mddev);
8251
8252                 if (last_check + window > io_sectors || j == max_sectors)
8253                         continue;
8254
8255                 last_check = io_sectors;
8256         repeat:
8257                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8258                         /* step marks */
8259                         int next = (last_mark+1) % SYNC_MARKS;
8260
8261                         mddev->resync_mark = mark[next];
8262                         mddev->resync_mark_cnt = mark_cnt[next];
8263                         mark[next] = jiffies;
8264                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8265                         last_mark = next;
8266                 }
8267
8268                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8269                         break;
8270
8271                 /*
8272                  * this loop exits only if either when we are slower than
8273                  * the 'hard' speed limit, or the system was IO-idle for
8274                  * a jiffy.
8275                  * the system might be non-idle CPU-wise, but we only care
8276                  * about not overloading the IO subsystem. (things like an
8277                  * e2fsck being done on the RAID array should execute fast)
8278                  */
8279                 cond_resched();
8280
8281                 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8282                 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8283                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
8284
8285                 if (currspeed > speed_min(mddev)) {
8286                         if (currspeed > speed_max(mddev)) {
8287                                 msleep(500);
8288                                 goto repeat;
8289                         }
8290                         if (!is_mddev_idle(mddev, 0)) {
8291                                 /*
8292                                  * Give other IO more of a chance.
8293                                  * The faster the devices, the less we wait.
8294                                  */
8295                                 wait_event(mddev->recovery_wait,
8296                                            !atomic_read(&mddev->recovery_active));
8297                         }
8298                 }
8299         }
8300         pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8301                 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8302                 ? "interrupted" : "done");
8303         /*
8304          * this also signals 'finished resyncing' to md_stop
8305          */
8306         blk_finish_plug(&plug);
8307         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8308
8309         if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8310             !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8311             mddev->curr_resync > 3) {
8312                 mddev->curr_resync_completed = mddev->curr_resync;
8313                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8314         }
8315         mddev->pers->sync_request(mddev, max_sectors, &skipped);
8316
8317         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8318             mddev->curr_resync > 3) {
8319                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8320                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8321                                 if (mddev->curr_resync >= mddev->recovery_cp) {
8322                                         pr_debug("md: checkpointing %s of %s.\n",
8323                                                  desc, mdname(mddev));
8324                                         if (test_bit(MD_RECOVERY_ERROR,
8325                                                 &mddev->recovery))
8326                                                 mddev->recovery_cp =
8327                                                         mddev->curr_resync_completed;
8328                                         else
8329                                                 mddev->recovery_cp =
8330                                                         mddev->curr_resync;
8331                                 }
8332                         } else
8333                                 mddev->recovery_cp = MaxSector;
8334                 } else {
8335                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8336                                 mddev->curr_resync = MaxSector;
8337                         rcu_read_lock();
8338                         rdev_for_each_rcu(rdev, mddev)
8339                                 if (rdev->raid_disk >= 0 &&
8340                                     mddev->delta_disks >= 0 &&
8341                                     !test_bit(Journal, &rdev->flags) &&
8342                                     !test_bit(Faulty, &rdev->flags) &&
8343                                     !test_bit(In_sync, &rdev->flags) &&
8344                                     rdev->recovery_offset < mddev->curr_resync)
8345                                         rdev->recovery_offset = mddev->curr_resync;
8346                         rcu_read_unlock();
8347                 }
8348         }
8349  skip:
8350         /* set CHANGE_PENDING here since maybe another update is needed,
8351          * so other nodes are informed. It should be harmless for normal
8352          * raid */
8353         set_mask_bits(&mddev->sb_flags, 0,
8354                       BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8355
8356         spin_lock(&mddev->lock);
8357         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8358                 /* We completed so min/max setting can be forgotten if used. */
8359                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8360                         mddev->resync_min = 0;
8361                 mddev->resync_max = MaxSector;
8362         } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8363                 mddev->resync_min = mddev->curr_resync_completed;
8364         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
8365         mddev->curr_resync = 0;
8366         spin_unlock(&mddev->lock);
8367
8368         wake_up(&resync_wait);
8369         md_wakeup_thread(mddev->thread);
8370         return;
8371 }
8372 EXPORT_SYMBOL_GPL(md_do_sync);
8373
8374 static int remove_and_add_spares(struct mddev *mddev,
8375                                  struct md_rdev *this)
8376 {
8377         struct md_rdev *rdev;
8378         int spares = 0;
8379         int removed = 0;
8380         bool remove_some = false;
8381
8382         rdev_for_each(rdev, mddev) {
8383                 if ((this == NULL || rdev == this) &&
8384                     rdev->raid_disk >= 0 &&
8385                     !test_bit(Blocked, &rdev->flags) &&
8386                     test_bit(Faulty, &rdev->flags) &&
8387                     atomic_read(&rdev->nr_pending)==0) {
8388                         /* Faulty non-Blocked devices with nr_pending == 0
8389                          * never get nr_pending incremented,
8390                          * never get Faulty cleared, and never get Blocked set.
8391                          * So we can synchronize_rcu now rather than once per device
8392                          */
8393                         remove_some = true;
8394                         set_bit(RemoveSynchronized, &rdev->flags);
8395                 }
8396         }
8397
8398         if (remove_some)
8399                 synchronize_rcu();
8400         rdev_for_each(rdev, mddev) {
8401                 if ((this == NULL || rdev == this) &&
8402                     rdev->raid_disk >= 0 &&
8403                     !test_bit(Blocked, &rdev->flags) &&
8404                     ((test_bit(RemoveSynchronized, &rdev->flags) ||
8405                      (!test_bit(In_sync, &rdev->flags) &&
8406                       !test_bit(Journal, &rdev->flags))) &&
8407                     atomic_read(&rdev->nr_pending)==0)) {
8408                         if (mddev->pers->hot_remove_disk(
8409                                     mddev, rdev) == 0) {
8410                                 sysfs_unlink_rdev(mddev, rdev);
8411                                 rdev->raid_disk = -1;
8412                                 removed++;
8413                         }
8414                 }
8415                 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
8416                         clear_bit(RemoveSynchronized, &rdev->flags);
8417         }
8418
8419         if (removed && mddev->kobj.sd)
8420                 sysfs_notify(&mddev->kobj, NULL, "degraded");
8421
8422         if (this && removed)
8423                 goto no_add;
8424
8425         rdev_for_each(rdev, mddev) {
8426                 if (this && this != rdev)
8427                         continue;
8428                 if (test_bit(Candidate, &rdev->flags))
8429                         continue;
8430                 if (rdev->raid_disk >= 0 &&
8431                     !test_bit(In_sync, &rdev->flags) &&
8432                     !test_bit(Journal, &rdev->flags) &&
8433                     !test_bit(Faulty, &rdev->flags))
8434                         spares++;
8435                 if (rdev->raid_disk >= 0)
8436                         continue;
8437                 if (test_bit(Faulty, &rdev->flags))
8438                         continue;
8439                 if (!test_bit(Journal, &rdev->flags)) {
8440                         if (mddev->ro &&
8441                             ! (rdev->saved_raid_disk >= 0 &&
8442                                !test_bit(Bitmap_sync, &rdev->flags)))
8443                                 continue;
8444
8445                         rdev->recovery_offset = 0;
8446                 }
8447                 if (mddev->pers->
8448                     hot_add_disk(mddev, rdev) == 0) {
8449                         if (sysfs_link_rdev(mddev, rdev))
8450                                 /* failure here is OK */;
8451                         if (!test_bit(Journal, &rdev->flags))
8452                                 spares++;
8453                         md_new_event(mddev);
8454                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8455                 }
8456         }
8457 no_add:
8458         if (removed)
8459                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8460         return spares;
8461 }
8462
8463 static void md_start_sync(struct work_struct *ws)
8464 {
8465         struct mddev *mddev = container_of(ws, struct mddev, del_work);
8466
8467         mddev->sync_thread = md_register_thread(md_do_sync,
8468                                                 mddev,
8469                                                 "resync");
8470         if (!mddev->sync_thread) {
8471                 pr_warn("%s: could not start resync thread...\n",
8472                         mdname(mddev));
8473                 /* leave the spares where they are, it shouldn't hurt */
8474                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8475                 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8476                 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8477                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8478                 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8479                 wake_up(&resync_wait);
8480                 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8481                                        &mddev->recovery))
8482                         if (mddev->sysfs_action)
8483                                 sysfs_notify_dirent_safe(mddev->sysfs_action);
8484         } else
8485                 md_wakeup_thread(mddev->sync_thread);
8486         sysfs_notify_dirent_safe(mddev->sysfs_action);
8487         md_new_event(mddev);
8488 }
8489
8490 /*
8491  * This routine is regularly called by all per-raid-array threads to
8492  * deal with generic issues like resync and super-block update.
8493  * Raid personalities that don't have a thread (linear/raid0) do not
8494  * need this as they never do any recovery or update the superblock.
8495  *
8496  * It does not do any resync itself, but rather "forks" off other threads
8497  * to do that as needed.
8498  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
8499  * "->recovery" and create a thread at ->sync_thread.
8500  * When the thread finishes it sets MD_RECOVERY_DONE
8501  * and wakeups up this thread which will reap the thread and finish up.
8502  * This thread also removes any faulty devices (with nr_pending == 0).
8503  *
8504  * The overall approach is:
8505  *  1/ if the superblock needs updating, update it.
8506  *  2/ If a recovery thread is running, don't do anything else.
8507  *  3/ If recovery has finished, clean up, possibly marking spares active.
8508  *  4/ If there are any faulty devices, remove them.
8509  *  5/ If array is degraded, try to add spares devices
8510  *  6/ If array has spares or is not in-sync, start a resync thread.
8511  */
8512 void md_check_recovery(struct mddev *mddev)
8513 {
8514         if (mddev->suspended)
8515                 return;
8516
8517         if (mddev->bitmap)
8518                 bitmap_daemon_work(mddev);
8519
8520         if (signal_pending(current)) {
8521                 if (mddev->pers->sync_request && !mddev->external) {
8522                         pr_debug("md: %s in immediate safe mode\n",
8523                                  mdname(mddev));
8524                         mddev->safemode = 2;
8525                 }
8526                 flush_signals(current);
8527         }
8528
8529         if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8530                 return;
8531         if ( ! (
8532                 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
8533                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8534                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8535                 (mddev->external == 0 && mddev->safemode == 1) ||
8536                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
8537                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
8538                 ))
8539                 return;
8540
8541         if (mddev_trylock(mddev)) {
8542                 int spares = 0;
8543
8544                 if (mddev->ro) {
8545                         struct md_rdev *rdev;
8546                         if (!mddev->external && mddev->in_sync)
8547                                 /* 'Blocked' flag not needed as failed devices
8548                                  * will be recorded if array switched to read/write.
8549                                  * Leaving it set will prevent the device
8550                                  * from being removed.
8551                                  */
8552                                 rdev_for_each(rdev, mddev)
8553                                         clear_bit(Blocked, &rdev->flags);
8554                         /* On a read-only array we can:
8555                          * - remove failed devices
8556                          * - add already-in_sync devices if the array itself
8557                          *   is in-sync.
8558                          * As we only add devices that are already in-sync,
8559                          * we can activate the spares immediately.
8560                          */
8561                         remove_and_add_spares(mddev, NULL);
8562                         /* There is no thread, but we need to call
8563                          * ->spare_active and clear saved_raid_disk
8564                          */
8565                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8566                         md_reap_sync_thread(mddev);
8567                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8568                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8569                         clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8570                         goto unlock;
8571                 }
8572
8573                 if (mddev_is_clustered(mddev)) {
8574                         struct md_rdev *rdev;
8575                         /* kick the device if another node issued a
8576                          * remove disk.
8577                          */
8578                         rdev_for_each(rdev, mddev) {
8579                                 if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
8580                                                 rdev->raid_disk < 0)
8581                                         md_kick_rdev_from_array(rdev);
8582                         }
8583                 }
8584
8585                 if (!mddev->external) {
8586                         int did_change = 0;
8587                         spin_lock(&mddev->lock);
8588                         if (mddev->safemode &&
8589                             !atomic_read(&mddev->writes_pending) &&
8590                             !mddev->in_sync &&
8591                             mddev->recovery_cp == MaxSector) {
8592                                 mddev->in_sync = 1;
8593                                 did_change = 1;
8594                                 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8595                         }
8596                         if (mddev->safemode == 1)
8597                                 mddev->safemode = 0;
8598                         spin_unlock(&mddev->lock);
8599                         if (did_change)
8600                                 sysfs_notify_dirent_safe(mddev->sysfs_state);
8601                 }
8602
8603                 if (mddev->sb_flags)
8604                         md_update_sb(mddev, 0);
8605
8606                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8607                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
8608                         /* resync/recovery still happening */
8609                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8610                         goto unlock;
8611                 }
8612                 if (mddev->sync_thread) {
8613                         md_reap_sync_thread(mddev);
8614                         goto unlock;
8615                 }
8616                 /* Set RUNNING before clearing NEEDED to avoid
8617                  * any transients in the value of "sync_action".
8618                  */
8619                 mddev->curr_resync_completed = 0;
8620                 spin_lock(&mddev->lock);
8621                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8622                 spin_unlock(&mddev->lock);
8623                 /* Clear some bits that don't mean anything, but
8624                  * might be left set
8625                  */
8626                 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
8627                 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8628
8629                 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8630                     test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
8631                         goto not_running;
8632                 /* no recovery is running.
8633                  * remove any failed drives, then
8634                  * add spares if possible.
8635                  * Spares are also removed and re-added, to allow
8636                  * the personality to fail the re-add.
8637                  */
8638
8639                 if (mddev->reshape_position != MaxSector) {
8640                         if (mddev->pers->check_reshape == NULL ||
8641                             mddev->pers->check_reshape(mddev) != 0)
8642                                 /* Cannot proceed */
8643                                 goto not_running;
8644                         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8645                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8646                 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
8647                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8648                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8649                         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8650                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8651                 } else if (mddev->recovery_cp < MaxSector) {
8652                         set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8653                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8654                 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
8655                         /* nothing to be done ... */
8656                         goto not_running;
8657
8658                 if (mddev->pers->sync_request) {
8659                         if (spares) {
8660                                 /* We are adding a device or devices to an array
8661                                  * which has the bitmap stored on all devices.
8662                                  * So make sure all bitmap pages get written
8663                                  */
8664                                 bitmap_write_all(mddev->bitmap);
8665                         }
8666                         INIT_WORK(&mddev->del_work, md_start_sync);
8667                         queue_work(md_misc_wq, &mddev->del_work);
8668                         goto unlock;
8669                 }
8670         not_running:
8671                 if (!mddev->sync_thread) {
8672                         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8673                         wake_up(&resync_wait);
8674                         if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8675                                                &mddev->recovery))
8676                                 if (mddev->sysfs_action)
8677                                         sysfs_notify_dirent_safe(mddev->sysfs_action);
8678                 }
8679         unlock:
8680                 wake_up(&mddev->sb_wait);
8681                 mddev_unlock(mddev);
8682         }
8683 }
8684 EXPORT_SYMBOL(md_check_recovery);
8685
8686 void md_reap_sync_thread(struct mddev *mddev)
8687 {
8688         struct md_rdev *rdev;
8689
8690         /* resync has finished, collect result */
8691         md_unregister_thread(&mddev->sync_thread);
8692         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8693             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8694                 /* success...*/
8695                 /* activate any spares */
8696                 if (mddev->pers->spare_active(mddev)) {
8697                         sysfs_notify(&mddev->kobj, NULL,
8698                                      "degraded");
8699                         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8700                 }
8701         }
8702         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8703             mddev->pers->finish_reshape)
8704                 mddev->pers->finish_reshape(mddev);
8705
8706         /* If array is no-longer degraded, then any saved_raid_disk
8707          * information must be scrapped.
8708          */
8709         if (!mddev->degraded)
8710                 rdev_for_each(rdev, mddev)
8711                         rdev->saved_raid_disk = -1;
8712
8713         md_update_sb(mddev, 1);
8714         /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
8715          * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
8716          * clustered raid */
8717         if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
8718                 md_cluster_ops->resync_finish(mddev);
8719         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8720         clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8721         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8722         clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8723         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8724         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8725         wake_up(&resync_wait);
8726         /* flag recovery needed just to double check */
8727         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8728         sysfs_notify_dirent_safe(mddev->sysfs_action);
8729         md_new_event(mddev);
8730         if (mddev->event_work.func)
8731                 queue_work(md_misc_wq, &mddev->event_work);
8732 }
8733 EXPORT_SYMBOL(md_reap_sync_thread);
8734
8735 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
8736 {
8737         sysfs_notify_dirent_safe(rdev->sysfs_state);
8738         wait_event_timeout(rdev->blocked_wait,
8739                            !test_bit(Blocked, &rdev->flags) &&
8740                            !test_bit(BlockedBadBlocks, &rdev->flags),
8741                            msecs_to_jiffies(5000));
8742         rdev_dec_pending(rdev, mddev);
8743 }
8744 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
8745
8746 void md_finish_reshape(struct mddev *mddev)
8747 {
8748         /* called be personality module when reshape completes. */
8749         struct md_rdev *rdev;
8750
8751         rdev_for_each(rdev, mddev) {
8752                 if (rdev->data_offset > rdev->new_data_offset)
8753                         rdev->sectors += rdev->data_offset - rdev->new_data_offset;
8754                 else
8755                         rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
8756                 rdev->data_offset = rdev->new_data_offset;
8757         }
8758 }
8759 EXPORT_SYMBOL(md_finish_reshape);
8760
8761 /* Bad block management */
8762
8763 /* Returns 1 on success, 0 on failure */
8764 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8765                        int is_new)
8766 {
8767         struct mddev *mddev = rdev->mddev;
8768         int rv;
8769         if (is_new)
8770                 s += rdev->new_data_offset;
8771         else
8772                 s += rdev->data_offset;
8773         rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
8774         if (rv == 0) {
8775                 /* Make sure they get written out promptly */
8776                 if (test_bit(ExternalBbl, &rdev->flags))
8777                         sysfs_notify(&rdev->kobj, NULL,
8778                                      "unacknowledged_bad_blocks");
8779                 sysfs_notify_dirent_safe(rdev->sysfs_state);
8780                 set_mask_bits(&mddev->sb_flags, 0,
8781                               BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
8782                 md_wakeup_thread(rdev->mddev->thread);
8783                 return 1;
8784         } else
8785                 return 0;
8786 }
8787 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8788
8789 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8790                          int is_new)
8791 {
8792         int rv;
8793         if (is_new)
8794                 s += rdev->new_data_offset;
8795         else
8796                 s += rdev->data_offset;
8797         rv = badblocks_clear(&rdev->badblocks, s, sectors);
8798         if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
8799                 sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
8800         return rv;
8801 }
8802 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8803
8804 static int md_notify_reboot(struct notifier_block *this,
8805                             unsigned long code, void *x)
8806 {
8807         struct list_head *tmp;
8808         struct mddev *mddev;
8809         int need_delay = 0;
8810
8811         for_each_mddev(mddev, tmp) {
8812                 if (mddev_trylock(mddev)) {
8813                         if (mddev->pers)
8814                                 __md_stop_writes(mddev);
8815                         if (mddev->persistent)
8816                                 mddev->safemode = 2;
8817                         mddev_unlock(mddev);
8818                 }
8819                 need_delay = 1;
8820         }
8821         /*
8822          * certain more exotic SCSI devices are known to be
8823          * volatile wrt too early system reboots. While the
8824          * right place to handle this issue is the given
8825          * driver, we do want to have a safe RAID driver ...
8826          */
8827         if (need_delay)
8828                 mdelay(1000*1);
8829
8830         return NOTIFY_DONE;
8831 }
8832
8833 static struct notifier_block md_notifier = {
8834         .notifier_call  = md_notify_reboot,
8835         .next           = NULL,
8836         .priority       = INT_MAX, /* before any real devices */
8837 };
8838
8839 static void md_geninit(void)
8840 {
8841         pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8842
8843         proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8844 }
8845
8846 static int __init md_init(void)
8847 {
8848         int ret = -ENOMEM;
8849
8850         md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8851         if (!md_wq)
8852                 goto err_wq;
8853
8854         md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8855         if (!md_misc_wq)
8856                 goto err_misc_wq;
8857
8858         if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8859                 goto err_md;
8860
8861         if ((ret = register_blkdev(0, "mdp")) < 0)
8862                 goto err_mdp;
8863         mdp_major = ret;
8864
8865         blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
8866                             md_probe, NULL, NULL);
8867         blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8868                             md_probe, NULL, NULL);
8869
8870         register_reboot_notifier(&md_notifier);
8871         raid_table_header = register_sysctl_table(raid_root_table);
8872
8873         md_geninit();
8874         return 0;
8875
8876 err_mdp:
8877         unregister_blkdev(MD_MAJOR, "md");
8878 err_md:
8879         destroy_workqueue(md_misc_wq);
8880 err_misc_wq:
8881         destroy_workqueue(md_wq);
8882 err_wq:
8883         return ret;
8884 }
8885
8886 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
8887 {
8888         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
8889         struct md_rdev *rdev2;
8890         int role, ret;
8891         char b[BDEVNAME_SIZE];
8892
8893         /*
8894          * If size is changed in another node then we need to
8895          * do resize as well.
8896          */
8897         if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
8898                 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
8899                 if (ret)
8900                         pr_info("md-cluster: resize failed\n");
8901                 else
8902                         bitmap_update_sb(mddev->bitmap);
8903         }
8904
8905         /* Check for change of roles in the active devices */
8906         rdev_for_each(rdev2, mddev) {
8907                 if (test_bit(Faulty, &rdev2->flags))
8908                         continue;
8909
8910                 /* Check if the roles changed */
8911                 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
8912
8913                 if (test_bit(Candidate, &rdev2->flags)) {
8914                         if (role == 0xfffe) {
8915                                 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
8916                                 md_kick_rdev_from_array(rdev2);
8917                                 continue;
8918                         }
8919                         else
8920                                 clear_bit(Candidate, &rdev2->flags);
8921                 }
8922
8923                 if (role != rdev2->raid_disk) {
8924                         /* got activated */
8925                         if (rdev2->raid_disk == -1 && role != 0xffff) {
8926                                 rdev2->saved_raid_disk = role;
8927                                 ret = remove_and_add_spares(mddev, rdev2);
8928                                 pr_info("Activated spare: %s\n",
8929                                         bdevname(rdev2->bdev,b));
8930                                 /* wakeup mddev->thread here, so array could
8931                                  * perform resync with the new activated disk */
8932                                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8933                                 md_wakeup_thread(mddev->thread);
8934
8935                         }
8936                         /* device faulty
8937                          * We just want to do the minimum to mark the disk
8938                          * as faulty. The recovery is performed by the
8939                          * one who initiated the error.
8940                          */
8941                         if ((role == 0xfffe) || (role == 0xfffd)) {
8942                                 md_error(mddev, rdev2);
8943                                 clear_bit(Blocked, &rdev2->flags);
8944                         }
8945                 }
8946         }
8947
8948         if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
8949                 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
8950
8951         /* Finally set the event to be up to date */
8952         mddev->events = le64_to_cpu(sb->events);
8953 }
8954
8955 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
8956 {
8957         int err;
8958         struct page *swapout = rdev->sb_page;
8959         struct mdp_superblock_1 *sb;
8960
8961         /* Store the sb page of the rdev in the swapout temporary
8962          * variable in case we err in the future
8963          */
8964         rdev->sb_page = NULL;
8965         err = alloc_disk_sb(rdev);
8966         if (err == 0) {
8967                 ClearPageUptodate(rdev->sb_page);
8968                 rdev->sb_loaded = 0;
8969                 err = super_types[mddev->major_version].
8970                         load_super(rdev, NULL, mddev->minor_version);
8971         }
8972         if (err < 0) {
8973                 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
8974                                 __func__, __LINE__, rdev->desc_nr, err);
8975                 if (rdev->sb_page)
8976                         put_page(rdev->sb_page);
8977                 rdev->sb_page = swapout;
8978                 rdev->sb_loaded = 1;
8979                 return err;
8980         }
8981
8982         sb = page_address(rdev->sb_page);
8983         /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
8984          * is not set
8985          */
8986
8987         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
8988                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
8989
8990         /* The other node finished recovery, call spare_active to set
8991          * device In_sync and mddev->degraded
8992          */
8993         if (rdev->recovery_offset == MaxSector &&
8994             !test_bit(In_sync, &rdev->flags) &&
8995             mddev->pers->spare_active(mddev))
8996                 sysfs_notify(&mddev->kobj, NULL, "degraded");
8997
8998         put_page(swapout);
8999         return 0;
9000 }
9001
9002 void md_reload_sb(struct mddev *mddev, int nr)
9003 {
9004         struct md_rdev *rdev;
9005         int err;
9006
9007         /* Find the rdev */
9008         rdev_for_each_rcu(rdev, mddev) {
9009                 if (rdev->desc_nr == nr)
9010                         break;
9011         }
9012
9013         if (!rdev || rdev->desc_nr != nr) {
9014                 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9015                 return;
9016         }
9017
9018         err = read_rdev(mddev, rdev);
9019         if (err < 0)
9020                 return;
9021
9022         check_sb_changes(mddev, rdev);
9023
9024         /* Read all rdev's to update recovery_offset */
9025         rdev_for_each_rcu(rdev, mddev)
9026                 read_rdev(mddev, rdev);
9027 }
9028 EXPORT_SYMBOL(md_reload_sb);
9029
9030 #ifndef MODULE
9031
9032 /*
9033  * Searches all registered partitions for autorun RAID arrays
9034  * at boot time.
9035  */
9036
9037 static DEFINE_MUTEX(detected_devices_mutex);
9038 static LIST_HEAD(all_detected_devices);
9039 struct detected_devices_node {
9040         struct list_head list;
9041         dev_t dev;
9042 };
9043
9044 void md_autodetect_dev(dev_t dev)
9045 {
9046         struct detected_devices_node *node_detected_dev;
9047
9048         node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9049         if (node_detected_dev) {
9050                 node_detected_dev->dev = dev;
9051                 mutex_lock(&detected_devices_mutex);
9052                 list_add_tail(&node_detected_dev->list, &all_detected_devices);
9053                 mutex_unlock(&detected_devices_mutex);
9054         }
9055 }
9056
9057 static void autostart_arrays(int part)
9058 {
9059         struct md_rdev *rdev;
9060         struct detected_devices_node *node_detected_dev;
9061         dev_t dev;
9062         int i_scanned, i_passed;
9063
9064         i_scanned = 0;
9065         i_passed = 0;
9066
9067         pr_info("md: Autodetecting RAID arrays.\n");
9068
9069         mutex_lock(&detected_devices_mutex);
9070         while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9071                 i_scanned++;
9072                 node_detected_dev = list_entry(all_detected_devices.next,
9073                                         struct detected_devices_node, list);
9074                 list_del(&node_detected_dev->list);
9075                 dev = node_detected_dev->dev;
9076                 kfree(node_detected_dev);
9077                 mutex_unlock(&detected_devices_mutex);
9078                 rdev = md_import_device(dev,0, 90);
9079                 mutex_lock(&detected_devices_mutex);
9080                 if (IS_ERR(rdev))
9081                         continue;
9082
9083                 if (test_bit(Faulty, &rdev->flags))
9084                         continue;
9085
9086                 set_bit(AutoDetected, &rdev->flags);
9087                 list_add(&rdev->same_set, &pending_raid_disks);
9088                 i_passed++;
9089         }
9090         mutex_unlock(&detected_devices_mutex);
9091
9092         pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9093
9094         autorun_devices(part);
9095 }
9096
9097 #endif /* !MODULE */
9098
9099 static __exit void md_exit(void)
9100 {
9101         struct mddev *mddev;
9102         struct list_head *tmp;
9103         int delay = 1;
9104
9105         blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9106         blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
9107
9108         unregister_blkdev(MD_MAJOR,"md");
9109         unregister_blkdev(mdp_major, "mdp");
9110         unregister_reboot_notifier(&md_notifier);
9111         unregister_sysctl_table(raid_table_header);
9112
9113         /* We cannot unload the modules while some process is
9114          * waiting for us in select() or poll() - wake them up
9115          */
9116         md_unloading = 1;
9117         while (waitqueue_active(&md_event_waiters)) {
9118                 /* not safe to leave yet */
9119                 wake_up(&md_event_waiters);
9120                 msleep(delay);
9121                 delay += delay;
9122         }
9123         remove_proc_entry("mdstat", NULL);
9124
9125         for_each_mddev(mddev, tmp) {
9126                 export_array(mddev);
9127                 mddev->ctime = 0;
9128                 mddev->hold_active = 0;
9129                 /*
9130                  * for_each_mddev() will call mddev_put() at the end of each
9131                  * iteration.  As the mddev is now fully clear, this will
9132                  * schedule the mddev for destruction by a workqueue, and the
9133                  * destroy_workqueue() below will wait for that to complete.
9134                  */
9135         }
9136         destroy_workqueue(md_misc_wq);
9137         destroy_workqueue(md_wq);
9138 }
9139
9140 subsys_initcall(md_init);
9141 module_exit(md_exit)
9142
9143 static int get_ro(char *buffer, struct kernel_param *kp)
9144 {
9145         return sprintf(buffer, "%d", start_readonly);
9146 }
9147 static int set_ro(const char *val, struct kernel_param *kp)
9148 {
9149         return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9150 }
9151
9152 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9153 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9154 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9155
9156 MODULE_LICENSE("GPL");
9157 MODULE_DESCRIPTION("MD RAID framework");
9158 MODULE_ALIAS("md");
9159 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);