fs/ceph/caps.c

   1 #include "ceph_debug.h"
   2
   3 #include <linux/fs.h>
   4 #include <linux/kernel.h>
   5 #include <linux/sched.h>
   6 #include <linux/slab.h>
   7 #include <linux/vmalloc.h>
   8 #include <linux/wait.h>
   9 #include <linux/writeback.h>
  10
  11 #include "super.h"
  12 #include "decode.h"
  13 #include "messenger.h"
  14
  15 /*
  16  * Capability management
  17  *
  18  * The Ceph metadata servers control client access to inode metadata
  19  * and file data by issuing capabilities, granting clients permission
  20  * to read and/or write both inode field and file data to OSDs
  21  * (storage nodes).  Each capability consists of a set of bits
  22  * indicating which operations are allowed.
  23  *
  24  * If the client holds a *_SHARED cap, the client has a coherent value
  25  * that can be safely read from the cached inode.
  26  *
  27  * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
  28  * client is allowed to change inode attributes (e.g., file size,
  29  * mtime), note its dirty state in the ceph_cap, and asynchronously
  30  * flush that metadata change to the MDS.
  31  *
  32  * In the event of a conflicting operation (perhaps by another
  33  * client), the MDS will revoke the conflicting client capabilities.
  34  *
  35  * In order for a client to cache an inode, it must hold a capability
  36  * with at least one MDS server.  When inodes are released, release
  37  * notifications are batched and periodically sent en masse to the MDS
  38  * cluster to release server state.
  39  */
  40
  41
  42 /*
  43  * Generate readable cap strings for debugging output.
  44  */
  45 #define MAX_CAP_STR 20
  46 static char cap_str[MAX_CAP_STR][40];
  47 static DEFINE_SPINLOCK(cap_str_lock);
  48 static int last_cap_str;
  49
  50 static char *gcap_string(char *s, int c)
  51 {
  52         if (c & CEPH_CAP_GSHARED)
  53                 *s++ = 's';
  54         if (c & CEPH_CAP_GEXCL)
  55                 *s++ = 'x';
  56         if (c & CEPH_CAP_GCACHE)
  57                 *s++ = 'c';
  58         if (c & CEPH_CAP_GRD)
  59                 *s++ = 'r';
  60         if (c & CEPH_CAP_GWR)
  61                 *s++ = 'w';
  62         if (c & CEPH_CAP_GBUFFER)
  63                 *s++ = 'b';
  64         if (c & CEPH_CAP_GLAZYIO)
  65                 *s++ = 'l';
  66         return s;
  67 }
  68
  69 const char *ceph_cap_string(int caps)
  70 {
  71         int i;
  72         char *s;
  73         int c;
  74
  75         spin_lock(&cap_str_lock);
  76         i = last_cap_str++;
  77         if (last_cap_str == MAX_CAP_STR)
  78                 last_cap_str = 0;
  79         spin_unlock(&cap_str_lock);
  80
  81         s = cap_str[i];
  82
  83         if (caps & CEPH_CAP_PIN)
  84                 *s++ = 'p';
  85
  86         c = (caps >> CEPH_CAP_SAUTH) & 3;
  87         if (c) {
  88                 *s++ = 'A';
  89                 s = gcap_string(s, c);
  90         }
  91
  92         c = (caps >> CEPH_CAP_SLINK) & 3;
  93         if (c) {
  94                 *s++ = 'L';
  95                 s = gcap_string(s, c);
  96         }
  97
  98         c = (caps >> CEPH_CAP_SXATTR) & 3;
  99         if (c) {
 100                 *s++ = 'X';
 101                 s = gcap_string(s, c);
 102         }
 103
 104         c = caps >> CEPH_CAP_SFILE;
 105         if (c) {
 106                 *s++ = 'F';
 107                 s = gcap_string(s, c);
 108         }
 109
 110         if (s == cap_str[i])
 111                 *s++ = '-';
 112         *s = 0;
 113         return cap_str[i];
 114 }
 115
 116 /*
 117  * Cap reservations
 118  *
 119  * Maintain a global pool of preallocated struct ceph_caps, referenced
 120  * by struct ceph_caps_reservations.  This ensures that we preallocate
 121  * memory needed to successfully process an MDS response.  (If an MDS
 122  * sends us cap information and we fail to process it, we will have
 123  * problems due to the client and MDS being out of sync.)
 124  *
 125  * Reservations are 'owned' by a ceph_cap_reservation context.
 126  */
 127 static spinlock_t caps_list_lock;
 128 static struct list_head caps_list;  /* unused (reserved or unreserved) */
 129 static int caps_total_count;        /* total caps allocated */
 130 static int caps_use_count;          /* in use */
 131 static int caps_reserve_count;      /* unused, reserved */
 132 static int caps_avail_count;        /* unused, unreserved */
 133 static int caps_min_count;          /* keep at least this many (unreserved) */
 134
 135 void __init ceph_caps_init(void)
 136 {
 137         INIT_LIST_HEAD(&caps_list);
 138         spin_lock_init(&caps_list_lock);
 139 }
 140
 141 void ceph_caps_finalize(void)
 142 {
 143         struct ceph_cap *cap;
 144
 145         spin_lock(&caps_list_lock);
 146         while (!list_empty(&caps_list)) {
 147                 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
 148                 list_del(&cap->caps_item);
 149                 kmem_cache_free(ceph_cap_cachep, cap);
 150         }
 151         caps_total_count = 0;
 152         caps_avail_count = 0;
 153         caps_use_count = 0;
 154         caps_reserve_count = 0;
 155         caps_min_count = 0;
 156         spin_unlock(&caps_list_lock);
 157 }
 158
 159 void ceph_adjust_min_caps(int delta)
 160 {
 161         spin_lock(&caps_list_lock);
 162         caps_min_count += delta;
 163         BUG_ON(caps_min_count < 0);
 164         spin_unlock(&caps_list_lock);
 165 }
 166
 167 int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
 168 {
 169         int i;
 170         struct ceph_cap *cap;
 171         int have;
 172         int alloc = 0;
 173         LIST_HEAD(newcaps);
 174         int ret = 0;
 175
 176         dout("reserve caps ctx=%p need=%d\n", ctx, need);
 177
 178         /* first reserve any caps that are already allocated */
 179         spin_lock(&caps_list_lock);
 180         if (caps_avail_count >= need)
 181                 have = need;
 182         else
 183                 have = caps_avail_count;
 184         caps_avail_count -= have;
 185         caps_reserve_count += have;
 186         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 187                caps_avail_count);
 188         spin_unlock(&caps_list_lock);
 189
 190         for (i = have; i < need; i++) {
 191                 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 192                 if (!cap) {
 193                         ret = -ENOMEM;
 194                         goto out_alloc_count;
 195                 }
 196                 list_add(&cap->caps_item, &newcaps);
 197                 alloc++;
 198         }
 199         BUG_ON(have + alloc != need);
 200
 201         spin_lock(&caps_list_lock);
 202         caps_total_count += alloc;
 203         caps_reserve_count += alloc;
 204         list_splice(&newcaps, &caps_list);
 205
 206         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 207                caps_avail_count);
 208         spin_unlock(&caps_list_lock);
 209
 210         ctx->count = need;
 211         dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
 212              ctx, caps_total_count, caps_use_count, caps_reserve_count,
 213              caps_avail_count);
 214         return 0;
 215
 216 out_alloc_count:
 217         /* we didn't manage to reserve as much as we needed */
 218         pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
 219                    ctx, need, have);
 220         return ret;
 221 }
 222
 223 int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
 224 {
 225         dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
 226         if (ctx->count) {
 227                 spin_lock(&caps_list_lock);
 228                 BUG_ON(caps_reserve_count < ctx->count);
 229                 caps_reserve_count -= ctx->count;
 230                 caps_avail_count += ctx->count;
 231                 ctx->count = 0;
 232                 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
 233                      caps_total_count, caps_use_count, caps_reserve_count,
 234                      caps_avail_count);
 235                 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 236                        caps_avail_count);
 237                 spin_unlock(&caps_list_lock);
 238         }
 239         return 0;
 240 }
 241
 242 static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
 243 {
 244         struct ceph_cap *cap = NULL;
 245
 246         /* temporary, until we do something about cap import/export */
 247         if (!ctx) {
 248                 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
 249                 if (cap) {
 250                         caps_use_count++;
 251                         caps_total_count++;
 252                 }
 253                 return cap;
 254         }
 255
 256         spin_lock(&caps_list_lock);
 257         dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
 258              ctx, ctx->count, caps_total_count, caps_use_count,
 259              caps_reserve_count, caps_avail_count);
 260         BUG_ON(!ctx->count);
 261         BUG_ON(ctx->count > caps_reserve_count);
 262         BUG_ON(list_empty(&caps_list));
 263
 264         ctx->count--;
 265         caps_reserve_count--;
 266         caps_use_count++;
 267
 268         cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
 269         list_del(&cap->caps_item);
 270
 271         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 272                caps_avail_count);
 273         spin_unlock(&caps_list_lock);
 274         return cap;
 275 }
 276
 277 void ceph_put_cap(struct ceph_cap *cap)
 278 {
 279         spin_lock(&caps_list_lock);
 280         dout("put_cap %p %d = %d used + %d resv + %d avail\n",
 281              cap, caps_total_count, caps_use_count,
 282              caps_reserve_count, caps_avail_count);
 283         caps_use_count--;
 284         /*
 285          * Keep some preallocated caps around (ceph_min_count), to
 286          * avoid lots of free/alloc churn.
 287          */
 288         if (caps_avail_count >= caps_reserve_count + caps_min_count) {
 289                 caps_total_count--;
 290                 kmem_cache_free(ceph_cap_cachep, cap);
 291         } else {
 292                 caps_avail_count++;
 293                 list_add(&cap->caps_item, &caps_list);
 294         }
 295
 296         BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
 297                caps_avail_count);
 298         spin_unlock(&caps_list_lock);
 299 }
 300
 301 void ceph_reservation_status(struct ceph_client *client,
 302                              int *total, int *avail, int *used, int *reserved,
 303                              int *min)
 304 {
 305         if (total)
 306                 *total = caps_total_count;
 307         if (avail)
 308                 *avail = caps_avail_count;
 309         if (used)
 310                 *used = caps_use_count;
 311         if (reserved)
 312                 *reserved = caps_reserve_count;
 313         if (min)
 314                 *min = caps_min_count;
 315 }
 316
 317 /*
 318  * Find ceph_cap for given mds, if any.
 319  *
 320  * Called with i_lock held.
 321  */
 322 static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
 323 {
 324         struct ceph_cap *cap;
 325         struct rb_node *n = ci->i_caps.rb_node;
 326
 327         while (n) {
 328                 cap = rb_entry(n, struct ceph_cap, ci_node);
 329                 if (mds < cap->mds)
 330                         n = n->rb_left;
 331                 else if (mds > cap->mds)
 332                         n = n->rb_right;
 333                 else
 334                         return cap;
 335         }
 336         return NULL;
 337 }
 338
 339 /*
 340  * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
 341  */
 342 static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
 343 {
 344         struct ceph_cap *cap;
 345         int mds = -1;
 346         struct rb_node *p;
 347
 348         /* prefer mds with WR|BUFFER|EXCL caps */
 349         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 350                 cap = rb_entry(p, struct ceph_cap, ci_node);
 351                 mds = cap->mds;
 352                 if (mseq)
 353                         *mseq = cap->mseq;
 354                 if (cap->issued & (CEPH_CAP_FILE_WR |
 355                                    CEPH_CAP_FILE_BUFFER |
 356                                    CEPH_CAP_FILE_EXCL))
 357                         break;
 358         }
 359         return mds;
 360 }
 361
 362 int ceph_get_cap_mds(struct inode *inode)
 363 {
 364         int mds;
 365         spin_lock(&inode->i_lock);
 366         mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
 367         spin_unlock(&inode->i_lock);
 368         return mds;
 369 }
 370
 371 /*
 372  * Called under i_lock.
 373  */
 374 static void __insert_cap_node(struct ceph_inode_info *ci,
 375                               struct ceph_cap *new)
 376 {
 377         struct rb_node **p = &ci->i_caps.rb_node;
 378         struct rb_node *parent = NULL;
 379         struct ceph_cap *cap = NULL;
 380
 381         while (*p) {
 382                 parent = *p;
 383                 cap = rb_entry(parent, struct ceph_cap, ci_node);
 384                 if (new->mds < cap->mds)
 385                         p = &(*p)->rb_left;
 386                 else if (new->mds > cap->mds)
 387                         p = &(*p)->rb_right;
 388                 else
 389                         BUG();
 390         }
 391
 392         rb_link_node(&new->ci_node, parent, p);
 393         rb_insert_color(&new->ci_node, &ci->i_caps);
 394 }
 395
 396 /*
 397  * (re)set cap hold timeouts, which control the delayed release
 398  * of unused caps back to the MDS.  Should be called on cap use.
 399  */
 400 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 401                                struct ceph_inode_info *ci)
 402 {
 403         struct ceph_mount_args *ma = mdsc->client->mount_args;
 404
 405         ci->i_hold_caps_min = round_jiffies(jiffies +
 406                                             ma->caps_wanted_delay_min * HZ);
 407         ci->i_hold_caps_max = round_jiffies(jiffies +
 408                                             ma->caps_wanted_delay_max * HZ);
 409         dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
 410              ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
 411 }
 412
 413 /*
 414  * (Re)queue cap at the end of the delayed cap release list.
 415  *
 416  * If I_FLUSH is set, leave the inode at the front of the list.
 417  *
 418  * Caller holds i_lock
 419  *    -> we take mdsc->cap_delay_lock
 420  */
 421 static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 422                                 struct ceph_inode_info *ci)
 423 {
 424         __cap_set_timeouts(mdsc, ci);
 425         dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
 426              ci->i_ceph_flags, ci->i_hold_caps_max);
 427         if (!mdsc->stopping) {
 428                 spin_lock(&mdsc->cap_delay_lock);
 429                 if (!list_empty(&ci->i_cap_delay_list)) {
 430                         if (ci->i_ceph_flags & CEPH_I_FLUSH)
 431                                 goto no_change;
 432                         list_del_init(&ci->i_cap_delay_list);
 433                 }
 434                 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 435 no_change:
 436                 spin_unlock(&mdsc->cap_delay_lock);
 437         }
 438 }
 439
 440 /*
 441  * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
 442  * indicating we should send a cap message to flush dirty metadata
 443  * asap, and move to the front of the delayed cap list.
 444  */
 445 static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
 446                                       struct ceph_inode_info *ci)
 447 {
 448         dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
 449         spin_lock(&mdsc->cap_delay_lock);
 450         ci->i_ceph_flags |= CEPH_I_FLUSH;
 451         if (!list_empty(&ci->i_cap_delay_list))
 452                 list_del_init(&ci->i_cap_delay_list);
 453         list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
 454         spin_unlock(&mdsc->cap_delay_lock);
 455 }
 456
 457 /*
 458  * Cancel delayed work on cap.
 459  *
 460  * Caller must hold i_lock.
 461  */
 462 static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
 463                                struct ceph_inode_info *ci)
 464 {
 465         dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
 466         if (list_empty(&ci->i_cap_delay_list))
 467                 return;
 468         spin_lock(&mdsc->cap_delay_lock);
 469         list_del_init(&ci->i_cap_delay_list);
 470         spin_unlock(&mdsc->cap_delay_lock);
 471 }
 472
 473 /*
 474  * Common issue checks for add_cap, handle_cap_grant.
 475  */
 476 static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 477                               unsigned issued)
 478 {
 479         unsigned had = __ceph_caps_issued(ci, NULL);
 480
 481         /*
 482          * Each time we receive FILE_CACHE anew, we increment
 483          * i_rdcache_gen.
 484          */
 485         if ((issued & CEPH_CAP_FILE_CACHE) &&
 486             (had & CEPH_CAP_FILE_CACHE) == 0)
 487                 ci->i_rdcache_gen++;
 488
 489         /*
 490          * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
 491          * don't know what happened to this directory while we didn't
 492          * have the cap.
 493          */
 494         if ((issued & CEPH_CAP_FILE_SHARED) &&
 495             (had & CEPH_CAP_FILE_SHARED) == 0) {
 496                 ci->i_shared_gen++;
 497                 if (S_ISDIR(ci->vfs_inode.i_mode)) {
 498                         dout(" marking %p NOT complete\n", &ci->vfs_inode);
 499                         ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
 500                 }
 501         }
 502 }
 503
 504 /*
 505  * Add a capability under the given MDS session.
 506  *
 507  * Caller should hold session snap_rwsem (read) and s_mutex.
 508  *
 509  * @fmode is the open file mode, if we are opening a file, otherwise
 510  * it is < 0.  (This is so we can atomically add the cap and add an
 511  * open file reference to it.)
 512  */
 513 int ceph_add_cap(struct inode *inode,
 514                  struct ceph_mds_session *session, u64 cap_id,
 515                  int fmode, unsigned issued, unsigned wanted,
 516                  unsigned seq, unsigned mseq, u64 realmino, int flags,
 517                  struct ceph_cap_reservation *caps_reservation)
 518 {
 519         struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
 520         struct ceph_inode_info *ci = ceph_inode(inode);
 521         struct ceph_cap *new_cap = NULL;
 522         struct ceph_cap *cap;
 523         int mds = session->s_mds;
 524         int actual_wanted;
 525
 526         dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
 527              session->s_mds, cap_id, ceph_cap_string(issued), seq);
 528
 529         /*
 530          * If we are opening the file, include file mode wanted bits
 531          * in wanted.
 532          */
 533         if (fmode >= 0)
 534                 wanted |= ceph_caps_for_mode(fmode);
 535
 536 retry:
 537         spin_lock(&inode->i_lock);
 538         cap = __get_cap_for_mds(ci, mds);
 539         if (!cap) {
 540                 if (new_cap) {
 541                         cap = new_cap;
 542                         new_cap = NULL;
 543                 } else {
 544                         spin_unlock(&inode->i_lock);
 545                         new_cap = get_cap(caps_reservation);
 546                         if (new_cap == NULL)
 547                                 return -ENOMEM;
 548                         goto retry;
 549                 }
 550
 551                 cap->issued = 0;
 552                 cap->implemented = 0;
 553                 cap->mds = mds;
 554                 cap->mds_wanted = 0;
 555
 556                 cap->ci = ci;
 557                 __insert_cap_node(ci, cap);
 558
 559                 /* clear out old exporting info?  (i.e. on cap import) */
 560                 if (ci->i_cap_exporting_mds == mds) {
 561                         ci->i_cap_exporting_issued = 0;
 562                         ci->i_cap_exporting_mseq = 0;
 563                         ci->i_cap_exporting_mds = -1;
 564                 }
 565
 566                 /* add to session cap list */
 567                 cap->session = session;
 568                 spin_lock(&session->s_cap_lock);
 569                 list_add_tail(&cap->session_caps, &session->s_caps);
 570                 session->s_nr_caps++;
 571                 spin_unlock(&session->s_cap_lock);
 572         }
 573
 574         if (!ci->i_snap_realm) {
 575                 /*
 576                  * add this inode to the appropriate snap realm
 577                  */
 578                 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
 579                                                                realmino);
 580                 if (realm) {
 581                         ceph_get_snap_realm(mdsc, realm);
 582                         spin_lock(&realm->inodes_with_caps_lock);
 583                         ci->i_snap_realm = realm;
 584                         list_add(&ci->i_snap_realm_item,
 585                                  &realm->inodes_with_caps);
 586                         spin_unlock(&realm->inodes_with_caps_lock);
 587                 } else {
 588                         pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
 589                                realmino);
 590                 }
 591         }
 592
 593         __check_cap_issue(ci, cap, issued);
 594
 595         /*
 596          * If we are issued caps we don't want, or the mds' wanted
 597          * value appears to be off, queue a check so we'll release
 598          * later and/or update the mds wanted value.
 599          */
 600         actual_wanted = __ceph_caps_wanted(ci);
 601         if ((wanted & ~actual_wanted) ||
 602             (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
 603                 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
 604                      ceph_cap_string(issued), ceph_cap_string(wanted),
 605                      ceph_cap_string(actual_wanted));
 606                 __cap_delay_requeue(mdsc, ci);
 607         }
 608
 609         if (flags & CEPH_CAP_FLAG_AUTH)
 610                 ci->i_auth_cap = cap;
 611         else if (ci->i_auth_cap == cap)
 612                 ci->i_auth_cap = NULL;
 613
 614         dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 615              inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
 616              ceph_cap_string(issued|cap->issued), seq, mds);
 617         cap->cap_id = cap_id;
 618         cap->issued = issued;
 619         cap->implemented |= issued;
 620         cap->mds_wanted |= wanted;
 621         cap->seq = seq;
 622         cap->issue_seq = seq;
 623         cap->mseq = mseq;
 624         cap->cap_gen = session->s_cap_gen;
 625
 626         if (fmode >= 0)
 627                 __ceph_get_fmode(ci, fmode);
 628         spin_unlock(&inode->i_lock);
 629         wake_up_all(&ci->i_cap_wq);
 630         return 0;
 631 }
 632
 633 /*
 634  * Return true if cap has not timed out and belongs to the current
 635  * generation of the MDS session (i.e. has not gone 'stale' due to
 636  * us losing touch with the mds).
 637  */
 638 static int __cap_is_valid(struct ceph_cap *cap)
 639 {
 640         unsigned long ttl;
 641         u32 gen;
 642
 643         spin_lock(&cap->session->s_cap_lock);
 644         gen = cap->session->s_cap_gen;
 645         ttl = cap->session->s_cap_ttl;
 646         spin_unlock(&cap->session->s_cap_lock);
 647
 648         if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 649                 dout("__cap_is_valid %p cap %p issued %s "
 650                      "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
 651                      cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
 652                 return 0;
 653         }
 654
 655         return 1;
 656 }
 657
 658 /*
 659  * Return set of valid cap bits issued to us.  Note that caps time
 660  * out, and may be invalidated in bulk if the client session times out
 661  * and session->s_cap_gen is bumped.
 662  */
 663 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 664 {
 665         int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
 666         struct ceph_cap *cap;
 667         struct rb_node *p;
 668
 669         if (implemented)
 670                 *implemented = 0;
 671         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 672                 cap = rb_entry(p, struct ceph_cap, ci_node);
 673                 if (!__cap_is_valid(cap))
 674                         continue;
 675                 dout("__ceph_caps_issued %p cap %p issued %s\n",
 676                      &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
 677                 have |= cap->issued;
 678                 if (implemented)
 679                         *implemented |= cap->implemented;
 680         }
 681         return have;
 682 }
 683
 684 /*
 685  * Get cap bits issued by caps other than @ocap
 686  */
 687 int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
 688 {
 689         int have = ci->i_snap_caps;
 690         struct ceph_cap *cap;
 691         struct rb_node *p;
 692
 693         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 694                 cap = rb_entry(p, struct ceph_cap, ci_node);
 695                 if (cap == ocap)
 696                         continue;
 697                 if (!__cap_is_valid(cap))
 698                         continue;
 699                 have |= cap->issued;
 700         }
 701         return have;
 702 }
 703
 704 /*
 705  * Move a cap to the end of the LRU (oldest caps at list head, newest
 706  * at list tail).
 707  */
 708 static void __touch_cap(struct ceph_cap *cap)
 709 {
 710         struct ceph_mds_session *s = cap->session;
 711
 712         spin_lock(&s->s_cap_lock);
 713         if (s->s_cap_iterator == NULL) {
 714                 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
 715                      s->s_mds);
 716                 list_move_tail(&cap->session_caps, &s->s_caps);
 717         } else {
 718                 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
 719                      &cap->ci->vfs_inode, cap, s->s_mds);
 720         }
 721         spin_unlock(&s->s_cap_lock);
 722 }
 723
 724 /*
 725  * Check if we hold the given mask.  If so, move the cap(s) to the
 726  * front of their respective LRUs.  (This is the preferred way for
 727  * callers to check for caps they want.)
 728  */
 729 int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 730 {
 731         struct ceph_cap *cap;
 732         struct rb_node *p;
 733         int have = ci->i_snap_caps;
 734
 735         if ((have & mask) == mask) {
 736                 dout("__ceph_caps_issued_mask %p snap issued %s"
 737                      " (mask %s)\n", &ci->vfs_inode,
 738                      ceph_cap_string(have),
 739                      ceph_cap_string(mask));
 740                 return 1;
 741         }
 742
 743         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 744                 cap = rb_entry(p, struct ceph_cap, ci_node);
 745                 if (!__cap_is_valid(cap))
 746                         continue;
 747                 if ((cap->issued & mask) == mask) {
 748                         dout("__ceph_caps_issued_mask %p cap %p issued %s"
 749                              " (mask %s)\n", &ci->vfs_inode, cap,
 750                              ceph_cap_string(cap->issued),
 751                              ceph_cap_string(mask));
 752                         if (touch)
 753                                 __touch_cap(cap);
 754                         return 1;
 755                 }
 756
 757                 /* does a combination of caps satisfy mask? */
 758                 have |= cap->issued;
 759                 if ((have & mask) == mask) {
 760                         dout("__ceph_caps_issued_mask %p combo issued %s"
 761                              " (mask %s)\n", &ci->vfs_inode,
 762                              ceph_cap_string(cap->issued),
 763                              ceph_cap_string(mask));
 764                         if (touch) {
 765                                 struct rb_node *q;
 766
 767                                 /* touch this + preceeding caps */
 768                                 __touch_cap(cap);
 769                                 for (q = rb_first(&ci->i_caps); q != p;
 770                                      q = rb_next(q)) {
 771                                         cap = rb_entry(q, struct ceph_cap,
 772                                                        ci_node);
 773                                         if (!__cap_is_valid(cap))
 774                                                 continue;
 775                                         __touch_cap(cap);
 776                                 }
 777                         }
 778                         return 1;
 779                 }
 780         }
 781
 782         return 0;
 783 }
 784
 785 /*
 786  * Return true if mask caps are currently being revoked by an MDS.
 787  */
 788 int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
 789 {
 790         struct inode *inode = &ci->vfs_inode;
 791         struct ceph_cap *cap;
 792         struct rb_node *p;
 793         int ret = 0;
 794
 795         spin_lock(&inode->i_lock);
 796         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 797                 cap = rb_entry(p, struct ceph_cap, ci_node);
 798                 if (__cap_is_valid(cap) &&
 799                     (cap->implemented & ~cap->issued & mask)) {
 800                         ret = 1;
 801                         break;
 802                 }
 803         }
 804         spin_unlock(&inode->i_lock);
 805         dout("ceph_caps_revoking %p %s = %d\n", inode,
 806              ceph_cap_string(mask), ret);
 807         return ret;
 808 }
 809
 810 int __ceph_caps_used(struct ceph_inode_info *ci)
 811 {
 812         int used = 0;
 813         if (ci->i_pin_ref)
 814                 used |= CEPH_CAP_PIN;
 815         if (ci->i_rd_ref)
 816                 used |= CEPH_CAP_FILE_RD;
 817         if (ci->i_rdcache_ref || ci->i_rdcache_gen)
 818                 used |= CEPH_CAP_FILE_CACHE;
 819         if (ci->i_wr_ref)
 820                 used |= CEPH_CAP_FILE_WR;
 821         if (ci->i_wrbuffer_ref)
 822                 used |= CEPH_CAP_FILE_BUFFER;
 823         return used;
 824 }
 825
 826 /*
 827  * wanted, by virtue of open file modes
 828  */
 829 int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 830 {
 831         int want = 0;
 832         int mode;
 833         for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
 834                 if (ci->i_nr_by_mode[mode])
 835                         want |= ceph_caps_for_mode(mode);
 836         return want;
 837 }
 838
 839 /*
 840  * Return caps we have registered with the MDS(s) as 'wanted'.
 841  */
 842 int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
 843 {
 844         struct ceph_cap *cap;
 845         struct rb_node *p;
 846         int mds_wanted = 0;
 847
 848         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 849                 cap = rb_entry(p, struct ceph_cap, ci_node);
 850                 if (!__cap_is_valid(cap))
 851                         continue;
 852                 mds_wanted |= cap->mds_wanted;
 853         }
 854         return mds_wanted;
 855 }
 856
 857 /*
 858  * called under i_lock
 859  */
 860 static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 861 {
 862         return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
 863 }
 864
 865 /*
 866  * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
 867  *
 868  * caller should hold i_lock.
 869  * caller will not hold session s_mutex if called from destroy_inode.
 870  */
 871 void __ceph_remove_cap(struct ceph_cap *cap)
 872 {
 873         struct ceph_mds_session *session = cap->session;
 874         struct ceph_inode_info *ci = cap->ci;
 875         struct ceph_mds_client *mdsc =
 876                 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
 877         int removed = 0;
 878
 879         dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
 880
 881         /* remove from session list */
 882         spin_lock(&session->s_cap_lock);
 883         if (session->s_cap_iterator == cap) {
 884                 /* not yet, we are iterating over this very cap */
 885                 dout("__ceph_remove_cap  delaying %p removal from session %p\n",
 886                      cap, cap->session);
 887         } else {
 888                 list_del_init(&cap->session_caps);
 889                 session->s_nr_caps--;
 890                 cap->session = NULL;
 891                 removed = 1;
 892         }
 893         /* protect backpointer with s_cap_lock: see iterate_session_caps */
 894         cap->ci = NULL;
 895         spin_unlock(&session->s_cap_lock);
 896
 897         /* remove from inode list */
 898         rb_erase(&cap->ci_node, &ci->i_caps);
 899         if (ci->i_auth_cap == cap)
 900                 ci->i_auth_cap = NULL;
 901
 902         if (removed)
 903                 ceph_put_cap(cap);
 904
 905         if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
 906                 struct ceph_snap_realm *realm = ci->i_snap_realm;
 907                 spin_lock(&realm->inodes_with_caps_lock);
 908                 list_del_init(&ci->i_snap_realm_item);
 909                 ci->i_snap_realm_counter++;
 910                 ci->i_snap_realm = NULL;
 911                 spin_unlock(&realm->inodes_with_caps_lock);
 912                 ceph_put_snap_realm(mdsc, realm);
 913         }
 914         if (!__ceph_is_any_real_caps(ci))
 915                 __cap_delay_cancel(mdsc, ci);
 916 }
 917
 918 /*
 919  * Build and send a cap message to the given MDS.
 920  *
 921  * Caller should be holding s_mutex.
 922  */
 923 static int send_cap_msg(struct ceph_mds_session *session,
 924                         u64 ino, u64 cid, int op,
 925                         int caps, int wanted, int dirty,
 926                         u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
 927                         u64 size, u64 max_size,
 928                         struct timespec *mtime, struct timespec *atime,
 929                         u64 time_warp_seq,
 930                         uid_t uid, gid_t gid, mode_t mode,
 931                         u64 xattr_version,
 932                         struct ceph_buffer *xattrs_buf,
 933                         u64 follows)
 934 {
 935         struct ceph_mds_caps *fc;
 936         struct ceph_msg *msg;
 937
 938         dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
 939              " seq %u/%u mseq %u follows %lld size %llu/%llu"
 940              " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
 941              cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
 942              ceph_cap_string(dirty),
 943              seq, issue_seq, mseq, follows, size, max_size,
 944              xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
 945
 946         msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
 947         if (!msg)
 948                 return -ENOMEM;
 949
 950         msg->hdr.tid = cpu_to_le64(flush_tid);
 951
 952         fc = msg->front.iov_base;
 953         memset(fc, 0, sizeof(*fc));
 954
 955         fc->cap_id = cpu_to_le64(cid);
 956         fc->op = cpu_to_le32(op);
 957         fc->seq = cpu_to_le32(seq);
 958         fc->issue_seq = cpu_to_le32(issue_seq);
 959         fc->migrate_seq = cpu_to_le32(mseq);
 960         fc->caps = cpu_to_le32(caps);
 961         fc->wanted = cpu_to_le32(wanted);
 962         fc->dirty = cpu_to_le32(dirty);
 963         fc->ino = cpu_to_le64(ino);
 964         fc->snap_follows = cpu_to_le64(follows);
 965
 966         fc->size = cpu_to_le64(size);
 967         fc->max_size = cpu_to_le64(max_size);
 968         if (mtime)
 969                 ceph_encode_timespec(&fc->mtime, mtime);
 970         if (atime)
 971                 ceph_encode_timespec(&fc->atime, atime);
 972         fc->time_warp_seq = cpu_to_le32(time_warp_seq);
 973
 974         fc->uid = cpu_to_le32(uid);
 975         fc->gid = cpu_to_le32(gid);
 976         fc->mode = cpu_to_le32(mode);
 977
 978         fc->xattr_version = cpu_to_le64(xattr_version);
 979         if (xattrs_buf) {
 980                 msg->middle = ceph_buffer_get(xattrs_buf);
 981                 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
 982                 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
 983         }
 984
 985         ceph_con_send(&session->s_con, msg);
 986         return 0;
 987 }
 988
 989 static void __queue_cap_release(struct ceph_mds_session *session,
 990                                 u64 ino, u64 cap_id, u32 migrate_seq,
 991                                 u32 issue_seq)
 992 {
 993         struct ceph_msg *msg;
 994         struct ceph_mds_cap_release *head;
 995         struct ceph_mds_cap_item *item;
 996
 997         spin_lock(&session->s_cap_lock);
 998         BUG_ON(!session->s_num_cap_releases);
 999         msg = list_first_entry(&session->s_cap_releases,
1000                                struct ceph_msg, list_head);
1001
1002         dout(" adding %llx release to mds%d msg %p (%d left)\n",
1003              ino, session->s_mds, msg, session->s_num_cap_releases);
1004
1005         BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1006         head = msg->front.iov_base;
1007         head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1008         item = msg->front.iov_base + msg->front.iov_len;
1009         item->ino = cpu_to_le64(ino);
1010         item->cap_id = cpu_to_le64(cap_id);
1011         item->migrate_seq = cpu_to_le32(migrate_seq);
1012         item->seq = cpu_to_le32(issue_seq);
1013
1014         session->s_num_cap_releases--;
1015
1016         msg->front.iov_len += sizeof(*item);
1017         if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1018                 dout(" release msg %p full\n", msg);
1019                 list_move_tail(&msg->list_head, &session->s_cap_releases_done);
1020         } else {
1021                 dout(" release msg %p at %d/%d (%d)\n", msg,
1022                      (int)le32_to_cpu(head->num),
1023                      (int)CEPH_CAPS_PER_RELEASE,
1024                      (int)msg->front.iov_len);
1025         }
1026         spin_unlock(&session->s_cap_lock);
1027 }
1028
1029 /*
1030  * Queue cap releases when an inode is dropped from our cache.  Since
1031  * inode is about to be destroyed, there is no need for i_lock.
1032  */
1033 void ceph_queue_caps_release(struct inode *inode)
1034 {
1035         struct ceph_inode_info *ci = ceph_inode(inode);
1036         struct rb_node *p;
1037
1038         p = rb_first(&ci->i_caps);
1039         while (p) {
1040                 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
1041                 struct ceph_mds_session *session = cap->session;
1042
1043                 __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1044                                     cap->mseq, cap->issue_seq);
1045                 p = rb_next(p);
1046                 __ceph_remove_cap(cap);
1047         }
1048 }
1049
1050 /*
1051  * Send a cap msg on the given inode.  Update our caps state, then
1052  * drop i_lock and send the message.
1053  *
1054  * Make note of max_size reported/requested from mds, revoked caps
1055  * that have now been implemented.
1056  *
1057  * Make half-hearted attempt ot to invalidate page cache if we are
1058  * dropping RDCACHE.  Note that this will leave behind locked pages
1059  * that we'll then need to deal with elsewhere.
1060  *
1061  * Return non-zero if delayed release, or we experienced an error
1062  * such that the caller should requeue + retry later.
1063  *
1064  * called with i_lock, then drops it.
1065  * caller should hold snap_rwsem (read), s_mutex.
1066  */
1067 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1068                       int op, int used, int want, int retain, int flushing,
1069                       unsigned *pflush_tid)
1070         __releases(cap->ci->vfs_inode->i_lock)
1071 {
1072         struct ceph_inode_info *ci = cap->ci;
1073         struct inode *inode = &ci->vfs_inode;
1074         u64 cap_id = cap->cap_id;
1075         int held, revoking, dropping, keep;
1076         u64 seq, issue_seq, mseq, time_warp_seq, follows;
1077         u64 size, max_size;
1078         struct timespec mtime, atime;
1079         int wake = 0;
1080         mode_t mode;
1081         uid_t uid;
1082         gid_t gid;
1083         struct ceph_mds_session *session;
1084         u64 xattr_version = 0;
1085         int delayed = 0;
1086         u64 flush_tid = 0;
1087         int i;
1088         int ret;
1089
1090         held = cap->issued | cap->implemented;
1091         revoking = cap->implemented & ~cap->issued;
1092         retain &= ~revoking;
1093         dropping = cap->issued & ~retain;
1094
1095         dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1096              inode, cap, cap->session,
1097              ceph_cap_string(held), ceph_cap_string(held & retain),
1098              ceph_cap_string(revoking));
1099         BUG_ON((retain & CEPH_CAP_PIN) == 0);
1100
1101         session = cap->session;
1102
1103         /* don't release wanted unless we've waited a bit. */
1104         if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1105             time_before(jiffies, ci->i_hold_caps_min)) {
1106                 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1107                      ceph_cap_string(cap->issued),
1108                      ceph_cap_string(cap->issued & retain),
1109                      ceph_cap_string(cap->mds_wanted),
1110                      ceph_cap_string(want));
1111                 want |= cap->mds_wanted;
1112                 retain |= cap->issued;
1113                 delayed = 1;
1114         }
1115         ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1116
1117         cap->issued &= retain;  /* drop bits we don't want */
1118         if (cap->implemented & ~cap->issued) {
1119                 /*
1120                  * Wake up any waiters on wanted -> needed transition.
1121                  * This is due to the weird transition from buffered
1122                  * to sync IO... we need to flush dirty pages _before_
1123                  * allowing sync writes to avoid reordering.
1124                  */
1125                 wake = 1;
1126         }
1127         cap->implemented &= cap->issued | used;
1128         cap->mds_wanted = want;
1129
1130         if (flushing) {
1131                 /*
1132                  * assign a tid for flush operations so we can avoid
1133                  * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1134                  * clean type races.  track latest tid for every bit
1135                  * so we can handle flush AxFw, flush Fw, and have the
1136                  * first ack clean Ax.
1137                  */
1138                 flush_tid = ++ci->i_cap_flush_last_tid;
1139                 if (pflush_tid)
1140                         *pflush_tid = flush_tid;
1141                 dout(" cap_flush_tid %d\n", (int)flush_tid);
1142                 for (i = 0; i < CEPH_CAP_BITS; i++)
1143                         if (flushing & (1 << i))
1144                                 ci->i_cap_flush_tid[i] = flush_tid;
1145         }
1146
1147         keep = cap->implemented;
1148         seq = cap->seq;
1149         issue_seq = cap->issue_seq;
1150         mseq = cap->mseq;
1151         size = inode->i_size;
1152         ci->i_reported_size = size;
1153         max_size = ci->i_wanted_max_size;
1154         ci->i_requested_max_size = max_size;
1155         mtime = inode->i_mtime;
1156         atime = inode->i_atime;
1157         time_warp_seq = ci->i_time_warp_seq;
1158         follows = ci->i_snap_realm->cached_context->seq;
1159         uid = inode->i_uid;
1160         gid = inode->i_gid;
1161         mode = inode->i_mode;
1162
1163         if (dropping & CEPH_CAP_XATTR_EXCL) {
1164                 __ceph_build_xattrs_blob(ci);
1165                 xattr_version = ci->i_xattrs.version + 1;
1166         }
1167
1168         spin_unlock(&inode->i_lock);
1169
1170         ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1171                 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1172                 size, max_size, &mtime, &atime, time_warp_seq,
1173                 uid, gid, mode,
1174                 xattr_version,
1175                 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1176                 follows);
1177         if (ret < 0) {
1178                 dout("error sending cap msg, must requeue %p\n", inode);
1179                 delayed = 1;
1180         }
1181
1182         if (wake)
1183                 wake_up_all(&ci->i_cap_wq);
1184
1185         return delayed;
1186 }
1187
1188 /*
1189  * When a snapshot is taken, clients accumulate dirty metadata on
1190  * inodes with capabilities in ceph_cap_snaps to describe the file
1191  * state at the time the snapshot was taken.  This must be flushed
1192  * asynchronously back to the MDS once sync writes complete and dirty
1193  * data is written out.
1194  *
1195  * Called under i_lock.  Takes s_mutex as needed.
1196  */
1197 void __ceph_flush_snaps(struct ceph_inode_info *ci,
1198                         struct ceph_mds_session **psession)
1199 {
1200         struct inode *inode = &ci->vfs_inode;
1201         int mds;
1202         struct ceph_cap_snap *capsnap;
1203         u32 mseq;
1204         struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1205         struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1206                                                     session->s_mutex */
1207         u64 next_follows = 0;  /* keep track of how far we've gotten through the
1208                              i_cap_snaps list, and skip these entries next time
1209                              around to avoid an infinite loop */
1210
1211         if (psession)
1212                 session = *psession;
1213
1214         dout("__flush_snaps %p\n", inode);
1215 retry:
1216         list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1217                 /* avoid an infiniute loop after retry */
1218                 if (capsnap->follows < next_follows)
1219                         continue;
1220                 /*
1221                  * we need to wait for sync writes to complete and for dirty
1222                  * pages to be written out.
1223                  */
1224                 if (capsnap->dirty_pages || capsnap->writing)
1225                         continue;
1226
1227                 /*
1228                  * if cap writeback already occurred, we should have dropped
1229                  * the capsnap in ceph_put_wrbuffer_cap_refs.
1230                  */
1231                 BUG_ON(capsnap->dirty == 0);
1232
1233                 /* pick mds, take s_mutex */
1234                 mds = __ceph_get_cap_mds(ci, &mseq);
1235                 if (session && session->s_mds != mds) {
1236                         dout("oops, wrong session %p mutex\n", session);
1237                         mutex_unlock(&session->s_mutex);
1238                         ceph_put_mds_session(session);
1239                         session = NULL;
1240                 }
1241                 if (!session) {
1242                         spin_unlock(&inode->i_lock);
1243                         mutex_lock(&mdsc->mutex);
1244                         session = __ceph_lookup_mds_session(mdsc, mds);
1245                         mutex_unlock(&mdsc->mutex);
1246                         if (session) {
1247                                 dout("inverting session/ino locks on %p\n",
1248                                      session);
1249                                 mutex_lock(&session->s_mutex);
1250                         }
1251                         /*
1252                          * if session == NULL, we raced against a cap
1253                          * deletion.  retry, and we'll get a better
1254                          * @mds value next time.
1255                          */
1256                         spin_lock(&inode->i_lock);
1257                         goto retry;
1258                 }
1259
1260                 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1261                 atomic_inc(&capsnap->nref);
1262                 if (!list_empty(&capsnap->flushing_item))
1263                         list_del_init(&capsnap->flushing_item);
1264                 list_add_tail(&capsnap->flushing_item,
1265                               &session->s_cap_snaps_flushing);
1266                 spin_unlock(&inode->i_lock);
1267
1268                 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1269                      inode, capsnap, next_follows, capsnap->size);
1270                 send_cap_msg(session, ceph_vino(inode).ino, 0,
1271                              CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1272                              capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1273                              capsnap->size, 0,
1274                              &capsnap->mtime, &capsnap->atime,
1275                              capsnap->time_warp_seq,
1276                              capsnap->uid, capsnap->gid, capsnap->mode,
1277                              0, NULL,
1278                              capsnap->follows);
1279
1280                 next_follows = capsnap->follows + 1;
1281                 ceph_put_cap_snap(capsnap);
1282
1283                 spin_lock(&inode->i_lock);
1284                 goto retry;
1285         }
1286
1287         /* we flushed them all; remove this inode from the queue */
1288         spin_lock(&mdsc->snap_flush_lock);
1289         list_del_init(&ci->i_snap_flush_item);
1290         spin_unlock(&mdsc->snap_flush_lock);
1291
1292         if (psession)
1293                 *psession = session;
1294         else if (session) {
1295                 mutex_unlock(&session->s_mutex);
1296                 ceph_put_mds_session(session);
1297         }
1298 }
1299
1300 static void ceph_flush_snaps(struct ceph_inode_info *ci)
1301 {
1302         struct inode *inode = &ci->vfs_inode;
1303
1304         spin_lock(&inode->i_lock);
1305         __ceph_flush_snaps(ci, NULL);
1306         spin_unlock(&inode->i_lock);
1307 }
1308
1309 /*
1310  * Mark caps dirty.  If inode is newly dirty, add to the global dirty
1311  * list.
1312  */
1313 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1314 {
1315         struct ceph_mds_client *mdsc =
1316                 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1317         struct inode *inode = &ci->vfs_inode;
1318         int was = ci->i_dirty_caps;
1319         int dirty = 0;
1320
1321         dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1322              ceph_cap_string(mask), ceph_cap_string(was),
1323              ceph_cap_string(was | mask));
1324         ci->i_dirty_caps |= mask;
1325         if (was == 0) {
1326                 dout(" inode %p now dirty\n", &ci->vfs_inode);
1327                 BUG_ON(!list_empty(&ci->i_dirty_item));
1328                 spin_lock(&mdsc->cap_dirty_lock);
1329                 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1330                 spin_unlock(&mdsc->cap_dirty_lock);
1331                 if (ci->i_flushing_caps == 0) {
1332                         igrab(inode);
1333                         dirty |= I_DIRTY_SYNC;
1334                 }
1335         }
1336         BUG_ON(list_empty(&ci->i_dirty_item));
1337         if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1338             (mask & CEPH_CAP_FILE_BUFFER))
1339                 dirty |= I_DIRTY_DATASYNC;
1340         if (dirty)
1341                 __mark_inode_dirty(inode, dirty);
1342         __cap_delay_requeue(mdsc, ci);
1343 }
1344
1345 /*
1346  * Add dirty inode to the flushing list.  Assigned a seq number so we
1347  * can wait for caps to flush without starving.
1348  *
1349  * Called under i_lock.
1350  */
1351 static int __mark_caps_flushing(struct inode *inode,
1352                                  struct ceph_mds_session *session)
1353 {
1354         struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1355         struct ceph_inode_info *ci = ceph_inode(inode);
1356         int flushing;
1357
1358         BUG_ON(ci->i_dirty_caps == 0);
1359         BUG_ON(list_empty(&ci->i_dirty_item));
1360
1361         flushing = ci->i_dirty_caps;
1362         dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1363              ceph_cap_string(flushing),
1364              ceph_cap_string(ci->i_flushing_caps),
1365              ceph_cap_string(ci->i_flushing_caps | flushing));
1366         ci->i_flushing_caps |= flushing;
1367         ci->i_dirty_caps = 0;
1368         dout(" inode %p now !dirty\n", inode);
1369
1370         spin_lock(&mdsc->cap_dirty_lock);
1371         list_del_init(&ci->i_dirty_item);
1372
1373         ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1374         if (list_empty(&ci->i_flushing_item)) {
1375                 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1376                 mdsc->num_cap_flushing++;
1377                 dout(" inode %p now flushing seq %lld\n", inode,
1378                      ci->i_cap_flush_seq);
1379         } else {
1380                 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1381                 dout(" inode %p now flushing (more) seq %lld\n", inode,
1382                      ci->i_cap_flush_seq);
1383         }
1384         spin_unlock(&mdsc->cap_dirty_lock);
1385
1386         return flushing;
1387 }
1388
1389 /*
1390  * try to invalidate mapping pages without blocking.
1391  */
1392 static int mapping_is_empty(struct address_space *mapping)
1393 {
1394         struct page *page = find_get_page(mapping, 0);
1395
1396         if (!page)
1397                 return 1;
1398
1399         put_page(page);
1400         return 0;
1401 }
1402
1403 static int try_nonblocking_invalidate(struct inode *inode)
1404 {
1405         struct ceph_inode_info *ci = ceph_inode(inode);
1406         u32 invalidating_gen = ci->i_rdcache_gen;
1407
1408         spin_unlock(&inode->i_lock);
1409         invalidate_mapping_pages(&inode->i_data, 0, -1);
1410         spin_lock(&inode->i_lock);
1411
1412         if (mapping_is_empty(&inode->i_data) &&
1413             invalidating_gen == ci->i_rdcache_gen) {
1414                 /* success. */
1415                 dout("try_nonblocking_invalidate %p success\n", inode);
1416                 ci->i_rdcache_gen = 0;
1417                 ci->i_rdcache_revoking = 0;
1418                 return 0;
1419         }
1420         dout("try_nonblocking_invalidate %p failed\n", inode);
1421         return -1;
1422 }
1423
1424 /*
1425  * Swiss army knife function to examine currently used and wanted
1426  * versus held caps.  Release, flush, ack revoked caps to mds as
1427  * appropriate.
1428  *
1429  *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1430  *    cap release further.
1431  *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
1432  *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1433  *    further delay.
1434  */
1435 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1436                      struct ceph_mds_session *session)
1437         __releases(session->s_mutex)
1438 {
1439         struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1440         struct ceph_mds_client *mdsc = &client->mdsc;
1441         struct inode *inode = &ci->vfs_inode;
1442         struct ceph_cap *cap;
1443         int file_wanted, used;
1444         int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
1445         int issued, implemented, want, retain, revoking, flushing = 0;
1446         int mds = -1;   /* keep track of how far we've gone through i_caps list
1447                            to avoid an infinite loop on retry */
1448         struct rb_node *p;
1449         int tried_invalidate = 0;
1450         int delayed = 0, sent = 0, force_requeue = 0, num;
1451         int queue_invalidate = 0;
1452         int is_delayed = flags & CHECK_CAPS_NODELAY;
1453
1454         /* if we are unmounting, flush any unused caps immediately. */
1455         if (mdsc->stopping)
1456                 is_delayed = 1;
1457
1458         spin_lock(&inode->i_lock);
1459
1460         if (ci->i_ceph_flags & CEPH_I_FLUSH)
1461                 flags |= CHECK_CAPS_FLUSH;
1462
1463         /* flush snaps first time around only */
1464         if (!list_empty(&ci->i_cap_snaps))
1465                 __ceph_flush_snaps(ci, &session);
1466         goto retry_locked;
1467 retry:
1468         spin_lock(&inode->i_lock);
1469 retry_locked:
1470         file_wanted = __ceph_caps_file_wanted(ci);
1471         used = __ceph_caps_used(ci);
1472         want = file_wanted | used;
1473         issued = __ceph_caps_issued(ci, &implemented);
1474         revoking = implemented & ~issued;
1475
1476         retain = want | CEPH_CAP_PIN;
1477         if (!mdsc->stopping && inode->i_nlink > 0) {
1478                 if (want) {
1479                         retain |= CEPH_CAP_ANY;       /* be greedy */
1480                 } else {
1481                         retain |= CEPH_CAP_ANY_SHARED;
1482                         /*
1483                          * keep RD only if we didn't have the file open RW,
1484                          * because then the mds would revoke it anyway to
1485                          * journal max_size=0.
1486                          */
1487                         if (ci->i_max_size == 0)
1488                                 retain |= CEPH_CAP_ANY_RD;
1489                 }
1490         }
1491
1492         dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1493              " issued %s revoking %s retain %s %s%s%s\n", inode,
1494              ceph_cap_string(file_wanted),
1495              ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1496              ceph_cap_string(ci->i_flushing_caps),
1497              ceph_cap_string(issued), ceph_cap_string(revoking),
1498              ceph_cap_string(retain),
1499              (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1500              (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1501              (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1502
1503         /*
1504          * If we no longer need to hold onto old our caps, and we may
1505          * have cached pages, but don't want them, then try to invalidate.
1506          * If we fail, it's because pages are locked.... try again later.
1507          */
1508         if ((!is_delayed || mdsc->stopping) &&
1509             ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
1510             ci->i_rdcache_gen &&                     /* may have cached pages */
1511             (file_wanted == 0 ||                     /* no open files */
1512              (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
1513             !tried_invalidate) {
1514                 dout("check_caps trying to invalidate on %p\n", inode);
1515                 if (try_nonblocking_invalidate(inode) < 0) {
1516                         if (revoking & CEPH_CAP_FILE_CACHE) {
1517                                 dout("check_caps queuing invalidate\n");
1518                                 queue_invalidate = 1;
1519                                 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1520                         } else {
1521                                 dout("check_caps failed to invalidate pages\n");
1522                                 /* we failed to invalidate pages.  check these
1523                                    caps again later. */
1524                                 force_requeue = 1;
1525                                 __cap_set_timeouts(mdsc, ci);
1526                         }
1527                 }
1528                 tried_invalidate = 1;
1529                 goto retry_locked;
1530         }
1531
1532         num = 0;
1533         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1534                 cap = rb_entry(p, struct ceph_cap, ci_node);
1535                 num++;
1536
1537                 /* avoid looping forever */
1538                 if (mds >= cap->mds ||
1539                     ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1540                         continue;
1541
1542                 /* NOTE: no side-effects allowed, until we take s_mutex */
1543
1544                 revoking = cap->implemented & ~cap->issued;
1545                 if (revoking)
1546                         dout(" mds%d revoking %s\n", cap->mds,
1547                              ceph_cap_string(revoking));
1548
1549                 if (cap == ci->i_auth_cap &&
1550                     (cap->issued & CEPH_CAP_FILE_WR)) {
1551                         /* request larger max_size from MDS? */
1552                         if (ci->i_wanted_max_size > ci->i_max_size &&
1553                             ci->i_wanted_max_size > ci->i_requested_max_size) {
1554                                 dout("requesting new max_size\n");
1555                                 goto ack;
1556                         }
1557
1558                         /* approaching file_max? */
1559                         if ((inode->i_size << 1) >= ci->i_max_size &&
1560                             (ci->i_reported_size << 1) < ci->i_max_size) {
1561                                 dout("i_size approaching max_size\n");
1562                                 goto ack;
1563                         }
1564                 }
1565                 /* flush anything dirty? */
1566                 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1567                     ci->i_dirty_caps) {
1568                         dout("flushing dirty caps\n");
1569                         goto ack;
1570                 }
1571
1572                 /* completed revocation? going down and there are no caps? */
1573                 if (revoking && (revoking & used) == 0) {
1574                         dout("completed revocation of %s\n",
1575                              ceph_cap_string(cap->implemented & ~cap->issued));
1576                         goto ack;
1577                 }
1578
1579                 /* want more caps from mds? */
1580                 if (want & ~(cap->mds_wanted | cap->issued))
1581                         goto ack;
1582
1583                 /* things we might delay */
1584                 if ((cap->issued & ~retain) == 0 &&
1585                     cap->mds_wanted == want)
1586                         continue;     /* nope, all good */
1587
1588                 if (is_delayed)
1589                         goto ack;
1590
1591                 /* delay? */
1592                 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1593                     time_before(jiffies, ci->i_hold_caps_max)) {
1594                         dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1595                              ceph_cap_string(cap->issued),
1596                              ceph_cap_string(cap->issued & retain),
1597                              ceph_cap_string(cap->mds_wanted),
1598                              ceph_cap_string(want));
1599                         delayed++;
1600                         continue;
1601                 }
1602
1603 ack:
1604                 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1605                         dout(" skipping %p I_NOFLUSH set\n", inode);
1606                         continue;
1607                 }
1608
1609                 if (session && session != cap->session) {
1610                         dout("oops, wrong session %p mutex\n", session);
1611                         mutex_unlock(&session->s_mutex);
1612                         session = NULL;
1613                 }
1614                 if (!session) {
1615                         session = cap->session;
1616                         if (mutex_trylock(&session->s_mutex) == 0) {
1617                                 dout("inverting session/ino locks on %p\n",
1618                                      session);
1619                                 spin_unlock(&inode->i_lock);
1620                                 if (took_snap_rwsem) {
1621                                         up_read(&mdsc->snap_rwsem);
1622                                         took_snap_rwsem = 0;
1623                                 }
1624                                 mutex_lock(&session->s_mutex);
1625                                 goto retry;
1626                         }
1627                 }
1628                 /* take snap_rwsem after session mutex */
1629                 if (!took_snap_rwsem) {
1630                         if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1631                                 dout("inverting snap/in locks on %p\n",
1632                                      inode);
1633                                 spin_unlock(&inode->i_lock);
1634                                 down_read(&mdsc->snap_rwsem);
1635                                 took_snap_rwsem = 1;
1636                                 goto retry;
1637                         }
1638                         took_snap_rwsem = 1;
1639                 }
1640
1641                 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1642                         flushing = __mark_caps_flushing(inode, session);
1643
1644                 mds = cap->mds;  /* remember mds, so we don't repeat */
1645                 sent++;
1646
1647                 /* __send_cap drops i_lock */
1648                 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1649                                       retain, flushing, NULL);
1650                 goto retry; /* retake i_lock and restart our cap scan. */
1651         }
1652
1653         /*
1654          * Reschedule delayed caps release if we delayed anything,
1655          * otherwise cancel.
1656          */
1657         if (delayed && is_delayed)
1658                 force_requeue = 1;   /* __send_cap delayed release; requeue */
1659         if (!delayed && !is_delayed)
1660                 __cap_delay_cancel(mdsc, ci);
1661         else if (!is_delayed || force_requeue)
1662                 __cap_delay_requeue(mdsc, ci);
1663
1664         spin_unlock(&inode->i_lock);
1665
1666         if (queue_invalidate)
1667                 ceph_queue_invalidate(inode);
1668
1669         if (session)
1670                 mutex_unlock(&session->s_mutex);
1671         if (took_snap_rwsem)
1672                 up_read(&mdsc->snap_rwsem);
1673 }
1674
1675 /*
1676  * Try to flush dirty caps back to the auth mds.
1677  */
1678 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1679                           unsigned *flush_tid)
1680 {
1681         struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1682         struct ceph_inode_info *ci = ceph_inode(inode);
1683         int unlock_session = session ? 0 : 1;
1684         int flushing = 0;
1685
1686 retry:
1687         spin_lock(&inode->i_lock);
1688         if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1689                 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1690                 goto out;
1691         }
1692         if (ci->i_dirty_caps && ci->i_auth_cap) {
1693                 struct ceph_cap *cap = ci->i_auth_cap;
1694                 int used = __ceph_caps_used(ci);
1695                 int want = __ceph_caps_wanted(ci);
1696                 int delayed;
1697
1698                 if (!session) {
1699                         spin_unlock(&inode->i_lock);
1700                         session = cap->session;
1701                         mutex_lock(&session->s_mutex);
1702                         goto retry;
1703                 }
1704                 BUG_ON(session != cap->session);
1705                 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1706                         goto out;
1707
1708                 flushing = __mark_caps_flushing(inode, session);
1709
1710                 /* __send_cap drops i_lock */
1711                 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1712                                      cap->issued | cap->implemented, flushing,
1713                                      flush_tid);
1714                 if (!delayed)
1715                         goto out_unlocked;
1716
1717                 spin_lock(&inode->i_lock);
1718                 __cap_delay_requeue(mdsc, ci);
1719         }
1720 out:
1721         spin_unlock(&inode->i_lock);
1722 out_unlocked:
1723         if (session && unlock_session)
1724                 mutex_unlock(&session->s_mutex);
1725         return flushing;
1726 }
1727
1728 /*
1729  * Return true if we've flushed caps through the given flush_tid.
1730  */
1731 static int caps_are_flushed(struct inode *inode, unsigned tid)
1732 {
1733         struct ceph_inode_info *ci = ceph_inode(inode);
1734         int i, ret = 1;
1735
1736         spin_lock(&inode->i_lock);
1737         for (i = 0; i < CEPH_CAP_BITS; i++)
1738                 if ((ci->i_flushing_caps & (1 << i)) &&
1739                     ci->i_cap_flush_tid[i] <= tid) {
1740                         /* still flushing this bit */
1741                         ret = 0;
1742                         break;
1743                 }
1744         spin_unlock(&inode->i_lock);
1745         return ret;
1746 }
1747
1748 /*
1749  * Wait on any unsafe replies for the given inode.  First wait on the
1750  * newest request, and make that the upper bound.  Then, if there are
1751  * more requests, keep waiting on the oldest as long as it is still older
1752  * than the original request.
1753  */
1754 static void sync_write_wait(struct inode *inode)
1755 {
1756         struct ceph_inode_info *ci = ceph_inode(inode);
1757         struct list_head *head = &ci->i_unsafe_writes;
1758         struct ceph_osd_request *req;
1759         u64 last_tid;
1760
1761         spin_lock(&ci->i_unsafe_lock);
1762         if (list_empty(head))
1763                 goto out;
1764
1765         /* set upper bound as _last_ entry in chain */
1766         req = list_entry(head->prev, struct ceph_osd_request,
1767                          r_unsafe_item);
1768         last_tid = req->r_tid;
1769
1770         do {
1771                 ceph_osdc_get_request(req);
1772                 spin_unlock(&ci->i_unsafe_lock);
1773                 dout("sync_write_wait on tid %llu (until %llu)\n",
1774                      req->r_tid, last_tid);
1775                 wait_for_completion(&req->r_safe_completion);
1776                 spin_lock(&ci->i_unsafe_lock);
1777                 ceph_osdc_put_request(req);
1778
1779                 /*
1780                  * from here on look at first entry in chain, since we
1781                  * only want to wait for anything older than last_tid
1782                  */
1783                 if (list_empty(head))
1784                         break;
1785                 req = list_entry(head->next, struct ceph_osd_request,
1786                                  r_unsafe_item);
1787         } while (req->r_tid < last_tid);
1788 out:
1789         spin_unlock(&ci->i_unsafe_lock);
1790 }
1791
1792 int ceph_fsync(struct file *file, int datasync)
1793 {
1794         struct inode *inode = file->f_mapping->host;
1795         struct ceph_inode_info *ci = ceph_inode(inode);
1796         unsigned flush_tid;
1797         int ret;
1798         int dirty;
1799
1800         dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1801         sync_write_wait(inode);
1802
1803         ret = filemap_write_and_wait(inode->i_mapping);
1804         if (ret < 0)
1805                 return ret;
1806
1807         dirty = try_flush_caps(inode, NULL, &flush_tid);
1808         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1809
1810         /*
1811          * only wait on non-file metadata writeback (the mds
1812          * can recover size and mtime, so we don't need to
1813          * wait for that)
1814          */
1815         if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1816                 dout("fsync waiting for flush_tid %u\n", flush_tid);
1817                 ret = wait_event_interruptible(ci->i_cap_wq,
1818                                        caps_are_flushed(inode, flush_tid));
1819         }
1820
1821         dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1822         return ret;
1823 }
1824
1825 /*
1826  * Flush any dirty caps back to the mds.  If we aren't asked to wait,
1827  * queue inode for flush but don't do so immediately, because we can
1828  * get by with fewer MDS messages if we wait for data writeback to
1829  * complete first.
1830  */
1831 int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1832 {
1833         struct ceph_inode_info *ci = ceph_inode(inode);
1834         unsigned flush_tid;
1835         int err = 0;
1836         int dirty;
1837         int wait = wbc->sync_mode == WB_SYNC_ALL;
1838
1839         dout("write_inode %p wait=%d\n", inode, wait);
1840         if (wait) {
1841                 dirty = try_flush_caps(inode, NULL, &flush_tid);
1842                 if (dirty)
1843                         err = wait_event_interruptible(ci->i_cap_wq,
1844                                        caps_are_flushed(inode, flush_tid));
1845         } else {
1846                 struct ceph_mds_client *mdsc =
1847                         &ceph_sb_to_client(inode->i_sb)->mdsc;
1848
1849                 spin_lock(&inode->i_lock);
1850                 if (__ceph_caps_dirty(ci))
1851                         __cap_delay_requeue_front(mdsc, ci);
1852                 spin_unlock(&inode->i_lock);
1853         }
1854         return err;
1855 }
1856
1857 /*
1858  * After a recovering MDS goes active, we need to resend any caps
1859  * we were flushing.
1860  *
1861  * Caller holds session->s_mutex.
1862  */
1863 static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1864                                    struct ceph_mds_session *session)
1865 {
1866         struct ceph_cap_snap *capsnap;
1867
1868         dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1869         list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1870                             flushing_item) {
1871                 struct ceph_inode_info *ci = capsnap->ci;
1872                 struct inode *inode = &ci->vfs_inode;
1873                 struct ceph_cap *cap;
1874
1875                 spin_lock(&inode->i_lock);
1876                 cap = ci->i_auth_cap;
1877                 if (cap && cap->session == session) {
1878                         dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1879                              cap, capsnap);
1880                         __ceph_flush_snaps(ci, &session);
1881                 } else {
1882                         pr_err("%p auth cap %p not mds%d ???\n", inode,
1883                                cap, session->s_mds);
1884                 }
1885                 spin_unlock(&inode->i_lock);
1886         }
1887 }
1888
1889 void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1890                              struct ceph_mds_session *session)
1891 {
1892         struct ceph_inode_info *ci;
1893
1894         kick_flushing_capsnaps(mdsc, session);
1895
1896         dout("kick_flushing_caps mds%d\n", session->s_mds);
1897         list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1898                 struct inode *inode = &ci->vfs_inode;
1899                 struct ceph_cap *cap;
1900                 int delayed = 0;
1901
1902                 spin_lock(&inode->i_lock);
1903                 cap = ci->i_auth_cap;
1904                 if (cap && cap->session == session) {
1905                         dout("kick_flushing_caps %p cap %p %s\n", inode,
1906                              cap, ceph_cap_string(ci->i_flushing_caps));
1907                         delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1908                                              __ceph_caps_used(ci),
1909                                              __ceph_caps_wanted(ci),
1910                                              cap->issued | cap->implemented,
1911                                              ci->i_flushing_caps, NULL);
1912                         if (delayed) {
1913                                 spin_lock(&inode->i_lock);
1914                                 __cap_delay_requeue(mdsc, ci);
1915                                 spin_unlock(&inode->i_lock);
1916                         }
1917                 } else {
1918                         pr_err("%p auth cap %p not mds%d ???\n", inode,
1919                                cap, session->s_mds);
1920                         spin_unlock(&inode->i_lock);
1921                 }
1922         }
1923 }
1924
1925
1926 /*
1927  * Take references to capabilities we hold, so that we don't release
1928  * them to the MDS prematurely.
1929  *
1930  * Protected by i_lock.
1931  */
1932 static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1933 {
1934         if (got & CEPH_CAP_PIN)
1935                 ci->i_pin_ref++;
1936         if (got & CEPH_CAP_FILE_RD)
1937                 ci->i_rd_ref++;
1938         if (got & CEPH_CAP_FILE_CACHE)
1939                 ci->i_rdcache_ref++;
1940         if (got & CEPH_CAP_FILE_WR)
1941                 ci->i_wr_ref++;
1942         if (got & CEPH_CAP_FILE_BUFFER) {
1943                 if (ci->i_wrbuffer_ref == 0)
1944                         igrab(&ci->vfs_inode);
1945                 ci->i_wrbuffer_ref++;
1946                 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1947                      &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1948         }
1949 }
1950
1951 /*
1952  * Try to grab cap references.  Specify those refs we @want, and the
1953  * minimal set we @need.  Also include the larger offset we are writing
1954  * to (when applicable), and check against max_size here as well.
1955  * Note that caller is responsible for ensuring max_size increases are
1956  * requested from the MDS.
1957  */
1958 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1959                             int *got, loff_t endoff, int *check_max, int *err)
1960 {
1961         struct inode *inode = &ci->vfs_inode;
1962         int ret = 0;
1963         int have, implemented;
1964         int file_wanted;
1965
1966         dout("get_cap_refs %p need %s want %s\n", inode,
1967              ceph_cap_string(need), ceph_cap_string(want));
1968         spin_lock(&inode->i_lock);
1969
1970         /* make sure file is actually open */
1971         file_wanted = __ceph_caps_file_wanted(ci);
1972         if ((file_wanted & need) == 0) {
1973                 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1974                      ceph_cap_string(need), ceph_cap_string(file_wanted));
1975                 *err = -EBADF;
1976                 ret = 1;
1977                 goto out;
1978         }
1979
1980         if (need & CEPH_CAP_FILE_WR) {
1981                 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1982                         dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1983                              inode, endoff, ci->i_max_size);
1984                         if (endoff > ci->i_wanted_max_size) {
1985                                 *check_max = 1;
1986                                 ret = 1;
1987                         }
1988                         goto out;
1989                 }
1990                 /*
1991                  * If a sync write is in progress, we must wait, so that we
1992                  * can get a final snapshot value for size+mtime.
1993                  */
1994                 if (__ceph_have_pending_cap_snap(ci)) {
1995                         dout("get_cap_refs %p cap_snap_pending\n", inode);
1996                         goto out;
1997                 }
1998         }
1999         have = __ceph_caps_issued(ci, &implemented);
2000
2001         /*
2002          * disallow writes while a truncate is pending
2003          */
2004         if (ci->i_truncate_pending)
2005                 have &= ~CEPH_CAP_FILE_WR;
2006
2007         if ((have & need) == need) {
2008                 /*
2009                  * Look at (implemented & ~have & not) so that we keep waiting
2010                  * on transition from wanted -> needed caps.  This is needed
2011                  * for WRBUFFER|WR -> WR to avoid a new WR sync write from
2012                  * going before a prior buffered writeback happens.
2013                  */
2014                 int not = want & ~(have & need);
2015                 int revoking = implemented & ~have;
2016                 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2017                      inode, ceph_cap_string(have), ceph_cap_string(not),
2018                      ceph_cap_string(revoking));
2019                 if ((revoking & not) == 0) {
2020                         *got = need | (have & want);
2021                         __take_cap_refs(ci, *got);
2022                         ret = 1;
2023                 }
2024         } else {
2025                 dout("get_cap_refs %p have %s needed %s\n", inode,
2026                      ceph_cap_string(have), ceph_cap_string(need));
2027         }
2028 out:
2029         spin_unlock(&inode->i_lock);
2030         dout("get_cap_refs %p ret %d got %s\n", inode,
2031              ret, ceph_cap_string(*got));
2032         return ret;
2033 }
2034
2035 /*
2036  * Check the offset we are writing up to against our current
2037  * max_size.  If necessary, tell the MDS we want to write to
2038  * a larger offset.
2039  */
2040 static void check_max_size(struct inode *inode, loff_t endoff)
2041 {
2042         struct ceph_inode_info *ci = ceph_inode(inode);
2043         int check = 0;
2044
2045         /* do we need to explicitly request a larger max_size? */
2046         spin_lock(&inode->i_lock);
2047         if ((endoff >= ci->i_max_size ||
2048              endoff > (inode->i_size << 1)) &&
2049             endoff > ci->i_wanted_max_size) {
2050                 dout("write %p at large endoff %llu, req max_size\n",
2051                      inode, endoff);
2052                 ci->i_wanted_max_size = endoff;
2053                 check = 1;
2054         }
2055         spin_unlock(&inode->i_lock);
2056         if (check)
2057                 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2058 }
2059
2060 /*
2061  * Wait for caps, and take cap references.  If we can't get a WR cap
2062  * due to a small max_size, make sure we check_max_size (and possibly
2063  * ask the mds) so we don't get hung up indefinitely.
2064  */
2065 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2066                   loff_t endoff)
2067 {
2068         int check_max, ret, err;
2069
2070 retry:
2071         if (endoff > 0)
2072                 check_max_size(&ci->vfs_inode, endoff);
2073         check_max = 0;
2074         err = 0;
2075         ret = wait_event_interruptible(ci->i_cap_wq,
2076                                        try_get_cap_refs(ci, need, want,
2077                                                         got, endoff,
2078                                                         &check_max, &err));
2079         if (err)
2080                 ret = err;
2081         if (check_max)
2082                 goto retry;
2083         return ret;
2084 }
2085
2086 /*
2087  * Take cap refs.  Caller must already know we hold at least one ref
2088  * on the caps in question or we don't know this is safe.
2089  */
2090 void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2091 {
2092         spin_lock(&ci->vfs_inode.i_lock);
2093         __take_cap_refs(ci, caps);
2094         spin_unlock(&ci->vfs_inode.i_lock);
2095 }
2096
2097 /*
2098  * Release cap refs.
2099  *
2100  * If we released the last ref on any given cap, call ceph_check_caps
2101  * to release (or schedule a release).
2102  *
2103  * If we are releasing a WR cap (from a sync write), finalize any affected
2104  * cap_snap, and wake up any waiters.
2105  */
2106 void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2107 {
2108         struct inode *inode = &ci->vfs_inode;
2109         int last = 0, put = 0, flushsnaps = 0, wake = 0;
2110         struct ceph_cap_snap *capsnap;
2111
2112         spin_lock(&inode->i_lock);
2113         if (had & CEPH_CAP_PIN)
2114                 --ci->i_pin_ref;
2115         if (had & CEPH_CAP_FILE_RD)
2116                 if (--ci->i_rd_ref == 0)
2117                         last++;
2118         if (had & CEPH_CAP_FILE_CACHE)
2119                 if (--ci->i_rdcache_ref == 0)
2120                         last++;
2121         if (had & CEPH_CAP_FILE_BUFFER) {
2122                 if (--ci->i_wrbuffer_ref == 0) {
2123                         last++;
2124                         put++;
2125                 }
2126                 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2127                      inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2128         }
2129         if (had & CEPH_CAP_FILE_WR)
2130                 if (--ci->i_wr_ref == 0) {
2131                         last++;
2132                         if (!list_empty(&ci->i_cap_snaps)) {
2133                                 capsnap = list_first_entry(&ci->i_cap_snaps,
2134                                                      struct ceph_cap_snap,
2135                                                      ci_item);
2136                                 if (capsnap->writing) {
2137                                         capsnap->writing = 0;
2138                                         flushsnaps =
2139                                                 __ceph_finish_cap_snap(ci,
2140                                                                        capsnap);
2141                                         wake = 1;
2142                                 }
2143                         }
2144                 }
2145         spin_unlock(&inode->i_lock);
2146
2147         dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2148              last ? " last" : "", put ? " put" : "");
2149
2150         if (last && !flushsnaps)
2151                 ceph_check_caps(ci, 0, NULL);
2152         else if (flushsnaps)
2153                 ceph_flush_snaps(ci);
2154         if (wake)
2155                 wake_up_all(&ci->i_cap_wq);
2156         if (put)
2157                 iput(inode);
2158 }
2159
2160 /*
2161  * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2162  * context.  Adjust per-snap dirty page accounting as appropriate.
2163  * Once all dirty data for a cap_snap is flushed, flush snapped file
2164  * metadata back to the MDS.  If we dropped the last ref, call
2165  * ceph_check_caps.
2166  */
2167 void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2168                                 struct ceph_snap_context *snapc)
2169 {
2170         struct inode *inode = &ci->vfs_inode;
2171         int last = 0;
2172         int complete_capsnap = 0;
2173         int drop_capsnap = 0;
2174         int found = 0;
2175         struct ceph_cap_snap *capsnap = NULL;
2176
2177         spin_lock(&inode->i_lock);
2178         ci->i_wrbuffer_ref -= nr;
2179         last = !ci->i_wrbuffer_ref;
2180
2181         if (ci->i_head_snapc == snapc) {
2182                 ci->i_wrbuffer_ref_head -= nr;
2183                 if (!ci->i_wrbuffer_ref_head) {
2184                         ceph_put_snap_context(ci->i_head_snapc);
2185                         ci->i_head_snapc = NULL;
2186                 }
2187                 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2188                      inode,
2189                      ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2190                      ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2191                      last ? " LAST" : "");
2192         } else {
2193                 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2194                         if (capsnap->context == snapc) {
2195                                 found = 1;
2196                                 break;
2197                         }
2198                 }
2199                 BUG_ON(!found);
2200                 capsnap->dirty_pages -= nr;
2201                 if (capsnap->dirty_pages == 0) {
2202                         complete_capsnap = 1;
2203                         if (capsnap->dirty == 0)
2204                                 /* cap writeback completed before we created
2205                                  * the cap_snap; no FLUSHSNAP is needed */
2206                                 drop_capsnap = 1;
2207                 }
2208                 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2209                      " snap %lld %d/%d -> %d/%d %s%s%s\n",
2210                      inode, capsnap, capsnap->context->seq,
2211                      ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2212                      ci->i_wrbuffer_ref, capsnap->dirty_pages,
2213                      last ? " (wrbuffer last)" : "",
2214                      complete_capsnap ? " (complete capsnap)" : "",
2215                      drop_capsnap ? " (drop capsnap)" : "");
2216                 if (drop_capsnap) {
2217                         ceph_put_snap_context(capsnap->context);
2218                         list_del(&capsnap->ci_item);
2219                         list_del(&capsnap->flushing_item);
2220                         ceph_put_cap_snap(capsnap);
2221                 }
2222         }
2223
2224         spin_unlock(&inode->i_lock);
2225
2226         if (last) {
2227                 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2228                 iput(inode);
2229         } else if (complete_capsnap) {
2230                 ceph_flush_snaps(ci);
2231                 wake_up_all(&ci->i_cap_wq);
2232         }
2233         if (drop_capsnap)
2234                 iput(inode);
2235 }
2236
2237 /*
2238  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
2239  * actually be a revocation if it specifies a smaller cap set.)
2240  *
2241  * caller holds s_mutex and i_lock, we drop both.
2242  *
2243  * return value:
2244  *  0 - ok
2245  *  1 - check_caps on auth cap only (writeback)
2246  *  2 - check_caps (ack revoke)
2247  */
2248 static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2249                              struct ceph_mds_session *session,
2250                              struct ceph_cap *cap,
2251                              struct ceph_buffer *xattr_buf)
2252         __releases(inode->i_lock)
2253         __releases(session->s_mutex)
2254 {
2255         struct ceph_inode_info *ci = ceph_inode(inode);
2256         int mds = session->s_mds;
2257         int seq = le32_to_cpu(grant->seq);
2258         int newcaps = le32_to_cpu(grant->caps);
2259         int issued, implemented, used, wanted, dirty;
2260         u64 size = le64_to_cpu(grant->size);
2261         u64 max_size = le64_to_cpu(grant->max_size);
2262         struct timespec mtime, atime, ctime;
2263         int check_caps = 0;
2264         int wake = 0;
2265         int writeback = 0;
2266         int revoked_rdcache = 0;
2267         int queue_invalidate = 0;
2268
2269         dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2270              inode, cap, mds, seq, ceph_cap_string(newcaps));
2271         dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2272                 inode->i_size);
2273
2274         /*
2275          * If CACHE is being revoked, and we have no dirty buffers,
2276          * try to invalidate (once).  (If there are dirty buffers, we
2277          * will invalidate _after_ writeback.)
2278          */
2279         if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2280             !ci->i_wrbuffer_ref) {
2281                 if (try_nonblocking_invalidate(inode) == 0) {
2282                         revoked_rdcache = 1;
2283                 } else {
2284                         /* there were locked pages.. invalidate later
2285                            in a separate thread. */
2286                         if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2287                                 queue_invalidate = 1;
2288                                 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2289                         }
2290                 }
2291         }
2292
2293         /* side effects now are allowed */
2294
2295         issued = __ceph_caps_issued(ci, &implemented);
2296         issued |= implemented | __ceph_caps_dirty(ci);
2297
2298         cap->cap_gen = session->s_cap_gen;
2299
2300         __check_cap_issue(ci, cap, newcaps);
2301
2302         if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2303                 inode->i_mode = le32_to_cpu(grant->mode);
2304                 inode->i_uid = le32_to_cpu(grant->uid);
2305                 inode->i_gid = le32_to_cpu(grant->gid);
2306                 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2307                      inode->i_uid, inode->i_gid);
2308         }
2309
2310         if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2311                 inode->i_nlink = le32_to_cpu(grant->nlink);
2312
2313         if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2314                 int len = le32_to_cpu(grant->xattr_len);
2315                 u64 version = le64_to_cpu(grant->xattr_version);
2316
2317                 if (version > ci->i_xattrs.version) {
2318                         dout(" got new xattrs v%llu on %p len %d\n",
2319                              version, inode, len);
2320                         if (ci->i_xattrs.blob)
2321                                 ceph_buffer_put(ci->i_xattrs.blob);
2322                         ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2323                         ci->i_xattrs.version = version;
2324                 }
2325         }
2326
2327         /* size/ctime/mtime/atime? */
2328         ceph_fill_file_size(inode, issued,
2329                             le32_to_cpu(grant->truncate_seq),
2330                             le64_to_cpu(grant->truncate_size), size);
2331         ceph_decode_timespec(&mtime, &grant->mtime);
2332         ceph_decode_timespec(&atime, &grant->atime);
2333         ceph_decode_timespec(&ctime, &grant->ctime);
2334         ceph_fill_file_time(inode, issued,
2335                             le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2336                             &atime);
2337
2338         /* max size increase? */
2339         if (max_size != ci->i_max_size) {
2340                 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2341                 ci->i_max_size = max_size;
2342                 if (max_size >= ci->i_wanted_max_size) {
2343                         ci->i_wanted_max_size = 0;  /* reset */
2344                         ci->i_requested_max_size = 0;
2345                 }
2346                 wake = 1;
2347         }
2348
2349         /* check cap bits */
2350         wanted = __ceph_caps_wanted(ci);
2351         used = __ceph_caps_used(ci);
2352         dirty = __ceph_caps_dirty(ci);
2353         dout(" my wanted = %s, used = %s, dirty %s\n",
2354              ceph_cap_string(wanted),
2355              ceph_cap_string(used),
2356              ceph_cap_string(dirty));
2357         if (wanted != le32_to_cpu(grant->wanted)) {
2358                 dout("mds wanted %s -> %s\n",
2359                      ceph_cap_string(le32_to_cpu(grant->wanted)),
2360                      ceph_cap_string(wanted));
2361                 grant->wanted = cpu_to_le32(wanted);
2362         }
2363
2364         cap->seq = seq;
2365
2366         /* file layout may have changed */
2367         ci->i_layout = grant->layout;
2368
2369         /* revocation, grant, or no-op? */
2370         if (cap->issued & ~newcaps) {
2371                 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2372                      ceph_cap_string(newcaps));
2373                 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2374                         writeback = 1; /* will delay ack */
2375                 else if (dirty & ~newcaps)
2376                         check_caps = 1;  /* initiate writeback in check_caps */
2377                 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2378                            revoked_rdcache)
2379                         check_caps = 2;     /* send revoke ack in check_caps */
2380                 cap->issued = newcaps;
2381                 cap->implemented |= newcaps;
2382         } else if (cap->issued == newcaps) {
2383                 dout("caps unchanged: %s -> %s\n",
2384                      ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2385         } else {
2386                 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2387                      ceph_cap_string(newcaps));
2388                 cap->issued = newcaps;
2389                 cap->implemented |= newcaps; /* add bits only, to
2390                                               * avoid stepping on a
2391                                               * pending revocation */
2392                 wake = 1;
2393         }
2394         BUG_ON(cap->issued & ~cap->implemented);
2395
2396         spin_unlock(&inode->i_lock);
2397         if (writeback)
2398                 /*
2399                  * queue inode for writeback: we can't actually call
2400                  * filemap_write_and_wait, etc. from message handler
2401                  * context.
2402                  */
2403                 ceph_queue_writeback(inode);
2404         if (queue_invalidate)
2405                 ceph_queue_invalidate(inode);
2406         if (wake)
2407                 wake_up_all(&ci->i_cap_wq);
2408
2409         if (check_caps == 1)
2410                 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2411                                 session);
2412         else if (check_caps == 2)
2413                 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2414         else
2415                 mutex_unlock(&session->s_mutex);
2416 }
2417
2418 /*
2419  * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2420  * MDS has been safely committed.
2421  */
2422 static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2423                                  struct ceph_mds_caps *m,
2424                                  struct ceph_mds_session *session,
2425                                  struct ceph_cap *cap)
2426         __releases(inode->i_lock)
2427 {
2428         struct ceph_inode_info *ci = ceph_inode(inode);
2429         struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
2430         unsigned seq = le32_to_cpu(m->seq);
2431         int dirty = le32_to_cpu(m->dirty);
2432         int cleaned = 0;
2433         int drop = 0;
2434         int i;
2435
2436         for (i = 0; i < CEPH_CAP_BITS; i++)
2437                 if ((dirty & (1 << i)) &&
2438                     flush_tid == ci->i_cap_flush_tid[i])
2439                         cleaned |= 1 << i;
2440
2441         dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2442              " flushing %s -> %s\n",
2443              inode, session->s_mds, seq, ceph_cap_string(dirty),
2444              ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2445              ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2446
2447         if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2448                 goto out;
2449
2450         ci->i_flushing_caps &= ~cleaned;
2451
2452         spin_lock(&mdsc->cap_dirty_lock);
2453         if (ci->i_flushing_caps == 0) {
2454                 list_del_init(&ci->i_flushing_item);
2455                 if (!list_empty(&session->s_cap_flushing))
2456                         dout(" mds%d still flushing cap on %p\n",
2457                              session->s_mds,
2458                              &list_entry(session->s_cap_flushing.next,
2459                                          struct ceph_inode_info,
2460                                          i_flushing_item)->vfs_inode);
2461                 mdsc->num_cap_flushing--;
2462                 wake_up_all(&mdsc->cap_flushing_wq);
2463                 dout(" inode %p now !flushing\n", inode);
2464
2465                 if (ci->i_dirty_caps == 0) {
2466                         dout(" inode %p now clean\n", inode);
2467                         BUG_ON(!list_empty(&ci->i_dirty_item));
2468                         drop = 1;
2469                 } else {
2470                         BUG_ON(list_empty(&ci->i_dirty_item));
2471                 }
2472         }
2473         spin_unlock(&mdsc->cap_dirty_lock);
2474         wake_up_all(&ci->i_cap_wq);
2475
2476 out:
2477         spin_unlock(&inode->i_lock);
2478         if (drop)
2479                 iput(inode);
2480 }
2481
2482 /*
2483  * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
2484  * throw away our cap_snap.
2485  *
2486  * Caller hold s_mutex.
2487  */
2488 static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2489                                      struct ceph_mds_caps *m,
2490                                      struct ceph_mds_session *session)
2491 {
2492         struct ceph_inode_info *ci = ceph_inode(inode);
2493         u64 follows = le64_to_cpu(m->snap_follows);
2494         struct ceph_cap_snap *capsnap;
2495         int drop = 0;
2496
2497         dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2498              inode, ci, session->s_mds, follows);
2499
2500         spin_lock(&inode->i_lock);
2501         list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2502                 if (capsnap->follows == follows) {
2503                         if (capsnap->flush_tid != flush_tid) {
2504                                 dout(" cap_snap %p follows %lld tid %lld !="
2505                                      " %lld\n", capsnap, follows,
2506                                      flush_tid, capsnap->flush_tid);
2507                                 break;
2508                         }
2509                         WARN_ON(capsnap->dirty_pages || capsnap->writing);
2510                         dout(" removing %p cap_snap %p follows %lld\n",
2511                              inode, capsnap, follows);
2512                         ceph_put_snap_context(capsnap->context);
2513                         list_del(&capsnap->ci_item);
2514                         list_del(&capsnap->flushing_item);
2515                         ceph_put_cap_snap(capsnap);
2516                         drop = 1;
2517                         break;
2518                 } else {
2519                         dout(" skipping cap_snap %p follows %lld\n",
2520                              capsnap, capsnap->follows);
2521                 }
2522         }
2523         spin_unlock(&inode->i_lock);
2524         if (drop)
2525                 iput(inode);
2526 }
2527
2528 /*
2529  * Handle TRUNC from MDS, indicating file truncation.
2530  *
2531  * caller hold s_mutex.
2532  */
2533 static void handle_cap_trunc(struct inode *inode,
2534                              struct ceph_mds_caps *trunc,
2535                              struct ceph_mds_session *session)
2536         __releases(inode->i_lock)
2537 {
2538         struct ceph_inode_info *ci = ceph_inode(inode);
2539         int mds = session->s_mds;
2540         int seq = le32_to_cpu(trunc->seq);
2541         u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2542         u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2543         u64 size = le64_to_cpu(trunc->size);
2544         int implemented = 0;
2545         int dirty = __ceph_caps_dirty(ci);
2546         int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2547         int queue_trunc = 0;
2548
2549         issued |= implemented | dirty;
2550
2551         dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2552              inode, mds, seq, truncate_size, truncate_seq);
2553         queue_trunc = ceph_fill_file_size(inode, issued,
2554                                           truncate_seq, truncate_size, size);
2555         spin_unlock(&inode->i_lock);
2556
2557         if (queue_trunc)
2558                 ceph_queue_vmtruncate(inode);
2559 }
2560
2561 /*
2562  * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
2563  * different one.  If we are the most recent migration we've seen (as
2564  * indicated by mseq), make note of the migrating cap bits for the
2565  * duration (until we see the corresponding IMPORT).
2566  *
2567  * caller holds s_mutex
2568  */
2569 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2570                               struct ceph_mds_session *session)
2571 {
2572         struct ceph_inode_info *ci = ceph_inode(inode);
2573         int mds = session->s_mds;
2574         unsigned mseq = le32_to_cpu(ex->migrate_seq);
2575         struct ceph_cap *cap = NULL, *t;
2576         struct rb_node *p;
2577         int remember = 1;
2578
2579         dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2580              inode, ci, mds, mseq);
2581
2582         spin_lock(&inode->i_lock);
2583
2584         /* make sure we haven't seen a higher mseq */
2585         for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2586                 t = rb_entry(p, struct ceph_cap, ci_node);
2587                 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2588                         dout(" higher mseq on cap from mds%d\n",
2589                              t->session->s_mds);
2590                         remember = 0;
2591                 }
2592                 if (t->session->s_mds == mds)
2593                         cap = t;
2594         }
2595
2596         if (cap) {
2597                 if (remember) {
2598                         /* make note */
2599                         ci->i_cap_exporting_mds = mds;
2600                         ci->i_cap_exporting_mseq = mseq;
2601                         ci->i_cap_exporting_issued = cap->issued;
2602                 }
2603                 __ceph_remove_cap(cap);
2604         }
2605         /* else, we already released it */
2606
2607         spin_unlock(&inode->i_lock);
2608 }
2609
2610 /*
2611  * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
2612  * clean them up.
2613  *
2614  * caller holds s_mutex.
2615  */
2616 static void handle_cap_import(struct ceph_mds_client *mdsc,
2617                               struct inode *inode, struct ceph_mds_caps *im,
2618                               struct ceph_mds_session *session,
2619                               void *snaptrace, int snaptrace_len)
2620 {
2621         struct ceph_inode_info *ci = ceph_inode(inode);
2622         int mds = session->s_mds;
2623         unsigned issued = le32_to_cpu(im->caps);
2624         unsigned wanted = le32_to_cpu(im->wanted);
2625         unsigned seq = le32_to_cpu(im->seq);
2626         unsigned mseq = le32_to_cpu(im->migrate_seq);
2627         u64 realmino = le64_to_cpu(im->realm);
2628         u64 cap_id = le64_to_cpu(im->cap_id);
2629
2630         if (ci->i_cap_exporting_mds >= 0 &&
2631             ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2632                 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2633                      " - cleared exporting from mds%d\n",
2634                      inode, ci, mds, mseq,
2635                      ci->i_cap_exporting_mds);
2636                 ci->i_cap_exporting_issued = 0;
2637                 ci->i_cap_exporting_mseq = 0;
2638                 ci->i_cap_exporting_mds = -1;
2639         } else {
2640                 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2641                      inode, ci, mds, mseq);
2642         }
2643
2644         down_write(&mdsc->snap_rwsem);
2645         ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2646                                false);
2647         downgrade_write(&mdsc->snap_rwsem);
2648         ceph_add_cap(inode, session, cap_id, -1,
2649                      issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2650                      NULL /* no caps context */);
2651         try_flush_caps(inode, session, NULL);
2652         up_read(&mdsc->snap_rwsem);
2653 }
2654
2655 /*
2656  * Handle a caps message from the MDS.
2657  *
2658  * Identify the appropriate session, inode, and call the right handler
2659  * based on the cap op.
2660  */
2661 void ceph_handle_caps(struct ceph_mds_session *session,
2662                       struct ceph_msg *msg)
2663 {
2664         struct ceph_mds_client *mdsc = session->s_mdsc;
2665         struct super_block *sb = mdsc->client->sb;
2666         struct inode *inode;
2667         struct ceph_cap *cap;
2668         struct ceph_mds_caps *h;
2669         int mds = session->s_mds;
2670         int op;
2671         u32 seq, mseq;
2672         struct ceph_vino vino;
2673         u64 cap_id;
2674         u64 size, max_size;
2675         u64 tid;
2676         void *snaptrace;
2677
2678         dout("handle_caps from mds%d\n", mds);
2679
2680         /* decode */
2681         tid = le64_to_cpu(msg->hdr.tid);
2682         if (msg->front.iov_len < sizeof(*h))
2683                 goto bad;
2684         h = msg->front.iov_base;
2685         snaptrace = h + 1;
2686         op = le32_to_cpu(h->op);
2687         vino.ino = le64_to_cpu(h->ino);
2688         vino.snap = CEPH_NOSNAP;
2689         cap_id = le64_to_cpu(h->cap_id);
2690         seq = le32_to_cpu(h->seq);
2691         mseq = le32_to_cpu(h->migrate_seq);
2692         size = le64_to_cpu(h->size);
2693         max_size = le64_to_cpu(h->max_size);
2694
2695         mutex_lock(&session->s_mutex);
2696         session->s_seq++;
2697         dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2698              (unsigned)seq);
2699
2700         /* lookup ino */
2701         inode = ceph_find_inode(sb, vino);
2702         dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2703              vino.snap, inode);
2704         if (!inode) {
2705                 dout(" i don't have ino %llx\n", vino.ino);
2706
2707                 if (op == CEPH_CAP_OP_IMPORT)
2708                         __queue_cap_release(session, vino.ino, cap_id,
2709                                             mseq, seq);
2710
2711                 /*
2712                  * send any full release message to try to move things
2713                  * along for the mds (who clearly thinks we still have this
2714                  * cap).
2715                  */
2716                 ceph_add_cap_releases(mdsc, session, -1);
2717                 ceph_send_cap_releases(mdsc, session);
2718                 goto done;
2719         }
2720
2721         /* these will work even if we don't have a cap yet */
2722         switch (op) {
2723         case CEPH_CAP_OP_FLUSHSNAP_ACK:
2724                 handle_cap_flushsnap_ack(inode, tid, h, session);
2725                 goto done;
2726
2727         case CEPH_CAP_OP_EXPORT:
2728                 handle_cap_export(inode, h, session);
2729                 goto done;
2730
2731         case CEPH_CAP_OP_IMPORT:
2732                 handle_cap_import(mdsc, inode, h, session,
2733                                   snaptrace, le32_to_cpu(h->snap_trace_len));
2734                 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2735                                 session);
2736                 goto done_unlocked;
2737         }
2738
2739         /* the rest require a cap */
2740         spin_lock(&inode->i_lock);
2741         cap = __get_cap_for_mds(ceph_inode(inode), mds);
2742         if (!cap) {
2743                 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2744                      inode, ceph_ino(inode), ceph_snap(inode), mds);
2745                 spin_unlock(&inode->i_lock);
2746                 goto done;
2747         }
2748
2749         /* note that each of these drops i_lock for us */
2750         switch (op) {
2751         case CEPH_CAP_OP_REVOKE:
2752         case CEPH_CAP_OP_GRANT:
2753                 handle_cap_grant(inode, h, session, cap, msg->middle);
2754                 goto done_unlocked;
2755
2756         case CEPH_CAP_OP_FLUSH_ACK:
2757                 handle_cap_flush_ack(inode, tid, h, session, cap);
2758                 break;
2759
2760         case CEPH_CAP_OP_TRUNC:
2761                 handle_cap_trunc(inode, h, session);
2762                 break;
2763
2764         default:
2765                 spin_unlock(&inode->i_lock);
2766                 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2767                        ceph_cap_op_name(op));
2768         }
2769
2770 done:
2771         mutex_unlock(&session->s_mutex);
2772 done_unlocked:
2773         if (inode)
2774                 iput(inode);
2775         return;
2776
2777 bad:
2778         pr_err("ceph_handle_caps: corrupt message\n");
2779         ceph_msg_dump(msg);
2780         return;
2781 }
2782
2783 /*
2784  * Delayed work handler to process end of delayed cap release LRU list.
2785  */
2786 void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2787 {
2788         struct ceph_inode_info *ci;
2789         int flags = CHECK_CAPS_NODELAY;
2790
2791         dout("check_delayed_caps\n");
2792         while (1) {
2793                 spin_lock(&mdsc->cap_delay_lock);
2794                 if (list_empty(&mdsc->cap_delay_list))
2795                         break;
2796                 ci = list_first_entry(&mdsc->cap_delay_list,
2797                                       struct ceph_inode_info,
2798                                       i_cap_delay_list);
2799                 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2800                     time_before(jiffies, ci->i_hold_caps_max))
2801                         break;
2802                 list_del_init(&ci->i_cap_delay_list);
2803                 spin_unlock(&mdsc->cap_delay_lock);
2804                 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2805                 ceph_check_caps(ci, flags, NULL);
2806         }
2807         spin_unlock(&mdsc->cap_delay_lock);
2808 }
2809
2810 /*
2811  * Flush all dirty caps to the mds
2812  */
2813 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2814 {
2815         struct ceph_inode_info *ci, *nci = NULL;
2816         struct inode *inode, *ninode = NULL;
2817         struct list_head *p, *n;
2818
2819         dout("flush_dirty_caps\n");
2820         spin_lock(&mdsc->cap_dirty_lock);
2821         list_for_each_safe(p, n, &mdsc->cap_dirty) {
2822                 if (nci) {
2823                         ci = nci;
2824                         inode = ninode;
2825                         ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2826                         dout("flush_dirty_caps inode %p (was next inode)\n",
2827                              inode);
2828                 } else {
2829                         ci = list_entry(p, struct ceph_inode_info,
2830                                         i_dirty_item);
2831                         inode = igrab(&ci->vfs_inode);
2832                         BUG_ON(!inode);
2833                         dout("flush_dirty_caps inode %p\n", inode);
2834                 }
2835                 if (n != &mdsc->cap_dirty) {
2836                         nci = list_entry(n, struct ceph_inode_info,
2837                                          i_dirty_item);
2838                         ninode = igrab(&nci->vfs_inode);
2839                         BUG_ON(!ninode);
2840                         nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2841                         dout("flush_dirty_caps next inode %p, noflush\n",
2842                              ninode);
2843                 } else {
2844                         nci = NULL;
2845                         ninode = NULL;
2846                 }
2847                 spin_unlock(&mdsc->cap_dirty_lock);
2848                 if (inode) {
2849                         ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2850                                         NULL);
2851                         iput(inode);
2852                 }
2853                 spin_lock(&mdsc->cap_dirty_lock);
2854         }
2855         spin_unlock(&mdsc->cap_dirty_lock);
2856 }
2857
2858 /*
2859  * Drop open file reference.  If we were the last open file,
2860  * we may need to release capabilities to the MDS (or schedule
2861  * their delayed release).
2862  */
2863 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2864 {
2865         struct inode *inode = &ci->vfs_inode;
2866         int last = 0;
2867
2868         spin_lock(&inode->i_lock);
2869         dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2870              ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2871         BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2872         if (--ci->i_nr_by_mode[fmode] == 0)
2873                 last++;
2874         spin_unlock(&inode->i_lock);
2875
2876         if (last && ci->i_vino.snap == CEPH_NOSNAP)
2877                 ceph_check_caps(ci, 0, NULL);
2878 }
2879
2880 /*
2881  * Helpers for embedding cap and dentry lease releases into mds
2882  * requests.
2883  *
2884  * @force is used by dentry_release (below) to force inclusion of a
2885  * record for the directory inode, even when there aren't any caps to
2886  * drop.
2887  */
2888 int ceph_encode_inode_release(void **p, struct inode *inode,
2889                               int mds, int drop, int unless, int force)
2890 {
2891         struct ceph_inode_info *ci = ceph_inode(inode);
2892         struct ceph_cap *cap;
2893         struct ceph_mds_request_release *rel = *p;
2894         int used, dirty;
2895         int ret = 0;
2896
2897         spin_lock(&inode->i_lock);
2898         used = __ceph_caps_used(ci);
2899         dirty = __ceph_caps_dirty(ci);
2900
2901         dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
2902              inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
2903              ceph_cap_string(unless));
2904
2905         /* only drop unused, clean caps */
2906         drop &= ~(used | dirty);
2907
2908         cap = __get_cap_for_mds(ci, mds);
2909         if (cap && __cap_is_valid(cap)) {
2910                 if (force ||
2911                     ((cap->issued & drop) &&
2912                      (cap->issued & unless) == 0)) {
2913                         if ((cap->issued & drop) &&
2914                             (cap->issued & unless) == 0) {
2915                                 dout("encode_inode_release %p cap %p %s -> "
2916                                      "%s\n", inode, cap,
2917                                      ceph_cap_string(cap->issued),
2918                                      ceph_cap_string(cap->issued & ~drop));
2919                                 cap->issued &= ~drop;
2920                                 cap->implemented &= ~drop;
2921                                 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2922                                         int wanted = __ceph_caps_wanted(ci);
2923                                         dout("  wanted %s -> %s (act %s)\n",
2924                                              ceph_cap_string(cap->mds_wanted),
2925                                              ceph_cap_string(cap->mds_wanted &
2926                                                              ~wanted),
2927                                              ceph_cap_string(wanted));
2928                                         cap->mds_wanted &= wanted;
2929                                 }
2930                         } else {
2931                                 dout("encode_inode_release %p cap %p %s"
2932                                      " (force)\n", inode, cap,
2933                                      ceph_cap_string(cap->issued));
2934                         }
2935
2936                         rel->ino = cpu_to_le64(ceph_ino(inode));
2937                         rel->cap_id = cpu_to_le64(cap->cap_id);
2938                         rel->seq = cpu_to_le32(cap->seq);
2939                         rel->issue_seq = cpu_to_le32(cap->issue_seq),
2940                         rel->mseq = cpu_to_le32(cap->mseq);
2941                         rel->caps = cpu_to_le32(cap->issued);
2942                         rel->wanted = cpu_to_le32(cap->mds_wanted);
2943                         rel->dname_len = 0;
2944                         rel->dname_seq = 0;
2945                         *p += sizeof(*rel);
2946                         ret = 1;
2947                 } else {
2948                         dout("encode_inode_release %p cap %p %s\n",
2949                              inode, cap, ceph_cap_string(cap->issued));
2950                 }
2951         }
2952         spin_unlock(&inode->i_lock);
2953         return ret;
2954 }
2955
2956 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2957                                int mds, int drop, int unless)
2958 {
2959         struct inode *dir = dentry->d_parent->d_inode;
2960         struct ceph_mds_request_release *rel = *p;
2961         struct ceph_dentry_info *di = ceph_dentry(dentry);
2962         int force = 0;
2963         int ret;
2964
2965         /*
2966          * force an record for the directory caps if we have a dentry lease.
2967          * this is racy (can't take i_lock and d_lock together), but it
2968          * doesn't have to be perfect; the mds will revoke anything we don't
2969          * release.
2970          */
2971         spin_lock(&dentry->d_lock);
2972         if (di->lease_session && di->lease_session->s_mds == mds)
2973                 force = 1;
2974         spin_unlock(&dentry->d_lock);
2975
2976         ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2977
2978         spin_lock(&dentry->d_lock);
2979         if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2980                 dout("encode_dentry_release %p mds%d seq %d\n",
2981                      dentry, mds, (int)di->lease_seq);
2982                 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2983                 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2984                 *p += dentry->d_name.len;
2985                 rel->dname_seq = cpu_to_le32(di->lease_seq);
2986                 __ceph_mdsc_drop_dentry_lease(dentry);
2987         }
2988         spin_unlock(&dentry->d_lock);
2989         return ret;
2990 }