fs/ceph/locks.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/ceph/ceph_debug.h>
   3
   4 #include <linux/file.h>
   5 #include <linux/namei.h>
   6 #include <linux/random.h>
   7
   8 #include "super.h"
   9 #include "mds_client.h"
  10 #include <linux/ceph/pagelist.h>
  11
  12 static u64 lock_secret;
  13 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
  14                                          struct ceph_mds_request *req);
  15
  16 static inline u64 secure_addr(void *addr)
  17 {
  18         u64 v = lock_secret ^ (u64)(unsigned long)addr;
  19         /*
  20          * Set the most significant bit, so that MDS knows the 'owner'
  21          * is sufficient to identify the owner of lock. (old code uses
  22          * both 'owner' and 'pid')
  23          */
  24         v |= (1ULL << 63);
  25         return v;
  26 }
  27
  28 void __init ceph_flock_init(void)
  29 {
  30         get_random_bytes(&lock_secret, sizeof(lock_secret));
  31 }
  32
  33 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
  34 {
  35         struct inode *inode = file_inode(src->fl_file);
  36         atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  37 }
  38
  39 static void ceph_fl_release_lock(struct file_lock *fl)
  40 {
  41         struct inode *inode = file_inode(fl->fl_file);
  42         struct ceph_inode_info *ci = ceph_inode(inode);
  43         if (atomic_dec_and_test(&ci->i_filelock_ref)) {
  44                 /* clear error when all locks are released */
  45                 spin_lock(&ci->i_ceph_lock);
  46                 ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
  47                 spin_unlock(&ci->i_ceph_lock);
  48         }
  49 }
  50
  51 static const struct file_lock_operations ceph_fl_lock_ops = {
  52         .fl_copy_lock = ceph_fl_copy_lock,
  53         .fl_release_private = ceph_fl_release_lock,
  54 };
  55
  56 /**
  57  * Implement fcntl and flock locking functions.
  58  */
  59 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
  60                              int cmd, u8 wait, struct file_lock *fl)
  61 {
  62         struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  63         struct ceph_mds_request *req;
  64         int err;
  65         u64 length = 0;
  66         u64 owner;
  67
  68         if (operation == CEPH_MDS_OP_SETFILELOCK) {
  69                 /*
  70                  * increasing i_filelock_ref closes race window between
  71                  * handling request reply and adding file_lock struct to
  72                  * inode. Otherwise, auth caps may get trimmed in the
  73                  * window. Caller function will decrease the counter.
  74                  */
  75                 fl->fl_ops = &ceph_fl_lock_ops;
  76                 atomic_inc(&ceph_inode(inode)->i_filelock_ref);
  77         }
  78
  79         if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
  80                 wait = 0;
  81
  82         req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
  83         if (IS_ERR(req))
  84                 return PTR_ERR(req);
  85         req->r_inode = inode;
  86         ihold(inode);
  87         req->r_num_caps = 1;
  88
  89         /* mds requires start and length rather than start and end */
  90         if (LLONG_MAX == fl->fl_end)
  91                 length = 0;
  92         else
  93                 length = fl->fl_end - fl->fl_start + 1;
  94
  95         owner = secure_addr(fl->fl_owner);
  96
  97         dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
  98              "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type,
  99              (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
 100              wait, fl->fl_type);
 101
 102         req->r_args.filelock_change.rule = lock_type;
 103         req->r_args.filelock_change.type = cmd;
 104         req->r_args.filelock_change.owner = cpu_to_le64(owner);
 105         req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
 106         req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
 107         req->r_args.filelock_change.length = cpu_to_le64(length);
 108         req->r_args.filelock_change.wait = wait;
 109
 110         if (wait)
 111                 req->r_wait_for_completion = ceph_lock_wait_for_completion;
 112
 113         err = ceph_mdsc_do_request(mdsc, inode, req);
 114         if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
 115                 fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 116                 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
 117                         fl->fl_type = F_RDLCK;
 118                 else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
 119                         fl->fl_type = F_WRLCK;
 120                 else
 121                         fl->fl_type = F_UNLCK;
 122
 123                 fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
 124                 length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
 125                                                  le64_to_cpu(req->r_reply_info.filelock_reply->length);
 126                 if (length >= 1)
 127                         fl->fl_end = length -1;
 128                 else
 129                         fl->fl_end = 0;
 130
 131         }
 132         ceph_mdsc_put_request(req);
 133         dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
 134              "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type,
 135              (int)operation, (u64)fl->fl_pid, fl->fl_start,
 136              length, wait, fl->fl_type, err);
 137         return err;
 138 }
 139
 140 static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 141                                          struct ceph_mds_request *req)
 142 {
 143         struct ceph_mds_request *intr_req;
 144         struct inode *inode = req->r_inode;
 145         int err, lock_type;
 146
 147         BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
 148         if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
 149                 lock_type = CEPH_LOCK_FCNTL_INTR;
 150         else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
 151                 lock_type = CEPH_LOCK_FLOCK_INTR;
 152         else
 153                 BUG_ON(1);
 154         BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
 155
 156         err = wait_for_completion_interruptible(&req->r_completion);
 157         if (!err)
 158                 return 0;
 159
 160         dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
 161              req->r_tid);
 162
 163         mutex_lock(&mdsc->mutex);
 164         if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
 165                 err = 0;
 166         } else {
 167                 /*
 168                  * ensure we aren't running concurrently with
 169                  * ceph_fill_trace or ceph_readdir_prepopulate, which
 170                  * rely on locks (dir mutex) held by our caller.
 171                  */
 172                 mutex_lock(&req->r_fill_mutex);
 173                 req->r_err = err;
 174                 set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
 175                 mutex_unlock(&req->r_fill_mutex);
 176
 177                 if (!req->r_session) {
 178                         // haven't sent the request
 179                         err = 0;
 180                 }
 181         }
 182         mutex_unlock(&mdsc->mutex);
 183         if (!err)
 184                 return 0;
 185
 186         intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
 187                                             USE_AUTH_MDS);
 188         if (IS_ERR(intr_req))
 189                 return PTR_ERR(intr_req);
 190
 191         intr_req->r_inode = inode;
 192         ihold(inode);
 193         intr_req->r_num_caps = 1;
 194
 195         intr_req->r_args.filelock_change = req->r_args.filelock_change;
 196         intr_req->r_args.filelock_change.rule = lock_type;
 197         intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
 198
 199         err = ceph_mdsc_do_request(mdsc, inode, intr_req);
 200         ceph_mdsc_put_request(intr_req);
 201
 202         if (err && err != -ERESTARTSYS)
 203                 return err;
 204
 205         wait_for_completion_killable(&req->r_safe_completion);
 206         return 0;
 207 }
 208
 209 /**
 210  * Attempt to set an fcntl lock.
 211  * For now, this just goes away to the server. Later it may be more awesome.
 212  */
 213 int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 214 {
 215         struct inode *inode = file_inode(file);
 216         struct ceph_inode_info *ci = ceph_inode(inode);
 217         int err = 0;
 218         u16 op = CEPH_MDS_OP_SETFILELOCK;
 219         u8 wait = 0;
 220         u8 lock_cmd;
 221
 222         if (!(fl->fl_flags & FL_POSIX))
 223                 return -ENOLCK;
 224         /* No mandatory locks */
 225         if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
 226                 return -ENOLCK;
 227
 228         dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
 229
 230         /* set wait bit as appropriate, then make command as Ceph expects it*/
 231         if (IS_GETLK(cmd))
 232                 op = CEPH_MDS_OP_GETFILELOCK;
 233         else if (IS_SETLKW(cmd))
 234                 wait = 1;
 235
 236         spin_lock(&ci->i_ceph_lock);
 237         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 238                 err = -EIO;
 239         }
 240         spin_unlock(&ci->i_ceph_lock);
 241         if (err < 0) {
 242                 if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
 243                         posix_lock_file(file, fl, NULL);
 244                 return err;
 245         }
 246
 247         if (F_RDLCK == fl->fl_type)
 248                 lock_cmd = CEPH_LOCK_SHARED;
 249         else if (F_WRLCK == fl->fl_type)
 250                 lock_cmd = CEPH_LOCK_EXCL;
 251         else
 252                 lock_cmd = CEPH_LOCK_UNLOCK;
 253
 254         err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
 255         if (!err) {
 256                 if (op == CEPH_MDS_OP_SETFILELOCK) {
 257                         dout("mds locked, locking locally\n");
 258                         err = posix_lock_file(file, fl, NULL);
 259                         if (err) {
 260                                 /* undo! This should only happen if
 261                                  * the kernel detects local
 262                                  * deadlock. */
 263                                 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
 264                                                   CEPH_LOCK_UNLOCK, 0, fl);
 265                                 dout("got %d on posix_lock_file, undid lock\n",
 266                                      err);
 267                         }
 268                 }
 269         }
 270         return err;
 271 }
 272
 273 int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 274 {
 275         struct inode *inode = file_inode(file);
 276         struct ceph_inode_info *ci = ceph_inode(inode);
 277         int err = 0;
 278         u8 wait = 0;
 279         u8 lock_cmd;
 280
 281         if (!(fl->fl_flags & FL_FLOCK))
 282                 return -ENOLCK;
 283         /* No mandatory locks */
 284         if (fl->fl_type & LOCK_MAND)
 285                 return -EOPNOTSUPP;
 286
 287         dout("ceph_flock, fl_file: %p\n", fl->fl_file);
 288
 289         spin_lock(&ci->i_ceph_lock);
 290         if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
 291                 err = -EIO;
 292         }
 293         spin_unlock(&ci->i_ceph_lock);
 294         if (err < 0) {
 295                 if (F_UNLCK == fl->fl_type)
 296                         locks_lock_file_wait(file, fl);
 297                 return err;
 298         }
 299
 300         if (IS_SETLKW(cmd))
 301                 wait = 1;
 302
 303         if (F_RDLCK == fl->fl_type)
 304                 lock_cmd = CEPH_LOCK_SHARED;
 305         else if (F_WRLCK == fl->fl_type)
 306                 lock_cmd = CEPH_LOCK_EXCL;
 307         else
 308                 lock_cmd = CEPH_LOCK_UNLOCK;
 309
 310         err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
 311                                 inode, lock_cmd, wait, fl);
 312         if (!err) {
 313                 err = locks_lock_file_wait(file, fl);
 314                 if (err) {
 315                         ceph_lock_message(CEPH_LOCK_FLOCK,
 316                                           CEPH_MDS_OP_SETFILELOCK,
 317                                           inode, CEPH_LOCK_UNLOCK, 0, fl);
 318                         dout("got %d on locks_lock_file_wait, undid lock\n", err);
 319                 }
 320         }
 321         return err;
 322 }
 323
 324 /*
 325  * Fills in the passed counter variables, so you can prepare pagelist metadata
 326  * before calling ceph_encode_locks.
 327  */
 328 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 329 {
 330         struct file_lock *lock;
 331         struct file_lock_context *ctx;
 332
 333         *fcntl_count = 0;
 334         *flock_count = 0;
 335
 336         ctx = inode->i_flctx;
 337         if (ctx) {
 338                 spin_lock(&ctx->flc_lock);
 339                 list_for_each_entry(lock, &ctx->flc_posix, fl_list)
 340                         ++(*fcntl_count);
 341                 list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 342                         ++(*flock_count);
 343                 spin_unlock(&ctx->flc_lock);
 344         }
 345         dout("counted %d flock locks and %d fcntl locks\n",
 346              *flock_count, *fcntl_count);
 347 }
 348
 349 /*
 350  * Given a pointer to a lock, convert it to a ceph filelock
 351  */
 352 static int lock_to_ceph_filelock(struct file_lock *lock,
 353                                  struct ceph_filelock *cephlock)
 354 {
 355         int err = 0;
 356         cephlock->start = cpu_to_le64(lock->fl_start);
 357         cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 358         cephlock->client = cpu_to_le64(0);
 359         cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
 360         cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
 361
 362         switch (lock->fl_type) {
 363         case F_RDLCK:
 364                 cephlock->type = CEPH_LOCK_SHARED;
 365                 break;
 366         case F_WRLCK:
 367                 cephlock->type = CEPH_LOCK_EXCL;
 368                 break;
 369         case F_UNLCK:
 370                 cephlock->type = CEPH_LOCK_UNLOCK;
 371                 break;
 372         default:
 373                 dout("Have unknown lock type %d\n", lock->fl_type);
 374                 err = -EINVAL;
 375         }
 376
 377         return err;
 378 }
 379
 380 /**
 381  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
 382  * array. Must be called with inode->i_lock already held.
 383  * If we encounter more of a specific lock type than expected, return -ENOSPC.
 384  */
 385 int ceph_encode_locks_to_buffer(struct inode *inode,
 386                                 struct ceph_filelock *flocks,
 387                                 int num_fcntl_locks, int num_flock_locks)
 388 {
 389         struct file_lock *lock;
 390         struct file_lock_context *ctx = inode->i_flctx;
 391         int err = 0;
 392         int seen_fcntl = 0;
 393         int seen_flock = 0;
 394         int l = 0;
 395
 396         dout("encoding %d flock and %d fcntl locks\n", num_flock_locks,
 397              num_fcntl_locks);
 398
 399         if (!ctx)
 400                 return 0;
 401
 402         spin_lock(&ctx->flc_lock);
 403         list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
 404                 ++seen_fcntl;
 405                 if (seen_fcntl > num_fcntl_locks) {
 406                         err = -ENOSPC;
 407                         goto fail;
 408                 }
 409                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 410                 if (err)
 411                         goto fail;
 412                 ++l;
 413         }
 414         list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
 415                 ++seen_flock;
 416                 if (seen_flock > num_flock_locks) {
 417                         err = -ENOSPC;
 418                         goto fail;
 419                 }
 420                 err = lock_to_ceph_filelock(lock, &flocks[l]);
 421                 if (err)
 422                         goto fail;
 423                 ++l;
 424         }
 425 fail:
 426         spin_unlock(&ctx->flc_lock);
 427         return err;
 428 }
 429
 430 /**
 431  * Copy the encoded flock and fcntl locks into the pagelist.
 432  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
 433  * sequential flock locks.
 434  * Returns zero on success.
 435  */
 436 int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
 437                            struct ceph_pagelist *pagelist,
 438                            int num_fcntl_locks, int num_flock_locks)
 439 {
 440         int err = 0;
 441         __le32 nlocks;
 442
 443         nlocks = cpu_to_le32(num_fcntl_locks);
 444         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 445         if (err)
 446                 goto out_fail;
 447
 448         if (num_fcntl_locks > 0) {
 449                 err = ceph_pagelist_append(pagelist, flocks,
 450                                            num_fcntl_locks * sizeof(*flocks));
 451                 if (err)
 452                         goto out_fail;
 453         }
 454
 455         nlocks = cpu_to_le32(num_flock_locks);
 456         err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
 457         if (err)
 458                 goto out_fail;
 459
 460         if (num_flock_locks > 0) {
 461                 err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
 462                                            num_flock_locks * sizeof(*flocks));
 463         }
 464 out_fail:
 465         return err;
 466 }