net/sunrpc/xprtrdma/rpc_rdma.c

   1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2 /*
   3  * Copyright (c) 2014-2017 Oracle.  All rights reserved.
   4  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
   5  *
   6  * This software is available to you under a choice of one of two
   7  * licenses.  You may choose to be licensed under the terms of the GNU
   8  * General Public License (GPL) Version 2, available from the file
   9  * COPYING in the main directory of this source tree, or the BSD-type
  10  * license below:
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  *
  16  *      Redistributions of source code must retain the above copyright
  17  *      notice, this list of conditions and the following disclaimer.
  18  *
  19  *      Redistributions in binary form must reproduce the above
  20  *      copyright notice, this list of conditions and the following
  21  *      disclaimer in the documentation and/or other materials provided
  22  *      with the distribution.
  23  *
  24  *      Neither the name of the Network Appliance, Inc. nor the names of
  25  *      its contributors may be used to endorse or promote products
  26  *      derived from this software without specific prior written
  27  *      permission.
  28  *
  29  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  30  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  31  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  32  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  33  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  34  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  35  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  36  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  37  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  38  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  39  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  40  */
  41
  42 /*
  43  * rpc_rdma.c
  44  *
  45  * This file contains the guts of the RPC RDMA protocol, and
  46  * does marshaling/unmarshaling, etc. It is also where interfacing
  47  * to the Linux RPC framework lives.
  48  */
  49
  50 #include <linux/highmem.h>
  51
  52 #include <linux/sunrpc/svc_rdma.h>
  53
  54 #include "xprt_rdma.h"
  55 #include <trace/events/rpcrdma.h>
  56
  57 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  58 # define RPCDBG_FACILITY        RPCDBG_TRANS
  59 #endif
  60
  61 /* Returns size of largest RPC-over-RDMA header in a Call message
  62  *
  63  * The largest Call header contains a full-size Read list and a
  64  * minimal Reply chunk.
  65  */
  66 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
  67 {
  68         unsigned int size;
  69
  70         /* Fixed header fields and list discriminators */
  71         size = RPCRDMA_HDRLEN_MIN;
  72
  73         /* Maximum Read list size */
  74         size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
  75
  76         /* Minimal Read chunk size */
  77         size += sizeof(__be32); /* segment count */
  78         size += rpcrdma_segment_maxsz * sizeof(__be32);
  79         size += sizeof(__be32); /* list discriminator */
  80
  81         dprintk("RPC:       %s: max call header size = %u\n",
  82                 __func__, size);
  83         return size;
  84 }
  85
  86 /* Returns size of largest RPC-over-RDMA header in a Reply message
  87  *
  88  * There is only one Write list or one Reply chunk per Reply
  89  * message.  The larger list is the Write list.
  90  */
  91 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
  92 {
  93         unsigned int size;
  94
  95         /* Fixed header fields and list discriminators */
  96         size = RPCRDMA_HDRLEN_MIN;
  97
  98         /* Maximum Write list size */
  99         size = sizeof(__be32);          /* segment count */
 100         size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
 101         size += sizeof(__be32); /* list discriminator */
 102
 103         dprintk("RPC:       %s: max reply header size = %u\n",
 104                 __func__, size);
 105         return size;
 106 }
 107
 108 /**
 109  * rpcrdma_set_max_header_sizes - Initialize inline payload sizes
 110  * @r_xprt: transport instance to initialize
 111  *
 112  * The max_inline fields contain the maximum size of an RPC message
 113  * so the marshaling code doesn't have to repeat this calculation
 114  * for every RPC.
 115  */
 116 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
 117 {
 118         unsigned int maxsegs = r_xprt->rx_ia.ri_max_segs;
 119         struct rpcrdma_ep *ep = &r_xprt->rx_ep;
 120
 121         ep->rep_max_inline_send =
 122                 ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
 123         ep->rep_max_inline_recv =
 124                 ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
 125 }
 126
 127 /* The client can send a request inline as long as the RPCRDMA header
 128  * plus the RPC call fit under the transport's inline limit. If the
 129  * combined call message size exceeds that limit, the client must use
 130  * a Read chunk for this operation.
 131  *
 132  * A Read chunk is also required if sending the RPC call inline would
 133  * exceed this device's max_sge limit.
 134  */
 135 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
 136                                 struct rpc_rqst *rqst)
 137 {
 138         struct xdr_buf *xdr = &rqst->rq_snd_buf;
 139         unsigned int count, remaining, offset;
 140
 141         if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
 142                 return false;
 143
 144         if (xdr->page_len) {
 145                 remaining = xdr->page_len;
 146                 offset = offset_in_page(xdr->page_base);
 147                 count = RPCRDMA_MIN_SEND_SGES;
 148                 while (remaining) {
 149                         remaining -= min_t(unsigned int,
 150                                            PAGE_SIZE - offset, remaining);
 151                         offset = 0;
 152                         if (++count > r_xprt->rx_ia.ri_max_send_sges)
 153                                 return false;
 154                 }
 155         }
 156
 157         return true;
 158 }
 159
 160 /* The client can't know how large the actual reply will be. Thus it
 161  * plans for the largest possible reply for that particular ULP
 162  * operation. If the maximum combined reply message size exceeds that
 163  * limit, the client must provide a write list or a reply chunk for
 164  * this request.
 165  */
 166 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 167                                    struct rpc_rqst *rqst)
 168 {
 169         return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
 170 }
 171
 172 /* The client is required to provide a Reply chunk if the maximum
 173  * size of the non-payload part of the RPC Reply is larger than
 174  * the inline threshold.
 175  */
 176 static bool
 177 rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
 178                           const struct rpc_rqst *rqst)
 179 {
 180         const struct xdr_buf *buf = &rqst->rq_rcv_buf;
 181
 182         return (buf->head[0].iov_len + buf->tail[0].iov_len) <
 183                 r_xprt->rx_ep.rep_max_inline_recv;
 184 }
 185
 186 /* Split @vec on page boundaries into SGEs. FMR registers pages, not
 187  * a byte range. Other modes coalesce these SGEs into a single MR
 188  * when they can.
 189  *
 190  * Returns pointer to next available SGE, and bumps the total number
 191  * of SGEs consumed.
 192  */
 193 static struct rpcrdma_mr_seg *
 194 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
 195                      unsigned int *n)
 196 {
 197         u32 remaining, page_offset;
 198         char *base;
 199
 200         base = vec->iov_base;
 201         page_offset = offset_in_page(base);
 202         remaining = vec->iov_len;
 203         while (remaining) {
 204                 seg->mr_page = NULL;
 205                 seg->mr_offset = base;
 206                 seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
 207                 remaining -= seg->mr_len;
 208                 base += seg->mr_len;
 209                 ++seg;
 210                 ++(*n);
 211                 page_offset = 0;
 212         }
 213         return seg;
 214 }
 215
 216 /* Convert @xdrbuf into SGEs no larger than a page each. As they
 217  * are registered, these SGEs are then coalesced into RDMA segments
 218  * when the selected memreg mode supports it.
 219  *
 220  * Returns positive number of SGEs consumed, or a negative errno.
 221  */
 222
 223 static int
 224 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
 225                      unsigned int pos, enum rpcrdma_chunktype type,
 226                      struct rpcrdma_mr_seg *seg)
 227 {
 228         unsigned long page_base;
 229         unsigned int len, n;
 230         struct page **ppages;
 231
 232         n = 0;
 233         if (pos == 0)
 234                 seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
 235
 236         len = xdrbuf->page_len;
 237         ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
 238         page_base = offset_in_page(xdrbuf->page_base);
 239         while (len) {
 240                 /* ACL likes to be lazy in allocating pages - ACLs
 241                  * are small by default but can get huge.
 242                  */
 243                 if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
 244                         if (!*ppages)
 245                                 *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
 246                         if (!*ppages)
 247                                 return -ENOBUFS;
 248                 }
 249                 seg->mr_page = *ppages;
 250                 seg->mr_offset = (char *)page_base;
 251                 seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
 252                 len -= seg->mr_len;
 253                 ++ppages;
 254                 ++seg;
 255                 ++n;
 256                 page_base = 0;
 257         }
 258
 259         /* When encoding a Read chunk, the tail iovec contains an
 260          * XDR pad and may be omitted.
 261          */
 262         if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
 263                 goto out;
 264
 265         /* When encoding a Write chunk, some servers need to see an
 266          * extra segment for non-XDR-aligned Write chunks. The upper
 267          * layer provides space in the tail iovec that may be used
 268          * for this purpose.
 269          */
 270         if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
 271                 goto out;
 272
 273         if (xdrbuf->tail[0].iov_len)
 274                 seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
 275
 276 out:
 277         if (unlikely(n > RPCRDMA_MAX_SEGS))
 278                 return -EIO;
 279         return n;
 280 }
 281
 282 static inline int
 283 encode_item_present(struct xdr_stream *xdr)
 284 {
 285         __be32 *p;
 286
 287         p = xdr_reserve_space(xdr, sizeof(*p));
 288         if (unlikely(!p))
 289                 return -EMSGSIZE;
 290
 291         *p = xdr_one;
 292         return 0;
 293 }
 294
 295 static inline int
 296 encode_item_not_present(struct xdr_stream *xdr)
 297 {
 298         __be32 *p;
 299
 300         p = xdr_reserve_space(xdr, sizeof(*p));
 301         if (unlikely(!p))
 302                 return -EMSGSIZE;
 303
 304         *p = xdr_zero;
 305         return 0;
 306 }
 307
 308 static void
 309 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
 310 {
 311         *iptr++ = cpu_to_be32(mr->mr_handle);
 312         *iptr++ = cpu_to_be32(mr->mr_length);
 313         xdr_encode_hyper(iptr, mr->mr_offset);
 314 }
 315
 316 static int
 317 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
 318 {
 319         __be32 *p;
 320
 321         p = xdr_reserve_space(xdr, 4 * sizeof(*p));
 322         if (unlikely(!p))
 323                 return -EMSGSIZE;
 324
 325         xdr_encode_rdma_segment(p, mr);
 326         return 0;
 327 }
 328
 329 static int
 330 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
 331                     u32 position)
 332 {
 333         __be32 *p;
 334
 335         p = xdr_reserve_space(xdr, 6 * sizeof(*p));
 336         if (unlikely(!p))
 337                 return -EMSGSIZE;
 338
 339         *p++ = xdr_one;                 /* Item present */
 340         *p++ = cpu_to_be32(position);
 341         xdr_encode_rdma_segment(p, mr);
 342         return 0;
 343 }
 344
 345 /* Register and XDR encode the Read list. Supports encoding a list of read
 346  * segments that belong to a single read chunk.
 347  *
 348  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 349  *
 350  *  Read chunklist (a linked list):
 351  *   N elements, position P (same P for all chunks of same arg!):
 352  *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
 353  *
 354  * Returns zero on success, or a negative errno if a failure occurred.
 355  * @xdr is advanced to the next position in the stream.
 356  *
 357  * Only a single @pos value is currently supported.
 358  */
 359 static noinline int
 360 rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 361                          struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
 362 {
 363         struct xdr_stream *xdr = &req->rl_stream;
 364         struct rpcrdma_mr_seg *seg;
 365         struct rpcrdma_mr *mr;
 366         unsigned int pos;
 367         int nsegs;
 368
 369         pos = rqst->rq_snd_buf.head[0].iov_len;
 370         if (rtype == rpcrdma_areadch)
 371                 pos = 0;
 372         seg = req->rl_segments;
 373         nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
 374                                      rtype, seg);
 375         if (nsegs < 0)
 376                 return nsegs;
 377
 378         do {
 379                 seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr);
 380                 if (IS_ERR(seg))
 381                         return PTR_ERR(seg);
 382                 rpcrdma_mr_push(mr, &req->rl_registered);
 383
 384                 if (encode_read_segment(xdr, mr, pos) < 0)
 385                         return -EMSGSIZE;
 386
 387                 trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs);
 388                 r_xprt->rx_stats.read_chunk_count++;
 389                 nsegs -= mr->mr_nents;
 390         } while (nsegs);
 391
 392         return 0;
 393 }
 394
 395 /* Register and XDR encode the Write list. Supports encoding a list
 396  * containing one array of plain segments that belong to a single
 397  * write chunk.
 398  *
 399  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 400  *
 401  *  Write chunklist (a list of (one) counted array):
 402  *   N elements:
 403  *    1 - N - HLOO - HLOO - ... - HLOO - 0
 404  *
 405  * Returns zero on success, or a negative errno if a failure occurred.
 406  * @xdr is advanced to the next position in the stream.
 407  *
 408  * Only a single Write chunk is currently supported.
 409  */
 410 static noinline int
 411 rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 412                           struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
 413 {
 414         struct xdr_stream *xdr = &req->rl_stream;
 415         struct rpcrdma_mr_seg *seg;
 416         struct rpcrdma_mr *mr;
 417         int nsegs, nchunks;
 418         __be32 *segcount;
 419
 420         seg = req->rl_segments;
 421         nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
 422                                      rqst->rq_rcv_buf.head[0].iov_len,
 423                                      wtype, seg);
 424         if (nsegs < 0)
 425                 return nsegs;
 426
 427         if (encode_item_present(xdr) < 0)
 428                 return -EMSGSIZE;
 429         segcount = xdr_reserve_space(xdr, sizeof(*segcount));
 430         if (unlikely(!segcount))
 431                 return -EMSGSIZE;
 432         /* Actual value encoded below */
 433
 434         nchunks = 0;
 435         do {
 436                 seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
 437                 if (IS_ERR(seg))
 438                         return PTR_ERR(seg);
 439                 rpcrdma_mr_push(mr, &req->rl_registered);
 440
 441                 if (encode_rdma_segment(xdr, mr) < 0)
 442                         return -EMSGSIZE;
 443
 444                 trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs);
 445                 r_xprt->rx_stats.write_chunk_count++;
 446                 r_xprt->rx_stats.total_rdma_request += mr->mr_length;
 447                 nchunks++;
 448                 nsegs -= mr->mr_nents;
 449         } while (nsegs);
 450
 451         /* Update count of segments in this Write chunk */
 452         *segcount = cpu_to_be32(nchunks);
 453
 454         return 0;
 455 }
 456
 457 /* Register and XDR encode the Reply chunk. Supports encoding an array
 458  * of plain segments that belong to a single write (reply) chunk.
 459  *
 460  * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
 461  *
 462  *  Reply chunk (a counted array):
 463  *   N elements:
 464  *    1 - N - HLOO - HLOO - ... - HLOO
 465  *
 466  * Returns zero on success, or a negative errno if a failure occurred.
 467  * @xdr is advanced to the next position in the stream.
 468  */
 469 static noinline int
 470 rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 471                            struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
 472 {
 473         struct xdr_stream *xdr = &req->rl_stream;
 474         struct rpcrdma_mr_seg *seg;
 475         struct rpcrdma_mr *mr;
 476         int nsegs, nchunks;
 477         __be32 *segcount;
 478
 479         seg = req->rl_segments;
 480         nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
 481         if (nsegs < 0)
 482                 return nsegs;
 483
 484         if (encode_item_present(xdr) < 0)
 485                 return -EMSGSIZE;
 486         segcount = xdr_reserve_space(xdr, sizeof(*segcount));
 487         if (unlikely(!segcount))
 488                 return -EMSGSIZE;
 489         /* Actual value encoded below */
 490
 491         nchunks = 0;
 492         do {
 493                 seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr);
 494                 if (IS_ERR(seg))
 495                         return PTR_ERR(seg);
 496                 rpcrdma_mr_push(mr, &req->rl_registered);
 497
 498                 if (encode_rdma_segment(xdr, mr) < 0)
 499                         return -EMSGSIZE;
 500
 501                 trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs);
 502                 r_xprt->rx_stats.reply_chunk_count++;
 503                 r_xprt->rx_stats.total_rdma_request += mr->mr_length;
 504                 nchunks++;
 505                 nsegs -= mr->mr_nents;
 506         } while (nsegs);
 507
 508         /* Update count of segments in the Reply chunk */
 509         *segcount = cpu_to_be32(nchunks);
 510
 511         return 0;
 512 }
 513
 514 static void rpcrdma_sendctx_done(struct kref *kref)
 515 {
 516         struct rpcrdma_req *req =
 517                 container_of(kref, struct rpcrdma_req, rl_kref);
 518         struct rpcrdma_rep *rep = req->rl_reply;
 519
 520         rpcrdma_complete_rqst(rep);
 521         rep->rr_rxprt->rx_stats.reply_waits_for_send++;
 522 }
 523
 524 /**
 525  * rpcrdma_sendctx_unmap - DMA-unmap Send buffer
 526  * @sc: sendctx containing SGEs to unmap
 527  *
 528  */
 529 void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
 530 {
 531         struct ib_sge *sge;
 532
 533         if (!sc->sc_unmap_count)
 534                 return;
 535
 536         /* The first two SGEs contain the transport header and
 537          * the inline buffer. These are always left mapped so
 538          * they can be cheaply re-used.
 539          */
 540         for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
 541              ++sge, --sc->sc_unmap_count)
 542                 ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
 543                                   DMA_TO_DEVICE);
 544
 545         kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
 546 }
 547
 548 /* Prepare an SGE for the RPC-over-RDMA transport header.
 549  */
 550 static bool rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
 551                                     struct rpcrdma_req *req, u32 len)
 552 {
 553         struct rpcrdma_sendctx *sc = req->rl_sendctx;
 554         struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
 555         struct ib_sge *sge = sc->sc_sges;
 556
 557         if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
 558                 goto out_regbuf;
 559         sge->addr = rdmab_addr(rb);
 560         sge->length = len;
 561         sge->lkey = rdmab_lkey(rb);
 562
 563         ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
 564                                       DMA_TO_DEVICE);
 565         sc->sc_wr.num_sge++;
 566         return true;
 567
 568 out_regbuf:
 569         pr_err("rpcrdma: failed to DMA map a Send buffer\n");
 570         return false;
 571 }
 572
 573 /* Prepare the Send SGEs. The head and tail iovec, and each entry
 574  * in the page list, gets its own SGE.
 575  */
 576 static bool rpcrdma_prepare_msg_sges(struct rpcrdma_xprt *r_xprt,
 577                                      struct rpcrdma_req *req,
 578                                      struct xdr_buf *xdr,
 579                                      enum rpcrdma_chunktype rtype)
 580 {
 581         struct rpcrdma_sendctx *sc = req->rl_sendctx;
 582         unsigned int sge_no, page_base, len, remaining;
 583         struct rpcrdma_regbuf *rb = req->rl_sendbuf;
 584         struct ib_sge *sge = sc->sc_sges;
 585         struct page *page, **ppages;
 586
 587         /* The head iovec is straightforward, as it is already
 588          * DMA-mapped. Sync the content that has changed.
 589          */
 590         if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
 591                 goto out_regbuf;
 592         sc->sc_device = rdmab_device(rb);
 593         sge_no = 1;
 594         sge[sge_no].addr = rdmab_addr(rb);
 595         sge[sge_no].length = xdr->head[0].iov_len;
 596         sge[sge_no].lkey = rdmab_lkey(rb);
 597         ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
 598                                       sge[sge_no].length, DMA_TO_DEVICE);
 599
 600         /* If there is a Read chunk, the page list is being handled
 601          * via explicit RDMA, and thus is skipped here. However, the
 602          * tail iovec may include an XDR pad for the page list, as
 603          * well as additional content, and may not reside in the
 604          * same page as the head iovec.
 605          */
 606         if (rtype == rpcrdma_readch) {
 607                 len = xdr->tail[0].iov_len;
 608
 609                 /* Do not include the tail if it is only an XDR pad */
 610                 if (len < 4)
 611                         goto out;
 612
 613                 page = virt_to_page(xdr->tail[0].iov_base);
 614                 page_base = offset_in_page(xdr->tail[0].iov_base);
 615
 616                 /* If the content in the page list is an odd length,
 617                  * xdr_write_pages() has added a pad at the beginning
 618                  * of the tail iovec. Force the tail's non-pad content
 619                  * to land at the next XDR position in the Send message.
 620                  */
 621                 page_base += len & 3;
 622                 len -= len & 3;
 623                 goto map_tail;
 624         }
 625
 626         /* If there is a page list present, temporarily DMA map
 627          * and prepare an SGE for each page to be sent.
 628          */
 629         if (xdr->page_len) {
 630                 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
 631                 page_base = offset_in_page(xdr->page_base);
 632                 remaining = xdr->page_len;
 633                 while (remaining) {
 634                         sge_no++;
 635                         if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
 636                                 goto out_mapping_overflow;
 637
 638                         len = min_t(u32, PAGE_SIZE - page_base, remaining);
 639                         sge[sge_no].addr =
 640                                 ib_dma_map_page(rdmab_device(rb), *ppages,
 641                                                 page_base, len, DMA_TO_DEVICE);
 642                         if (ib_dma_mapping_error(rdmab_device(rb),
 643                                                  sge[sge_no].addr))
 644                                 goto out_mapping_err;
 645                         sge[sge_no].length = len;
 646                         sge[sge_no].lkey = rdmab_lkey(rb);
 647
 648                         sc->sc_unmap_count++;
 649                         ppages++;
 650                         remaining -= len;
 651                         page_base = 0;
 652                 }
 653         }
 654
 655         /* The tail iovec is not always constructed in the same
 656          * page where the head iovec resides (see, for example,
 657          * gss_wrap_req_priv). To neatly accommodate that case,
 658          * DMA map it separately.
 659          */
 660         if (xdr->tail[0].iov_len) {
 661                 page = virt_to_page(xdr->tail[0].iov_base);
 662                 page_base = offset_in_page(xdr->tail[0].iov_base);
 663                 len = xdr->tail[0].iov_len;
 664
 665 map_tail:
 666                 sge_no++;
 667                 sge[sge_no].addr =
 668                         ib_dma_map_page(rdmab_device(rb), page, page_base, len,
 669                                         DMA_TO_DEVICE);
 670                 if (ib_dma_mapping_error(rdmab_device(rb), sge[sge_no].addr))
 671                         goto out_mapping_err;
 672                 sge[sge_no].length = len;
 673                 sge[sge_no].lkey = rdmab_lkey(rb);
 674                 sc->sc_unmap_count++;
 675         }
 676
 677 out:
 678         sc->sc_wr.num_sge += sge_no;
 679         if (sc->sc_unmap_count)
 680                 kref_get(&req->rl_kref);
 681         return true;
 682
 683 out_regbuf:
 684         pr_err("rpcrdma: failed to DMA map a Send buffer\n");
 685         return false;
 686
 687 out_mapping_overflow:
 688         rpcrdma_sendctx_unmap(sc);
 689         pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
 690         return false;
 691
 692 out_mapping_err:
 693         rpcrdma_sendctx_unmap(sc);
 694         trace_xprtrdma_dma_maperr(sge[sge_no].addr);
 695         return false;
 696 }
 697
 698 /**
 699  * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
 700  * @r_xprt: controlling transport
 701  * @req: context of RPC Call being marshalled
 702  * @hdrlen: size of transport header, in bytes
 703  * @xdr: xdr_buf containing RPC Call
 704  * @rtype: chunk type being encoded
 705  *
 706  * Returns 0 on success; otherwise a negative errno is returned.
 707  */
 708 int
 709 rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
 710                           struct rpcrdma_req *req, u32 hdrlen,
 711                           struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
 712 {
 713         int ret;
 714
 715         ret = -EAGAIN;
 716         req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
 717         if (!req->rl_sendctx)
 718                 goto err;
 719         req->rl_sendctx->sc_wr.num_sge = 0;
 720         req->rl_sendctx->sc_unmap_count = 0;
 721         req->rl_sendctx->sc_req = req;
 722         kref_init(&req->rl_kref);
 723
 724         ret = -EIO;
 725         if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
 726                 goto err;
 727         if (rtype != rpcrdma_areadch)
 728                 if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
 729                         goto err;
 730         return 0;
 731
 732 err:
 733         trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
 734         return ret;
 735 }
 736
 737 /**
 738  * rpcrdma_marshal_req - Marshal and send one RPC request
 739  * @r_xprt: controlling transport
 740  * @rqst: RPC request to be marshaled
 741  *
 742  * For the RPC in "rqst", this function:
 743  *  - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
 744  *  - Registers Read, Write, and Reply chunks
 745  *  - Constructs the transport header
 746  *  - Posts a Send WR to send the transport header and request
 747  *
 748  * Returns:
 749  *      %0 if the RPC was sent successfully,
 750  *      %-ENOTCONN if the connection was lost,
 751  *      %-EAGAIN if the caller should call again with the same arguments,
 752  *      %-ENOBUFS if the caller should call again after a delay,
 753  *      %-EMSGSIZE if the transport header is too small,
 754  *      %-EIO if a permanent problem occurred while marshaling.
 755  */
 756 int
 757 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 758 {
 759         struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 760         struct xdr_stream *xdr = &req->rl_stream;
 761         enum rpcrdma_chunktype rtype, wtype;
 762         bool ddp_allowed;
 763         __be32 *p;
 764         int ret;
 765
 766         rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
 767         xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
 768                         rqst);
 769
 770         /* Fixed header fields */
 771         ret = -EMSGSIZE;
 772         p = xdr_reserve_space(xdr, 4 * sizeof(*p));
 773         if (!p)
 774                 goto out_err;
 775         *p++ = rqst->rq_xid;
 776         *p++ = rpcrdma_version;
 777         *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
 778
 779         /* When the ULP employs a GSS flavor that guarantees integrity
 780          * or privacy, direct data placement of individual data items
 781          * is not allowed.
 782          */
 783         ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
 784                                                 RPCAUTH_AUTH_DATATOUCH);
 785
 786         /*
 787          * Chunks needed for results?
 788          *
 789          * o If the expected result is under the inline threshold, all ops
 790          *   return as inline.
 791          * o Large read ops return data as write chunk(s), header as
 792          *   inline.
 793          * o Large non-read ops return as a single reply chunk.
 794          */
 795         if (rpcrdma_results_inline(r_xprt, rqst))
 796                 wtype = rpcrdma_noch;
 797         else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) &&
 798                  rpcrdma_nonpayload_inline(r_xprt, rqst))
 799                 wtype = rpcrdma_writech;
 800         else
 801                 wtype = rpcrdma_replych;
 802
 803         /*
 804          * Chunks needed for arguments?
 805          *
 806          * o If the total request is under the inline threshold, all ops
 807          *   are sent as inline.
 808          * o Large write ops transmit data as read chunk(s), header as
 809          *   inline.
 810          * o Large non-write ops are sent with the entire message as a
 811          *   single read chunk (protocol 0-position special case).
 812          *
 813          * This assumes that the upper layer does not present a request
 814          * that both has a data payload, and whose non-data arguments
 815          * by themselves are larger than the inline threshold.
 816          */
 817         if (rpcrdma_args_inline(r_xprt, rqst)) {
 818                 *p++ = rdma_msg;
 819                 rtype = rpcrdma_noch;
 820         } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
 821                 *p++ = rdma_msg;
 822                 rtype = rpcrdma_readch;
 823         } else {
 824                 r_xprt->rx_stats.nomsg_call_count++;
 825                 *p++ = rdma_nomsg;
 826                 rtype = rpcrdma_areadch;
 827         }
 828
 829         /* If this is a retransmit, discard previously registered
 830          * chunks. Very likely the connection has been replaced,
 831          * so these registrations are invalid and unusable.
 832          */
 833         while (unlikely(!list_empty(&req->rl_registered))) {
 834                 struct rpcrdma_mr *mr;
 835
 836                 mr = rpcrdma_mr_pop(&req->rl_registered);
 837                 rpcrdma_mr_recycle(mr);
 838         }
 839
 840         /* This implementation supports the following combinations
 841          * of chunk lists in one RPC-over-RDMA Call message:
 842          *
 843          *   - Read list
 844          *   - Write list
 845          *   - Reply chunk
 846          *   - Read list + Reply chunk
 847          *
 848          * It might not yet support the following combinations:
 849          *
 850          *   - Read list + Write list
 851          *
 852          * It does not support the following combinations:
 853          *
 854          *   - Write list + Reply chunk
 855          *   - Read list + Write list + Reply chunk
 856          *
 857          * This implementation supports only a single chunk in each
 858          * Read or Write list. Thus for example the client cannot
 859          * send a Call message with a Position Zero Read chunk and a
 860          * regular Read chunk at the same time.
 861          */
 862         if (rtype != rpcrdma_noch) {
 863                 ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
 864                 if (ret)
 865                         goto out_err;
 866         }
 867         ret = encode_item_not_present(xdr);
 868         if (ret)
 869                 goto out_err;
 870
 871         if (wtype == rpcrdma_writech) {
 872                 ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
 873                 if (ret)
 874                         goto out_err;
 875         }
 876         ret = encode_item_not_present(xdr);
 877         if (ret)
 878                 goto out_err;
 879
 880         if (wtype != rpcrdma_replych)
 881                 ret = encode_item_not_present(xdr);
 882         else
 883                 ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
 884         if (ret)
 885                 goto out_err;
 886
 887         ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
 888                                         &rqst->rq_snd_buf, rtype);
 889         if (ret)
 890                 goto out_err;
 891
 892         trace_xprtrdma_marshal(req, rtype, wtype);
 893         return 0;
 894
 895 out_err:
 896         trace_xprtrdma_marshal_failed(rqst, ret);
 897         r_xprt->rx_stats.failed_marshal_count++;
 898         frwr_reset(req);
 899         return ret;
 900 }
 901
 902 /**
 903  * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
 904  * @rqst: controlling RPC request
 905  * @srcp: points to RPC message payload in receive buffer
 906  * @copy_len: remaining length of receive buffer content
 907  * @pad: Write chunk pad bytes needed (zero for pure inline)
 908  *
 909  * The upper layer has set the maximum number of bytes it can
 910  * receive in each component of rq_rcv_buf. These values are set in
 911  * the head.iov_len, page_len, tail.iov_len, and buflen fields.
 912  *
 913  * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
 914  * many cases this function simply updates iov_base pointers in
 915  * rq_rcv_buf to point directly to the received reply data, to
 916  * avoid copying reply data.
 917  *
 918  * Returns the count of bytes which had to be memcopied.
 919  */
 920 static unsigned long
 921 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 922 {
 923         unsigned long fixup_copy_count;
 924         int i, npages, curlen;
 925         char *destp;
 926         struct page **ppages;
 927         int page_base;
 928
 929         /* The head iovec is redirected to the RPC reply message
 930          * in the receive buffer, to avoid a memcopy.
 931          */
 932         rqst->rq_rcv_buf.head[0].iov_base = srcp;
 933         rqst->rq_private_buf.head[0].iov_base = srcp;
 934
 935         /* The contents of the receive buffer that follow
 936          * head.iov_len bytes are copied into the page list.
 937          */
 938         curlen = rqst->rq_rcv_buf.head[0].iov_len;
 939         if (curlen > copy_len)
 940                 curlen = copy_len;
 941         trace_xprtrdma_fixup(rqst, copy_len, curlen);
 942         srcp += curlen;
 943         copy_len -= curlen;
 944
 945         ppages = rqst->rq_rcv_buf.pages +
 946                 (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT);
 947         page_base = offset_in_page(rqst->rq_rcv_buf.page_base);
 948         fixup_copy_count = 0;
 949         if (copy_len && rqst->rq_rcv_buf.page_len) {
 950                 int pagelist_len;
 951
 952                 pagelist_len = rqst->rq_rcv_buf.page_len;
 953                 if (pagelist_len > copy_len)
 954                         pagelist_len = copy_len;
 955                 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
 956                 for (i = 0; i < npages; i++) {
 957                         curlen = PAGE_SIZE - page_base;
 958                         if (curlen > pagelist_len)
 959                                 curlen = pagelist_len;
 960
 961                         trace_xprtrdma_fixup_pg(rqst, i, srcp,
 962                                                 copy_len, curlen);
 963                         destp = kmap_atomic(ppages[i]);
 964                         memcpy(destp + page_base, srcp, curlen);
 965                         flush_dcache_page(ppages[i]);
 966                         kunmap_atomic(destp);
 967                         srcp += curlen;
 968                         copy_len -= curlen;
 969                         fixup_copy_count += curlen;
 970                         pagelist_len -= curlen;
 971                         if (!pagelist_len)
 972                                 break;
 973                         page_base = 0;
 974                 }
 975
 976                 /* Implicit padding for the last segment in a Write
 977                  * chunk is inserted inline at the front of the tail
 978                  * iovec. The upper layer ignores the content of
 979                  * the pad. Simply ensure inline content in the tail
 980                  * that follows the Write chunk is properly aligned.
 981                  */
 982                 if (pad)
 983                         srcp -= pad;
 984         }
 985
 986         /* The tail iovec is redirected to the remaining data
 987          * in the receive buffer, to avoid a memcopy.
 988          */
 989         if (copy_len || pad) {
 990                 rqst->rq_rcv_buf.tail[0].iov_base = srcp;
 991                 rqst->rq_private_buf.tail[0].iov_base = srcp;
 992         }
 993
 994         return fixup_copy_count;
 995 }
 996
 997 /* By convention, backchannel calls arrive via rdma_msg type
 998  * messages, and never populate the chunk lists. This makes
 999  * the RPC/RDMA header small and fixed in size, so it is
1000  * straightforward to check the RPC header's direction field.
1001  */
1002 static bool
1003 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
1004 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
1005 {
1006         struct xdr_stream *xdr = &rep->rr_stream;
1007         __be32 *p;
1008
1009         if (rep->rr_proc != rdma_msg)
1010                 return false;
1011
1012         /* Peek at stream contents without advancing. */
1013         p = xdr_inline_decode(xdr, 0);
1014
1015         /* Chunk lists */
1016         if (*p++ != xdr_zero)
1017                 return false;
1018         if (*p++ != xdr_zero)
1019                 return false;
1020         if (*p++ != xdr_zero)
1021                 return false;
1022
1023         /* RPC header */
1024         if (*p++ != rep->rr_xid)
1025                 return false;
1026         if (*p != cpu_to_be32(RPC_CALL))
1027                 return false;
1028
1029         /* Now that we are sure this is a backchannel call,
1030          * advance to the RPC header.
1031          */
1032         p = xdr_inline_decode(xdr, 3 * sizeof(*p));
1033         if (unlikely(!p))
1034                 goto out_short;
1035
1036         rpcrdma_bc_receive_call(r_xprt, rep);
1037         return true;
1038
1039 out_short:
1040         pr_warn("RPC/RDMA short backward direction call\n");
1041         return true;
1042 }
1043 #else   /* CONFIG_SUNRPC_BACKCHANNEL */
1044 {
1045         return false;
1046 }
1047 #endif  /* CONFIG_SUNRPC_BACKCHANNEL */
1048
1049 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
1050 {
1051         u32 handle;
1052         u64 offset;
1053         __be32 *p;
1054
1055         p = xdr_inline_decode(xdr, 4 * sizeof(*p));
1056         if (unlikely(!p))
1057                 return -EIO;
1058
1059         handle = be32_to_cpup(p++);
1060         *length = be32_to_cpup(p++);
1061         xdr_decode_hyper(p, &offset);
1062
1063         trace_xprtrdma_decode_seg(handle, *length, offset);
1064         return 0;
1065 }
1066
1067 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
1068 {
1069         u32 segcount, seglength;
1070         __be32 *p;
1071
1072         p = xdr_inline_decode(xdr, sizeof(*p));
1073         if (unlikely(!p))
1074                 return -EIO;
1075
1076         *length = 0;
1077         segcount = be32_to_cpup(p);
1078         while (segcount--) {
1079                 if (decode_rdma_segment(xdr, &seglength))
1080                         return -EIO;
1081                 *length += seglength;
1082         }
1083
1084         return 0;
1085 }
1086
1087 /* In RPC-over-RDMA Version One replies, a Read list is never
1088  * expected. This decoder is a stub that returns an error if
1089  * a Read list is present.
1090  */
1091 static int decode_read_list(struct xdr_stream *xdr)
1092 {
1093         __be32 *p;
1094
1095         p = xdr_inline_decode(xdr, sizeof(*p));
1096         if (unlikely(!p))
1097                 return -EIO;
1098         if (unlikely(*p != xdr_zero))
1099                 return -EIO;
1100         return 0;
1101 }
1102
1103 /* Supports only one Write chunk in the Write list
1104  */
1105 static int decode_write_list(struct xdr_stream *xdr, u32 *length)
1106 {
1107         u32 chunklen;
1108         bool first;
1109         __be32 *p;
1110
1111         *length = 0;
1112         first = true;
1113         do {
1114                 p = xdr_inline_decode(xdr, sizeof(*p));
1115                 if (unlikely(!p))
1116                         return -EIO;
1117                 if (*p == xdr_zero)
1118                         break;
1119                 if (!first)
1120                         return -EIO;
1121
1122                 if (decode_write_chunk(xdr, &chunklen))
1123                         return -EIO;
1124                 *length += chunklen;
1125                 first = false;
1126         } while (true);
1127         return 0;
1128 }
1129
1130 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
1131 {
1132         __be32 *p;
1133
1134         p = xdr_inline_decode(xdr, sizeof(*p));
1135         if (unlikely(!p))
1136                 return -EIO;
1137
1138         *length = 0;
1139         if (*p != xdr_zero)
1140                 if (decode_write_chunk(xdr, length))
1141                         return -EIO;
1142         return 0;
1143 }
1144
1145 static int
1146 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
1147                    struct rpc_rqst *rqst)
1148 {
1149         struct xdr_stream *xdr = &rep->rr_stream;
1150         u32 writelist, replychunk, rpclen;
1151         char *base;
1152
1153         /* Decode the chunk lists */
1154         if (decode_read_list(xdr))
1155                 return -EIO;
1156         if (decode_write_list(xdr, &writelist))
1157                 return -EIO;
1158         if (decode_reply_chunk(xdr, &replychunk))
1159                 return -EIO;
1160
1161         /* RDMA_MSG sanity checks */
1162         if (unlikely(replychunk))
1163                 return -EIO;
1164
1165         /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
1166         base = (char *)xdr_inline_decode(xdr, 0);
1167         rpclen = xdr_stream_remaining(xdr);
1168         r_xprt->rx_stats.fixup_copy_count +=
1169                 rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
1170
1171         r_xprt->rx_stats.total_rdma_reply += writelist;
1172         return rpclen + xdr_align_size(writelist);
1173 }
1174
1175 static noinline int
1176 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
1177 {
1178         struct xdr_stream *xdr = &rep->rr_stream;
1179         u32 writelist, replychunk;
1180
1181         /* Decode the chunk lists */
1182         if (decode_read_list(xdr))
1183                 return -EIO;
1184         if (decode_write_list(xdr, &writelist))
1185                 return -EIO;
1186         if (decode_reply_chunk(xdr, &replychunk))
1187                 return -EIO;
1188
1189         /* RDMA_NOMSG sanity checks */
1190         if (unlikely(writelist))
1191                 return -EIO;
1192         if (unlikely(!replychunk))
1193                 return -EIO;
1194
1195         /* Reply chunk buffer already is the reply vector */
1196         r_xprt->rx_stats.total_rdma_reply += replychunk;
1197         return replychunk;
1198 }
1199
1200 static noinline int
1201 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
1202                      struct rpc_rqst *rqst)
1203 {
1204         struct xdr_stream *xdr = &rep->rr_stream;
1205         __be32 *p;
1206
1207         p = xdr_inline_decode(xdr, sizeof(*p));
1208         if (unlikely(!p))
1209                 return -EIO;
1210
1211         switch (*p) {
1212         case err_vers:
1213                 p = xdr_inline_decode(xdr, 2 * sizeof(*p));
1214                 if (!p)
1215                         break;
1216                 dprintk("RPC:       %s: server reports "
1217                         "version error (%u-%u), xid %08x\n", __func__,
1218                         be32_to_cpup(p), be32_to_cpu(*(p + 1)),
1219                         be32_to_cpu(rep->rr_xid));
1220                 break;
1221         case err_chunk:
1222                 dprintk("RPC:       %s: server reports "
1223                         "header decoding error, xid %08x\n", __func__,
1224                         be32_to_cpu(rep->rr_xid));
1225                 break;
1226         default:
1227                 dprintk("RPC:       %s: server reports "
1228                         "unrecognized error %d, xid %08x\n", __func__,
1229                         be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
1230         }
1231
1232         r_xprt->rx_stats.bad_reply_count++;
1233         return -EREMOTEIO;
1234 }
1235
1236 /* Perform XID lookup, reconstruction of the RPC reply, and
1237  * RPC completion while holding the transport lock to ensure
1238  * the rep, rqst, and rq_task pointers remain stable.
1239  */
1240 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
1241 {
1242         struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1243         struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1244         struct rpc_rqst *rqst = rep->rr_rqst;
1245         int status;
1246
1247         xprt->reestablish_timeout = 0;
1248
1249         switch (rep->rr_proc) {
1250         case rdma_msg:
1251                 status = rpcrdma_decode_msg(r_xprt, rep, rqst);
1252                 break;
1253         case rdma_nomsg:
1254                 status = rpcrdma_decode_nomsg(r_xprt, rep);
1255                 break;
1256         case rdma_error:
1257                 status = rpcrdma_decode_error(r_xprt, rep, rqst);
1258                 break;
1259         default:
1260                 status = -EIO;
1261         }
1262         if (status < 0)
1263                 goto out_badheader;
1264
1265 out:
1266         spin_lock(&xprt->queue_lock);
1267         xprt_complete_rqst(rqst->rq_task, status);
1268         xprt_unpin_rqst(rqst);
1269         spin_unlock(&xprt->queue_lock);
1270         return;
1271
1272 /* If the incoming reply terminated a pending RPC, the next
1273  * RPC call will post a replacement receive buffer as it is
1274  * being marshaled.
1275  */
1276 out_badheader:
1277         trace_xprtrdma_reply_hdr(rep);
1278         r_xprt->rx_stats.bad_reply_count++;
1279         goto out;
1280 }
1281
1282 static void rpcrdma_reply_done(struct kref *kref)
1283 {
1284         struct rpcrdma_req *req =
1285                 container_of(kref, struct rpcrdma_req, rl_kref);
1286
1287         rpcrdma_complete_rqst(req->rl_reply);
1288 }
1289
1290 /**
1291  * rpcrdma_reply_handler - Process received RPC/RDMA messages
1292  * @rep: Incoming rpcrdma_rep object to process
1293  *
1294  * Errors must result in the RPC task either being awakened, or
1295  * allowed to timeout, to discover the errors at that time.
1296  */
1297 void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1298 {
1299         struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
1300         struct rpc_xprt *xprt = &r_xprt->rx_xprt;
1301         struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1302         struct rpcrdma_req *req;
1303         struct rpc_rqst *rqst;
1304         u32 credits;
1305         __be32 *p;
1306
1307         /* Fixed transport header fields */
1308         xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
1309                         rep->rr_hdrbuf.head[0].iov_base, NULL);
1310         p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
1311         if (unlikely(!p))
1312                 goto out_shortreply;
1313         rep->rr_xid = *p++;
1314         rep->rr_vers = *p++;
1315         credits = be32_to_cpu(*p++);
1316         rep->rr_proc = *p++;
1317
1318         if (rep->rr_vers != rpcrdma_version)
1319                 goto out_badversion;
1320
1321         if (rpcrdma_is_bcall(r_xprt, rep))
1322                 return;
1323
1324         /* Match incoming rpcrdma_rep to an rpcrdma_req to
1325          * get context for handling any incoming chunks.
1326          */
1327         spin_lock(&xprt->queue_lock);
1328         rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
1329         if (!rqst)
1330                 goto out_norqst;
1331         xprt_pin_rqst(rqst);
1332         spin_unlock(&xprt->queue_lock);
1333
1334         if (credits == 0)
1335                 credits = 1;    /* don't deadlock */
1336         else if (credits > buf->rb_max_requests)
1337                 credits = buf->rb_max_requests;
1338         if (buf->rb_credits != credits) {
1339                 spin_lock_bh(&xprt->transport_lock);
1340                 buf->rb_credits = credits;
1341                 xprt->cwnd = credits << RPC_CWNDSHIFT;
1342                 spin_unlock_bh(&xprt->transport_lock);
1343         }
1344
1345         req = rpcr_to_rdmar(rqst);
1346         if (req->rl_reply) {
1347                 trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
1348                 rpcrdma_recv_buffer_put(req->rl_reply);
1349         }
1350         req->rl_reply = rep;
1351         rep->rr_rqst = rqst;
1352
1353         trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
1354
1355         if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
1356                 frwr_reminv(rep, &req->rl_registered);
1357         if (!list_empty(&req->rl_registered))
1358                 frwr_unmap_async(r_xprt, req);
1359                 /* LocalInv completion will complete the RPC */
1360         else
1361                 kref_put(&req->rl_kref, rpcrdma_reply_done);
1362         return;
1363
1364 out_badversion:
1365         trace_xprtrdma_reply_vers(rep);
1366         goto out;
1367
1368 out_norqst:
1369         spin_unlock(&xprt->queue_lock);
1370         trace_xprtrdma_reply_rqst(rep);
1371         goto out;
1372
1373 out_shortreply:
1374         trace_xprtrdma_reply_short(rep);
1375
1376 out:
1377         rpcrdma_recv_buffer_put(rep);
1378 }