]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/nvme/host/tcp.c
15543358e245603c753174e847f19787f89bf3ca
[linux.git] / drivers / nvme / host / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP host.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/blk-mq.h>
15 #include <crypto/hash.h>
16
17 #include "nvme.h"
18 #include "fabrics.h"
19
20 struct nvme_tcp_queue;
21
22 enum nvme_tcp_send_state {
23         NVME_TCP_SEND_CMD_PDU = 0,
24         NVME_TCP_SEND_H2C_PDU,
25         NVME_TCP_SEND_DATA,
26         NVME_TCP_SEND_DDGST,
27 };
28
29 struct nvme_tcp_request {
30         struct nvme_request     req;
31         void                    *pdu;
32         struct nvme_tcp_queue   *queue;
33         u32                     data_len;
34         u32                     pdu_len;
35         u32                     pdu_sent;
36         u16                     ttag;
37         struct list_head        entry;
38         u32                     ddgst;
39
40         struct bio              *curr_bio;
41         struct iov_iter         iter;
42
43         /* send state */
44         size_t                  offset;
45         size_t                  data_sent;
46         enum nvme_tcp_send_state state;
47 };
48
49 enum nvme_tcp_queue_flags {
50         NVME_TCP_Q_ALLOCATED    = 0,
51         NVME_TCP_Q_LIVE         = 1,
52 };
53
54 enum nvme_tcp_recv_state {
55         NVME_TCP_RECV_PDU = 0,
56         NVME_TCP_RECV_DATA,
57         NVME_TCP_RECV_DDGST,
58 };
59
60 struct nvme_tcp_ctrl;
61 struct nvme_tcp_queue {
62         struct socket           *sock;
63         struct work_struct      io_work;
64         int                     io_cpu;
65
66         spinlock_t              lock;
67         struct list_head        send_list;
68
69         /* recv state */
70         void                    *pdu;
71         int                     pdu_remaining;
72         int                     pdu_offset;
73         size_t                  data_remaining;
74         size_t                  ddgst_remaining;
75
76         /* send state */
77         struct nvme_tcp_request *request;
78
79         int                     queue_size;
80         size_t                  cmnd_capsule_len;
81         struct nvme_tcp_ctrl    *ctrl;
82         unsigned long           flags;
83         bool                    rd_enabled;
84
85         bool                    hdr_digest;
86         bool                    data_digest;
87         struct ahash_request    *rcv_hash;
88         struct ahash_request    *snd_hash;
89         __le32                  exp_ddgst;
90         __le32                  recv_ddgst;
91
92         struct page_frag_cache  pf_cache;
93
94         void (*state_change)(struct sock *);
95         void (*data_ready)(struct sock *);
96         void (*write_space)(struct sock *);
97 };
98
99 struct nvme_tcp_ctrl {
100         /* read only in the hot path */
101         struct nvme_tcp_queue   *queues;
102         struct blk_mq_tag_set   tag_set;
103
104         /* other member variables */
105         struct list_head        list;
106         struct blk_mq_tag_set   admin_tag_set;
107         struct sockaddr_storage addr;
108         struct sockaddr_storage src_addr;
109         struct nvme_ctrl        ctrl;
110
111         struct work_struct      err_work;
112         struct delayed_work     connect_work;
113         struct nvme_tcp_request async_req;
114 };
115
116 static LIST_HEAD(nvme_tcp_ctrl_list);
117 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
118 static struct workqueue_struct *nvme_tcp_wq;
119 static struct blk_mq_ops nvme_tcp_mq_ops;
120 static struct blk_mq_ops nvme_tcp_admin_mq_ops;
121
122 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
123 {
124         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
125 }
126
127 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
128 {
129         return queue - queue->ctrl->queues;
130 }
131
132 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
133 {
134         u32 queue_idx = nvme_tcp_queue_id(queue);
135
136         if (queue_idx == 0)
137                 return queue->ctrl->admin_tag_set.tags[queue_idx];
138         return queue->ctrl->tag_set.tags[queue_idx - 1];
139 }
140
141 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
142 {
143         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
144 }
145
146 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
147 {
148         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
149 }
150
151 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
152 {
153         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
154 }
155
156 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
157 {
158         return req == &req->queue->ctrl->async_req;
159 }
160
161 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
162 {
163         struct request *rq;
164         unsigned int bytes;
165
166         if (unlikely(nvme_tcp_async_req(req)))
167                 return false; /* async events don't have a request */
168
169         rq = blk_mq_rq_from_pdu(req);
170         bytes = blk_rq_payload_bytes(rq);
171
172         return rq_data_dir(rq) == WRITE && bytes &&
173                 bytes <= nvme_tcp_inline_data_size(req->queue);
174 }
175
176 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
177 {
178         return req->iter.bvec->bv_page;
179 }
180
181 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
182 {
183         return req->iter.bvec->bv_offset + req->iter.iov_offset;
184 }
185
186 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
187 {
188         return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
189                         req->pdu_len - req->pdu_sent);
190 }
191
192 static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
193 {
194         return req->iter.iov_offset;
195 }
196
197 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
198 {
199         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
200                         req->pdu_len - req->pdu_sent : 0;
201 }
202
203 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
204                 int len)
205 {
206         return nvme_tcp_pdu_data_left(req) <= len;
207 }
208
209 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
210                 unsigned int dir)
211 {
212         struct request *rq = blk_mq_rq_from_pdu(req);
213         struct bio_vec *vec;
214         unsigned int size;
215         int nsegs;
216         size_t offset;
217
218         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
219                 vec = &rq->special_vec;
220                 nsegs = 1;
221                 size = blk_rq_payload_bytes(rq);
222                 offset = 0;
223         } else {
224                 struct bio *bio = req->curr_bio;
225
226                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
227                 nsegs = bio_segments(bio);
228                 size = bio->bi_iter.bi_size;
229                 offset = bio->bi_iter.bi_bvec_done;
230         }
231
232         iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
233         req->iter.iov_offset = offset;
234 }
235
236 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
237                 int len)
238 {
239         req->data_sent += len;
240         req->pdu_sent += len;
241         iov_iter_advance(&req->iter, len);
242         if (!iov_iter_count(&req->iter) &&
243             req->data_sent < req->data_len) {
244                 req->curr_bio = req->curr_bio->bi_next;
245                 nvme_tcp_init_iter(req, WRITE);
246         }
247 }
248
249 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
250 {
251         struct nvme_tcp_queue *queue = req->queue;
252
253         spin_lock(&queue->lock);
254         list_add_tail(&req->entry, &queue->send_list);
255         spin_unlock(&queue->lock);
256
257         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
258 }
259
260 static inline struct nvme_tcp_request *
261 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
262 {
263         struct nvme_tcp_request *req;
264
265         spin_lock(&queue->lock);
266         req = list_first_entry_or_null(&queue->send_list,
267                         struct nvme_tcp_request, entry);
268         if (req)
269                 list_del(&req->entry);
270         spin_unlock(&queue->lock);
271
272         return req;
273 }
274
275 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, u32 *dgst)
276 {
277         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
278         crypto_ahash_final(hash);
279 }
280
281 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
282                 struct page *page, off_t off, size_t len)
283 {
284         struct scatterlist sg;
285
286         sg_init_marker(&sg, 1);
287         sg_set_page(&sg, page, len, off);
288         ahash_request_set_crypt(hash, &sg, NULL, len);
289         crypto_ahash_update(hash);
290 }
291
292 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
293                 void *pdu, size_t len)
294 {
295         struct scatterlist sg;
296
297         sg_init_one(&sg, pdu, len);
298         ahash_request_set_crypt(hash, &sg, pdu + len, len);
299         crypto_ahash_digest(hash);
300 }
301
302 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
303                 void *pdu, size_t pdu_len)
304 {
305         struct nvme_tcp_hdr *hdr = pdu;
306         __le32 recv_digest;
307         __le32 exp_digest;
308
309         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
310                 dev_err(queue->ctrl->ctrl.device,
311                         "queue %d: header digest flag is cleared\n",
312                         nvme_tcp_queue_id(queue));
313                 return -EPROTO;
314         }
315
316         recv_digest = *(__le32 *)(pdu + hdr->hlen);
317         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
318         exp_digest = *(__le32 *)(pdu + hdr->hlen);
319         if (recv_digest != exp_digest) {
320                 dev_err(queue->ctrl->ctrl.device,
321                         "header digest error: recv %#x expected %#x\n",
322                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
323                 return -EIO;
324         }
325
326         return 0;
327 }
328
329 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
330 {
331         struct nvme_tcp_hdr *hdr = pdu;
332         u8 digest_len = nvme_tcp_hdgst_len(queue);
333         u32 len;
334
335         len = le32_to_cpu(hdr->plen) - hdr->hlen -
336                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
337
338         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
339                 dev_err(queue->ctrl->ctrl.device,
340                         "queue %d: data digest flag is cleared\n",
341                 nvme_tcp_queue_id(queue));
342                 return -EPROTO;
343         }
344         crypto_ahash_init(queue->rcv_hash);
345
346         return 0;
347 }
348
349 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
350                 struct request *rq, unsigned int hctx_idx)
351 {
352         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
353
354         page_frag_free(req->pdu);
355 }
356
357 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
358                 struct request *rq, unsigned int hctx_idx,
359                 unsigned int numa_node)
360 {
361         struct nvme_tcp_ctrl *ctrl = set->driver_data;
362         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
363         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
364         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
365         u8 hdgst = nvme_tcp_hdgst_len(queue);
366
367         req->pdu = page_frag_alloc(&queue->pf_cache,
368                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
369                 GFP_KERNEL | __GFP_ZERO);
370         if (!req->pdu)
371                 return -ENOMEM;
372
373         req->queue = queue;
374         nvme_req(rq)->ctrl = &ctrl->ctrl;
375
376         return 0;
377 }
378
379 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
380                 unsigned int hctx_idx)
381 {
382         struct nvme_tcp_ctrl *ctrl = data;
383         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
384
385         hctx->driver_data = queue;
386         return 0;
387 }
388
389 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
390                 unsigned int hctx_idx)
391 {
392         struct nvme_tcp_ctrl *ctrl = data;
393         struct nvme_tcp_queue *queue = &ctrl->queues[0];
394
395         hctx->driver_data = queue;
396         return 0;
397 }
398
399 static enum nvme_tcp_recv_state
400 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
401 {
402         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
403                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
404                 NVME_TCP_RECV_DATA;
405 }
406
407 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
408 {
409         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
410                                 nvme_tcp_hdgst_len(queue);
411         queue->pdu_offset = 0;
412         queue->data_remaining = -1;
413         queue->ddgst_remaining = 0;
414 }
415
416 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
417 {
418         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
419                 return;
420
421         queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
422 }
423
424 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
425                 struct nvme_completion *cqe)
426 {
427         struct request *rq;
428
429         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
430         if (!rq) {
431                 dev_err(queue->ctrl->ctrl.device,
432                         "queue %d tag 0x%x not found\n",
433                         nvme_tcp_queue_id(queue), cqe->command_id);
434                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
435                 return -EINVAL;
436         }
437
438         nvme_end_request(rq, cqe->status, cqe->result);
439
440         return 0;
441 }
442
443 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
444                 struct nvme_tcp_data_pdu *pdu)
445 {
446         struct request *rq;
447
448         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
449         if (!rq) {
450                 dev_err(queue->ctrl->ctrl.device,
451                         "queue %d tag %#x not found\n",
452                         nvme_tcp_queue_id(queue), pdu->command_id);
453                 return -ENOENT;
454         }
455
456         if (!blk_rq_payload_bytes(rq)) {
457                 dev_err(queue->ctrl->ctrl.device,
458                         "queue %d tag %#x unexpected data\n",
459                         nvme_tcp_queue_id(queue), rq->tag);
460                 return -EIO;
461         }
462
463         queue->data_remaining = le32_to_cpu(pdu->data_length);
464
465         return 0;
466
467 }
468
469 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
470                 struct nvme_tcp_rsp_pdu *pdu)
471 {
472         struct nvme_completion *cqe = &pdu->cqe;
473         int ret = 0;
474
475         /*
476          * AEN requests are special as they don't time out and can
477          * survive any kind of queue freeze and often don't respond to
478          * aborts.  We don't even bother to allocate a struct request
479          * for them but rather special case them here.
480          */
481         if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
482             cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
483                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
484                                 &cqe->result);
485         else
486                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
487
488         return ret;
489 }
490
491 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
492                 struct nvme_tcp_r2t_pdu *pdu)
493 {
494         struct nvme_tcp_data_pdu *data = req->pdu;
495         struct nvme_tcp_queue *queue = req->queue;
496         struct request *rq = blk_mq_rq_from_pdu(req);
497         u8 hdgst = nvme_tcp_hdgst_len(queue);
498         u8 ddgst = nvme_tcp_ddgst_len(queue);
499
500         req->pdu_len = le32_to_cpu(pdu->r2t_length);
501         req->pdu_sent = 0;
502
503         if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
504                 dev_err(queue->ctrl->ctrl.device,
505                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
506                         rq->tag, req->pdu_len, req->data_len,
507                         req->data_sent);
508                 return -EPROTO;
509         }
510
511         if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
512                 dev_err(queue->ctrl->ctrl.device,
513                         "req %d unexpected r2t offset %u (expected %zu)\n",
514                         rq->tag, le32_to_cpu(pdu->r2t_offset),
515                         req->data_sent);
516                 return -EPROTO;
517         }
518
519         memset(data, 0, sizeof(*data));
520         data->hdr.type = nvme_tcp_h2c_data;
521         data->hdr.flags = NVME_TCP_F_DATA_LAST;
522         if (queue->hdr_digest)
523                 data->hdr.flags |= NVME_TCP_F_HDGST;
524         if (queue->data_digest)
525                 data->hdr.flags |= NVME_TCP_F_DDGST;
526         data->hdr.hlen = sizeof(*data);
527         data->hdr.pdo = data->hdr.hlen + hdgst;
528         data->hdr.plen =
529                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
530         data->ttag = pdu->ttag;
531         data->command_id = rq->tag;
532         data->data_offset = cpu_to_le32(req->data_sent);
533         data->data_length = cpu_to_le32(req->pdu_len);
534         return 0;
535 }
536
537 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
538                 struct nvme_tcp_r2t_pdu *pdu)
539 {
540         struct nvme_tcp_request *req;
541         struct request *rq;
542         int ret;
543
544         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
545         if (!rq) {
546                 dev_err(queue->ctrl->ctrl.device,
547                         "queue %d tag %#x not found\n",
548                         nvme_tcp_queue_id(queue), pdu->command_id);
549                 return -ENOENT;
550         }
551         req = blk_mq_rq_to_pdu(rq);
552
553         ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
554         if (unlikely(ret))
555                 return ret;
556
557         req->state = NVME_TCP_SEND_H2C_PDU;
558         req->offset = 0;
559
560         nvme_tcp_queue_request(req);
561
562         return 0;
563 }
564
565 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
566                 unsigned int *offset, size_t *len)
567 {
568         struct nvme_tcp_hdr *hdr;
569         char *pdu = queue->pdu;
570         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
571         int ret;
572
573         ret = skb_copy_bits(skb, *offset,
574                 &pdu[queue->pdu_offset], rcv_len);
575         if (unlikely(ret))
576                 return ret;
577
578         queue->pdu_remaining -= rcv_len;
579         queue->pdu_offset += rcv_len;
580         *offset += rcv_len;
581         *len -= rcv_len;
582         if (queue->pdu_remaining)
583                 return 0;
584
585         hdr = queue->pdu;
586         if (queue->hdr_digest) {
587                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
588                 if (unlikely(ret))
589                         return ret;
590         }
591
592
593         if (queue->data_digest) {
594                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
595                 if (unlikely(ret))
596                         return ret;
597         }
598
599         switch (hdr->type) {
600         case nvme_tcp_c2h_data:
601                 ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
602                 break;
603         case nvme_tcp_rsp:
604                 nvme_tcp_init_recv_ctx(queue);
605                 ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
606                 break;
607         case nvme_tcp_r2t:
608                 nvme_tcp_init_recv_ctx(queue);
609                 ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
610                 break;
611         default:
612                 dev_err(queue->ctrl->ctrl.device,
613                         "unsupported pdu type (%d)\n", hdr->type);
614                 return -EINVAL;
615         }
616
617         return ret;
618 }
619
620 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
621                               unsigned int *offset, size_t *len)
622 {
623         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
624         struct nvme_tcp_request *req;
625         struct request *rq;
626
627         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
628         if (!rq) {
629                 dev_err(queue->ctrl->ctrl.device,
630                         "queue %d tag %#x not found\n",
631                         nvme_tcp_queue_id(queue), pdu->command_id);
632                 return -ENOENT;
633         }
634         req = blk_mq_rq_to_pdu(rq);
635
636         while (true) {
637                 int recv_len, ret;
638
639                 recv_len = min_t(size_t, *len, queue->data_remaining);
640                 if (!recv_len)
641                         break;
642
643                 if (!iov_iter_count(&req->iter)) {
644                         req->curr_bio = req->curr_bio->bi_next;
645
646                         /*
647                          * If we don`t have any bios it means that controller
648                          * sent more data than we requested, hence error
649                          */
650                         if (!req->curr_bio) {
651                                 dev_err(queue->ctrl->ctrl.device,
652                                         "queue %d no space in request %#x",
653                                         nvme_tcp_queue_id(queue), rq->tag);
654                                 nvme_tcp_init_recv_ctx(queue);
655                                 return -EIO;
656                         }
657                         nvme_tcp_init_iter(req, READ);
658                 }
659
660                 /* we can read only from what is left in this bio */
661                 recv_len = min_t(size_t, recv_len,
662                                 iov_iter_count(&req->iter));
663
664                 if (queue->data_digest)
665                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
666                                 &req->iter, recv_len, queue->rcv_hash);
667                 else
668                         ret = skb_copy_datagram_iter(skb, *offset,
669                                         &req->iter, recv_len);
670                 if (ret) {
671                         dev_err(queue->ctrl->ctrl.device,
672                                 "queue %d failed to copy request %#x data",
673                                 nvme_tcp_queue_id(queue), rq->tag);
674                         return ret;
675                 }
676
677                 *len -= recv_len;
678                 *offset += recv_len;
679                 queue->data_remaining -= recv_len;
680         }
681
682         if (!queue->data_remaining) {
683                 if (queue->data_digest) {
684                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
685                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
686                 } else {
687                         nvme_tcp_init_recv_ctx(queue);
688                 }
689         }
690
691         return 0;
692 }
693
694 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
695                 struct sk_buff *skb, unsigned int *offset, size_t *len)
696 {
697         char *ddgst = (char *)&queue->recv_ddgst;
698         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
699         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
700         int ret;
701
702         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
703         if (unlikely(ret))
704                 return ret;
705
706         queue->ddgst_remaining -= recv_len;
707         *offset += recv_len;
708         *len -= recv_len;
709         if (queue->ddgst_remaining)
710                 return 0;
711
712         if (queue->recv_ddgst != queue->exp_ddgst) {
713                 dev_err(queue->ctrl->ctrl.device,
714                         "data digest error: recv %#x expected %#x\n",
715                         le32_to_cpu(queue->recv_ddgst),
716                         le32_to_cpu(queue->exp_ddgst));
717                 return -EIO;
718         }
719
720         nvme_tcp_init_recv_ctx(queue);
721         return 0;
722 }
723
724 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
725                              unsigned int offset, size_t len)
726 {
727         struct nvme_tcp_queue *queue = desc->arg.data;
728         size_t consumed = len;
729         int result;
730
731         while (len) {
732                 switch (nvme_tcp_recv_state(queue)) {
733                 case NVME_TCP_RECV_PDU:
734                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
735                         break;
736                 case NVME_TCP_RECV_DATA:
737                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
738                         break;
739                 case NVME_TCP_RECV_DDGST:
740                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
741                         break;
742                 default:
743                         result = -EFAULT;
744                 }
745                 if (result) {
746                         dev_err(queue->ctrl->ctrl.device,
747                                 "receive failed:  %d\n", result);
748                         queue->rd_enabled = false;
749                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
750                         return result;
751                 }
752         }
753
754         return consumed;
755 }
756
757 static void nvme_tcp_data_ready(struct sock *sk)
758 {
759         struct nvme_tcp_queue *queue;
760
761         read_lock(&sk->sk_callback_lock);
762         queue = sk->sk_user_data;
763         if (likely(queue && queue->rd_enabled))
764                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
765         read_unlock(&sk->sk_callback_lock);
766 }
767
768 static void nvme_tcp_write_space(struct sock *sk)
769 {
770         struct nvme_tcp_queue *queue;
771
772         read_lock_bh(&sk->sk_callback_lock);
773         queue = sk->sk_user_data;
774         if (likely(queue && sk_stream_is_writeable(sk))) {
775                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
776                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
777         }
778         read_unlock_bh(&sk->sk_callback_lock);
779 }
780
781 static void nvme_tcp_state_change(struct sock *sk)
782 {
783         struct nvme_tcp_queue *queue;
784
785         read_lock(&sk->sk_callback_lock);
786         queue = sk->sk_user_data;
787         if (!queue)
788                 goto done;
789
790         switch (sk->sk_state) {
791         case TCP_CLOSE:
792         case TCP_CLOSE_WAIT:
793         case TCP_LAST_ACK:
794         case TCP_FIN_WAIT1:
795         case TCP_FIN_WAIT2:
796                 /* fallthrough */
797                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
798                 break;
799         default:
800                 dev_info(queue->ctrl->ctrl.device,
801                         "queue %d socket state %d\n",
802                         nvme_tcp_queue_id(queue), sk->sk_state);
803         }
804
805         queue->state_change(sk);
806 done:
807         read_unlock(&sk->sk_callback_lock);
808 }
809
810 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
811 {
812         queue->request = NULL;
813 }
814
815 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
816 {
817         union nvme_result res = {};
818
819         nvme_end_request(blk_mq_rq_from_pdu(req),
820                 NVME_SC_DATA_XFER_ERROR, res);
821 }
822
823 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
824 {
825         struct nvme_tcp_queue *queue = req->queue;
826
827         while (true) {
828                 struct page *page = nvme_tcp_req_cur_page(req);
829                 size_t offset = nvme_tcp_req_cur_offset(req);
830                 size_t len = nvme_tcp_req_cur_length(req);
831                 bool last = nvme_tcp_pdu_last_send(req, len);
832                 int ret, flags = MSG_DONTWAIT;
833
834                 if (last && !queue->data_digest)
835                         flags |= MSG_EOR;
836                 else
837                         flags |= MSG_MORE;
838
839                 ret = kernel_sendpage(queue->sock, page, offset, len, flags);
840                 if (ret <= 0)
841                         return ret;
842
843                 nvme_tcp_advance_req(req, ret);
844                 if (queue->data_digest)
845                         nvme_tcp_ddgst_update(queue->snd_hash, page,
846                                         offset, ret);
847
848                 /* fully successful last write*/
849                 if (last && ret == len) {
850                         if (queue->data_digest) {
851                                 nvme_tcp_ddgst_final(queue->snd_hash,
852                                         &req->ddgst);
853                                 req->state = NVME_TCP_SEND_DDGST;
854                                 req->offset = 0;
855                         } else {
856                                 nvme_tcp_done_send_req(queue);
857                         }
858                         return 1;
859                 }
860         }
861         return -EAGAIN;
862 }
863
864 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
865 {
866         struct nvme_tcp_queue *queue = req->queue;
867         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
868         bool inline_data = nvme_tcp_has_inline_data(req);
869         int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
870         u8 hdgst = nvme_tcp_hdgst_len(queue);
871         int len = sizeof(*pdu) + hdgst - req->offset;
872         int ret;
873
874         if (queue->hdr_digest && !req->offset)
875                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
876
877         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
878                         offset_in_page(pdu) + req->offset, len,  flags);
879         if (unlikely(ret <= 0))
880                 return ret;
881
882         len -= ret;
883         if (!len) {
884                 if (inline_data) {
885                         req->state = NVME_TCP_SEND_DATA;
886                         if (queue->data_digest)
887                                 crypto_ahash_init(queue->snd_hash);
888                         nvme_tcp_init_iter(req, WRITE);
889                 } else {
890                         nvme_tcp_done_send_req(queue);
891                 }
892                 return 1;
893         }
894         req->offset += ret;
895
896         return -EAGAIN;
897 }
898
899 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
900 {
901         struct nvme_tcp_queue *queue = req->queue;
902         struct nvme_tcp_data_pdu *pdu = req->pdu;
903         u8 hdgst = nvme_tcp_hdgst_len(queue);
904         int len = sizeof(*pdu) - req->offset + hdgst;
905         int ret;
906
907         if (queue->hdr_digest && !req->offset)
908                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
909
910         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
911                         offset_in_page(pdu) + req->offset, len,
912                         MSG_DONTWAIT | MSG_MORE);
913         if (unlikely(ret <= 0))
914                 return ret;
915
916         len -= ret;
917         if (!len) {
918                 req->state = NVME_TCP_SEND_DATA;
919                 if (queue->data_digest)
920                         crypto_ahash_init(queue->snd_hash);
921                 if (!req->data_sent)
922                         nvme_tcp_init_iter(req, WRITE);
923                 return 1;
924         }
925         req->offset += ret;
926
927         return -EAGAIN;
928 }
929
930 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
931 {
932         struct nvme_tcp_queue *queue = req->queue;
933         int ret;
934         struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
935         struct kvec iov = {
936                 .iov_base = &req->ddgst + req->offset,
937                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
938         };
939
940         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
941         if (unlikely(ret <= 0))
942                 return ret;
943
944         if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
945                 nvme_tcp_done_send_req(queue);
946                 return 1;
947         }
948
949         req->offset += ret;
950         return -EAGAIN;
951 }
952
953 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
954 {
955         struct nvme_tcp_request *req;
956         int ret = 1;
957
958         if (!queue->request) {
959                 queue->request = nvme_tcp_fetch_request(queue);
960                 if (!queue->request)
961                         return 0;
962         }
963         req = queue->request;
964
965         if (req->state == NVME_TCP_SEND_CMD_PDU) {
966                 ret = nvme_tcp_try_send_cmd_pdu(req);
967                 if (ret <= 0)
968                         goto done;
969                 if (!nvme_tcp_has_inline_data(req))
970                         return ret;
971         }
972
973         if (req->state == NVME_TCP_SEND_H2C_PDU) {
974                 ret = nvme_tcp_try_send_data_pdu(req);
975                 if (ret <= 0)
976                         goto done;
977         }
978
979         if (req->state == NVME_TCP_SEND_DATA) {
980                 ret = nvme_tcp_try_send_data(req);
981                 if (ret <= 0)
982                         goto done;
983         }
984
985         if (req->state == NVME_TCP_SEND_DDGST)
986                 ret = nvme_tcp_try_send_ddgst(req);
987 done:
988         if (ret == -EAGAIN)
989                 ret = 0;
990         return ret;
991 }
992
993 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
994 {
995         struct sock *sk = queue->sock->sk;
996         read_descriptor_t rd_desc;
997         int consumed;
998
999         rd_desc.arg.data = queue;
1000         rd_desc.count = 1;
1001         lock_sock(sk);
1002         consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1003         release_sock(sk);
1004         return consumed;
1005 }
1006
1007 static void nvme_tcp_io_work(struct work_struct *w)
1008 {
1009         struct nvme_tcp_queue *queue =
1010                 container_of(w, struct nvme_tcp_queue, io_work);
1011         unsigned long start = jiffies + msecs_to_jiffies(1);
1012
1013         do {
1014                 bool pending = false;
1015                 int result;
1016
1017                 result = nvme_tcp_try_send(queue);
1018                 if (result > 0) {
1019                         pending = true;
1020                 } else if (unlikely(result < 0)) {
1021                         dev_err(queue->ctrl->ctrl.device,
1022                                 "failed to send request %d\n", result);
1023                         if (result != -EPIPE)
1024                                 nvme_tcp_fail_request(queue->request);
1025                         nvme_tcp_done_send_req(queue);
1026                         return;
1027                 }
1028
1029                 result = nvme_tcp_try_recv(queue);
1030                 if (result > 0)
1031                         pending = true;
1032
1033                 if (!pending)
1034                         return;
1035
1036         } while (time_after(jiffies, start)); /* quota is exhausted */
1037
1038         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1039 }
1040
1041 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1042 {
1043         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1044
1045         ahash_request_free(queue->rcv_hash);
1046         ahash_request_free(queue->snd_hash);
1047         crypto_free_ahash(tfm);
1048 }
1049
1050 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1051 {
1052         struct crypto_ahash *tfm;
1053
1054         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1055         if (IS_ERR(tfm))
1056                 return PTR_ERR(tfm);
1057
1058         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1059         if (!queue->snd_hash)
1060                 goto free_tfm;
1061         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1062
1063         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1064         if (!queue->rcv_hash)
1065                 goto free_snd_hash;
1066         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1067
1068         return 0;
1069 free_snd_hash:
1070         ahash_request_free(queue->snd_hash);
1071 free_tfm:
1072         crypto_free_ahash(tfm);
1073         return -ENOMEM;
1074 }
1075
1076 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1077 {
1078         struct nvme_tcp_request *async = &ctrl->async_req;
1079
1080         page_frag_free(async->pdu);
1081 }
1082
1083 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1084 {
1085         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1086         struct nvme_tcp_request *async = &ctrl->async_req;
1087         u8 hdgst = nvme_tcp_hdgst_len(queue);
1088
1089         async->pdu = page_frag_alloc(&queue->pf_cache,
1090                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1091                 GFP_KERNEL | __GFP_ZERO);
1092         if (!async->pdu)
1093                 return -ENOMEM;
1094
1095         async->queue = &ctrl->queues[0];
1096         return 0;
1097 }
1098
1099 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1100 {
1101         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1102         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1103
1104         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1105                 return;
1106
1107         if (queue->hdr_digest || queue->data_digest)
1108                 nvme_tcp_free_crypto(queue);
1109
1110         sock_release(queue->sock);
1111         kfree(queue->pdu);
1112 }
1113
1114 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1115 {
1116         struct nvme_tcp_icreq_pdu *icreq;
1117         struct nvme_tcp_icresp_pdu *icresp;
1118         struct msghdr msg = {};
1119         struct kvec iov;
1120         bool ctrl_hdgst, ctrl_ddgst;
1121         int ret;
1122
1123         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1124         if (!icreq)
1125                 return -ENOMEM;
1126
1127         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1128         if (!icresp) {
1129                 ret = -ENOMEM;
1130                 goto free_icreq;
1131         }
1132
1133         icreq->hdr.type = nvme_tcp_icreq;
1134         icreq->hdr.hlen = sizeof(*icreq);
1135         icreq->hdr.pdo = 0;
1136         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1137         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1138         icreq->maxr2t = 0; /* single inflight r2t supported */
1139         icreq->hpda = 0; /* no alignment constraint */
1140         if (queue->hdr_digest)
1141                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1142         if (queue->data_digest)
1143                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1144
1145         iov.iov_base = icreq;
1146         iov.iov_len = sizeof(*icreq);
1147         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1148         if (ret < 0)
1149                 goto free_icresp;
1150
1151         memset(&msg, 0, sizeof(msg));
1152         iov.iov_base = icresp;
1153         iov.iov_len = sizeof(*icresp);
1154         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1155                         iov.iov_len, msg.msg_flags);
1156         if (ret < 0)
1157                 goto free_icresp;
1158
1159         ret = -EINVAL;
1160         if (icresp->hdr.type != nvme_tcp_icresp) {
1161                 pr_err("queue %d: bad type returned %d\n",
1162                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1163                 goto free_icresp;
1164         }
1165
1166         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1167                 pr_err("queue %d: bad pdu length returned %d\n",
1168                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1169                 goto free_icresp;
1170         }
1171
1172         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1173                 pr_err("queue %d: bad pfv returned %d\n",
1174                         nvme_tcp_queue_id(queue), icresp->pfv);
1175                 goto free_icresp;
1176         }
1177
1178         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1179         if ((queue->data_digest && !ctrl_ddgst) ||
1180             (!queue->data_digest && ctrl_ddgst)) {
1181                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1182                         nvme_tcp_queue_id(queue),
1183                         queue->data_digest ? "enabled" : "disabled",
1184                         ctrl_ddgst ? "enabled" : "disabled");
1185                 goto free_icresp;
1186         }
1187
1188         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1189         if ((queue->hdr_digest && !ctrl_hdgst) ||
1190             (!queue->hdr_digest && ctrl_hdgst)) {
1191                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1192                         nvme_tcp_queue_id(queue),
1193                         queue->hdr_digest ? "enabled" : "disabled",
1194                         ctrl_hdgst ? "enabled" : "disabled");
1195                 goto free_icresp;
1196         }
1197
1198         if (icresp->cpda != 0) {
1199                 pr_err("queue %d: unsupported cpda returned %d\n",
1200                         nvme_tcp_queue_id(queue), icresp->cpda);
1201                 goto free_icresp;
1202         }
1203
1204         ret = 0;
1205 free_icresp:
1206         kfree(icresp);
1207 free_icreq:
1208         kfree(icreq);
1209         return ret;
1210 }
1211
1212 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1213                 int qid, size_t queue_size)
1214 {
1215         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1216         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1217         struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1218         int ret, opt, rcv_pdu_size;
1219
1220         queue->ctrl = ctrl;
1221         INIT_LIST_HEAD(&queue->send_list);
1222         spin_lock_init(&queue->lock);
1223         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1224         queue->queue_size = queue_size;
1225
1226         if (qid > 0)
1227                 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
1228         else
1229                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1230                                                 NVME_TCP_ADMIN_CCSZ;
1231
1232         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1233                         IPPROTO_TCP, &queue->sock);
1234         if (ret) {
1235                 dev_err(ctrl->ctrl.device,
1236                         "failed to create socket: %d\n", ret);
1237                 return ret;
1238         }
1239
1240         /* Single syn retry */
1241         opt = 1;
1242         ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
1243                         (char *)&opt, sizeof(opt));
1244         if (ret) {
1245                 dev_err(ctrl->ctrl.device,
1246                         "failed to set TCP_SYNCNT sock opt %d\n", ret);
1247                 goto err_sock;
1248         }
1249
1250         /* Set TCP no delay */
1251         opt = 1;
1252         ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
1253                         TCP_NODELAY, (char *)&opt, sizeof(opt));
1254         if (ret) {
1255                 dev_err(ctrl->ctrl.device,
1256                         "failed to set TCP_NODELAY sock opt %d\n", ret);
1257                 goto err_sock;
1258         }
1259
1260         /*
1261          * Cleanup whatever is sitting in the TCP transmit queue on socket
1262          * close. This is done to prevent stale data from being sent should
1263          * the network connection be restored before TCP times out.
1264          */
1265         ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
1266                         (char *)&sol, sizeof(sol));
1267         if (ret) {
1268                 dev_err(ctrl->ctrl.device,
1269                         "failed to set SO_LINGER sock opt %d\n", ret);
1270                 goto err_sock;
1271         }
1272
1273         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1274         queue->io_cpu = (qid == 0) ? 0 : qid - 1;
1275         queue->request = NULL;
1276         queue->data_remaining = 0;
1277         queue->ddgst_remaining = 0;
1278         queue->pdu_remaining = 0;
1279         queue->pdu_offset = 0;
1280         sk_set_memalloc(queue->sock->sk);
1281
1282         if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
1283                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1284                         sizeof(ctrl->src_addr));
1285                 if (ret) {
1286                         dev_err(ctrl->ctrl.device,
1287                                 "failed to bind queue %d socket %d\n",
1288                                 qid, ret);
1289                         goto err_sock;
1290                 }
1291         }
1292
1293         queue->hdr_digest = nctrl->opts->hdr_digest;
1294         queue->data_digest = nctrl->opts->data_digest;
1295         if (queue->hdr_digest || queue->data_digest) {
1296                 ret = nvme_tcp_alloc_crypto(queue);
1297                 if (ret) {
1298                         dev_err(ctrl->ctrl.device,
1299                                 "failed to allocate queue %d crypto\n", qid);
1300                         goto err_sock;
1301                 }
1302         }
1303
1304         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1305                         nvme_tcp_hdgst_len(queue);
1306         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1307         if (!queue->pdu) {
1308                 ret = -ENOMEM;
1309                 goto err_crypto;
1310         }
1311
1312         dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
1313                         nvme_tcp_queue_id(queue));
1314
1315         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1316                 sizeof(ctrl->addr), 0);
1317         if (ret) {
1318                 dev_err(ctrl->ctrl.device,
1319                         "failed to connect socket: %d\n", ret);
1320                 goto err_rcv_pdu;
1321         }
1322
1323         ret = nvme_tcp_init_connection(queue);
1324         if (ret)
1325                 goto err_init_connect;
1326
1327         queue->rd_enabled = true;
1328         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1329         nvme_tcp_init_recv_ctx(queue);
1330
1331         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1332         queue->sock->sk->sk_user_data = queue;
1333         queue->state_change = queue->sock->sk->sk_state_change;
1334         queue->data_ready = queue->sock->sk->sk_data_ready;
1335         queue->write_space = queue->sock->sk->sk_write_space;
1336         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1337         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1338         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1339         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1340
1341         return 0;
1342
1343 err_init_connect:
1344         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1345 err_rcv_pdu:
1346         kfree(queue->pdu);
1347 err_crypto:
1348         if (queue->hdr_digest || queue->data_digest)
1349                 nvme_tcp_free_crypto(queue);
1350 err_sock:
1351         sock_release(queue->sock);
1352         queue->sock = NULL;
1353         return ret;
1354 }
1355
1356 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1357 {
1358         struct socket *sock = queue->sock;
1359
1360         write_lock_bh(&sock->sk->sk_callback_lock);
1361         sock->sk->sk_user_data  = NULL;
1362         sock->sk->sk_data_ready = queue->data_ready;
1363         sock->sk->sk_state_change = queue->state_change;
1364         sock->sk->sk_write_space  = queue->write_space;
1365         write_unlock_bh(&sock->sk->sk_callback_lock);
1366 }
1367
1368 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1369 {
1370         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1371         nvme_tcp_restore_sock_calls(queue);
1372         cancel_work_sync(&queue->io_work);
1373 }
1374
1375 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1376 {
1377         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1378         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1379
1380         if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1381                 return;
1382
1383         __nvme_tcp_stop_queue(queue);
1384 }
1385
1386 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1387 {
1388         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1389         int ret;
1390
1391         if (idx)
1392                 ret = nvmf_connect_io_queue(nctrl, idx);
1393         else
1394                 ret = nvmf_connect_admin_queue(nctrl);
1395
1396         if (!ret) {
1397                 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1398         } else {
1399                 __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1400                 dev_err(nctrl->device,
1401                         "failed to connect queue: %d ret=%d\n", idx, ret);
1402         }
1403         return ret;
1404 }
1405
1406 static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1407                 bool admin)
1408 {
1409         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1410         struct blk_mq_tag_set *set;
1411         int ret;
1412
1413         if (admin) {
1414                 set = &ctrl->admin_tag_set;
1415                 memset(set, 0, sizeof(*set));
1416                 set->ops = &nvme_tcp_admin_mq_ops;
1417                 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1418                 set->reserved_tags = 2; /* connect + keep-alive */
1419                 set->numa_node = NUMA_NO_NODE;
1420                 set->cmd_size = sizeof(struct nvme_tcp_request);
1421                 set->driver_data = ctrl;
1422                 set->nr_hw_queues = 1;
1423                 set->timeout = ADMIN_TIMEOUT;
1424         } else {
1425                 set = &ctrl->tag_set;
1426                 memset(set, 0, sizeof(*set));
1427                 set->ops = &nvme_tcp_mq_ops;
1428                 set->queue_depth = nctrl->sqsize + 1;
1429                 set->reserved_tags = 1; /* fabric connect */
1430                 set->numa_node = NUMA_NO_NODE;
1431                 set->flags = BLK_MQ_F_SHOULD_MERGE;
1432                 set->cmd_size = sizeof(struct nvme_tcp_request);
1433                 set->driver_data = ctrl;
1434                 set->nr_hw_queues = nctrl->queue_count - 1;
1435                 set->timeout = NVME_IO_TIMEOUT;
1436         }
1437
1438         ret = blk_mq_alloc_tag_set(set);
1439         if (ret)
1440                 return ERR_PTR(ret);
1441
1442         return set;
1443 }
1444
1445 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1446 {
1447         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1448                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1449                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1450         }
1451
1452         nvme_tcp_free_queue(ctrl, 0);
1453 }
1454
1455 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1456 {
1457         int i;
1458
1459         for (i = 1; i < ctrl->queue_count; i++)
1460                 nvme_tcp_free_queue(ctrl, i);
1461 }
1462
1463 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1464 {
1465         int i;
1466
1467         for (i = 1; i < ctrl->queue_count; i++)
1468                 nvme_tcp_stop_queue(ctrl, i);
1469 }
1470
1471 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1472 {
1473         int i, ret = 0;
1474
1475         for (i = 1; i < ctrl->queue_count; i++) {
1476                 ret = nvme_tcp_start_queue(ctrl, i);
1477                 if (ret)
1478                         goto out_stop_queues;
1479         }
1480
1481         return 0;
1482
1483 out_stop_queues:
1484         for (i--; i >= 1; i--)
1485                 nvme_tcp_stop_queue(ctrl, i);
1486         return ret;
1487 }
1488
1489 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1490 {
1491         int ret;
1492
1493         ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1494         if (ret)
1495                 return ret;
1496
1497         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1498         if (ret)
1499                 goto out_free_queue;
1500
1501         return 0;
1502
1503 out_free_queue:
1504         nvme_tcp_free_queue(ctrl, 0);
1505         return ret;
1506 }
1507
1508 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1509 {
1510         int i, ret;
1511
1512         for (i = 1; i < ctrl->queue_count; i++) {
1513                 ret = nvme_tcp_alloc_queue(ctrl, i,
1514                                 ctrl->sqsize + 1);
1515                 if (ret)
1516                         goto out_free_queues;
1517         }
1518
1519         return 0;
1520
1521 out_free_queues:
1522         for (i--; i >= 1; i--)
1523                 nvme_tcp_free_queue(ctrl, i);
1524
1525         return ret;
1526 }
1527
1528 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1529 {
1530         return min(ctrl->queue_count - 1, num_online_cpus());
1531 }
1532
1533 static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
1534 {
1535         unsigned int nr_io_queues;
1536         int ret;
1537
1538         nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1539         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1540         if (ret)
1541                 return ret;
1542
1543         ctrl->queue_count = nr_io_queues + 1;
1544         if (ctrl->queue_count < 2)
1545                 return 0;
1546
1547         dev_info(ctrl->device,
1548                 "creating %d I/O queues.\n", nr_io_queues);
1549
1550         return nvme_tcp_alloc_io_queues(ctrl);
1551 }
1552
1553 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1554 {
1555         nvme_tcp_stop_io_queues(ctrl);
1556         if (remove) {
1557                 if (ctrl->ops->flags & NVME_F_FABRICS)
1558                         blk_cleanup_queue(ctrl->connect_q);
1559                 blk_mq_free_tag_set(ctrl->tagset);
1560         }
1561         nvme_tcp_free_io_queues(ctrl);
1562 }
1563
1564 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1565 {
1566         int ret;
1567
1568         ret = nvme_alloc_io_queues(ctrl);
1569         if (ret)
1570                 return ret;
1571
1572         if (new) {
1573                 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1574                 if (IS_ERR(ctrl->tagset)) {
1575                         ret = PTR_ERR(ctrl->tagset);
1576                         goto out_free_io_queues;
1577                 }
1578
1579                 if (ctrl->ops->flags & NVME_F_FABRICS) {
1580                         ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1581                         if (IS_ERR(ctrl->connect_q)) {
1582                                 ret = PTR_ERR(ctrl->connect_q);
1583                                 goto out_free_tag_set;
1584                         }
1585                 }
1586         } else {
1587                 blk_mq_update_nr_hw_queues(ctrl->tagset,
1588                         ctrl->queue_count - 1);
1589         }
1590
1591         ret = nvme_tcp_start_io_queues(ctrl);
1592         if (ret)
1593                 goto out_cleanup_connect_q;
1594
1595         return 0;
1596
1597 out_cleanup_connect_q:
1598         if (new && (ctrl->ops->flags & NVME_F_FABRICS))
1599                 blk_cleanup_queue(ctrl->connect_q);
1600 out_free_tag_set:
1601         if (new)
1602                 blk_mq_free_tag_set(ctrl->tagset);
1603 out_free_io_queues:
1604         nvme_tcp_free_io_queues(ctrl);
1605         return ret;
1606 }
1607
1608 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1609 {
1610         nvme_tcp_stop_queue(ctrl, 0);
1611         if (remove) {
1612                 free_opal_dev(ctrl->opal_dev);
1613                 blk_cleanup_queue(ctrl->admin_q);
1614                 blk_mq_free_tag_set(ctrl->admin_tagset);
1615         }
1616         nvme_tcp_free_admin_queue(ctrl);
1617 }
1618
1619 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1620 {
1621         int error;
1622
1623         error = nvme_tcp_alloc_admin_queue(ctrl);
1624         if (error)
1625                 return error;
1626
1627         if (new) {
1628                 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1629                 if (IS_ERR(ctrl->admin_tagset)) {
1630                         error = PTR_ERR(ctrl->admin_tagset);
1631                         goto out_free_queue;
1632                 }
1633
1634                 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1635                 if (IS_ERR(ctrl->admin_q)) {
1636                         error = PTR_ERR(ctrl->admin_q);
1637                         goto out_free_tagset;
1638                 }
1639         }
1640
1641         error = nvme_tcp_start_queue(ctrl, 0);
1642         if (error)
1643                 goto out_cleanup_queue;
1644
1645         error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
1646         if (error) {
1647                 dev_err(ctrl->device,
1648                         "prop_get NVME_REG_CAP failed\n");
1649                 goto out_stop_queue;
1650         }
1651
1652         ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
1653
1654         error = nvme_enable_ctrl(ctrl, ctrl->cap);
1655         if (error)
1656                 goto out_stop_queue;
1657
1658         error = nvme_init_identify(ctrl);
1659         if (error)
1660                 goto out_stop_queue;
1661
1662         return 0;
1663
1664 out_stop_queue:
1665         nvme_tcp_stop_queue(ctrl, 0);
1666 out_cleanup_queue:
1667         if (new)
1668                 blk_cleanup_queue(ctrl->admin_q);
1669 out_free_tagset:
1670         if (new)
1671                 blk_mq_free_tag_set(ctrl->admin_tagset);
1672 out_free_queue:
1673         nvme_tcp_free_admin_queue(ctrl);
1674         return error;
1675 }
1676
1677 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1678                 bool remove)
1679 {
1680         blk_mq_quiesce_queue(ctrl->admin_q);
1681         nvme_tcp_stop_queue(ctrl, 0);
1682         blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
1683         blk_mq_unquiesce_queue(ctrl->admin_q);
1684         nvme_tcp_destroy_admin_queue(ctrl, remove);
1685 }
1686
1687 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1688                 bool remove)
1689 {
1690         if (ctrl->queue_count <= 1)
1691                 return;
1692         nvme_stop_queues(ctrl);
1693         nvme_tcp_stop_io_queues(ctrl);
1694         blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
1695         if (remove)
1696                 nvme_start_queues(ctrl);
1697         nvme_tcp_destroy_io_queues(ctrl, remove);
1698 }
1699
1700 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1701 {
1702         /* If we are resetting/deleting then do nothing */
1703         if (ctrl->state != NVME_CTRL_CONNECTING) {
1704                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1705                         ctrl->state == NVME_CTRL_LIVE);
1706                 return;
1707         }
1708
1709         if (nvmf_should_reconnect(ctrl)) {
1710                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1711                         ctrl->opts->reconnect_delay);
1712                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1713                                 ctrl->opts->reconnect_delay * HZ);
1714         } else {
1715                 dev_info(ctrl->device, "Removing controller...\n");
1716                 nvme_delete_ctrl(ctrl);
1717         }
1718 }
1719
1720 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1721 {
1722         struct nvmf_ctrl_options *opts = ctrl->opts;
1723         int ret = -EINVAL;
1724
1725         ret = nvme_tcp_configure_admin_queue(ctrl, new);
1726         if (ret)
1727                 return ret;
1728
1729         if (ctrl->icdoff) {
1730                 dev_err(ctrl->device, "icdoff is not supported!\n");
1731                 goto destroy_admin;
1732         }
1733
1734         if (opts->queue_size > ctrl->sqsize + 1)
1735                 dev_warn(ctrl->device,
1736                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
1737                         opts->queue_size, ctrl->sqsize + 1);
1738
1739         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1740                 dev_warn(ctrl->device,
1741                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
1742                         ctrl->sqsize + 1, ctrl->maxcmd);
1743                 ctrl->sqsize = ctrl->maxcmd - 1;
1744         }
1745
1746         if (ctrl->queue_count > 1) {
1747                 ret = nvme_tcp_configure_io_queues(ctrl, new);
1748                 if (ret)
1749                         goto destroy_admin;
1750         }
1751
1752         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1753                 /* state change failure is ok if we're in DELETING state */
1754                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1755                 ret = -EINVAL;
1756                 goto destroy_io;
1757         }
1758
1759         nvme_start_ctrl(ctrl);
1760         return 0;
1761
1762 destroy_io:
1763         if (ctrl->queue_count > 1)
1764                 nvme_tcp_destroy_io_queues(ctrl, new);
1765 destroy_admin:
1766         nvme_tcp_stop_queue(ctrl, 0);
1767         nvme_tcp_destroy_admin_queue(ctrl, new);
1768         return ret;
1769 }
1770
1771 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1772 {
1773         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1774                         struct nvme_tcp_ctrl, connect_work);
1775         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1776
1777         ++ctrl->nr_reconnects;
1778
1779         if (nvme_tcp_setup_ctrl(ctrl, false))
1780                 goto requeue;
1781
1782         dev_info(ctrl->device, "Successfully reconnected (%d attepmpt)\n",
1783                         ctrl->nr_reconnects);
1784
1785         ctrl->nr_reconnects = 0;
1786
1787         return;
1788
1789 requeue:
1790         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
1791                         ctrl->nr_reconnects);
1792         nvme_tcp_reconnect_or_remove(ctrl);
1793 }
1794
1795 static void nvme_tcp_error_recovery_work(struct work_struct *work)
1796 {
1797         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
1798                                 struct nvme_tcp_ctrl, err_work);
1799         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1800
1801         nvme_stop_keep_alive(ctrl);
1802         nvme_tcp_teardown_io_queues(ctrl, false);
1803         /* unquiesce to fail fast pending requests */
1804         nvme_start_queues(ctrl);
1805         nvme_tcp_teardown_admin_queue(ctrl, false);
1806
1807         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1808                 /* state change failure is ok if we're in DELETING state */
1809                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1810                 return;
1811         }
1812
1813         nvme_tcp_reconnect_or_remove(ctrl);
1814 }
1815
1816 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
1817 {
1818         nvme_tcp_teardown_io_queues(ctrl, shutdown);
1819         if (shutdown)
1820                 nvme_shutdown_ctrl(ctrl);
1821         else
1822                 nvme_disable_ctrl(ctrl, ctrl->cap);
1823         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
1824 }
1825
1826 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
1827 {
1828         nvme_tcp_teardown_ctrl(ctrl, true);
1829 }
1830
1831 static void nvme_reset_ctrl_work(struct work_struct *work)
1832 {
1833         struct nvme_ctrl *ctrl =
1834                 container_of(work, struct nvme_ctrl, reset_work);
1835
1836         nvme_stop_ctrl(ctrl);
1837         nvme_tcp_teardown_ctrl(ctrl, false);
1838
1839         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1840                 /* state change failure is ok if we're in DELETING state */
1841                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1842                 return;
1843         }
1844
1845         if (nvme_tcp_setup_ctrl(ctrl, false))
1846                 goto out_fail;
1847
1848         return;
1849
1850 out_fail:
1851         ++ctrl->nr_reconnects;
1852         nvme_tcp_reconnect_or_remove(ctrl);
1853 }
1854
1855 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
1856 {
1857         cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
1858         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
1859 }
1860
1861 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
1862 {
1863         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1864
1865         if (list_empty(&ctrl->list))
1866                 goto free_ctrl;
1867
1868         mutex_lock(&nvme_tcp_ctrl_mutex);
1869         list_del(&ctrl->list);
1870         mutex_unlock(&nvme_tcp_ctrl_mutex);
1871
1872         nvmf_free_options(nctrl->opts);
1873 free_ctrl:
1874         kfree(ctrl->queues);
1875         kfree(ctrl);
1876 }
1877
1878 static void nvme_tcp_set_sg_null(struct nvme_command *c)
1879 {
1880         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1881
1882         sg->addr = 0;
1883         sg->length = 0;
1884         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1885                         NVME_SGL_FMT_TRANSPORT_A;
1886 }
1887
1888 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
1889                 struct nvme_command *c, u32 data_len)
1890 {
1891         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1892
1893         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1894         sg->length = cpu_to_le32(data_len);
1895         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1896 }
1897
1898 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
1899                 u32 data_len)
1900 {
1901         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1902
1903         sg->addr = 0;
1904         sg->length = cpu_to_le32(data_len);
1905         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1906                         NVME_SGL_FMT_TRANSPORT_A;
1907 }
1908
1909 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
1910 {
1911         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
1912         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1913         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
1914         struct nvme_command *cmd = &pdu->cmd;
1915         u8 hdgst = nvme_tcp_hdgst_len(queue);
1916
1917         memset(pdu, 0, sizeof(*pdu));
1918         pdu->hdr.type = nvme_tcp_cmd;
1919         if (queue->hdr_digest)
1920                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
1921         pdu->hdr.hlen = sizeof(*pdu);
1922         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
1923
1924         cmd->common.opcode = nvme_admin_async_event;
1925         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1926         cmd->common.flags |= NVME_CMD_SGL_METABUF;
1927         nvme_tcp_set_sg_null(cmd);
1928
1929         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
1930         ctrl->async_req.offset = 0;
1931         ctrl->async_req.curr_bio = NULL;
1932         ctrl->async_req.data_len = 0;
1933
1934         nvme_tcp_queue_request(&ctrl->async_req);
1935 }
1936
1937 static enum blk_eh_timer_return
1938 nvme_tcp_timeout(struct request *rq, bool reserved)
1939 {
1940         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1941         struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
1942         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1943
1944         dev_dbg(ctrl->ctrl.device,
1945                 "queue %d: timeout request %#x type %d\n",
1946                 nvme_tcp_queue_id(req->queue), rq->tag,
1947                 pdu->hdr.type);
1948
1949         if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
1950                 union nvme_result res = {};
1951
1952                 nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
1953                 nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
1954                 return BLK_EH_DONE;
1955         }
1956
1957         /* queue error recovery */
1958         nvme_tcp_error_recovery(&ctrl->ctrl);
1959
1960         return BLK_EH_RESET_TIMER;
1961 }
1962
1963 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
1964                         struct request *rq)
1965 {
1966         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1967         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1968         struct nvme_command *c = &pdu->cmd;
1969
1970         c->common.flags |= NVME_CMD_SGL_METABUF;
1971
1972         if (rq_data_dir(rq) == WRITE && req->data_len &&
1973             req->data_len <= nvme_tcp_inline_data_size(queue))
1974                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
1975         else
1976                 nvme_tcp_set_sg_host_data(c, req->data_len);
1977
1978         return 0;
1979 }
1980
1981 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
1982                 struct request *rq)
1983 {
1984         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1985         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1986         struct nvme_tcp_queue *queue = req->queue;
1987         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
1988         blk_status_t ret;
1989
1990         ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
1991         if (ret)
1992                 return ret;
1993
1994         req->state = NVME_TCP_SEND_CMD_PDU;
1995         req->offset = 0;
1996         req->data_sent = 0;
1997         req->pdu_len = 0;
1998         req->pdu_sent = 0;
1999         req->data_len = blk_rq_payload_bytes(rq);
2000         req->curr_bio = rq->bio;
2001
2002         if (rq_data_dir(rq) == WRITE &&
2003             req->data_len <= nvme_tcp_inline_data_size(queue))
2004                 req->pdu_len = req->data_len;
2005         else if (req->curr_bio)
2006                 nvme_tcp_init_iter(req, READ);
2007
2008         pdu->hdr.type = nvme_tcp_cmd;
2009         pdu->hdr.flags = 0;
2010         if (queue->hdr_digest)
2011                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2012         if (queue->data_digest && req->pdu_len) {
2013                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2014                 ddgst = nvme_tcp_ddgst_len(queue);
2015         }
2016         pdu->hdr.hlen = sizeof(*pdu);
2017         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2018         pdu->hdr.plen =
2019                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2020
2021         ret = nvme_tcp_map_data(queue, rq);
2022         if (unlikely(ret)) {
2023                 dev_err(queue->ctrl->ctrl.device,
2024                         "Failed to map data (%d)\n", ret);
2025                 return ret;
2026         }
2027
2028         return 0;
2029 }
2030
2031 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2032                 const struct blk_mq_queue_data *bd)
2033 {
2034         struct nvme_ns *ns = hctx->queue->queuedata;
2035         struct nvme_tcp_queue *queue = hctx->driver_data;
2036         struct request *rq = bd->rq;
2037         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2038         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2039         blk_status_t ret;
2040
2041         if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2042                 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2043
2044         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2045         if (unlikely(ret))
2046                 return ret;
2047
2048         blk_mq_start_request(rq);
2049
2050         nvme_tcp_queue_request(req);
2051
2052         return BLK_STS_OK;
2053 }
2054
2055 static struct blk_mq_ops nvme_tcp_mq_ops = {
2056         .queue_rq       = nvme_tcp_queue_rq,
2057         .complete       = nvme_complete_rq,
2058         .init_request   = nvme_tcp_init_request,
2059         .exit_request   = nvme_tcp_exit_request,
2060         .init_hctx      = nvme_tcp_init_hctx,
2061         .timeout        = nvme_tcp_timeout,
2062 };
2063
2064 static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2065         .queue_rq       = nvme_tcp_queue_rq,
2066         .complete       = nvme_complete_rq,
2067         .init_request   = nvme_tcp_init_request,
2068         .exit_request   = nvme_tcp_exit_request,
2069         .init_hctx      = nvme_tcp_init_admin_hctx,
2070         .timeout        = nvme_tcp_timeout,
2071 };
2072
2073 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2074         .name                   = "tcp",
2075         .module                 = THIS_MODULE,
2076         .flags                  = NVME_F_FABRICS,
2077         .reg_read32             = nvmf_reg_read32,
2078         .reg_read64             = nvmf_reg_read64,
2079         .reg_write32            = nvmf_reg_write32,
2080         .free_ctrl              = nvme_tcp_free_ctrl,
2081         .submit_async_event     = nvme_tcp_submit_async_event,
2082         .delete_ctrl            = nvme_tcp_delete_ctrl,
2083         .get_address            = nvmf_get_address,
2084         .stop_ctrl              = nvme_tcp_stop_ctrl,
2085 };
2086
2087 static bool
2088 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2089 {
2090         struct nvme_tcp_ctrl *ctrl;
2091         bool found = false;
2092
2093         mutex_lock(&nvme_tcp_ctrl_mutex);
2094         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2095                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2096                 if (found)
2097                         break;
2098         }
2099         mutex_unlock(&nvme_tcp_ctrl_mutex);
2100
2101         return found;
2102 }
2103
2104 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2105                 struct nvmf_ctrl_options *opts)
2106 {
2107         struct nvme_tcp_ctrl *ctrl;
2108         int ret;
2109
2110         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2111         if (!ctrl)
2112                 return ERR_PTR(-ENOMEM);
2113
2114         INIT_LIST_HEAD(&ctrl->list);
2115         ctrl->ctrl.opts = opts;
2116         ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
2117         ctrl->ctrl.sqsize = opts->queue_size - 1;
2118         ctrl->ctrl.kato = opts->kato;
2119
2120         INIT_DELAYED_WORK(&ctrl->connect_work,
2121                         nvme_tcp_reconnect_ctrl_work);
2122         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2123         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2124
2125         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2126                 opts->trsvcid =
2127                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2128                 if (!opts->trsvcid) {
2129                         ret = -ENOMEM;
2130                         goto out_free_ctrl;
2131                 }
2132                 opts->mask |= NVMF_OPT_TRSVCID;
2133         }
2134
2135         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2136                         opts->traddr, opts->trsvcid, &ctrl->addr);
2137         if (ret) {
2138                 pr_err("malformed address passed: %s:%s\n",
2139                         opts->traddr, opts->trsvcid);
2140                 goto out_free_ctrl;
2141         }
2142
2143         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2144                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2145                         opts->host_traddr, NULL, &ctrl->src_addr);
2146                 if (ret) {
2147                         pr_err("malformed src address passed: %s\n",
2148                                opts->host_traddr);
2149                         goto out_free_ctrl;
2150                 }
2151         }
2152
2153         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2154                 ret = -EALREADY;
2155                 goto out_free_ctrl;
2156         }
2157
2158         ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
2159                                 GFP_KERNEL);
2160         if (!ctrl->queues) {
2161                 ret = -ENOMEM;
2162                 goto out_free_ctrl;
2163         }
2164
2165         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2166         if (ret)
2167                 goto out_kfree_queues;
2168
2169         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2170                 WARN_ON_ONCE(1);
2171                 ret = -EINTR;
2172                 goto out_uninit_ctrl;
2173         }
2174
2175         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2176         if (ret)
2177                 goto out_uninit_ctrl;
2178
2179         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2180                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2181
2182         nvme_get_ctrl(&ctrl->ctrl);
2183
2184         mutex_lock(&nvme_tcp_ctrl_mutex);
2185         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2186         mutex_unlock(&nvme_tcp_ctrl_mutex);
2187
2188         return &ctrl->ctrl;
2189
2190 out_uninit_ctrl:
2191         nvme_uninit_ctrl(&ctrl->ctrl);
2192         nvme_put_ctrl(&ctrl->ctrl);
2193         if (ret > 0)
2194                 ret = -EIO;
2195         return ERR_PTR(ret);
2196 out_kfree_queues:
2197         kfree(ctrl->queues);
2198 out_free_ctrl:
2199         kfree(ctrl);
2200         return ERR_PTR(ret);
2201 }
2202
2203 static struct nvmf_transport_ops nvme_tcp_transport = {
2204         .name           = "tcp",
2205         .module         = THIS_MODULE,
2206         .required_opts  = NVMF_OPT_TRADDR,
2207         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2208                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2209                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST,
2210         .create_ctrl    = nvme_tcp_create_ctrl,
2211 };
2212
2213 static int __init nvme_tcp_init_module(void)
2214 {
2215         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2216                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2217         if (!nvme_tcp_wq)
2218                 return -ENOMEM;
2219
2220         nvmf_register_transport(&nvme_tcp_transport);
2221         return 0;
2222 }
2223
2224 static void __exit nvme_tcp_cleanup_module(void)
2225 {
2226         struct nvme_tcp_ctrl *ctrl;
2227
2228         nvmf_unregister_transport(&nvme_tcp_transport);
2229
2230         mutex_lock(&nvme_tcp_ctrl_mutex);
2231         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2232                 nvme_delete_ctrl(&ctrl->ctrl);
2233         mutex_unlock(&nvme_tcp_ctrl_mutex);
2234         flush_workqueue(nvme_delete_wq);
2235
2236         destroy_workqueue(nvme_tcp_wq);
2237 }
2238
2239 module_init(nvme_tcp_init_module);
2240 module_exit(nvme_tcp_cleanup_module);
2241
2242 MODULE_LICENSE("GPL v2");