]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/nvme/target/tcp.c
nvmet-tcp: fix possible memory leak
[linux.git] / drivers / nvme / target / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP target.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
17
18 #include "nvmet.h"
19
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE  (4 * PAGE_SIZE)
21
22 #define NVMET_TCP_RECV_BUDGET           8
23 #define NVMET_TCP_SEND_BUDGET           8
24 #define NVMET_TCP_IO_WORK_BUDGET        64
25
26 enum nvmet_tcp_send_state {
27         NVMET_TCP_SEND_DATA_PDU,
28         NVMET_TCP_SEND_DATA,
29         NVMET_TCP_SEND_R2T,
30         NVMET_TCP_SEND_DDGST,
31         NVMET_TCP_SEND_RESPONSE
32 };
33
34 enum nvmet_tcp_recv_state {
35         NVMET_TCP_RECV_PDU,
36         NVMET_TCP_RECV_DATA,
37         NVMET_TCP_RECV_DDGST,
38         NVMET_TCP_RECV_ERR,
39 };
40
41 enum {
42         NVMET_TCP_F_INIT_FAILED = (1 << 0),
43 };
44
45 struct nvmet_tcp_cmd {
46         struct nvmet_tcp_queue          *queue;
47         struct nvmet_req                req;
48
49         struct nvme_tcp_cmd_pdu         *cmd_pdu;
50         struct nvme_tcp_rsp_pdu         *rsp_pdu;
51         struct nvme_tcp_data_pdu        *data_pdu;
52         struct nvme_tcp_r2t_pdu         *r2t_pdu;
53
54         u32                             rbytes_done;
55         u32                             wbytes_done;
56
57         u32                             pdu_len;
58         u32                             pdu_recv;
59         int                             sg_idx;
60         int                             nr_mapped;
61         struct msghdr                   recv_msg;
62         struct kvec                     *iov;
63         u32                             flags;
64
65         struct list_head                entry;
66         struct llist_node               lentry;
67
68         /* send state */
69         u32                             offset;
70         struct scatterlist              *cur_sg;
71         enum nvmet_tcp_send_state       state;
72
73         __le32                          exp_ddgst;
74         __le32                          recv_ddgst;
75 };
76
77 enum nvmet_tcp_queue_state {
78         NVMET_TCP_Q_CONNECTING,
79         NVMET_TCP_Q_LIVE,
80         NVMET_TCP_Q_DISCONNECTING,
81 };
82
83 struct nvmet_tcp_queue {
84         struct socket           *sock;
85         struct nvmet_tcp_port   *port;
86         struct work_struct      io_work;
87         int                     cpu;
88         struct nvmet_cq         nvme_cq;
89         struct nvmet_sq         nvme_sq;
90
91         /* send state */
92         struct nvmet_tcp_cmd    *cmds;
93         unsigned int            nr_cmds;
94         struct list_head        free_list;
95         struct llist_head       resp_list;
96         struct list_head        resp_send_list;
97         int                     send_list_len;
98         struct nvmet_tcp_cmd    *snd_cmd;
99
100         /* recv state */
101         int                     offset;
102         int                     left;
103         enum nvmet_tcp_recv_state rcv_state;
104         struct nvmet_tcp_cmd    *cmd;
105         union nvme_tcp_pdu      pdu;
106
107         /* digest state */
108         bool                    hdr_digest;
109         bool                    data_digest;
110         struct ahash_request    *snd_hash;
111         struct ahash_request    *rcv_hash;
112
113         spinlock_t              state_lock;
114         enum nvmet_tcp_queue_state state;
115
116         struct sockaddr_storage sockaddr;
117         struct sockaddr_storage sockaddr_peer;
118         struct work_struct      release_work;
119
120         int                     idx;
121         struct list_head        queue_list;
122
123         struct nvmet_tcp_cmd    connect;
124
125         struct page_frag_cache  pf_cache;
126
127         void (*data_ready)(struct sock *);
128         void (*state_change)(struct sock *);
129         void (*write_space)(struct sock *);
130 };
131
132 struct nvmet_tcp_port {
133         struct socket           *sock;
134         struct work_struct      accept_work;
135         struct nvmet_port       *nport;
136         struct sockaddr_storage addr;
137         int                     last_cpu;
138         void (*data_ready)(struct sock *);
139 };
140
141 static DEFINE_IDA(nvmet_tcp_queue_ida);
142 static LIST_HEAD(nvmet_tcp_queue_list);
143 static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
144
145 static struct workqueue_struct *nvmet_tcp_wq;
146 static struct nvmet_fabrics_ops nvmet_tcp_ops;
147 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
148 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
149
150 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
151                 struct nvmet_tcp_cmd *cmd)
152 {
153         return cmd - queue->cmds;
154 }
155
156 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
157 {
158         return nvme_is_write(cmd->req.cmd) &&
159                 cmd->rbytes_done < cmd->req.transfer_len;
160 }
161
162 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
163 {
164         return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
165 }
166
167 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
168 {
169         return !nvme_is_write(cmd->req.cmd) &&
170                 cmd->req.transfer_len > 0 &&
171                 !cmd->req.cqe->status;
172 }
173
174 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
175 {
176         return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
177                 !cmd->rbytes_done;
178 }
179
180 static inline struct nvmet_tcp_cmd *
181 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
182 {
183         struct nvmet_tcp_cmd *cmd;
184
185         cmd = list_first_entry_or_null(&queue->free_list,
186                                 struct nvmet_tcp_cmd, entry);
187         if (!cmd)
188                 return NULL;
189         list_del_init(&cmd->entry);
190
191         cmd->rbytes_done = cmd->wbytes_done = 0;
192         cmd->pdu_len = 0;
193         cmd->pdu_recv = 0;
194         cmd->iov = NULL;
195         cmd->flags = 0;
196         return cmd;
197 }
198
199 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
200 {
201         if (unlikely(cmd == &cmd->queue->connect))
202                 return;
203
204         list_add_tail(&cmd->entry, &cmd->queue->free_list);
205 }
206
207 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
208 {
209         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
210 }
211
212 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
213 {
214         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
215 }
216
217 static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
218                 void *pdu, size_t len)
219 {
220         struct scatterlist sg;
221
222         sg_init_one(&sg, pdu, len);
223         ahash_request_set_crypt(hash, &sg, pdu + len, len);
224         crypto_ahash_digest(hash);
225 }
226
227 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
228         void *pdu, size_t len)
229 {
230         struct nvme_tcp_hdr *hdr = pdu;
231         __le32 recv_digest;
232         __le32 exp_digest;
233
234         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
235                 pr_err("queue %d: header digest enabled but no header digest\n",
236                         queue->idx);
237                 return -EPROTO;
238         }
239
240         recv_digest = *(__le32 *)(pdu + hdr->hlen);
241         nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
242         exp_digest = *(__le32 *)(pdu + hdr->hlen);
243         if (recv_digest != exp_digest) {
244                 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
245                         queue->idx, le32_to_cpu(recv_digest),
246                         le32_to_cpu(exp_digest));
247                 return -EPROTO;
248         }
249
250         return 0;
251 }
252
253 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
254 {
255         struct nvme_tcp_hdr *hdr = pdu;
256         u8 digest_len = nvmet_tcp_hdgst_len(queue);
257         u32 len;
258
259         len = le32_to_cpu(hdr->plen) - hdr->hlen -
260                 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
261
262         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
263                 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
264                 return -EPROTO;
265         }
266
267         return 0;
268 }
269
270 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
271 {
272         struct scatterlist *sg;
273         int i;
274
275         sg = &cmd->req.sg[cmd->sg_idx];
276
277         for (i = 0; i < cmd->nr_mapped; i++)
278                 kunmap(sg_page(&sg[i]));
279 }
280
281 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
282 {
283         struct kvec *iov = cmd->iov;
284         struct scatterlist *sg;
285         u32 length, offset, sg_offset;
286
287         length = cmd->pdu_len;
288         cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
289         offset = cmd->rbytes_done;
290         cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
291         sg_offset = offset % PAGE_SIZE;
292         sg = &cmd->req.sg[cmd->sg_idx];
293
294         while (length) {
295                 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
296
297                 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
298                 iov->iov_len = iov_len;
299
300                 length -= iov_len;
301                 sg = sg_next(sg);
302                 iov++;
303         }
304
305         iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
306                 cmd->nr_mapped, cmd->pdu_len);
307 }
308
309 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
310 {
311         queue->rcv_state = NVMET_TCP_RECV_ERR;
312         if (queue->nvme_sq.ctrl)
313                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
314         else
315                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
316 }
317
318 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
319 {
320         struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
321         u32 len = le32_to_cpu(sgl->length);
322
323         if (!cmd->req.data_len)
324                 return 0;
325
326         if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
327                           NVME_SGL_FMT_OFFSET)) {
328                 if (!nvme_is_write(cmd->req.cmd))
329                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
330
331                 if (len > cmd->req.port->inline_data_size)
332                         return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
333                 cmd->pdu_len = len;
334         }
335         cmd->req.transfer_len += len;
336
337         cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
338         if (!cmd->req.sg)
339                 return NVME_SC_INTERNAL;
340         cmd->cur_sg = cmd->req.sg;
341
342         if (nvmet_tcp_has_data_in(cmd)) {
343                 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
344                                 sizeof(*cmd->iov), GFP_KERNEL);
345                 if (!cmd->iov)
346                         goto err;
347         }
348
349         return 0;
350 err:
351         if (cmd->req.sg_cnt)
352                 sgl_free(cmd->req.sg);
353         return NVME_SC_INTERNAL;
354 }
355
356 static void nvmet_tcp_ddgst(struct ahash_request *hash,
357                 struct nvmet_tcp_cmd *cmd)
358 {
359         ahash_request_set_crypt(hash, cmd->req.sg,
360                 (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
361         crypto_ahash_digest(hash);
362 }
363
364 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
365 {
366         struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
367         struct nvmet_tcp_queue *queue = cmd->queue;
368         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
369         u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
370
371         cmd->offset = 0;
372         cmd->state = NVMET_TCP_SEND_DATA_PDU;
373
374         pdu->hdr.type = nvme_tcp_c2h_data;
375         pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
376                                                 NVME_TCP_F_DATA_SUCCESS : 0);
377         pdu->hdr.hlen = sizeof(*pdu);
378         pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
379         pdu->hdr.plen =
380                 cpu_to_le32(pdu->hdr.hlen + hdgst +
381                                 cmd->req.transfer_len + ddgst);
382         pdu->command_id = cmd->req.cqe->command_id;
383         pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
384         pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
385
386         if (queue->data_digest) {
387                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
388                 nvmet_tcp_ddgst(queue->snd_hash, cmd);
389         }
390
391         if (cmd->queue->hdr_digest) {
392                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
393                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
394         }
395 }
396
397 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
398 {
399         struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
400         struct nvmet_tcp_queue *queue = cmd->queue;
401         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
402
403         cmd->offset = 0;
404         cmd->state = NVMET_TCP_SEND_R2T;
405
406         pdu->hdr.type = nvme_tcp_r2t;
407         pdu->hdr.flags = 0;
408         pdu->hdr.hlen = sizeof(*pdu);
409         pdu->hdr.pdo = 0;
410         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
411
412         pdu->command_id = cmd->req.cmd->common.command_id;
413         pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
414         pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
415         pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
416         if (cmd->queue->hdr_digest) {
417                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
418                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
419         }
420 }
421
422 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
423 {
424         struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
425         struct nvmet_tcp_queue *queue = cmd->queue;
426         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
427
428         cmd->offset = 0;
429         cmd->state = NVMET_TCP_SEND_RESPONSE;
430
431         pdu->hdr.type = nvme_tcp_rsp;
432         pdu->hdr.flags = 0;
433         pdu->hdr.hlen = sizeof(*pdu);
434         pdu->hdr.pdo = 0;
435         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
436         if (cmd->queue->hdr_digest) {
437                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
438                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
439         }
440 }
441
442 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
443 {
444         struct llist_node *node;
445
446         node = llist_del_all(&queue->resp_list);
447         if (!node)
448                 return;
449
450         while (node) {
451                 struct nvmet_tcp_cmd *cmd = llist_entry(node,
452                                         struct nvmet_tcp_cmd, lentry);
453
454                 list_add(&cmd->entry, &queue->resp_send_list);
455                 node = node->next;
456                 queue->send_list_len++;
457         }
458 }
459
460 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
461 {
462         queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
463                                 struct nvmet_tcp_cmd, entry);
464         if (!queue->snd_cmd) {
465                 nvmet_tcp_process_resp_list(queue);
466                 queue->snd_cmd =
467                         list_first_entry_or_null(&queue->resp_send_list,
468                                         struct nvmet_tcp_cmd, entry);
469                 if (unlikely(!queue->snd_cmd))
470                         return NULL;
471         }
472
473         list_del_init(&queue->snd_cmd->entry);
474         queue->send_list_len--;
475
476         if (nvmet_tcp_need_data_out(queue->snd_cmd))
477                 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
478         else if (nvmet_tcp_need_data_in(queue->snd_cmd))
479                 nvmet_setup_r2t_pdu(queue->snd_cmd);
480         else
481                 nvmet_setup_response_pdu(queue->snd_cmd);
482
483         return queue->snd_cmd;
484 }
485
486 static void nvmet_tcp_queue_response(struct nvmet_req *req)
487 {
488         struct nvmet_tcp_cmd *cmd =
489                 container_of(req, struct nvmet_tcp_cmd, req);
490         struct nvmet_tcp_queue  *queue = cmd->queue;
491
492         llist_add(&cmd->lentry, &queue->resp_list);
493         queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
494 }
495
496 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
497 {
498         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
499         int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
500         int ret;
501
502         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
503                         offset_in_page(cmd->data_pdu) + cmd->offset,
504                         left, MSG_DONTWAIT | MSG_MORE);
505         if (ret <= 0)
506                 return ret;
507
508         cmd->offset += ret;
509         left -= ret;
510
511         if (left)
512                 return -EAGAIN;
513
514         cmd->state = NVMET_TCP_SEND_DATA;
515         cmd->offset  = 0;
516         return 1;
517 }
518
519 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
520 {
521         struct nvmet_tcp_queue *queue = cmd->queue;
522         int ret;
523
524         while (cmd->cur_sg) {
525                 struct page *page = sg_page(cmd->cur_sg);
526                 u32 left = cmd->cur_sg->length - cmd->offset;
527
528                 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
529                                         left, MSG_DONTWAIT | MSG_MORE);
530                 if (ret <= 0)
531                         return ret;
532
533                 cmd->offset += ret;
534                 cmd->wbytes_done += ret;
535
536                 /* Done with sg?*/
537                 if (cmd->offset == cmd->cur_sg->length) {
538                         cmd->cur_sg = sg_next(cmd->cur_sg);
539                         cmd->offset = 0;
540                 }
541         }
542
543         if (queue->data_digest) {
544                 cmd->state = NVMET_TCP_SEND_DDGST;
545                 cmd->offset = 0;
546         } else {
547                 if (queue->nvme_sq.sqhd_disabled) {
548                         cmd->queue->snd_cmd = NULL;
549                         nvmet_tcp_put_cmd(cmd);
550                 } else {
551                         nvmet_setup_response_pdu(cmd);
552                 }
553         }
554
555         if (queue->nvme_sq.sqhd_disabled) {
556                 kfree(cmd->iov);
557                 if (cmd->req.sg_cnt)
558                         sgl_free(cmd->req.sg);
559         }
560
561         return 1;
562
563 }
564
565 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
566                 bool last_in_batch)
567 {
568         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
569         int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
570         int flags = MSG_DONTWAIT;
571         int ret;
572
573         if (!last_in_batch && cmd->queue->send_list_len)
574                 flags |= MSG_MORE;
575         else
576                 flags |= MSG_EOR;
577
578         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
579                 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
580         if (ret <= 0)
581                 return ret;
582         cmd->offset += ret;
583         left -= ret;
584
585         if (left)
586                 return -EAGAIN;
587
588         kfree(cmd->iov);
589         if (cmd->req.sg_cnt)
590                 sgl_free(cmd->req.sg);
591         cmd->queue->snd_cmd = NULL;
592         nvmet_tcp_put_cmd(cmd);
593         return 1;
594 }
595
596 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
597 {
598         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
599         int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
600         int flags = MSG_DONTWAIT;
601         int ret;
602
603         if (!last_in_batch && cmd->queue->send_list_len)
604                 flags |= MSG_MORE;
605         else
606                 flags |= MSG_EOR;
607
608         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
609                 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
610         if (ret <= 0)
611                 return ret;
612         cmd->offset += ret;
613         left -= ret;
614
615         if (left)
616                 return -EAGAIN;
617
618         cmd->queue->snd_cmd = NULL;
619         return 1;
620 }
621
622 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
623 {
624         struct nvmet_tcp_queue *queue = cmd->queue;
625         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
626         struct kvec iov = {
627                 .iov_base = &cmd->exp_ddgst + cmd->offset,
628                 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
629         };
630         int ret;
631
632         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
633         if (unlikely(ret <= 0))
634                 return ret;
635
636         cmd->offset += ret;
637
638         if (queue->nvme_sq.sqhd_disabled) {
639                 cmd->queue->snd_cmd = NULL;
640                 nvmet_tcp_put_cmd(cmd);
641         } else {
642                 nvmet_setup_response_pdu(cmd);
643         }
644         return 1;
645 }
646
647 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
648                 bool last_in_batch)
649 {
650         struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
651         int ret = 0;
652
653         if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
654                 cmd = nvmet_tcp_fetch_cmd(queue);
655                 if (unlikely(!cmd))
656                         return 0;
657         }
658
659         if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
660                 ret = nvmet_try_send_data_pdu(cmd);
661                 if (ret <= 0)
662                         goto done_send;
663         }
664
665         if (cmd->state == NVMET_TCP_SEND_DATA) {
666                 ret = nvmet_try_send_data(cmd);
667                 if (ret <= 0)
668                         goto done_send;
669         }
670
671         if (cmd->state == NVMET_TCP_SEND_DDGST) {
672                 ret = nvmet_try_send_ddgst(cmd);
673                 if (ret <= 0)
674                         goto done_send;
675         }
676
677         if (cmd->state == NVMET_TCP_SEND_R2T) {
678                 ret = nvmet_try_send_r2t(cmd, last_in_batch);
679                 if (ret <= 0)
680                         goto done_send;
681         }
682
683         if (cmd->state == NVMET_TCP_SEND_RESPONSE)
684                 ret = nvmet_try_send_response(cmd, last_in_batch);
685
686 done_send:
687         if (ret < 0) {
688                 if (ret == -EAGAIN)
689                         return 0;
690                 return ret;
691         }
692
693         return 1;
694 }
695
696 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
697                 int budget, int *sends)
698 {
699         int i, ret = 0;
700
701         for (i = 0; i < budget; i++) {
702                 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
703                 if (ret <= 0)
704                         break;
705                 (*sends)++;
706         }
707
708         return ret;
709 }
710
711 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
712 {
713         queue->offset = 0;
714         queue->left = sizeof(struct nvme_tcp_hdr);
715         queue->cmd = NULL;
716         queue->rcv_state = NVMET_TCP_RECV_PDU;
717 }
718
719 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
720 {
721         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
722
723         ahash_request_free(queue->rcv_hash);
724         ahash_request_free(queue->snd_hash);
725         crypto_free_ahash(tfm);
726 }
727
728 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
729 {
730         struct crypto_ahash *tfm;
731
732         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
733         if (IS_ERR(tfm))
734                 return PTR_ERR(tfm);
735
736         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
737         if (!queue->snd_hash)
738                 goto free_tfm;
739         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
740
741         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
742         if (!queue->rcv_hash)
743                 goto free_snd_hash;
744         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
745
746         return 0;
747 free_snd_hash:
748         ahash_request_free(queue->snd_hash);
749 free_tfm:
750         crypto_free_ahash(tfm);
751         return -ENOMEM;
752 }
753
754
755 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
756 {
757         struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
758         struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
759         struct msghdr msg = {};
760         struct kvec iov;
761         int ret;
762
763         if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
764                 pr_err("bad nvme-tcp pdu length (%d)\n",
765                         le32_to_cpu(icreq->hdr.plen));
766                 nvmet_tcp_fatal_error(queue);
767         }
768
769         if (icreq->pfv != NVME_TCP_PFV_1_0) {
770                 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
771                 return -EPROTO;
772         }
773
774         if (icreq->hpda != 0) {
775                 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
776                         icreq->hpda);
777                 return -EPROTO;
778         }
779
780         queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
781         queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
782         if (queue->hdr_digest || queue->data_digest) {
783                 ret = nvmet_tcp_alloc_crypto(queue);
784                 if (ret)
785                         return ret;
786         }
787
788         memset(icresp, 0, sizeof(*icresp));
789         icresp->hdr.type = nvme_tcp_icresp;
790         icresp->hdr.hlen = sizeof(*icresp);
791         icresp->hdr.pdo = 0;
792         icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
793         icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
794         icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */
795         icresp->cpda = 0;
796         if (queue->hdr_digest)
797                 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
798         if (queue->data_digest)
799                 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
800
801         iov.iov_base = icresp;
802         iov.iov_len = sizeof(*icresp);
803         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
804         if (ret < 0)
805                 goto free_crypto;
806
807         queue->state = NVMET_TCP_Q_LIVE;
808         nvmet_prepare_receive_pdu(queue);
809         return 0;
810 free_crypto:
811         if (queue->hdr_digest || queue->data_digest)
812                 nvmet_tcp_free_crypto(queue);
813         return ret;
814 }
815
816 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
817                 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
818 {
819         int ret;
820
821         /* recover the expected data transfer length */
822         req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
823
824         if (!nvme_is_write(cmd->req.cmd) ||
825             req->data_len > cmd->req.port->inline_data_size) {
826                 nvmet_prepare_receive_pdu(queue);
827                 return;
828         }
829
830         ret = nvmet_tcp_map_data(cmd);
831         if (unlikely(ret)) {
832                 pr_err("queue %d: failed to map data\n", queue->idx);
833                 nvmet_tcp_fatal_error(queue);
834                 return;
835         }
836
837         queue->rcv_state = NVMET_TCP_RECV_DATA;
838         nvmet_tcp_map_pdu_iovec(cmd);
839         cmd->flags |= NVMET_TCP_F_INIT_FAILED;
840 }
841
842 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
843 {
844         struct nvme_tcp_data_pdu *data = &queue->pdu.data;
845         struct nvmet_tcp_cmd *cmd;
846
847         cmd = &queue->cmds[data->ttag];
848
849         if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
850                 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
851                         data->ttag, le32_to_cpu(data->data_offset),
852                         cmd->rbytes_done);
853                 /* FIXME: use path and transport errors */
854                 nvmet_req_complete(&cmd->req,
855                         NVME_SC_INVALID_FIELD | NVME_SC_DNR);
856                 return -EPROTO;
857         }
858
859         cmd->pdu_len = le32_to_cpu(data->data_length);
860         cmd->pdu_recv = 0;
861         nvmet_tcp_map_pdu_iovec(cmd);
862         queue->cmd = cmd;
863         queue->rcv_state = NVMET_TCP_RECV_DATA;
864
865         return 0;
866 }
867
868 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
869 {
870         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
871         struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
872         struct nvmet_req *req;
873         int ret;
874
875         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
876                 if (hdr->type != nvme_tcp_icreq) {
877                         pr_err("unexpected pdu type (%d) before icreq\n",
878                                 hdr->type);
879                         nvmet_tcp_fatal_error(queue);
880                         return -EPROTO;
881                 }
882                 return nvmet_tcp_handle_icreq(queue);
883         }
884
885         if (hdr->type == nvme_tcp_h2c_data) {
886                 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
887                 if (unlikely(ret))
888                         return ret;
889                 return 0;
890         }
891
892         queue->cmd = nvmet_tcp_get_cmd(queue);
893         if (unlikely(!queue->cmd)) {
894                 /* This should never happen */
895                 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
896                         queue->idx, queue->nr_cmds, queue->send_list_len,
897                         nvme_cmd->common.opcode);
898                 nvmet_tcp_fatal_error(queue);
899                 return -ENOMEM;
900         }
901
902         req = &queue->cmd->req;
903         memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
904
905         if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
906                         &queue->nvme_sq, &nvmet_tcp_ops))) {
907                 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
908                         req->cmd, req->cmd->common.command_id,
909                         req->cmd->common.opcode,
910                         le32_to_cpu(req->cmd->common.dptr.sgl.length));
911
912                 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
913                 return -EAGAIN;
914         }
915
916         ret = nvmet_tcp_map_data(queue->cmd);
917         if (unlikely(ret)) {
918                 pr_err("queue %d: failed to map data\n", queue->idx);
919                 if (nvmet_tcp_has_inline_data(queue->cmd))
920                         nvmet_tcp_fatal_error(queue);
921                 else
922                         nvmet_req_complete(req, ret);
923                 ret = -EAGAIN;
924                 goto out;
925         }
926
927         if (nvmet_tcp_need_data_in(queue->cmd)) {
928                 if (nvmet_tcp_has_inline_data(queue->cmd)) {
929                         queue->rcv_state = NVMET_TCP_RECV_DATA;
930                         nvmet_tcp_map_pdu_iovec(queue->cmd);
931                         return 0;
932                 }
933                 /* send back R2T */
934                 nvmet_tcp_queue_response(&queue->cmd->req);
935                 goto out;
936         }
937
938         nvmet_req_execute(&queue->cmd->req);
939 out:
940         nvmet_prepare_receive_pdu(queue);
941         return ret;
942 }
943
944 static const u8 nvme_tcp_pdu_sizes[] = {
945         [nvme_tcp_icreq]        = sizeof(struct nvme_tcp_icreq_pdu),
946         [nvme_tcp_cmd]          = sizeof(struct nvme_tcp_cmd_pdu),
947         [nvme_tcp_h2c_data]     = sizeof(struct nvme_tcp_data_pdu),
948 };
949
950 static inline u8 nvmet_tcp_pdu_size(u8 type)
951 {
952         size_t idx = type;
953
954         return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
955                 nvme_tcp_pdu_sizes[idx]) ?
956                         nvme_tcp_pdu_sizes[idx] : 0;
957 }
958
959 static inline bool nvmet_tcp_pdu_valid(u8 type)
960 {
961         switch (type) {
962         case nvme_tcp_icreq:
963         case nvme_tcp_cmd:
964         case nvme_tcp_h2c_data:
965                 /* fallthru */
966                 return true;
967         }
968
969         return false;
970 }
971
972 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
973 {
974         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
975         int len;
976         struct kvec iov;
977         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
978
979 recv:
980         iov.iov_base = (void *)&queue->pdu + queue->offset;
981         iov.iov_len = queue->left;
982         len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
983                         iov.iov_len, msg.msg_flags);
984         if (unlikely(len < 0))
985                 return len;
986
987         queue->offset += len;
988         queue->left -= len;
989         if (queue->left)
990                 return -EAGAIN;
991
992         if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
993                 u8 hdgst = nvmet_tcp_hdgst_len(queue);
994
995                 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
996                         pr_err("unexpected pdu type %d\n", hdr->type);
997                         nvmet_tcp_fatal_error(queue);
998                         return -EIO;
999                 }
1000
1001                 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1002                         pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1003                         return -EIO;
1004                 }
1005
1006                 queue->left = hdr->hlen - queue->offset + hdgst;
1007                 goto recv;
1008         }
1009
1010         if (queue->hdr_digest &&
1011             nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
1012                 nvmet_tcp_fatal_error(queue); /* fatal */
1013                 return -EPROTO;
1014         }
1015
1016         if (queue->data_digest &&
1017             nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1018                 nvmet_tcp_fatal_error(queue); /* fatal */
1019                 return -EPROTO;
1020         }
1021
1022         return nvmet_tcp_done_recv_pdu(queue);
1023 }
1024
1025 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1026 {
1027         struct nvmet_tcp_queue *queue = cmd->queue;
1028
1029         nvmet_tcp_ddgst(queue->rcv_hash, cmd);
1030         queue->offset = 0;
1031         queue->left = NVME_TCP_DIGEST_LENGTH;
1032         queue->rcv_state = NVMET_TCP_RECV_DDGST;
1033 }
1034
1035 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1036 {
1037         struct nvmet_tcp_cmd  *cmd = queue->cmd;
1038         int ret;
1039
1040         while (msg_data_left(&cmd->recv_msg)) {
1041                 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1042                         cmd->recv_msg.msg_flags);
1043                 if (ret <= 0)
1044                         return ret;
1045
1046                 cmd->pdu_recv += ret;
1047                 cmd->rbytes_done += ret;
1048         }
1049
1050         nvmet_tcp_unmap_pdu_iovec(cmd);
1051
1052         if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1053             cmd->rbytes_done == cmd->req.transfer_len) {
1054                 if (queue->data_digest) {
1055                         nvmet_tcp_prep_recv_ddgst(cmd);
1056                         return 0;
1057                 }
1058                 nvmet_req_execute(&cmd->req);
1059         }
1060
1061         nvmet_prepare_receive_pdu(queue);
1062         return 0;
1063 }
1064
1065 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1066 {
1067         struct nvmet_tcp_cmd *cmd = queue->cmd;
1068         int ret;
1069         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1070         struct kvec iov = {
1071                 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1072                 .iov_len = queue->left
1073         };
1074
1075         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1076                         iov.iov_len, msg.msg_flags);
1077         if (unlikely(ret < 0))
1078                 return ret;
1079
1080         queue->offset += ret;
1081         queue->left -= ret;
1082         if (queue->left)
1083                 return -EAGAIN;
1084
1085         if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1086                 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1087                         queue->idx, cmd->req.cmd->common.command_id,
1088                         queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1089                         le32_to_cpu(cmd->exp_ddgst));
1090                 nvmet_tcp_finish_cmd(cmd);
1091                 nvmet_tcp_fatal_error(queue);
1092                 ret = -EPROTO;
1093                 goto out;
1094         }
1095
1096         if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1097             cmd->rbytes_done == cmd->req.transfer_len)
1098                 nvmet_req_execute(&cmd->req);
1099         ret = 0;
1100 out:
1101         nvmet_prepare_receive_pdu(queue);
1102         return ret;
1103 }
1104
1105 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1106 {
1107         int result = 0;
1108
1109         if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1110                 return 0;
1111
1112         if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1113                 result = nvmet_tcp_try_recv_pdu(queue);
1114                 if (result != 0)
1115                         goto done_recv;
1116         }
1117
1118         if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1119                 result = nvmet_tcp_try_recv_data(queue);
1120                 if (result != 0)
1121                         goto done_recv;
1122         }
1123
1124         if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1125                 result = nvmet_tcp_try_recv_ddgst(queue);
1126                 if (result != 0)
1127                         goto done_recv;
1128         }
1129
1130 done_recv:
1131         if (result < 0) {
1132                 if (result == -EAGAIN)
1133                         return 0;
1134                 return result;
1135         }
1136         return 1;
1137 }
1138
1139 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1140                 int budget, int *recvs)
1141 {
1142         int i, ret = 0;
1143
1144         for (i = 0; i < budget; i++) {
1145                 ret = nvmet_tcp_try_recv_one(queue);
1146                 if (ret <= 0)
1147                         break;
1148                 (*recvs)++;
1149         }
1150
1151         return ret;
1152 }
1153
1154 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1155 {
1156         spin_lock(&queue->state_lock);
1157         if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1158                 queue->state = NVMET_TCP_Q_DISCONNECTING;
1159                 schedule_work(&queue->release_work);
1160         }
1161         spin_unlock(&queue->state_lock);
1162 }
1163
1164 static void nvmet_tcp_io_work(struct work_struct *w)
1165 {
1166         struct nvmet_tcp_queue *queue =
1167                 container_of(w, struct nvmet_tcp_queue, io_work);
1168         bool pending;
1169         int ret, ops = 0;
1170
1171         do {
1172                 pending = false;
1173
1174                 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1175                 if (ret > 0) {
1176                         pending = true;
1177                 } else if (ret < 0) {
1178                         if (ret == -EPIPE || ret == -ECONNRESET)
1179                                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1180                         else
1181                                 nvmet_tcp_fatal_error(queue);
1182                         return;
1183                 }
1184
1185                 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1186                 if (ret > 0) {
1187                         /* transmitted message/data */
1188                         pending = true;
1189                 } else if (ret < 0) {
1190                         if (ret == -EPIPE || ret == -ECONNRESET)
1191                                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1192                         else
1193                                 nvmet_tcp_fatal_error(queue);
1194                         return;
1195                 }
1196
1197         } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1198
1199         /*
1200          * We exahusted our budget, requeue our selves
1201          */
1202         if (pending)
1203                 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1204 }
1205
1206 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1207                 struct nvmet_tcp_cmd *c)
1208 {
1209         u8 hdgst = nvmet_tcp_hdgst_len(queue);
1210
1211         c->queue = queue;
1212         c->req.port = queue->port->nport;
1213
1214         c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1215                         sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1216         if (!c->cmd_pdu)
1217                 return -ENOMEM;
1218         c->req.cmd = &c->cmd_pdu->cmd;
1219
1220         c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1221                         sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1222         if (!c->rsp_pdu)
1223                 goto out_free_cmd;
1224         c->req.cqe = &c->rsp_pdu->cqe;
1225
1226         c->data_pdu = page_frag_alloc(&queue->pf_cache,
1227                         sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1228         if (!c->data_pdu)
1229                 goto out_free_rsp;
1230
1231         c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1232                         sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1233         if (!c->r2t_pdu)
1234                 goto out_free_data;
1235
1236         c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1237
1238         list_add_tail(&c->entry, &queue->free_list);
1239
1240         return 0;
1241 out_free_data:
1242         page_frag_free(c->data_pdu);
1243 out_free_rsp:
1244         page_frag_free(c->rsp_pdu);
1245 out_free_cmd:
1246         page_frag_free(c->cmd_pdu);
1247         return -ENOMEM;
1248 }
1249
1250 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1251 {
1252         page_frag_free(c->r2t_pdu);
1253         page_frag_free(c->data_pdu);
1254         page_frag_free(c->rsp_pdu);
1255         page_frag_free(c->cmd_pdu);
1256 }
1257
1258 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1259 {
1260         struct nvmet_tcp_cmd *cmds;
1261         int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1262
1263         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1264         if (!cmds)
1265                 goto out;
1266
1267         for (i = 0; i < nr_cmds; i++) {
1268                 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1269                 if (ret)
1270                         goto out_free;
1271         }
1272
1273         queue->cmds = cmds;
1274
1275         return 0;
1276 out_free:
1277         while (--i >= 0)
1278                 nvmet_tcp_free_cmd(cmds + i);
1279         kfree(cmds);
1280 out:
1281         return ret;
1282 }
1283
1284 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1285 {
1286         struct nvmet_tcp_cmd *cmds = queue->cmds;
1287         int i;
1288
1289         for (i = 0; i < queue->nr_cmds; i++)
1290                 nvmet_tcp_free_cmd(cmds + i);
1291
1292         nvmet_tcp_free_cmd(&queue->connect);
1293         kfree(cmds);
1294 }
1295
1296 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1297 {
1298         struct socket *sock = queue->sock;
1299
1300         write_lock_bh(&sock->sk->sk_callback_lock);
1301         sock->sk->sk_data_ready =  queue->data_ready;
1302         sock->sk->sk_state_change = queue->state_change;
1303         sock->sk->sk_write_space = queue->write_space;
1304         sock->sk->sk_user_data = NULL;
1305         write_unlock_bh(&sock->sk->sk_callback_lock);
1306 }
1307
1308 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1309 {
1310         nvmet_req_uninit(&cmd->req);
1311         nvmet_tcp_unmap_pdu_iovec(cmd);
1312         kfree(cmd->iov);
1313         if (cmd->req.sg_cnt)
1314                 sgl_free(cmd->req.sg);
1315 }
1316
1317 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1318 {
1319         struct nvmet_tcp_cmd *cmd = queue->cmds;
1320         int i;
1321
1322         for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1323                 if (nvmet_tcp_need_data_in(cmd))
1324                         nvmet_tcp_finish_cmd(cmd);
1325         }
1326
1327         if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1328                 /* failed in connect */
1329                 nvmet_tcp_finish_cmd(&queue->connect);
1330         }
1331 }
1332
1333 static void nvmet_tcp_release_queue_work(struct work_struct *w)
1334 {
1335         struct nvmet_tcp_queue *queue =
1336                 container_of(w, struct nvmet_tcp_queue, release_work);
1337
1338         mutex_lock(&nvmet_tcp_queue_mutex);
1339         list_del_init(&queue->queue_list);
1340         mutex_unlock(&nvmet_tcp_queue_mutex);
1341
1342         nvmet_tcp_restore_socket_callbacks(queue);
1343         flush_work(&queue->io_work);
1344
1345         nvmet_tcp_uninit_data_in_cmds(queue);
1346         nvmet_sq_destroy(&queue->nvme_sq);
1347         cancel_work_sync(&queue->io_work);
1348         sock_release(queue->sock);
1349         nvmet_tcp_free_cmds(queue);
1350         if (queue->hdr_digest || queue->data_digest)
1351                 nvmet_tcp_free_crypto(queue);
1352         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1353
1354         kfree(queue);
1355 }
1356
1357 static void nvmet_tcp_data_ready(struct sock *sk)
1358 {
1359         struct nvmet_tcp_queue *queue;
1360
1361         read_lock_bh(&sk->sk_callback_lock);
1362         queue = sk->sk_user_data;
1363         if (likely(queue))
1364                 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1365         read_unlock_bh(&sk->sk_callback_lock);
1366 }
1367
1368 static void nvmet_tcp_write_space(struct sock *sk)
1369 {
1370         struct nvmet_tcp_queue *queue;
1371
1372         read_lock_bh(&sk->sk_callback_lock);
1373         queue = sk->sk_user_data;
1374         if (unlikely(!queue))
1375                 goto out;
1376
1377         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1378                 queue->write_space(sk);
1379                 goto out;
1380         }
1381
1382         if (sk_stream_is_writeable(sk)) {
1383                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1384                 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1385         }
1386 out:
1387         read_unlock_bh(&sk->sk_callback_lock);
1388 }
1389
1390 static void nvmet_tcp_state_change(struct sock *sk)
1391 {
1392         struct nvmet_tcp_queue *queue;
1393
1394         write_lock_bh(&sk->sk_callback_lock);
1395         queue = sk->sk_user_data;
1396         if (!queue)
1397                 goto done;
1398
1399         switch (sk->sk_state) {
1400         case TCP_FIN_WAIT1:
1401         case TCP_CLOSE_WAIT:
1402         case TCP_CLOSE:
1403                 /* FALLTHRU */
1404                 sk->sk_user_data = NULL;
1405                 nvmet_tcp_schedule_release_queue(queue);
1406                 break;
1407         default:
1408                 pr_warn("queue %d unhandled state %d\n",
1409                         queue->idx, sk->sk_state);
1410         }
1411 done:
1412         write_unlock_bh(&sk->sk_callback_lock);
1413 }
1414
1415 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1416 {
1417         struct socket *sock = queue->sock;
1418         struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1419         int ret;
1420
1421         ret = kernel_getsockname(sock,
1422                 (struct sockaddr *)&queue->sockaddr);
1423         if (ret < 0)
1424                 return ret;
1425
1426         ret = kernel_getpeername(sock,
1427                 (struct sockaddr *)&queue->sockaddr_peer);
1428         if (ret < 0)
1429                 return ret;
1430
1431         /*
1432          * Cleanup whatever is sitting in the TCP transmit queue on socket
1433          * close. This is done to prevent stale data from being sent should
1434          * the network connection be restored before TCP times out.
1435          */
1436         ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
1437                         (char *)&sol, sizeof(sol));
1438         if (ret)
1439                 return ret;
1440
1441         write_lock_bh(&sock->sk->sk_callback_lock);
1442         sock->sk->sk_user_data = queue;
1443         queue->data_ready = sock->sk->sk_data_ready;
1444         sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1445         queue->state_change = sock->sk->sk_state_change;
1446         sock->sk->sk_state_change = nvmet_tcp_state_change;
1447         queue->write_space = sock->sk->sk_write_space;
1448         sock->sk->sk_write_space = nvmet_tcp_write_space;
1449         write_unlock_bh(&sock->sk->sk_callback_lock);
1450
1451         return 0;
1452 }
1453
1454 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1455                 struct socket *newsock)
1456 {
1457         struct nvmet_tcp_queue *queue;
1458         int ret;
1459
1460         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1461         if (!queue)
1462                 return -ENOMEM;
1463
1464         INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1465         INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1466         queue->sock = newsock;
1467         queue->port = port;
1468         queue->nr_cmds = 0;
1469         spin_lock_init(&queue->state_lock);
1470         queue->state = NVMET_TCP_Q_CONNECTING;
1471         INIT_LIST_HEAD(&queue->free_list);
1472         init_llist_head(&queue->resp_list);
1473         INIT_LIST_HEAD(&queue->resp_send_list);
1474
1475         queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1476         if (queue->idx < 0) {
1477                 ret = queue->idx;
1478                 goto out_free_queue;
1479         }
1480
1481         ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1482         if (ret)
1483                 goto out_ida_remove;
1484
1485         ret = nvmet_sq_init(&queue->nvme_sq);
1486         if (ret)
1487                 goto out_free_connect;
1488
1489         port->last_cpu = cpumask_next_wrap(port->last_cpu,
1490                                 cpu_online_mask, -1, false);
1491         queue->cpu = port->last_cpu;
1492         nvmet_prepare_receive_pdu(queue);
1493
1494         mutex_lock(&nvmet_tcp_queue_mutex);
1495         list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1496         mutex_unlock(&nvmet_tcp_queue_mutex);
1497
1498         ret = nvmet_tcp_set_queue_sock(queue);
1499         if (ret)
1500                 goto out_destroy_sq;
1501
1502         queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1503
1504         return 0;
1505 out_destroy_sq:
1506         mutex_lock(&nvmet_tcp_queue_mutex);
1507         list_del_init(&queue->queue_list);
1508         mutex_unlock(&nvmet_tcp_queue_mutex);
1509         nvmet_sq_destroy(&queue->nvme_sq);
1510 out_free_connect:
1511         nvmet_tcp_free_cmd(&queue->connect);
1512 out_ida_remove:
1513         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1514 out_free_queue:
1515         kfree(queue);
1516         return ret;
1517 }
1518
1519 static void nvmet_tcp_accept_work(struct work_struct *w)
1520 {
1521         struct nvmet_tcp_port *port =
1522                 container_of(w, struct nvmet_tcp_port, accept_work);
1523         struct socket *newsock;
1524         int ret;
1525
1526         while (true) {
1527                 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1528                 if (ret < 0) {
1529                         if (ret != -EAGAIN)
1530                                 pr_warn("failed to accept err=%d\n", ret);
1531                         return;
1532                 }
1533                 ret = nvmet_tcp_alloc_queue(port, newsock);
1534                 if (ret) {
1535                         pr_err("failed to allocate queue\n");
1536                         sock_release(newsock);
1537                 }
1538         }
1539 }
1540
1541 static void nvmet_tcp_listen_data_ready(struct sock *sk)
1542 {
1543         struct nvmet_tcp_port *port;
1544
1545         read_lock_bh(&sk->sk_callback_lock);
1546         port = sk->sk_user_data;
1547         if (!port)
1548                 goto out;
1549
1550         if (sk->sk_state == TCP_LISTEN)
1551                 schedule_work(&port->accept_work);
1552 out:
1553         read_unlock_bh(&sk->sk_callback_lock);
1554 }
1555
1556 static int nvmet_tcp_add_port(struct nvmet_port *nport)
1557 {
1558         struct nvmet_tcp_port *port;
1559         __kernel_sa_family_t af;
1560         int opt, ret;
1561
1562         port = kzalloc(sizeof(*port), GFP_KERNEL);
1563         if (!port)
1564                 return -ENOMEM;
1565
1566         switch (nport->disc_addr.adrfam) {
1567         case NVMF_ADDR_FAMILY_IP4:
1568                 af = AF_INET;
1569                 break;
1570         case NVMF_ADDR_FAMILY_IP6:
1571                 af = AF_INET6;
1572                 break;
1573         default:
1574                 pr_err("address family %d not supported\n",
1575                                 nport->disc_addr.adrfam);
1576                 ret = -EINVAL;
1577                 goto err_port;
1578         }
1579
1580         ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1581                         nport->disc_addr.trsvcid, &port->addr);
1582         if (ret) {
1583                 pr_err("malformed ip/port passed: %s:%s\n",
1584                         nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1585                 goto err_port;
1586         }
1587
1588         port->nport = nport;
1589         port->last_cpu = -1;
1590         INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1591         if (port->nport->inline_data_size < 0)
1592                 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1593
1594         ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1595                                 IPPROTO_TCP, &port->sock);
1596         if (ret) {
1597                 pr_err("failed to create a socket\n");
1598                 goto err_port;
1599         }
1600
1601         port->sock->sk->sk_user_data = port;
1602         port->data_ready = port->sock->sk->sk_data_ready;
1603         port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1604
1605         opt = 1;
1606         ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
1607                         TCP_NODELAY, (char *)&opt, sizeof(opt));
1608         if (ret) {
1609                 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
1610                 goto err_sock;
1611         }
1612
1613         ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
1614                         (char *)&opt, sizeof(opt));
1615         if (ret) {
1616                 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
1617                 goto err_sock;
1618         }
1619
1620         ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1621                         sizeof(port->addr));
1622         if (ret) {
1623                 pr_err("failed to bind port socket %d\n", ret);
1624                 goto err_sock;
1625         }
1626
1627         ret = kernel_listen(port->sock, 128);
1628         if (ret) {
1629                 pr_err("failed to listen %d on port sock\n", ret);
1630                 goto err_sock;
1631         }
1632
1633         nport->priv = port;
1634         pr_info("enabling port %d (%pISpc)\n",
1635                 le16_to_cpu(nport->disc_addr.portid), &port->addr);
1636
1637         return 0;
1638
1639 err_sock:
1640         sock_release(port->sock);
1641 err_port:
1642         kfree(port);
1643         return ret;
1644 }
1645
1646 static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1647 {
1648         struct nvmet_tcp_port *port = nport->priv;
1649
1650         write_lock_bh(&port->sock->sk->sk_callback_lock);
1651         port->sock->sk->sk_data_ready = port->data_ready;
1652         port->sock->sk->sk_user_data = NULL;
1653         write_unlock_bh(&port->sock->sk->sk_callback_lock);
1654         cancel_work_sync(&port->accept_work);
1655
1656         sock_release(port->sock);
1657         kfree(port);
1658 }
1659
1660 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1661 {
1662         struct nvmet_tcp_queue *queue;
1663
1664         mutex_lock(&nvmet_tcp_queue_mutex);
1665         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1666                 if (queue->nvme_sq.ctrl == ctrl)
1667                         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1668         mutex_unlock(&nvmet_tcp_queue_mutex);
1669 }
1670
1671 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1672 {
1673         struct nvmet_tcp_queue *queue =
1674                 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1675
1676         if (sq->qid == 0) {
1677                 /* Let inflight controller teardown complete */
1678                 flush_scheduled_work();
1679         }
1680
1681         queue->nr_cmds = sq->size * 2;
1682         if (nvmet_tcp_alloc_cmds(queue))
1683                 return NVME_SC_INTERNAL;
1684         return 0;
1685 }
1686
1687 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1688                 struct nvmet_port *nport, char *traddr)
1689 {
1690         struct nvmet_tcp_port *port = nport->priv;
1691
1692         if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1693                 struct nvmet_tcp_cmd *cmd =
1694                         container_of(req, struct nvmet_tcp_cmd, req);
1695                 struct nvmet_tcp_queue *queue = cmd->queue;
1696
1697                 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1698         } else {
1699                 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1700         }
1701 }
1702
1703 static struct nvmet_fabrics_ops nvmet_tcp_ops = {
1704         .owner                  = THIS_MODULE,
1705         .type                   = NVMF_TRTYPE_TCP,
1706         .msdbd                  = 1,
1707         .has_keyed_sgls         = 0,
1708         .add_port               = nvmet_tcp_add_port,
1709         .remove_port            = nvmet_tcp_remove_port,
1710         .queue_response         = nvmet_tcp_queue_response,
1711         .delete_ctrl            = nvmet_tcp_delete_ctrl,
1712         .install_queue          = nvmet_tcp_install_queue,
1713         .disc_traddr            = nvmet_tcp_disc_port_addr,
1714 };
1715
1716 static int __init nvmet_tcp_init(void)
1717 {
1718         int ret;
1719
1720         nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1721         if (!nvmet_tcp_wq)
1722                 return -ENOMEM;
1723
1724         ret = nvmet_register_transport(&nvmet_tcp_ops);
1725         if (ret)
1726                 goto err;
1727
1728         return 0;
1729 err:
1730         destroy_workqueue(nvmet_tcp_wq);
1731         return ret;
1732 }
1733
1734 static void __exit nvmet_tcp_exit(void)
1735 {
1736         struct nvmet_tcp_queue *queue;
1737
1738         nvmet_unregister_transport(&nvmet_tcp_ops);
1739
1740         flush_scheduled_work();
1741         mutex_lock(&nvmet_tcp_queue_mutex);
1742         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1743                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1744         mutex_unlock(&nvmet_tcp_queue_mutex);
1745         flush_scheduled_work();
1746
1747         destroy_workqueue(nvmet_tcp_wq);
1748 }
1749
1750 module_init(nvmet_tcp_init);
1751 module_exit(nvmet_tcp_exit);
1752
1753 MODULE_LICENSE("GPL v2");
1754 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */