]> asedeno.scripts.mit.edu Git - linux.git/blob - net/smc/af_smc.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next
[linux.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *
12  *  Copyright IBM Corp. 2016, 2018
13  *
14  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
15  *              based on prototype from Frank Blaschka
16  */
17
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
24 #include <linux/in.h>
25 #include <linux/sched/signal.h>
26 #include <linux/if_vlan.h>
27
28 #include <net/sock.h>
29 #include <net/tcp.h>
30 #include <net/smc.h>
31 #include <asm/ioctls.h>
32
33 #include <net/net_namespace.h>
34 #include <net/netns/generic.h>
35 #include "smc_netns.h"
36
37 #include "smc.h"
38 #include "smc_clc.h"
39 #include "smc_llc.h"
40 #include "smc_cdc.h"
41 #include "smc_core.h"
42 #include "smc_ib.h"
43 #include "smc_ism.h"
44 #include "smc_pnet.h"
45 #include "smc_tx.h"
46 #include "smc_rx.h"
47 #include "smc_close.h"
48
49 static DEFINE_MUTEX(smc_server_lgr_pending);    /* serialize link group
50                                                  * creation on server
51                                                  */
52 static DEFINE_MUTEX(smc_client_lgr_pending);    /* serialize link group
53                                                  * creation on client
54                                                  */
55
56 static void smc_tcp_listen_work(struct work_struct *);
57 static void smc_connect_work(struct work_struct *);
58
59 static void smc_set_keepalive(struct sock *sk, int val)
60 {
61         struct smc_sock *smc = smc_sk(sk);
62
63         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
64 }
65
66 static struct smc_hashinfo smc_v4_hashinfo = {
67         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
68 };
69
70 static struct smc_hashinfo smc_v6_hashinfo = {
71         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
72 };
73
74 int smc_hash_sk(struct sock *sk)
75 {
76         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
77         struct hlist_head *head;
78
79         head = &h->ht;
80
81         write_lock_bh(&h->lock);
82         sk_add_node(sk, head);
83         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
84         write_unlock_bh(&h->lock);
85
86         return 0;
87 }
88 EXPORT_SYMBOL_GPL(smc_hash_sk);
89
90 void smc_unhash_sk(struct sock *sk)
91 {
92         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
93
94         write_lock_bh(&h->lock);
95         if (sk_del_node_init(sk))
96                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
97         write_unlock_bh(&h->lock);
98 }
99 EXPORT_SYMBOL_GPL(smc_unhash_sk);
100
101 struct proto smc_proto = {
102         .name           = "SMC",
103         .owner          = THIS_MODULE,
104         .keepalive      = smc_set_keepalive,
105         .hash           = smc_hash_sk,
106         .unhash         = smc_unhash_sk,
107         .obj_size       = sizeof(struct smc_sock),
108         .h.smc_hash     = &smc_v4_hashinfo,
109         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
110 };
111 EXPORT_SYMBOL_GPL(smc_proto);
112
113 struct proto smc_proto6 = {
114         .name           = "SMC6",
115         .owner          = THIS_MODULE,
116         .keepalive      = smc_set_keepalive,
117         .hash           = smc_hash_sk,
118         .unhash         = smc_unhash_sk,
119         .obj_size       = sizeof(struct smc_sock),
120         .h.smc_hash     = &smc_v6_hashinfo,
121         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
122 };
123 EXPORT_SYMBOL_GPL(smc_proto6);
124
125 static int smc_release(struct socket *sock)
126 {
127         struct sock *sk = sock->sk;
128         struct smc_sock *smc;
129         int rc = 0;
130
131         if (!sk)
132                 goto out;
133
134         smc = smc_sk(sk);
135
136         /* cleanup for a dangling non-blocking connect */
137         if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
138                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
139         flush_work(&smc->connect_work);
140
141         if (sk->sk_state == SMC_LISTEN)
142                 /* smc_close_non_accepted() is called and acquires
143                  * sock lock for child sockets again
144                  */
145                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
146         else
147                 lock_sock(sk);
148
149         if (!smc->use_fallback) {
150                 rc = smc_close_active(smc);
151                 sock_set_flag(sk, SOCK_DEAD);
152                 sk->sk_shutdown |= SHUTDOWN_MASK;
153         } else {
154                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
155                         sock_put(sk); /* passive closing */
156                 if (sk->sk_state == SMC_LISTEN) {
157                         /* wake up clcsock accept */
158                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
159                 }
160                 sk->sk_state = SMC_CLOSED;
161                 sk->sk_state_change(sk);
162         }
163
164         sk->sk_prot->unhash(sk);
165
166         if (sk->sk_state == SMC_CLOSED) {
167                 if (smc->clcsock) {
168                         mutex_lock(&smc->clcsock_release_lock);
169                         sock_release(smc->clcsock);
170                         smc->clcsock = NULL;
171                         mutex_unlock(&smc->clcsock_release_lock);
172                 }
173                 if (!smc->use_fallback)
174                         smc_conn_free(&smc->conn);
175         }
176
177         /* detach socket */
178         sock_orphan(sk);
179         sock->sk = NULL;
180         release_sock(sk);
181
182         sock_put(sk); /* final sock_put */
183 out:
184         return rc;
185 }
186
187 static void smc_destruct(struct sock *sk)
188 {
189         if (sk->sk_state != SMC_CLOSED)
190                 return;
191         if (!sock_flag(sk, SOCK_DEAD))
192                 return;
193
194         sk_refcnt_debug_dec(sk);
195 }
196
197 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
198                                    int protocol)
199 {
200         struct smc_sock *smc;
201         struct proto *prot;
202         struct sock *sk;
203
204         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
205         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
206         if (!sk)
207                 return NULL;
208
209         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
210         sk->sk_state = SMC_INIT;
211         sk->sk_destruct = smc_destruct;
212         sk->sk_protocol = protocol;
213         smc = smc_sk(sk);
214         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
215         INIT_WORK(&smc->connect_work, smc_connect_work);
216         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
217         INIT_LIST_HEAD(&smc->accept_q);
218         spin_lock_init(&smc->accept_q_lock);
219         spin_lock_init(&smc->conn.send_lock);
220         sk->sk_prot->hash(sk);
221         sk_refcnt_debug_inc(sk);
222         mutex_init(&smc->clcsock_release_lock);
223
224         return sk;
225 }
226
227 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
228                     int addr_len)
229 {
230         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
231         struct sock *sk = sock->sk;
232         struct smc_sock *smc;
233         int rc;
234
235         smc = smc_sk(sk);
236
237         /* replicate tests from inet_bind(), to be safe wrt. future changes */
238         rc = -EINVAL;
239         if (addr_len < sizeof(struct sockaddr_in))
240                 goto out;
241
242         rc = -EAFNOSUPPORT;
243         if (addr->sin_family != AF_INET &&
244             addr->sin_family != AF_INET6 &&
245             addr->sin_family != AF_UNSPEC)
246                 goto out;
247         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
248         if (addr->sin_family == AF_UNSPEC &&
249             addr->sin_addr.s_addr != htonl(INADDR_ANY))
250                 goto out;
251
252         lock_sock(sk);
253
254         /* Check if socket is already active */
255         rc = -EINVAL;
256         if (sk->sk_state != SMC_INIT)
257                 goto out_rel;
258
259         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
260         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
261
262 out_rel:
263         release_sock(sk);
264 out:
265         return rc;
266 }
267
268 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
269                                    unsigned long mask)
270 {
271         /* options we don't get control via setsockopt for */
272         nsk->sk_type = osk->sk_type;
273         nsk->sk_sndbuf = osk->sk_sndbuf;
274         nsk->sk_rcvbuf = osk->sk_rcvbuf;
275         nsk->sk_sndtimeo = osk->sk_sndtimeo;
276         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
277         nsk->sk_mark = osk->sk_mark;
278         nsk->sk_priority = osk->sk_priority;
279         nsk->sk_rcvlowat = osk->sk_rcvlowat;
280         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
281         nsk->sk_err = osk->sk_err;
282
283         nsk->sk_flags &= ~mask;
284         nsk->sk_flags |= osk->sk_flags & mask;
285 }
286
287 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
288                              (1UL << SOCK_KEEPOPEN) | \
289                              (1UL << SOCK_LINGER) | \
290                              (1UL << SOCK_BROADCAST) | \
291                              (1UL << SOCK_TIMESTAMP) | \
292                              (1UL << SOCK_DBG) | \
293                              (1UL << SOCK_RCVTSTAMP) | \
294                              (1UL << SOCK_RCVTSTAMPNS) | \
295                              (1UL << SOCK_LOCALROUTE) | \
296                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
297                              (1UL << SOCK_RXQ_OVFL) | \
298                              (1UL << SOCK_WIFI_STATUS) | \
299                              (1UL << SOCK_NOFCS) | \
300                              (1UL << SOCK_FILTER_LOCKED) | \
301                              (1UL << SOCK_TSTAMP_NEW))
302 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
303  * clc socket (since smc is not called for these options from net/core)
304  */
305 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
306 {
307         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
308 }
309
310 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
311                              (1UL << SOCK_KEEPOPEN) | \
312                              (1UL << SOCK_LINGER) | \
313                              (1UL << SOCK_DBG))
314 /* copy only settings and flags relevant for smc from clc to smc socket */
315 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
316 {
317         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
318 }
319
320 /* register a new rmb, send confirm_rkey msg to register with peer */
321 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
322                        bool conf_rkey)
323 {
324         if (!rmb_desc->wr_reg) {
325                 /* register memory region for new rmb */
326                 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
327                         rmb_desc->regerr = 1;
328                         return -EFAULT;
329                 }
330                 rmb_desc->wr_reg = 1;
331         }
332         if (!conf_rkey)
333                 return 0;
334         /* exchange confirm_rkey msg with peer */
335         if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
336                 rmb_desc->regerr = 1;
337                 return -EFAULT;
338         }
339         return 0;
340 }
341
342 static int smc_clnt_conf_first_link(struct smc_sock *smc)
343 {
344         struct net *net = sock_net(smc->clcsock->sk);
345         struct smc_link_group *lgr = smc->conn.lgr;
346         struct smc_link *link;
347         int rest;
348         int rc;
349
350         link = &lgr->lnk[SMC_SINGLE_LINK];
351         /* receive CONFIRM LINK request from server over RoCE fabric */
352         rest = wait_for_completion_interruptible_timeout(
353                 &link->llc_confirm,
354                 SMC_LLC_WAIT_FIRST_TIME);
355         if (rest <= 0) {
356                 struct smc_clc_msg_decline dclc;
357
358                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
359                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
360                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
361         }
362
363         if (link->llc_confirm_rc)
364                 return SMC_CLC_DECL_RMBE_EC;
365
366         rc = smc_ib_modify_qp_rts(link);
367         if (rc)
368                 return SMC_CLC_DECL_ERR_RDYLNK;
369
370         smc_wr_remember_qp_attr(link);
371
372         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
373                 return SMC_CLC_DECL_ERR_REGRMB;
374
375         /* send CONFIRM LINK response over RoCE fabric */
376         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
377         if (rc < 0)
378                 return SMC_CLC_DECL_TIMEOUT_CL;
379
380         /* receive ADD LINK request from server over RoCE fabric */
381         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
382                                                          SMC_LLC_WAIT_TIME);
383         if (rest <= 0) {
384                 struct smc_clc_msg_decline dclc;
385
386                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
387                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
388                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
389         }
390
391         /* send add link reject message, only one link supported for now */
392         rc = smc_llc_send_add_link(link,
393                                    link->smcibdev->mac[link->ibport - 1],
394                                    link->gid, SMC_LLC_RESP);
395         if (rc < 0)
396                 return SMC_CLC_DECL_TIMEOUT_AL;
397
398         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
399
400         return 0;
401 }
402
403 static void smcr_conn_save_peer_info(struct smc_sock *smc,
404                                      struct smc_clc_msg_accept_confirm *clc)
405 {
406         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
407
408         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
409         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
410         smc->conn.peer_rmbe_size = bufsize;
411         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
412         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
413 }
414
415 static void smcd_conn_save_peer_info(struct smc_sock *smc,
416                                      struct smc_clc_msg_accept_confirm *clc)
417 {
418         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
419
420         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
421         smc->conn.peer_token = clc->token;
422         /* msg header takes up space in the buffer */
423         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
424         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
425         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
426 }
427
428 static void smc_conn_save_peer_info(struct smc_sock *smc,
429                                     struct smc_clc_msg_accept_confirm *clc)
430 {
431         if (smc->conn.lgr->is_smcd)
432                 smcd_conn_save_peer_info(smc, clc);
433         else
434                 smcr_conn_save_peer_info(smc, clc);
435 }
436
437 static void smc_link_save_peer_info(struct smc_link *link,
438                                     struct smc_clc_msg_accept_confirm *clc)
439 {
440         link->peer_qpn = ntoh24(clc->qpn);
441         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
442         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
443         link->peer_psn = ntoh24(clc->psn);
444         link->peer_mtu = clc->qp_mtu;
445 }
446
447 /* fall back during connect */
448 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
449 {
450         smc->use_fallback = true;
451         smc->fallback_rsn = reason_code;
452         smc_copy_sock_settings_to_clc(smc);
453         smc->connect_nonblock = 0;
454         if (smc->sk.sk_state == SMC_INIT)
455                 smc->sk.sk_state = SMC_ACTIVE;
456         return 0;
457 }
458
459 /* decline and fall back during connect */
460 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
461 {
462         int rc;
463
464         if (reason_code < 0) { /* error, fallback is not possible */
465                 if (smc->sk.sk_state == SMC_INIT)
466                         sock_put(&smc->sk); /* passive closing */
467                 return reason_code;
468         }
469         if (reason_code != SMC_CLC_DECL_PEERDECL) {
470                 rc = smc_clc_send_decline(smc, reason_code);
471                 if (rc < 0) {
472                         if (smc->sk.sk_state == SMC_INIT)
473                                 sock_put(&smc->sk); /* passive closing */
474                         return rc;
475                 }
476         }
477         return smc_connect_fallback(smc, reason_code);
478 }
479
480 /* abort connecting */
481 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
482                              int local_contact)
483 {
484         if (local_contact == SMC_FIRST_CONTACT)
485                 smc_lgr_forget(smc->conn.lgr);
486         if (smc->conn.lgr->is_smcd)
487                 /* there is only one lgr role for SMC-D; use server lock */
488                 mutex_unlock(&smc_server_lgr_pending);
489         else
490                 mutex_unlock(&smc_client_lgr_pending);
491
492         smc_conn_free(&smc->conn);
493         smc->connect_nonblock = 0;
494         return reason_code;
495 }
496
497 /* check if there is a rdma device available for this connection. */
498 /* called for connect and listen */
499 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
500 {
501         /* PNET table look up: search active ib_device and port
502          * within same PNETID that also contains the ethernet device
503          * used for the internal TCP socket
504          */
505         smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
506         if (!ini->ib_dev)
507                 return SMC_CLC_DECL_NOSMCRDEV;
508         return 0;
509 }
510
511 /* check if there is an ISM device available for this connection. */
512 /* called for connect and listen */
513 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
514 {
515         /* Find ISM device with same PNETID as connecting interface  */
516         smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
517         if (!ini->ism_dev)
518                 return SMC_CLC_DECL_NOSMCDDEV;
519         return 0;
520 }
521
522 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
523 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
524                                       struct smc_init_info *ini)
525 {
526         if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
527                 return SMC_CLC_DECL_ISMVLANERR;
528         return 0;
529 }
530
531 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
532  * used, the VLAN ID will be registered again during the connection setup.
533  */
534 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
535                                         struct smc_init_info *ini)
536 {
537         if (!is_smcd)
538                 return 0;
539         if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
540                 return SMC_CLC_DECL_CNFERR;
541         return 0;
542 }
543
544 /* CLC handshake during connect */
545 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
546                            struct smc_clc_msg_accept_confirm *aclc,
547                            struct smc_init_info *ini)
548 {
549         int rc = 0;
550
551         /* do inband token exchange */
552         rc = smc_clc_send_proposal(smc, smc_type, ini);
553         if (rc)
554                 return rc;
555         /* receive SMC Accept CLC message */
556         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
557                                 CLC_WAIT_TIME);
558 }
559
560 /* setup for RDMA connection of client */
561 static int smc_connect_rdma(struct smc_sock *smc,
562                             struct smc_clc_msg_accept_confirm *aclc,
563                             struct smc_init_info *ini)
564 {
565         struct smc_link *link;
566         int reason_code = 0;
567
568         ini->is_smcd = false;
569         ini->ib_lcl = &aclc->lcl;
570         ini->ib_clcqpn = ntoh24(aclc->qpn);
571         ini->srv_first_contact = aclc->hdr.flag;
572
573         mutex_lock(&smc_client_lgr_pending);
574         reason_code = smc_conn_create(smc, ini);
575         if (reason_code) {
576                 mutex_unlock(&smc_client_lgr_pending);
577                 return reason_code;
578         }
579         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
580
581         smc_conn_save_peer_info(smc, aclc);
582
583         /* create send buffer and rmb */
584         if (smc_buf_create(smc, false))
585                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
586                                          ini->cln_first_contact);
587
588         if (ini->cln_first_contact == SMC_FIRST_CONTACT)
589                 smc_link_save_peer_info(link, aclc);
590
591         if (smc_rmb_rtoken_handling(&smc->conn, aclc))
592                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
593                                          ini->cln_first_contact);
594
595         smc_close_init(smc);
596         smc_rx_init(smc);
597
598         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
599                 if (smc_ib_ready_link(link))
600                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
601                                                  ini->cln_first_contact);
602         } else {
603                 if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
604                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
605                                                  ini->cln_first_contact);
606         }
607         smc_rmb_sync_sg_for_device(&smc->conn);
608
609         reason_code = smc_clc_send_confirm(smc);
610         if (reason_code)
611                 return smc_connect_abort(smc, reason_code,
612                                          ini->cln_first_contact);
613
614         smc_tx_init(smc);
615
616         if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
617                 /* QP confirmation over RoCE fabric */
618                 reason_code = smc_clnt_conf_first_link(smc);
619                 if (reason_code)
620                         return smc_connect_abort(smc, reason_code,
621                                                  ini->cln_first_contact);
622         }
623         mutex_unlock(&smc_client_lgr_pending);
624
625         smc_copy_sock_settings_to_clc(smc);
626         smc->connect_nonblock = 0;
627         if (smc->sk.sk_state == SMC_INIT)
628                 smc->sk.sk_state = SMC_ACTIVE;
629
630         return 0;
631 }
632
633 /* setup for ISM connection of client */
634 static int smc_connect_ism(struct smc_sock *smc,
635                            struct smc_clc_msg_accept_confirm *aclc,
636                            struct smc_init_info *ini)
637 {
638         int rc = 0;
639
640         ini->is_smcd = true;
641         ini->ism_gid = aclc->gid;
642         ini->srv_first_contact = aclc->hdr.flag;
643
644         /* there is only one lgr role for SMC-D; use server lock */
645         mutex_lock(&smc_server_lgr_pending);
646         rc = smc_conn_create(smc, ini);
647         if (rc) {
648                 mutex_unlock(&smc_server_lgr_pending);
649                 return rc;
650         }
651
652         /* Create send and receive buffers */
653         if (smc_buf_create(smc, true))
654                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
655                                          ini->cln_first_contact);
656
657         smc_conn_save_peer_info(smc, aclc);
658         smc_close_init(smc);
659         smc_rx_init(smc);
660         smc_tx_init(smc);
661
662         rc = smc_clc_send_confirm(smc);
663         if (rc)
664                 return smc_connect_abort(smc, rc, ini->cln_first_contact);
665         mutex_unlock(&smc_server_lgr_pending);
666
667         smc_copy_sock_settings_to_clc(smc);
668         smc->connect_nonblock = 0;
669         if (smc->sk.sk_state == SMC_INIT)
670                 smc->sk.sk_state = SMC_ACTIVE;
671
672         return 0;
673 }
674
675 /* perform steps before actually connecting */
676 static int __smc_connect(struct smc_sock *smc)
677 {
678         bool ism_supported = false, rdma_supported = false;
679         struct smc_clc_msg_accept_confirm aclc;
680         struct smc_init_info ini = {0};
681         int smc_type;
682         int rc = 0;
683
684         sock_hold(&smc->sk); /* sock put in passive closing */
685
686         if (smc->use_fallback)
687                 return smc_connect_fallback(smc, smc->fallback_rsn);
688
689         /* if peer has not signalled SMC-capability, fall back */
690         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
691                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
692
693         /* IPSec connections opt out of SMC-R optimizations */
694         if (using_ipsec(smc))
695                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
696
697         /* get vlan id from IP device */
698         if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
699                 return smc_connect_decline_fallback(smc,
700                                                     SMC_CLC_DECL_GETVLANERR);
701
702         /* check if there is an ism device available */
703         if (!smc_find_ism_device(smc, &ini) &&
704             !smc_connect_ism_vlan_setup(smc, &ini)) {
705                 /* ISM is supported for this connection */
706                 ism_supported = true;
707                 smc_type = SMC_TYPE_D;
708         }
709
710         /* check if there is a rdma device available */
711         if (!smc_find_rdma_device(smc, &ini)) {
712                 /* RDMA is supported for this connection */
713                 rdma_supported = true;
714                 if (ism_supported)
715                         smc_type = SMC_TYPE_B; /* both */
716                 else
717                         smc_type = SMC_TYPE_R; /* only RDMA */
718         }
719
720         /* if neither ISM nor RDMA are supported, fallback */
721         if (!rdma_supported && !ism_supported)
722                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
723
724         /* perform CLC handshake */
725         rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
726         if (rc) {
727                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
728                 return smc_connect_decline_fallback(smc, rc);
729         }
730
731         /* depending on previous steps, connect using rdma or ism */
732         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
733                 rc = smc_connect_rdma(smc, &aclc, &ini);
734         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
735                 rc = smc_connect_ism(smc, &aclc, &ini);
736         else
737                 rc = SMC_CLC_DECL_MODEUNSUPP;
738         if (rc) {
739                 smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
740                 return smc_connect_decline_fallback(smc, rc);
741         }
742
743         smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
744         return 0;
745 }
746
747 static void smc_connect_work(struct work_struct *work)
748 {
749         struct smc_sock *smc = container_of(work, struct smc_sock,
750                                             connect_work);
751         long timeo = smc->sk.sk_sndtimeo;
752         int rc = 0;
753
754         if (!timeo)
755                 timeo = MAX_SCHEDULE_TIMEOUT;
756         lock_sock(smc->clcsock->sk);
757         if (smc->clcsock->sk->sk_err) {
758                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
759         } else if ((1 << smc->clcsock->sk->sk_state) &
760                                         (TCPF_SYN_SENT | TCP_SYN_RECV)) {
761                 rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
762                 if ((rc == -EPIPE) &&
763                     ((1 << smc->clcsock->sk->sk_state) &
764                                         (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
765                         rc = 0;
766         }
767         release_sock(smc->clcsock->sk);
768         lock_sock(&smc->sk);
769         if (rc != 0 || smc->sk.sk_err) {
770                 smc->sk.sk_state = SMC_CLOSED;
771                 if (rc == -EPIPE || rc == -EAGAIN)
772                         smc->sk.sk_err = EPIPE;
773                 else if (signal_pending(current))
774                         smc->sk.sk_err = -sock_intr_errno(timeo);
775                 goto out;
776         }
777
778         rc = __smc_connect(smc);
779         if (rc < 0)
780                 smc->sk.sk_err = -rc;
781
782 out:
783         if (smc->sk.sk_err)
784                 smc->sk.sk_state_change(&smc->sk);
785         else
786                 smc->sk.sk_write_space(&smc->sk);
787         release_sock(&smc->sk);
788 }
789
790 static int smc_connect(struct socket *sock, struct sockaddr *addr,
791                        int alen, int flags)
792 {
793         struct sock *sk = sock->sk;
794         struct smc_sock *smc;
795         int rc = -EINVAL;
796
797         smc = smc_sk(sk);
798
799         /* separate smc parameter checking to be safe */
800         if (alen < sizeof(addr->sa_family))
801                 goto out_err;
802         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
803                 goto out_err;
804
805         lock_sock(sk);
806         switch (sk->sk_state) {
807         default:
808                 goto out;
809         case SMC_ACTIVE:
810                 rc = -EISCONN;
811                 goto out;
812         case SMC_INIT:
813                 rc = 0;
814                 break;
815         }
816
817         smc_copy_sock_settings_to_clc(smc);
818         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
819         if (smc->connect_nonblock) {
820                 rc = -EALREADY;
821                 goto out;
822         }
823         rc = kernel_connect(smc->clcsock, addr, alen, flags);
824         if (rc && rc != -EINPROGRESS)
825                 goto out;
826         if (flags & O_NONBLOCK) {
827                 if (schedule_work(&smc->connect_work))
828                         smc->connect_nonblock = 1;
829                 rc = -EINPROGRESS;
830         } else {
831                 rc = __smc_connect(smc);
832                 if (rc < 0)
833                         goto out;
834                 else
835                         rc = 0; /* success cases including fallback */
836         }
837
838 out:
839         release_sock(sk);
840 out_err:
841         return rc;
842 }
843
844 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
845 {
846         struct socket *new_clcsock = NULL;
847         struct sock *lsk = &lsmc->sk;
848         struct sock *new_sk;
849         int rc = -EINVAL;
850
851         release_sock(lsk);
852         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
853         if (!new_sk) {
854                 rc = -ENOMEM;
855                 lsk->sk_err = ENOMEM;
856                 *new_smc = NULL;
857                 lock_sock(lsk);
858                 goto out;
859         }
860         *new_smc = smc_sk(new_sk);
861
862         mutex_lock(&lsmc->clcsock_release_lock);
863         if (lsmc->clcsock)
864                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
865         mutex_unlock(&lsmc->clcsock_release_lock);
866         lock_sock(lsk);
867         if  (rc < 0)
868                 lsk->sk_err = -rc;
869         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
870                 if (new_clcsock)
871                         sock_release(new_clcsock);
872                 new_sk->sk_state = SMC_CLOSED;
873                 sock_set_flag(new_sk, SOCK_DEAD);
874                 new_sk->sk_prot->unhash(new_sk);
875                 sock_put(new_sk); /* final */
876                 *new_smc = NULL;
877                 goto out;
878         }
879
880         (*new_smc)->clcsock = new_clcsock;
881 out:
882         return rc;
883 }
884
885 /* add a just created sock to the accept queue of the listen sock as
886  * candidate for a following socket accept call from user space
887  */
888 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
889 {
890         struct smc_sock *par = smc_sk(parent);
891
892         sock_hold(sk); /* sock_put in smc_accept_unlink () */
893         spin_lock(&par->accept_q_lock);
894         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
895         spin_unlock(&par->accept_q_lock);
896         sk_acceptq_added(parent);
897 }
898
899 /* remove a socket from the accept queue of its parental listening socket */
900 static void smc_accept_unlink(struct sock *sk)
901 {
902         struct smc_sock *par = smc_sk(sk)->listen_smc;
903
904         spin_lock(&par->accept_q_lock);
905         list_del_init(&smc_sk(sk)->accept_q);
906         spin_unlock(&par->accept_q_lock);
907         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
908         sock_put(sk); /* sock_hold in smc_accept_enqueue */
909 }
910
911 /* remove a sock from the accept queue to bind it to a new socket created
912  * for a socket accept call from user space
913  */
914 struct sock *smc_accept_dequeue(struct sock *parent,
915                                 struct socket *new_sock)
916 {
917         struct smc_sock *isk, *n;
918         struct sock *new_sk;
919
920         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
921                 new_sk = (struct sock *)isk;
922
923                 smc_accept_unlink(new_sk);
924                 if (new_sk->sk_state == SMC_CLOSED) {
925                         if (isk->clcsock) {
926                                 sock_release(isk->clcsock);
927                                 isk->clcsock = NULL;
928                         }
929                         new_sk->sk_prot->unhash(new_sk);
930                         sock_put(new_sk); /* final */
931                         continue;
932                 }
933                 if (new_sock)
934                         sock_graft(new_sk, new_sock);
935                 return new_sk;
936         }
937         return NULL;
938 }
939
940 /* clean up for a created but never accepted sock */
941 void smc_close_non_accepted(struct sock *sk)
942 {
943         struct smc_sock *smc = smc_sk(sk);
944
945         lock_sock(sk);
946         if (!sk->sk_lingertime)
947                 /* wait for peer closing */
948                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
949         if (!smc->use_fallback) {
950                 smc_close_active(smc);
951                 sock_set_flag(sk, SOCK_DEAD);
952                 sk->sk_shutdown |= SHUTDOWN_MASK;
953         }
954         if (smc->clcsock) {
955                 struct socket *tcp;
956
957                 tcp = smc->clcsock;
958                 smc->clcsock = NULL;
959                 sock_release(tcp);
960         }
961         if (smc->use_fallback) {
962                 sock_put(sk); /* passive closing */
963                 sk->sk_state = SMC_CLOSED;
964         } else {
965                 if (sk->sk_state == SMC_CLOSED)
966                         smc_conn_free(&smc->conn);
967         }
968         release_sock(sk);
969         sk->sk_prot->unhash(sk);
970         sock_put(sk); /* final sock_put */
971 }
972
973 static int smc_serv_conf_first_link(struct smc_sock *smc)
974 {
975         struct net *net = sock_net(smc->clcsock->sk);
976         struct smc_link_group *lgr = smc->conn.lgr;
977         struct smc_link *link;
978         int rest;
979         int rc;
980
981         link = &lgr->lnk[SMC_SINGLE_LINK];
982
983         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
984                 return SMC_CLC_DECL_ERR_REGRMB;
985
986         /* send CONFIRM LINK request to client over the RoCE fabric */
987         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
988         if (rc < 0)
989                 return SMC_CLC_DECL_TIMEOUT_CL;
990
991         /* receive CONFIRM LINK response from client over the RoCE fabric */
992         rest = wait_for_completion_interruptible_timeout(
993                 &link->llc_confirm_resp,
994                 SMC_LLC_WAIT_FIRST_TIME);
995         if (rest <= 0) {
996                 struct smc_clc_msg_decline dclc;
997
998                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
999                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1000                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1001         }
1002
1003         if (link->llc_confirm_resp_rc)
1004                 return SMC_CLC_DECL_RMBE_EC;
1005
1006         /* send ADD LINK request to client over the RoCE fabric */
1007         rc = smc_llc_send_add_link(link,
1008                                    link->smcibdev->mac[link->ibport - 1],
1009                                    link->gid, SMC_LLC_REQ);
1010         if (rc < 0)
1011                 return SMC_CLC_DECL_TIMEOUT_AL;
1012
1013         /* receive ADD LINK response from client over the RoCE fabric */
1014         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1015                                                          SMC_LLC_WAIT_TIME);
1016         if (rest <= 0) {
1017                 struct smc_clc_msg_decline dclc;
1018
1019                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1020                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1021                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1022         }
1023
1024         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1025
1026         return 0;
1027 }
1028
1029 /* listen worker: finish */
1030 static void smc_listen_out(struct smc_sock *new_smc)
1031 {
1032         struct smc_sock *lsmc = new_smc->listen_smc;
1033         struct sock *newsmcsk = &new_smc->sk;
1034
1035         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1036         if (lsmc->sk.sk_state == SMC_LISTEN) {
1037                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1038         } else { /* no longer listening */
1039                 smc_close_non_accepted(newsmcsk);
1040         }
1041         release_sock(&lsmc->sk);
1042
1043         /* Wake up accept */
1044         lsmc->sk.sk_data_ready(&lsmc->sk);
1045         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1046 }
1047
1048 /* listen worker: finish in state connected */
1049 static void smc_listen_out_connected(struct smc_sock *new_smc)
1050 {
1051         struct sock *newsmcsk = &new_smc->sk;
1052
1053         sk_refcnt_debug_inc(newsmcsk);
1054         if (newsmcsk->sk_state == SMC_INIT)
1055                 newsmcsk->sk_state = SMC_ACTIVE;
1056
1057         smc_listen_out(new_smc);
1058 }
1059
1060 /* listen worker: finish in error state */
1061 static void smc_listen_out_err(struct smc_sock *new_smc)
1062 {
1063         struct sock *newsmcsk = &new_smc->sk;
1064
1065         if (newsmcsk->sk_state == SMC_INIT)
1066                 sock_put(&new_smc->sk); /* passive closing */
1067         newsmcsk->sk_state = SMC_CLOSED;
1068         smc_conn_free(&new_smc->conn);
1069
1070         smc_listen_out(new_smc);
1071 }
1072
1073 /* listen worker: decline and fall back if possible */
1074 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1075                                int local_contact)
1076 {
1077         /* RDMA setup failed, switch back to TCP */
1078         if (local_contact == SMC_FIRST_CONTACT)
1079                 smc_lgr_forget(new_smc->conn.lgr);
1080         if (reason_code < 0) { /* error, no fallback possible */
1081                 smc_listen_out_err(new_smc);
1082                 return;
1083         }
1084         smc_conn_free(&new_smc->conn);
1085         new_smc->use_fallback = true;
1086         new_smc->fallback_rsn = reason_code;
1087         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1088                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1089                         smc_listen_out_err(new_smc);
1090                         return;
1091                 }
1092         }
1093         smc_listen_out_connected(new_smc);
1094 }
1095
1096 /* listen worker: check prefixes */
1097 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1098                                  struct smc_clc_msg_proposal *pclc)
1099 {
1100         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1101         struct socket *newclcsock = new_smc->clcsock;
1102
1103         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1104         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1105                 return SMC_CLC_DECL_DIFFPREFIX;
1106
1107         return 0;
1108 }
1109
1110 /* listen worker: initialize connection and buffers */
1111 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1112                                 struct smc_init_info *ini)
1113 {
1114         int rc;
1115
1116         /* allocate connection / link group */
1117         rc = smc_conn_create(new_smc, ini);
1118         if (rc)
1119                 return rc;
1120
1121         /* create send buffer and rmb */
1122         if (smc_buf_create(new_smc, false))
1123                 return SMC_CLC_DECL_MEM;
1124
1125         return 0;
1126 }
1127
1128 /* listen worker: initialize connection and buffers for SMC-D */
1129 static int smc_listen_ism_init(struct smc_sock *new_smc,
1130                                struct smc_clc_msg_proposal *pclc,
1131                                struct smc_init_info *ini)
1132 {
1133         struct smc_clc_msg_smcd *pclc_smcd;
1134         int rc;
1135
1136         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1137         ini->ism_gid = pclc_smcd->gid;
1138         rc = smc_conn_create(new_smc, ini);
1139         if (rc)
1140                 return rc;
1141
1142         /* Check if peer can be reached via ISM device */
1143         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1144                             new_smc->conn.lgr->vlan_id,
1145                             new_smc->conn.lgr->smcd)) {
1146                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1147                         smc_lgr_forget(new_smc->conn.lgr);
1148                 smc_conn_free(&new_smc->conn);
1149                 return SMC_CLC_DECL_SMCDNOTALK;
1150         }
1151
1152         /* Create send and receive buffers */
1153         if (smc_buf_create(new_smc, true)) {
1154                 if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1155                         smc_lgr_forget(new_smc->conn.lgr);
1156                 smc_conn_free(&new_smc->conn);
1157                 return SMC_CLC_DECL_MEM;
1158         }
1159
1160         return 0;
1161 }
1162
1163 /* listen worker: register buffers */
1164 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1165 {
1166         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1167
1168         if (local_contact != SMC_FIRST_CONTACT) {
1169                 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1170                         return SMC_CLC_DECL_ERR_REGRMB;
1171         }
1172         smc_rmb_sync_sg_for_device(&new_smc->conn);
1173
1174         return 0;
1175 }
1176
1177 /* listen worker: finish RDMA setup */
1178 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1179                                   struct smc_clc_msg_accept_confirm *cclc,
1180                                   int local_contact)
1181 {
1182         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1183         int reason_code = 0;
1184
1185         if (local_contact == SMC_FIRST_CONTACT)
1186                 smc_link_save_peer_info(link, cclc);
1187
1188         if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1189                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1190                 goto decline;
1191         }
1192
1193         if (local_contact == SMC_FIRST_CONTACT) {
1194                 if (smc_ib_ready_link(link)) {
1195                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1196                         goto decline;
1197                 }
1198                 /* QP confirmation over RoCE fabric */
1199                 reason_code = smc_serv_conf_first_link(new_smc);
1200                 if (reason_code)
1201                         goto decline;
1202         }
1203         return 0;
1204
1205 decline:
1206         smc_listen_decline(new_smc, reason_code, local_contact);
1207         return reason_code;
1208 }
1209
1210 /* setup for RDMA connection of server */
1211 static void smc_listen_work(struct work_struct *work)
1212 {
1213         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1214                                                 smc_listen_work);
1215         struct socket *newclcsock = new_smc->clcsock;
1216         struct smc_clc_msg_accept_confirm cclc;
1217         struct smc_clc_msg_proposal *pclc;
1218         struct smc_init_info ini = {0};
1219         bool ism_supported = false;
1220         u8 buf[SMC_CLC_MAX_LEN];
1221         int rc = 0;
1222
1223         if (new_smc->use_fallback) {
1224                 smc_listen_out_connected(new_smc);
1225                 return;
1226         }
1227
1228         /* check if peer is smc capable */
1229         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1230                 new_smc->use_fallback = true;
1231                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1232                 smc_listen_out_connected(new_smc);
1233                 return;
1234         }
1235
1236         /* do inband token exchange -
1237          * wait for and receive SMC Proposal CLC message
1238          */
1239         pclc = (struct smc_clc_msg_proposal *)&buf;
1240         rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1241                               SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1242         if (rc)
1243                 goto out_decl;
1244
1245         /* IPSec connections opt out of SMC-R optimizations */
1246         if (using_ipsec(new_smc)) {
1247                 rc = SMC_CLC_DECL_IPSEC;
1248                 goto out_decl;
1249         }
1250
1251         /* check for matching IP prefix and subnet length */
1252         rc = smc_listen_prfx_check(new_smc, pclc);
1253         if (rc)
1254                 goto out_decl;
1255
1256         /* get vlan id from IP device */
1257         if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1258                 rc = SMC_CLC_DECL_GETVLANERR;
1259                 goto out_decl;
1260         }
1261
1262         mutex_lock(&smc_server_lgr_pending);
1263         smc_close_init(new_smc);
1264         smc_rx_init(new_smc);
1265         smc_tx_init(new_smc);
1266
1267         /* check if ISM is available */
1268         if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1269                 ini.is_smcd = true; /* prepare ISM check */
1270                 rc = smc_find_ism_device(new_smc, &ini);
1271                 if (!rc)
1272                         rc = smc_listen_ism_init(new_smc, pclc, &ini);
1273                 if (!rc)
1274                         ism_supported = true;
1275                 else if (pclc->hdr.path == SMC_TYPE_D)
1276                         goto out_unlock; /* skip RDMA and decline */
1277         }
1278
1279         /* check if RDMA is available */
1280         if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1281                 /* prepare RDMA check */
1282                 memset(&ini, 0, sizeof(ini));
1283                 ini.is_smcd = false;
1284                 ini.ib_lcl = &pclc->lcl;
1285                 rc = smc_find_rdma_device(new_smc, &ini);
1286                 if (rc) {
1287                         /* no RDMA device found */
1288                         if (pclc->hdr.path == SMC_TYPE_B)
1289                                 /* neither ISM nor RDMA device found */
1290                                 rc = SMC_CLC_DECL_NOSMCDEV;
1291                         goto out_unlock;
1292                 }
1293                 rc = smc_listen_rdma_init(new_smc, &ini);
1294                 if (rc)
1295                         goto out_unlock;
1296                 rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1297                 if (rc)
1298                         goto out_unlock;
1299         }
1300
1301         /* send SMC Accept CLC message */
1302         rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1303         if (rc)
1304                 goto out_unlock;
1305
1306         /* SMC-D does not need this lock any more */
1307         if (ism_supported)
1308                 mutex_unlock(&smc_server_lgr_pending);
1309
1310         /* receive SMC Confirm CLC message */
1311         rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1312                               SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1313         if (rc) {
1314                 if (!ism_supported)
1315                         goto out_unlock;
1316                 goto out_decl;
1317         }
1318
1319         /* finish worker */
1320         if (!ism_supported) {
1321                 rc = smc_listen_rdma_finish(new_smc, &cclc,
1322                                             ini.cln_first_contact);
1323                 mutex_unlock(&smc_server_lgr_pending);
1324                 if (rc)
1325                         return;
1326         }
1327         smc_conn_save_peer_info(new_smc, &cclc);
1328         smc_listen_out_connected(new_smc);
1329         return;
1330
1331 out_unlock:
1332         mutex_unlock(&smc_server_lgr_pending);
1333 out_decl:
1334         smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1335 }
1336
1337 static void smc_tcp_listen_work(struct work_struct *work)
1338 {
1339         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1340                                              tcp_listen_work);
1341         struct sock *lsk = &lsmc->sk;
1342         struct smc_sock *new_smc;
1343         int rc = 0;
1344
1345         lock_sock(lsk);
1346         while (lsk->sk_state == SMC_LISTEN) {
1347                 rc = smc_clcsock_accept(lsmc, &new_smc);
1348                 if (rc)
1349                         goto out;
1350                 if (!new_smc)
1351                         continue;
1352
1353                 new_smc->listen_smc = lsmc;
1354                 new_smc->use_fallback = lsmc->use_fallback;
1355                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1356                 sock_hold(lsk); /* sock_put in smc_listen_work */
1357                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1358                 smc_copy_sock_settings_to_smc(new_smc);
1359                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1360                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1361                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1362                 if (!schedule_work(&new_smc->smc_listen_work))
1363                         sock_put(&new_smc->sk);
1364         }
1365
1366 out:
1367         release_sock(lsk);
1368         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1369 }
1370
1371 static int smc_listen(struct socket *sock, int backlog)
1372 {
1373         struct sock *sk = sock->sk;
1374         struct smc_sock *smc;
1375         int rc;
1376
1377         smc = smc_sk(sk);
1378         lock_sock(sk);
1379
1380         rc = -EINVAL;
1381         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1382                 goto out;
1383
1384         rc = 0;
1385         if (sk->sk_state == SMC_LISTEN) {
1386                 sk->sk_max_ack_backlog = backlog;
1387                 goto out;
1388         }
1389         /* some socket options are handled in core, so we could not apply
1390          * them to the clc socket -- copy smc socket options to clc socket
1391          */
1392         smc_copy_sock_settings_to_clc(smc);
1393         if (!smc->use_fallback)
1394                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1395
1396         rc = kernel_listen(smc->clcsock, backlog);
1397         if (rc)
1398                 goto out;
1399         sk->sk_max_ack_backlog = backlog;
1400         sk->sk_ack_backlog = 0;
1401         sk->sk_state = SMC_LISTEN;
1402         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1403         if (!schedule_work(&smc->tcp_listen_work))
1404                 sock_put(sk);
1405
1406 out:
1407         release_sock(sk);
1408         return rc;
1409 }
1410
1411 static int smc_accept(struct socket *sock, struct socket *new_sock,
1412                       int flags, bool kern)
1413 {
1414         struct sock *sk = sock->sk, *nsk;
1415         DECLARE_WAITQUEUE(wait, current);
1416         struct smc_sock *lsmc;
1417         long timeo;
1418         int rc = 0;
1419
1420         lsmc = smc_sk(sk);
1421         sock_hold(sk); /* sock_put below */
1422         lock_sock(sk);
1423
1424         if (lsmc->sk.sk_state != SMC_LISTEN) {
1425                 rc = -EINVAL;
1426                 release_sock(sk);
1427                 goto out;
1428         }
1429
1430         /* Wait for an incoming connection */
1431         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1432         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1433         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1434                 set_current_state(TASK_INTERRUPTIBLE);
1435                 if (!timeo) {
1436                         rc = -EAGAIN;
1437                         break;
1438                 }
1439                 release_sock(sk);
1440                 timeo = schedule_timeout(timeo);
1441                 /* wakeup by sk_data_ready in smc_listen_work() */
1442                 sched_annotate_sleep();
1443                 lock_sock(sk);
1444                 if (signal_pending(current)) {
1445                         rc = sock_intr_errno(timeo);
1446                         break;
1447                 }
1448         }
1449         set_current_state(TASK_RUNNING);
1450         remove_wait_queue(sk_sleep(sk), &wait);
1451
1452         if (!rc)
1453                 rc = sock_error(nsk);
1454         release_sock(sk);
1455         if (rc)
1456                 goto out;
1457
1458         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1459                 /* wait till data arrives on the socket */
1460                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1461                                                                 MSEC_PER_SEC);
1462                 if (smc_sk(nsk)->use_fallback) {
1463                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1464
1465                         lock_sock(clcsk);
1466                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1467                                 sk_wait_data(clcsk, &timeo, NULL);
1468                         release_sock(clcsk);
1469                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1470                         lock_sock(nsk);
1471                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1472                         release_sock(nsk);
1473                 }
1474         }
1475
1476 out:
1477         sock_put(sk); /* sock_hold above */
1478         return rc;
1479 }
1480
1481 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1482                        int peer)
1483 {
1484         struct smc_sock *smc;
1485
1486         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1487             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1488                 return -ENOTCONN;
1489
1490         smc = smc_sk(sock->sk);
1491
1492         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1493 }
1494
1495 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1496 {
1497         struct sock *sk = sock->sk;
1498         struct smc_sock *smc;
1499         int rc = -EPIPE;
1500
1501         smc = smc_sk(sk);
1502         lock_sock(sk);
1503         if ((sk->sk_state != SMC_ACTIVE) &&
1504             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1505             (sk->sk_state != SMC_INIT))
1506                 goto out;
1507
1508         if (msg->msg_flags & MSG_FASTOPEN) {
1509                 if (sk->sk_state == SMC_INIT) {
1510                         smc->use_fallback = true;
1511                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1512                 } else {
1513                         rc = -EINVAL;
1514                         goto out;
1515                 }
1516         }
1517
1518         if (smc->use_fallback)
1519                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1520         else
1521                 rc = smc_tx_sendmsg(smc, msg, len);
1522 out:
1523         release_sock(sk);
1524         return rc;
1525 }
1526
1527 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1528                        int flags)
1529 {
1530         struct sock *sk = sock->sk;
1531         struct smc_sock *smc;
1532         int rc = -ENOTCONN;
1533
1534         smc = smc_sk(sk);
1535         lock_sock(sk);
1536         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1537                 /* socket was connected before, no more data to read */
1538                 rc = 0;
1539                 goto out;
1540         }
1541         if ((sk->sk_state == SMC_INIT) ||
1542             (sk->sk_state == SMC_LISTEN) ||
1543             (sk->sk_state == SMC_CLOSED))
1544                 goto out;
1545
1546         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1547                 rc = 0;
1548                 goto out;
1549         }
1550
1551         if (smc->use_fallback) {
1552                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1553         } else {
1554                 msg->msg_namelen = 0;
1555                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1556         }
1557
1558 out:
1559         release_sock(sk);
1560         return rc;
1561 }
1562
1563 static __poll_t smc_accept_poll(struct sock *parent)
1564 {
1565         struct smc_sock *isk = smc_sk(parent);
1566         __poll_t mask = 0;
1567
1568         spin_lock(&isk->accept_q_lock);
1569         if (!list_empty(&isk->accept_q))
1570                 mask = EPOLLIN | EPOLLRDNORM;
1571         spin_unlock(&isk->accept_q_lock);
1572
1573         return mask;
1574 }
1575
1576 static __poll_t smc_poll(struct file *file, struct socket *sock,
1577                              poll_table *wait)
1578 {
1579         struct sock *sk = sock->sk;
1580         struct smc_sock *smc;
1581         __poll_t mask = 0;
1582
1583         if (!sk)
1584                 return EPOLLNVAL;
1585
1586         smc = smc_sk(sock->sk);
1587         if (smc->use_fallback) {
1588                 /* delegate to CLC child sock */
1589                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1590                 sk->sk_err = smc->clcsock->sk->sk_err;
1591         } else {
1592                 if (sk->sk_state != SMC_CLOSED)
1593                         sock_poll_wait(file, sock, wait);
1594                 if (sk->sk_err)
1595                         mask |= EPOLLERR;
1596                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1597                     (sk->sk_state == SMC_CLOSED))
1598                         mask |= EPOLLHUP;
1599                 if (sk->sk_state == SMC_LISTEN) {
1600                         /* woken up by sk_data_ready in smc_listen_work() */
1601                         mask |= smc_accept_poll(sk);
1602                 } else if (smc->use_fallback) { /* as result of connect_work()*/
1603                         mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1604                                                            wait);
1605                         sk->sk_err = smc->clcsock->sk->sk_err;
1606                 } else {
1607                         if ((sk->sk_state != SMC_INIT &&
1608                              atomic_read(&smc->conn.sndbuf_space)) ||
1609                             sk->sk_shutdown & SEND_SHUTDOWN) {
1610                                 mask |= EPOLLOUT | EPOLLWRNORM;
1611                         } else {
1612                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1613                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1614                         }
1615                         if (atomic_read(&smc->conn.bytes_to_rcv))
1616                                 mask |= EPOLLIN | EPOLLRDNORM;
1617                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1618                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1619                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1620                                 mask |= EPOLLIN;
1621                         if (smc->conn.urg_state == SMC_URG_VALID)
1622                                 mask |= EPOLLPRI;
1623                 }
1624         }
1625
1626         return mask;
1627 }
1628
1629 static int smc_shutdown(struct socket *sock, int how)
1630 {
1631         struct sock *sk = sock->sk;
1632         struct smc_sock *smc;
1633         int rc = -EINVAL;
1634         int rc1 = 0;
1635
1636         smc = smc_sk(sk);
1637
1638         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1639                 return rc;
1640
1641         lock_sock(sk);
1642
1643         rc = -ENOTCONN;
1644         if ((sk->sk_state != SMC_ACTIVE) &&
1645             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1646             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1647             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1648             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1649             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1650                 goto out;
1651         if (smc->use_fallback) {
1652                 rc = kernel_sock_shutdown(smc->clcsock, how);
1653                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1654                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1655                         sk->sk_state = SMC_CLOSED;
1656                 goto out;
1657         }
1658         switch (how) {
1659         case SHUT_RDWR:         /* shutdown in both directions */
1660                 rc = smc_close_active(smc);
1661                 break;
1662         case SHUT_WR:
1663                 rc = smc_close_shutdown_write(smc);
1664                 break;
1665         case SHUT_RD:
1666                 rc = 0;
1667                 /* nothing more to do because peer is not involved */
1668                 break;
1669         }
1670         if (smc->clcsock)
1671                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1672         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1673         sk->sk_shutdown |= how + 1;
1674
1675 out:
1676         release_sock(sk);
1677         return rc ? rc : rc1;
1678 }
1679
1680 static int smc_setsockopt(struct socket *sock, int level, int optname,
1681                           char __user *optval, unsigned int optlen)
1682 {
1683         struct sock *sk = sock->sk;
1684         struct smc_sock *smc;
1685         int val, rc;
1686
1687         smc = smc_sk(sk);
1688
1689         /* generic setsockopts reaching us here always apply to the
1690          * CLC socket
1691          */
1692         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1693                                            optval, optlen);
1694         if (smc->clcsock->sk->sk_err) {
1695                 sk->sk_err = smc->clcsock->sk->sk_err;
1696                 sk->sk_error_report(sk);
1697         }
1698         if (rc)
1699                 return rc;
1700
1701         if (optlen < sizeof(int))
1702                 return -EINVAL;
1703         if (get_user(val, (int __user *)optval))
1704                 return -EFAULT;
1705
1706         lock_sock(sk);
1707         switch (optname) {
1708         case TCP_ULP:
1709         case TCP_FASTOPEN:
1710         case TCP_FASTOPEN_CONNECT:
1711         case TCP_FASTOPEN_KEY:
1712         case TCP_FASTOPEN_NO_COOKIE:
1713                 /* option not supported by SMC */
1714                 if (sk->sk_state == SMC_INIT) {
1715                         smc->use_fallback = true;
1716                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1717                 } else {
1718                         if (!smc->use_fallback)
1719                                 rc = -EINVAL;
1720                 }
1721                 break;
1722         case TCP_NODELAY:
1723                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1724                         if (val && !smc->use_fallback)
1725                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1726                                                  0);
1727                 }
1728                 break;
1729         case TCP_CORK:
1730                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1731                         if (!val && !smc->use_fallback)
1732                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1733                                                  0);
1734                 }
1735                 break;
1736         case TCP_DEFER_ACCEPT:
1737                 smc->sockopt_defer_accept = val;
1738                 break;
1739         default:
1740                 break;
1741         }
1742         release_sock(sk);
1743
1744         return rc;
1745 }
1746
1747 static int smc_getsockopt(struct socket *sock, int level, int optname,
1748                           char __user *optval, int __user *optlen)
1749 {
1750         struct smc_sock *smc;
1751
1752         smc = smc_sk(sock->sk);
1753         /* socket options apply to the CLC socket */
1754         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1755                                              optval, optlen);
1756 }
1757
1758 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1759                      unsigned long arg)
1760 {
1761         union smc_host_cursor cons, urg;
1762         struct smc_connection *conn;
1763         struct smc_sock *smc;
1764         int answ;
1765
1766         smc = smc_sk(sock->sk);
1767         conn = &smc->conn;
1768         lock_sock(&smc->sk);
1769         if (smc->use_fallback) {
1770                 if (!smc->clcsock) {
1771                         release_sock(&smc->sk);
1772                         return -EBADF;
1773                 }
1774                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1775                 release_sock(&smc->sk);
1776                 return answ;
1777         }
1778         switch (cmd) {
1779         case SIOCINQ: /* same as FIONREAD */
1780                 if (smc->sk.sk_state == SMC_LISTEN) {
1781                         release_sock(&smc->sk);
1782                         return -EINVAL;
1783                 }
1784                 if (smc->sk.sk_state == SMC_INIT ||
1785                     smc->sk.sk_state == SMC_CLOSED)
1786                         answ = 0;
1787                 else
1788                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1789                 break;
1790         case SIOCOUTQ:
1791                 /* output queue size (not send + not acked) */
1792                 if (smc->sk.sk_state == SMC_LISTEN) {
1793                         release_sock(&smc->sk);
1794                         return -EINVAL;
1795                 }
1796                 if (smc->sk.sk_state == SMC_INIT ||
1797                     smc->sk.sk_state == SMC_CLOSED)
1798                         answ = 0;
1799                 else
1800                         answ = smc->conn.sndbuf_desc->len -
1801                                         atomic_read(&smc->conn.sndbuf_space);
1802                 break;
1803         case SIOCOUTQNSD:
1804                 /* output queue size (not send only) */
1805                 if (smc->sk.sk_state == SMC_LISTEN) {
1806                         release_sock(&smc->sk);
1807                         return -EINVAL;
1808                 }
1809                 if (smc->sk.sk_state == SMC_INIT ||
1810                     smc->sk.sk_state == SMC_CLOSED)
1811                         answ = 0;
1812                 else
1813                         answ = smc_tx_prepared_sends(&smc->conn);
1814                 break;
1815         case SIOCATMARK:
1816                 if (smc->sk.sk_state == SMC_LISTEN) {
1817                         release_sock(&smc->sk);
1818                         return -EINVAL;
1819                 }
1820                 if (smc->sk.sk_state == SMC_INIT ||
1821                     smc->sk.sk_state == SMC_CLOSED) {
1822                         answ = 0;
1823                 } else {
1824                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1825                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1826                         answ = smc_curs_diff(conn->rmb_desc->len,
1827                                              &cons, &urg) == 1;
1828                 }
1829                 break;
1830         default:
1831                 release_sock(&smc->sk);
1832                 return -ENOIOCTLCMD;
1833         }
1834         release_sock(&smc->sk);
1835
1836         return put_user(answ, (int __user *)arg);
1837 }
1838
1839 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1840                             int offset, size_t size, int flags)
1841 {
1842         struct sock *sk = sock->sk;
1843         struct smc_sock *smc;
1844         int rc = -EPIPE;
1845
1846         smc = smc_sk(sk);
1847         lock_sock(sk);
1848         if (sk->sk_state != SMC_ACTIVE) {
1849                 release_sock(sk);
1850                 goto out;
1851         }
1852         release_sock(sk);
1853         if (smc->use_fallback)
1854                 rc = kernel_sendpage(smc->clcsock, page, offset,
1855                                      size, flags);
1856         else
1857                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1858
1859 out:
1860         return rc;
1861 }
1862
1863 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1864  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1865  * updates till whenever a respective page has been fully processed.
1866  * Note that subsequent recv() calls have to wait till all splice() processing
1867  * completed.
1868  */
1869 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1870                                struct pipe_inode_info *pipe, size_t len,
1871                                unsigned int flags)
1872 {
1873         struct sock *sk = sock->sk;
1874         struct smc_sock *smc;
1875         int rc = -ENOTCONN;
1876
1877         smc = smc_sk(sk);
1878         lock_sock(sk);
1879         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1880                 /* socket was connected before, no more data to read */
1881                 rc = 0;
1882                 goto out;
1883         }
1884         if (sk->sk_state == SMC_INIT ||
1885             sk->sk_state == SMC_LISTEN ||
1886             sk->sk_state == SMC_CLOSED)
1887                 goto out;
1888
1889         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1890                 rc = 0;
1891                 goto out;
1892         }
1893
1894         if (smc->use_fallback) {
1895                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1896                                                     pipe, len, flags);
1897         } else {
1898                 if (*ppos) {
1899                         rc = -ESPIPE;
1900                         goto out;
1901                 }
1902                 if (flags & SPLICE_F_NONBLOCK)
1903                         flags = MSG_DONTWAIT;
1904                 else
1905                         flags = 0;
1906                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1907         }
1908 out:
1909         release_sock(sk);
1910
1911         return rc;
1912 }
1913
1914 /* must look like tcp */
1915 static const struct proto_ops smc_sock_ops = {
1916         .family         = PF_SMC,
1917         .owner          = THIS_MODULE,
1918         .release        = smc_release,
1919         .bind           = smc_bind,
1920         .connect        = smc_connect,
1921         .socketpair     = sock_no_socketpair,
1922         .accept         = smc_accept,
1923         .getname        = smc_getname,
1924         .poll           = smc_poll,
1925         .ioctl          = smc_ioctl,
1926         .listen         = smc_listen,
1927         .shutdown       = smc_shutdown,
1928         .setsockopt     = smc_setsockopt,
1929         .getsockopt     = smc_getsockopt,
1930         .sendmsg        = smc_sendmsg,
1931         .recvmsg        = smc_recvmsg,
1932         .mmap           = sock_no_mmap,
1933         .sendpage       = smc_sendpage,
1934         .splice_read    = smc_splice_read,
1935 };
1936
1937 static int smc_create(struct net *net, struct socket *sock, int protocol,
1938                       int kern)
1939 {
1940         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1941         struct smc_sock *smc;
1942         struct sock *sk;
1943         int rc;
1944
1945         rc = -ESOCKTNOSUPPORT;
1946         if (sock->type != SOCK_STREAM)
1947                 goto out;
1948
1949         rc = -EPROTONOSUPPORT;
1950         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1951                 goto out;
1952
1953         rc = -ENOBUFS;
1954         sock->ops = &smc_sock_ops;
1955         sk = smc_sock_alloc(net, sock, protocol);
1956         if (!sk)
1957                 goto out;
1958
1959         /* create internal TCP socket for CLC handshake and fallback */
1960         smc = smc_sk(sk);
1961         smc->use_fallback = false; /* assume rdma capability first */
1962         smc->fallback_rsn = 0;
1963         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1964                               &smc->clcsock);
1965         if (rc) {
1966                 sk_common_release(sk);
1967                 goto out;
1968         }
1969         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1970         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1971
1972 out:
1973         return rc;
1974 }
1975
1976 static const struct net_proto_family smc_sock_family_ops = {
1977         .family = PF_SMC,
1978         .owner  = THIS_MODULE,
1979         .create = smc_create,
1980 };
1981
1982 unsigned int smc_net_id;
1983
1984 static __net_init int smc_net_init(struct net *net)
1985 {
1986         return smc_pnet_net_init(net);
1987 }
1988
1989 static void __net_exit smc_net_exit(struct net *net)
1990 {
1991         smc_pnet_net_exit(net);
1992 }
1993
1994 static struct pernet_operations smc_net_ops = {
1995         .init = smc_net_init,
1996         .exit = smc_net_exit,
1997         .id   = &smc_net_id,
1998         .size = sizeof(struct smc_net),
1999 };
2000
2001 static int __init smc_init(void)
2002 {
2003         int rc;
2004
2005         rc = register_pernet_subsys(&smc_net_ops);
2006         if (rc)
2007                 return rc;
2008
2009         rc = smc_pnet_init();
2010         if (rc)
2011                 return rc;
2012
2013         rc = smc_llc_init();
2014         if (rc) {
2015                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2016                 goto out_pnet;
2017         }
2018
2019         rc = smc_cdc_init();
2020         if (rc) {
2021                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2022                 goto out_pnet;
2023         }
2024
2025         rc = proto_register(&smc_proto, 1);
2026         if (rc) {
2027                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2028                 goto out_pnet;
2029         }
2030
2031         rc = proto_register(&smc_proto6, 1);
2032         if (rc) {
2033                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2034                 goto out_proto;
2035         }
2036
2037         rc = sock_register(&smc_sock_family_ops);
2038         if (rc) {
2039                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
2040                 goto out_proto6;
2041         }
2042         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2043         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2044
2045         rc = smc_ib_register_client();
2046         if (rc) {
2047                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
2048                 goto out_sock;
2049         }
2050
2051         static_branch_enable(&tcp_have_smc);
2052         return 0;
2053
2054 out_sock:
2055         sock_unregister(PF_SMC);
2056 out_proto6:
2057         proto_unregister(&smc_proto6);
2058 out_proto:
2059         proto_unregister(&smc_proto);
2060 out_pnet:
2061         smc_pnet_exit();
2062         return rc;
2063 }
2064
2065 static void __exit smc_exit(void)
2066 {
2067         smc_core_exit();
2068         static_branch_disable(&tcp_have_smc);
2069         smc_ib_unregister_client();
2070         sock_unregister(PF_SMC);
2071         proto_unregister(&smc_proto6);
2072         proto_unregister(&smc_proto);
2073         smc_pnet_exit();
2074         unregister_pernet_subsys(&smc_net_ops);
2075 }
2076
2077 module_init(smc_init);
2078 module_exit(smc_exit);
2079
2080 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2081 MODULE_DESCRIPTION("smc socket address family");
2082 MODULE_LICENSE("GPL");
2083 MODULE_ALIAS_NETPROTO(PF_SMC);