]> asedeno.scripts.mit.edu Git - linux.git/blob - net/smc/af_smc.c
Merge tag 'jfs-5.2' of git://github.com/kleikamp/linux-shaggy
[linux.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - support for alternate links postponed
11  *
12  *  Copyright IBM Corp. 2016, 2018
13  *
14  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
15  *              based on prototype from Frank Blaschka
16  */
17
18 #define KMSG_COMPONENT "smc"
19 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20
21 #include <linux/module.h>
22 #include <linux/socket.h>
23 #include <linux/workqueue.h>
24 #include <linux/in.h>
25 #include <linux/sched/signal.h>
26 #include <linux/if_vlan.h>
27
28 #include <net/sock.h>
29 #include <net/tcp.h>
30 #include <net/smc.h>
31 #include <asm/ioctls.h>
32
33 #include <net/net_namespace.h>
34 #include <net/netns/generic.h>
35 #include "smc_netns.h"
36
37 #include "smc.h"
38 #include "smc_clc.h"
39 #include "smc_llc.h"
40 #include "smc_cdc.h"
41 #include "smc_core.h"
42 #include "smc_ib.h"
43 #include "smc_ism.h"
44 #include "smc_pnet.h"
45 #include "smc_tx.h"
46 #include "smc_rx.h"
47 #include "smc_close.h"
48
49 static DEFINE_MUTEX(smc_server_lgr_pending);    /* serialize link group
50                                                  * creation on server
51                                                  */
52 static DEFINE_MUTEX(smc_client_lgr_pending);    /* serialize link group
53                                                  * creation on client
54                                                  */
55
56 static void smc_tcp_listen_work(struct work_struct *);
57 static void smc_connect_work(struct work_struct *);
58
59 static void smc_set_keepalive(struct sock *sk, int val)
60 {
61         struct smc_sock *smc = smc_sk(sk);
62
63         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
64 }
65
66 static struct smc_hashinfo smc_v4_hashinfo = {
67         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
68 };
69
70 static struct smc_hashinfo smc_v6_hashinfo = {
71         .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
72 };
73
74 int smc_hash_sk(struct sock *sk)
75 {
76         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
77         struct hlist_head *head;
78
79         head = &h->ht;
80
81         write_lock_bh(&h->lock);
82         sk_add_node(sk, head);
83         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
84         write_unlock_bh(&h->lock);
85
86         return 0;
87 }
88 EXPORT_SYMBOL_GPL(smc_hash_sk);
89
90 void smc_unhash_sk(struct sock *sk)
91 {
92         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
93
94         write_lock_bh(&h->lock);
95         if (sk_del_node_init(sk))
96                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
97         write_unlock_bh(&h->lock);
98 }
99 EXPORT_SYMBOL_GPL(smc_unhash_sk);
100
101 struct proto smc_proto = {
102         .name           = "SMC",
103         .owner          = THIS_MODULE,
104         .keepalive      = smc_set_keepalive,
105         .hash           = smc_hash_sk,
106         .unhash         = smc_unhash_sk,
107         .obj_size       = sizeof(struct smc_sock),
108         .h.smc_hash     = &smc_v4_hashinfo,
109         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
110 };
111 EXPORT_SYMBOL_GPL(smc_proto);
112
113 struct proto smc_proto6 = {
114         .name           = "SMC6",
115         .owner          = THIS_MODULE,
116         .keepalive      = smc_set_keepalive,
117         .hash           = smc_hash_sk,
118         .unhash         = smc_unhash_sk,
119         .obj_size       = sizeof(struct smc_sock),
120         .h.smc_hash     = &smc_v6_hashinfo,
121         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
122 };
123 EXPORT_SYMBOL_GPL(smc_proto6);
124
125 static int smc_release(struct socket *sock)
126 {
127         struct sock *sk = sock->sk;
128         struct smc_sock *smc;
129         int rc = 0;
130
131         if (!sk)
132                 goto out;
133
134         smc = smc_sk(sk);
135
136         /* cleanup for a dangling non-blocking connect */
137         if (smc->connect_info && sk->sk_state == SMC_INIT)
138                 tcp_abort(smc->clcsock->sk, ECONNABORTED);
139         flush_work(&smc->connect_work);
140         kfree(smc->connect_info);
141         smc->connect_info = NULL;
142
143         if (sk->sk_state == SMC_LISTEN)
144                 /* smc_close_non_accepted() is called and acquires
145                  * sock lock for child sockets again
146                  */
147                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
148         else
149                 lock_sock(sk);
150
151         if (!smc->use_fallback) {
152                 rc = smc_close_active(smc);
153                 sock_set_flag(sk, SOCK_DEAD);
154                 sk->sk_shutdown |= SHUTDOWN_MASK;
155         } else {
156                 if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
157                         sock_put(sk); /* passive closing */
158                 if (sk->sk_state == SMC_LISTEN) {
159                         /* wake up clcsock accept */
160                         rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
161                 }
162                 sk->sk_state = SMC_CLOSED;
163                 sk->sk_state_change(sk);
164         }
165
166         sk->sk_prot->unhash(sk);
167
168         if (sk->sk_state == SMC_CLOSED) {
169                 if (smc->clcsock) {
170                         release_sock(sk);
171                         smc_clcsock_release(smc);
172                         lock_sock(sk);
173                 }
174                 if (!smc->use_fallback)
175                         smc_conn_free(&smc->conn);
176         }
177
178         /* detach socket */
179         sock_orphan(sk);
180         sock->sk = NULL;
181         release_sock(sk);
182
183         sock_put(sk); /* final sock_put */
184 out:
185         return rc;
186 }
187
188 static void smc_destruct(struct sock *sk)
189 {
190         if (sk->sk_state != SMC_CLOSED)
191                 return;
192         if (!sock_flag(sk, SOCK_DEAD))
193                 return;
194
195         sk_refcnt_debug_dec(sk);
196 }
197
198 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
199                                    int protocol)
200 {
201         struct smc_sock *smc;
202         struct proto *prot;
203         struct sock *sk;
204
205         prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
206         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
207         if (!sk)
208                 return NULL;
209
210         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
211         sk->sk_state = SMC_INIT;
212         sk->sk_destruct = smc_destruct;
213         sk->sk_protocol = protocol;
214         smc = smc_sk(sk);
215         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
216         INIT_WORK(&smc->connect_work, smc_connect_work);
217         INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
218         INIT_LIST_HEAD(&smc->accept_q);
219         spin_lock_init(&smc->accept_q_lock);
220         spin_lock_init(&smc->conn.send_lock);
221         sk->sk_prot->hash(sk);
222         sk_refcnt_debug_inc(sk);
223         mutex_init(&smc->clcsock_release_lock);
224
225         return sk;
226 }
227
228 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
229                     int addr_len)
230 {
231         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
232         struct sock *sk = sock->sk;
233         struct smc_sock *smc;
234         int rc;
235
236         smc = smc_sk(sk);
237
238         /* replicate tests from inet_bind(), to be safe wrt. future changes */
239         rc = -EINVAL;
240         if (addr_len < sizeof(struct sockaddr_in))
241                 goto out;
242
243         rc = -EAFNOSUPPORT;
244         if (addr->sin_family != AF_INET &&
245             addr->sin_family != AF_INET6 &&
246             addr->sin_family != AF_UNSPEC)
247                 goto out;
248         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
249         if (addr->sin_family == AF_UNSPEC &&
250             addr->sin_addr.s_addr != htonl(INADDR_ANY))
251                 goto out;
252
253         lock_sock(sk);
254
255         /* Check if socket is already active */
256         rc = -EINVAL;
257         if (sk->sk_state != SMC_INIT)
258                 goto out_rel;
259
260         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
261         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
262
263 out_rel:
264         release_sock(sk);
265 out:
266         return rc;
267 }
268
269 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
270                                    unsigned long mask)
271 {
272         /* options we don't get control via setsockopt for */
273         nsk->sk_type = osk->sk_type;
274         nsk->sk_sndbuf = osk->sk_sndbuf;
275         nsk->sk_rcvbuf = osk->sk_rcvbuf;
276         nsk->sk_sndtimeo = osk->sk_sndtimeo;
277         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
278         nsk->sk_mark = osk->sk_mark;
279         nsk->sk_priority = osk->sk_priority;
280         nsk->sk_rcvlowat = osk->sk_rcvlowat;
281         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
282         nsk->sk_err = osk->sk_err;
283
284         nsk->sk_flags &= ~mask;
285         nsk->sk_flags |= osk->sk_flags & mask;
286 }
287
288 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
289                              (1UL << SOCK_KEEPOPEN) | \
290                              (1UL << SOCK_LINGER) | \
291                              (1UL << SOCK_BROADCAST) | \
292                              (1UL << SOCK_TIMESTAMP) | \
293                              (1UL << SOCK_DBG) | \
294                              (1UL << SOCK_RCVTSTAMP) | \
295                              (1UL << SOCK_RCVTSTAMPNS) | \
296                              (1UL << SOCK_LOCALROUTE) | \
297                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
298                              (1UL << SOCK_RXQ_OVFL) | \
299                              (1UL << SOCK_WIFI_STATUS) | \
300                              (1UL << SOCK_NOFCS) | \
301                              (1UL << SOCK_FILTER_LOCKED) | \
302                              (1UL << SOCK_TSTAMP_NEW))
303 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
304  * clc socket (since smc is not called for these options from net/core)
305  */
306 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
307 {
308         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
309 }
310
311 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
312                              (1UL << SOCK_KEEPOPEN) | \
313                              (1UL << SOCK_LINGER) | \
314                              (1UL << SOCK_DBG))
315 /* copy only settings and flags relevant for smc from clc to smc socket */
316 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
317 {
318         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
319 }
320
321 /* register a new rmb, send confirm_rkey msg to register with peer */
322 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
323                        bool conf_rkey)
324 {
325         if (!rmb_desc->wr_reg) {
326                 /* register memory region for new rmb */
327                 if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
328                         rmb_desc->regerr = 1;
329                         return -EFAULT;
330                 }
331                 rmb_desc->wr_reg = 1;
332         }
333         if (!conf_rkey)
334                 return 0;
335         /* exchange confirm_rkey msg with peer */
336         if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
337                 rmb_desc->regerr = 1;
338                 return -EFAULT;
339         }
340         return 0;
341 }
342
343 static int smc_clnt_conf_first_link(struct smc_sock *smc)
344 {
345         struct net *net = sock_net(smc->clcsock->sk);
346         struct smc_link_group *lgr = smc->conn.lgr;
347         struct smc_link *link;
348         int rest;
349         int rc;
350
351         link = &lgr->lnk[SMC_SINGLE_LINK];
352         /* receive CONFIRM LINK request from server over RoCE fabric */
353         rest = wait_for_completion_interruptible_timeout(
354                 &link->llc_confirm,
355                 SMC_LLC_WAIT_FIRST_TIME);
356         if (rest <= 0) {
357                 struct smc_clc_msg_decline dclc;
358
359                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
360                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
361                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
362         }
363
364         if (link->llc_confirm_rc)
365                 return SMC_CLC_DECL_RMBE_EC;
366
367         rc = smc_ib_modify_qp_rts(link);
368         if (rc)
369                 return SMC_CLC_DECL_ERR_RDYLNK;
370
371         smc_wr_remember_qp_attr(link);
372
373         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
374                 return SMC_CLC_DECL_ERR_REGRMB;
375
376         /* send CONFIRM LINK response over RoCE fabric */
377         rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
378         if (rc < 0)
379                 return SMC_CLC_DECL_TIMEOUT_CL;
380
381         /* receive ADD LINK request from server over RoCE fabric */
382         rest = wait_for_completion_interruptible_timeout(&link->llc_add,
383                                                          SMC_LLC_WAIT_TIME);
384         if (rest <= 0) {
385                 struct smc_clc_msg_decline dclc;
386
387                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
388                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
389                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
390         }
391
392         /* send add link reject message, only one link supported for now */
393         rc = smc_llc_send_add_link(link,
394                                    link->smcibdev->mac[link->ibport - 1],
395                                    link->gid, SMC_LLC_RESP);
396         if (rc < 0)
397                 return SMC_CLC_DECL_TIMEOUT_AL;
398
399         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
400
401         return 0;
402 }
403
404 static void smcr_conn_save_peer_info(struct smc_sock *smc,
405                                      struct smc_clc_msg_accept_confirm *clc)
406 {
407         int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
408
409         smc->conn.peer_rmbe_idx = clc->rmbe_idx;
410         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
411         smc->conn.peer_rmbe_size = bufsize;
412         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
413         smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
414 }
415
416 static void smcd_conn_save_peer_info(struct smc_sock *smc,
417                                      struct smc_clc_msg_accept_confirm *clc)
418 {
419         int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
420
421         smc->conn.peer_rmbe_idx = clc->dmbe_idx;
422         smc->conn.peer_token = clc->token;
423         /* msg header takes up space in the buffer */
424         smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
425         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
426         smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
427 }
428
429 static void smc_conn_save_peer_info(struct smc_sock *smc,
430                                     struct smc_clc_msg_accept_confirm *clc)
431 {
432         if (smc->conn.lgr->is_smcd)
433                 smcd_conn_save_peer_info(smc, clc);
434         else
435                 smcr_conn_save_peer_info(smc, clc);
436 }
437
438 static void smc_link_save_peer_info(struct smc_link *link,
439                                     struct smc_clc_msg_accept_confirm *clc)
440 {
441         link->peer_qpn = ntoh24(clc->qpn);
442         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
443         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
444         link->peer_psn = ntoh24(clc->psn);
445         link->peer_mtu = clc->qp_mtu;
446 }
447
448 static void smc_switch_to_fallback(struct smc_sock *smc)
449 {
450         smc->use_fallback = true;
451         if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
452                 smc->clcsock->file = smc->sk.sk_socket->file;
453                 smc->clcsock->file->private_data = smc->clcsock;
454         }
455 }
456
457 /* fall back during connect */
458 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
459 {
460         smc_switch_to_fallback(smc);
461         smc->fallback_rsn = reason_code;
462         smc_copy_sock_settings_to_clc(smc);
463         if (smc->sk.sk_state == SMC_INIT)
464                 smc->sk.sk_state = SMC_ACTIVE;
465         return 0;
466 }
467
468 /* decline and fall back during connect */
469 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
470 {
471         int rc;
472
473         if (reason_code < 0) { /* error, fallback is not possible */
474                 if (smc->sk.sk_state == SMC_INIT)
475                         sock_put(&smc->sk); /* passive closing */
476                 return reason_code;
477         }
478         if (reason_code != SMC_CLC_DECL_PEERDECL) {
479                 rc = smc_clc_send_decline(smc, reason_code);
480                 if (rc < 0) {
481                         if (smc->sk.sk_state == SMC_INIT)
482                                 sock_put(&smc->sk); /* passive closing */
483                         return rc;
484                 }
485         }
486         return smc_connect_fallback(smc, reason_code);
487 }
488
489 /* abort connecting */
490 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
491                              int local_contact)
492 {
493         if (local_contact == SMC_FIRST_CONTACT)
494                 smc_lgr_forget(smc->conn.lgr);
495         if (smc->conn.lgr->is_smcd)
496                 /* there is only one lgr role for SMC-D; use server lock */
497                 mutex_unlock(&smc_server_lgr_pending);
498         else
499                 mutex_unlock(&smc_client_lgr_pending);
500
501         smc_conn_free(&smc->conn);
502         return reason_code;
503 }
504
505 /* check if there is a rdma device available for this connection. */
506 /* called for connect and listen */
507 static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
508                           u8 *ibport, unsigned short vlan_id, u8 gid[])
509 {
510         int reason_code = 0;
511
512         /* PNET table look up: search active ib_device and port
513          * within same PNETID that also contains the ethernet device
514          * used for the internal TCP socket
515          */
516         smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
517                                     gid);
518         if (!(*ibdev))
519                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
520
521         return reason_code;
522 }
523
524 /* check if there is an ISM device available for this connection. */
525 /* called for connect and listen */
526 static int smc_check_ism(struct smc_sock *smc, struct smcd_dev **ismdev)
527 {
528         /* Find ISM device with same PNETID as connecting interface  */
529         smc_pnet_find_ism_resource(smc->clcsock->sk, ismdev);
530         if (!(*ismdev))
531                 return SMC_CLC_DECL_CNFERR; /* configuration error */
532         return 0;
533 }
534
535 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
536 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
537                                       struct smcd_dev *ismdev,
538                                       unsigned short vlan_id)
539 {
540         if (vlan_id && smc_ism_get_vlan(ismdev, vlan_id))
541                 return SMC_CLC_DECL_CNFERR;
542         return 0;
543 }
544
545 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
546  * used, the VLAN ID will be registered again during the connection setup.
547  */
548 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
549                                         struct smcd_dev *ismdev,
550                                         unsigned short vlan_id)
551 {
552         if (!is_smcd)
553                 return 0;
554         if (vlan_id && smc_ism_put_vlan(ismdev, vlan_id))
555                 return SMC_CLC_DECL_CNFERR;
556         return 0;
557 }
558
559 /* CLC handshake during connect */
560 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
561                            struct smc_clc_msg_accept_confirm *aclc,
562                            struct smc_ib_device *ibdev, u8 ibport,
563                            u8 gid[], struct smcd_dev *ismdev)
564 {
565         int rc = 0;
566
567         /* do inband token exchange */
568         rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
569         if (rc)
570                 return rc;
571         /* receive SMC Accept CLC message */
572         return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
573                                 CLC_WAIT_TIME);
574 }
575
576 /* setup for RDMA connection of client */
577 static int smc_connect_rdma(struct smc_sock *smc,
578                             struct smc_clc_msg_accept_confirm *aclc,
579                             struct smc_ib_device *ibdev, u8 ibport)
580 {
581         int local_contact = SMC_FIRST_CONTACT;
582         struct smc_link *link;
583         int reason_code = 0;
584
585         mutex_lock(&smc_client_lgr_pending);
586         local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev,
587                                         ibport, ntoh24(aclc->qpn), &aclc->lcl,
588                                         NULL, 0);
589         if (local_contact < 0) {
590                 if (local_contact == -ENOMEM)
591                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
592                 else if (local_contact == -ENOLINK)
593                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
594                 else
595                         reason_code = SMC_CLC_DECL_INTERR; /* other error */
596                 mutex_unlock(&smc_client_lgr_pending);
597                 return reason_code;
598         }
599         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
600
601         smc_conn_save_peer_info(smc, aclc);
602
603         /* create send buffer and rmb */
604         if (smc_buf_create(smc, false))
605                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
606
607         if (local_contact == SMC_FIRST_CONTACT)
608                 smc_link_save_peer_info(link, aclc);
609
610         if (smc_rmb_rtoken_handling(&smc->conn, aclc))
611                 return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
612                                          local_contact);
613
614         smc_close_init(smc);
615         smc_rx_init(smc);
616
617         if (local_contact == SMC_FIRST_CONTACT) {
618                 if (smc_ib_ready_link(link))
619                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
620                                                  local_contact);
621         } else {
622                 if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
623                         return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
624                                                  local_contact);
625         }
626         smc_rmb_sync_sg_for_device(&smc->conn);
627
628         reason_code = smc_clc_send_confirm(smc);
629         if (reason_code)
630                 return smc_connect_abort(smc, reason_code, local_contact);
631
632         smc_tx_init(smc);
633
634         if (local_contact == SMC_FIRST_CONTACT) {
635                 /* QP confirmation over RoCE fabric */
636                 reason_code = smc_clnt_conf_first_link(smc);
637                 if (reason_code)
638                         return smc_connect_abort(smc, reason_code,
639                                                  local_contact);
640         }
641         mutex_unlock(&smc_client_lgr_pending);
642
643         smc_copy_sock_settings_to_clc(smc);
644         if (smc->sk.sk_state == SMC_INIT)
645                 smc->sk.sk_state = SMC_ACTIVE;
646
647         return 0;
648 }
649
650 /* setup for ISM connection of client */
651 static int smc_connect_ism(struct smc_sock *smc,
652                            struct smc_clc_msg_accept_confirm *aclc,
653                            struct smcd_dev *ismdev)
654 {
655         int local_contact = SMC_FIRST_CONTACT;
656         int rc = 0;
657
658         /* there is only one lgr role for SMC-D; use server lock */
659         mutex_lock(&smc_server_lgr_pending);
660         local_contact = smc_conn_create(smc, true, aclc->hdr.flag, NULL, 0, 0,
661                                         NULL, ismdev, aclc->gid);
662         if (local_contact < 0) {
663                 mutex_unlock(&smc_server_lgr_pending);
664                 return SMC_CLC_DECL_MEM;
665         }
666
667         /* Create send and receive buffers */
668         if (smc_buf_create(smc, true))
669                 return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
670
671         smc_conn_save_peer_info(smc, aclc);
672         smc_close_init(smc);
673         smc_rx_init(smc);
674         smc_tx_init(smc);
675
676         rc = smc_clc_send_confirm(smc);
677         if (rc)
678                 return smc_connect_abort(smc, rc, local_contact);
679         mutex_unlock(&smc_server_lgr_pending);
680
681         smc_copy_sock_settings_to_clc(smc);
682         if (smc->sk.sk_state == SMC_INIT)
683                 smc->sk.sk_state = SMC_ACTIVE;
684
685         return 0;
686 }
687
688 /* perform steps before actually connecting */
689 static int __smc_connect(struct smc_sock *smc)
690 {
691         bool ism_supported = false, rdma_supported = false;
692         struct smc_clc_msg_accept_confirm aclc;
693         struct smc_ib_device *ibdev;
694         struct smcd_dev *ismdev;
695         u8 gid[SMC_GID_SIZE];
696         unsigned short vlan;
697         int smc_type;
698         int rc = 0;
699         u8 ibport;
700
701         sock_hold(&smc->sk); /* sock put in passive closing */
702
703         if (smc->use_fallback)
704                 return smc_connect_fallback(smc, smc->fallback_rsn);
705
706         /* if peer has not signalled SMC-capability, fall back */
707         if (!tcp_sk(smc->clcsock->sk)->syn_smc)
708                 return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
709
710         /* IPSec connections opt out of SMC-R optimizations */
711         if (using_ipsec(smc))
712                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
713
714         /* check for VLAN ID */
715         if (smc_vlan_by_tcpsk(smc->clcsock, &vlan))
716                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
717
718         /* check if there is an ism device available */
719         if (!smc_check_ism(smc, &ismdev) &&
720             !smc_connect_ism_vlan_setup(smc, ismdev, vlan)) {
721                 /* ISM is supported for this connection */
722                 ism_supported = true;
723                 smc_type = SMC_TYPE_D;
724         }
725
726         /* check if there is a rdma device available */
727         if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
728                 /* RDMA is supported for this connection */
729                 rdma_supported = true;
730                 if (ism_supported)
731                         smc_type = SMC_TYPE_B; /* both */
732                 else
733                         smc_type = SMC_TYPE_R; /* only RDMA */
734         }
735
736         /* if neither ISM nor RDMA are supported, fallback */
737         if (!rdma_supported && !ism_supported)
738                 return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
739
740         /* perform CLC handshake */
741         rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
742         if (rc) {
743                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
744                 return smc_connect_decline_fallback(smc, rc);
745         }
746
747         /* depending on previous steps, connect using rdma or ism */
748         if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
749                 rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
750         else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
751                 rc = smc_connect_ism(smc, &aclc, ismdev);
752         else
753                 rc = SMC_CLC_DECL_MODEUNSUPP;
754         if (rc) {
755                 smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
756                 return smc_connect_decline_fallback(smc, rc);
757         }
758
759         smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
760         return 0;
761 }
762
763 static void smc_connect_work(struct work_struct *work)
764 {
765         struct smc_sock *smc = container_of(work, struct smc_sock,
766                                             connect_work);
767         int rc;
768
769         lock_sock(&smc->sk);
770         rc = kernel_connect(smc->clcsock, &smc->connect_info->addr,
771                             smc->connect_info->alen, smc->connect_info->flags);
772         if (smc->clcsock->sk->sk_err) {
773                 smc->sk.sk_err = smc->clcsock->sk->sk_err;
774                 goto out;
775         }
776         if (rc < 0) {
777                 smc->sk.sk_err = -rc;
778                 goto out;
779         }
780
781         rc = __smc_connect(smc);
782         if (rc < 0)
783                 smc->sk.sk_err = -rc;
784
785 out:
786         if (!sock_flag(&smc->sk, SOCK_DEAD)) {
787                 if (smc->sk.sk_err) {
788                         smc->sk.sk_state_change(&smc->sk);
789                 } else { /* allow polling before and after fallback decision */
790                         smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
791                         smc->sk.sk_write_space(&smc->sk);
792                 }
793         }
794         kfree(smc->connect_info);
795         smc->connect_info = NULL;
796         release_sock(&smc->sk);
797 }
798
799 static int smc_connect(struct socket *sock, struct sockaddr *addr,
800                        int alen, int flags)
801 {
802         struct sock *sk = sock->sk;
803         struct smc_sock *smc;
804         int rc = -EINVAL;
805
806         smc = smc_sk(sk);
807
808         /* separate smc parameter checking to be safe */
809         if (alen < sizeof(addr->sa_family))
810                 goto out_err;
811         if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
812                 goto out_err;
813
814         lock_sock(sk);
815         switch (sk->sk_state) {
816         default:
817                 goto out;
818         case SMC_ACTIVE:
819                 rc = -EISCONN;
820                 goto out;
821         case SMC_INIT:
822                 rc = 0;
823                 break;
824         }
825
826         smc_copy_sock_settings_to_clc(smc);
827         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
828         if (flags & O_NONBLOCK) {
829                 if (smc->connect_info) {
830                         rc = -EALREADY;
831                         goto out;
832                 }
833                 smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL);
834                 if (!smc->connect_info) {
835                         rc = -ENOMEM;
836                         goto out;
837                 }
838                 smc->connect_info->alen = alen;
839                 smc->connect_info->flags = flags ^ O_NONBLOCK;
840                 memcpy(&smc->connect_info->addr, addr, alen);
841                 schedule_work(&smc->connect_work);
842                 rc = -EINPROGRESS;
843         } else {
844                 rc = kernel_connect(smc->clcsock, addr, alen, flags);
845                 if (rc)
846                         goto out;
847
848                 rc = __smc_connect(smc);
849                 if (rc < 0)
850                         goto out;
851                 else
852                         rc = 0; /* success cases including fallback */
853         }
854
855 out:
856         release_sock(sk);
857 out_err:
858         return rc;
859 }
860
861 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
862 {
863         struct socket *new_clcsock = NULL;
864         struct sock *lsk = &lsmc->sk;
865         struct sock *new_sk;
866         int rc = -EINVAL;
867
868         release_sock(lsk);
869         new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
870         if (!new_sk) {
871                 rc = -ENOMEM;
872                 lsk->sk_err = ENOMEM;
873                 *new_smc = NULL;
874                 lock_sock(lsk);
875                 goto out;
876         }
877         *new_smc = smc_sk(new_sk);
878
879         mutex_lock(&lsmc->clcsock_release_lock);
880         if (lsmc->clcsock)
881                 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
882         mutex_unlock(&lsmc->clcsock_release_lock);
883         lock_sock(lsk);
884         if  (rc < 0)
885                 lsk->sk_err = -rc;
886         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
887                 new_sk->sk_prot->unhash(new_sk);
888                 if (new_clcsock)
889                         sock_release(new_clcsock);
890                 new_sk->sk_state = SMC_CLOSED;
891                 sock_set_flag(new_sk, SOCK_DEAD);
892                 sock_put(new_sk); /* final */
893                 *new_smc = NULL;
894                 goto out;
895         }
896
897         (*new_smc)->clcsock = new_clcsock;
898 out:
899         return rc;
900 }
901
902 /* add a just created sock to the accept queue of the listen sock as
903  * candidate for a following socket accept call from user space
904  */
905 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
906 {
907         struct smc_sock *par = smc_sk(parent);
908
909         sock_hold(sk); /* sock_put in smc_accept_unlink () */
910         spin_lock(&par->accept_q_lock);
911         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
912         spin_unlock(&par->accept_q_lock);
913         sk_acceptq_added(parent);
914 }
915
916 /* remove a socket from the accept queue of its parental listening socket */
917 static void smc_accept_unlink(struct sock *sk)
918 {
919         struct smc_sock *par = smc_sk(sk)->listen_smc;
920
921         spin_lock(&par->accept_q_lock);
922         list_del_init(&smc_sk(sk)->accept_q);
923         spin_unlock(&par->accept_q_lock);
924         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
925         sock_put(sk); /* sock_hold in smc_accept_enqueue */
926 }
927
928 /* remove a sock from the accept queue to bind it to a new socket created
929  * for a socket accept call from user space
930  */
931 struct sock *smc_accept_dequeue(struct sock *parent,
932                                 struct socket *new_sock)
933 {
934         struct smc_sock *isk, *n;
935         struct sock *new_sk;
936
937         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
938                 new_sk = (struct sock *)isk;
939
940                 smc_accept_unlink(new_sk);
941                 if (new_sk->sk_state == SMC_CLOSED) {
942                         new_sk->sk_prot->unhash(new_sk);
943                         if (isk->clcsock) {
944                                 sock_release(isk->clcsock);
945                                 isk->clcsock = NULL;
946                         }
947                         sock_put(new_sk); /* final */
948                         continue;
949                 }
950                 if (new_sock) {
951                         sock_graft(new_sk, new_sock);
952                         if (isk->use_fallback) {
953                                 smc_sk(new_sk)->clcsock->file = new_sock->file;
954                                 isk->clcsock->file->private_data = isk->clcsock;
955                         }
956                 }
957                 return new_sk;
958         }
959         return NULL;
960 }
961
962 /* clean up for a created but never accepted sock */
963 void smc_close_non_accepted(struct sock *sk)
964 {
965         struct smc_sock *smc = smc_sk(sk);
966
967         lock_sock(sk);
968         if (!sk->sk_lingertime)
969                 /* wait for peer closing */
970                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
971         if (!smc->use_fallback) {
972                 smc_close_active(smc);
973                 sock_set_flag(sk, SOCK_DEAD);
974                 sk->sk_shutdown |= SHUTDOWN_MASK;
975         }
976         sk->sk_prot->unhash(sk);
977         if (smc->clcsock) {
978                 struct socket *tcp;
979
980                 tcp = smc->clcsock;
981                 smc->clcsock = NULL;
982                 sock_release(tcp);
983         }
984         if (smc->use_fallback) {
985                 sock_put(sk); /* passive closing */
986                 sk->sk_state = SMC_CLOSED;
987         } else {
988                 if (sk->sk_state == SMC_CLOSED)
989                         smc_conn_free(&smc->conn);
990         }
991         release_sock(sk);
992         sock_put(sk); /* final sock_put */
993 }
994
995 static int smc_serv_conf_first_link(struct smc_sock *smc)
996 {
997         struct net *net = sock_net(smc->clcsock->sk);
998         struct smc_link_group *lgr = smc->conn.lgr;
999         struct smc_link *link;
1000         int rest;
1001         int rc;
1002
1003         link = &lgr->lnk[SMC_SINGLE_LINK];
1004
1005         if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
1006                 return SMC_CLC_DECL_ERR_REGRMB;
1007
1008         /* send CONFIRM LINK request to client over the RoCE fabric */
1009         rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1010         if (rc < 0)
1011                 return SMC_CLC_DECL_TIMEOUT_CL;
1012
1013         /* receive CONFIRM LINK response from client over the RoCE fabric */
1014         rest = wait_for_completion_interruptible_timeout(
1015                 &link->llc_confirm_resp,
1016                 SMC_LLC_WAIT_FIRST_TIME);
1017         if (rest <= 0) {
1018                 struct smc_clc_msg_decline dclc;
1019
1020                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1021                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1022                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1023         }
1024
1025         if (link->llc_confirm_resp_rc)
1026                 return SMC_CLC_DECL_RMBE_EC;
1027
1028         /* send ADD LINK request to client over the RoCE fabric */
1029         rc = smc_llc_send_add_link(link,
1030                                    link->smcibdev->mac[link->ibport - 1],
1031                                    link->gid, SMC_LLC_REQ);
1032         if (rc < 0)
1033                 return SMC_CLC_DECL_TIMEOUT_AL;
1034
1035         /* receive ADD LINK response from client over the RoCE fabric */
1036         rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1037                                                          SMC_LLC_WAIT_TIME);
1038         if (rest <= 0) {
1039                 struct smc_clc_msg_decline dclc;
1040
1041                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1042                                       SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1043                 return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1044         }
1045
1046         smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1047
1048         return 0;
1049 }
1050
1051 /* listen worker: finish */
1052 static void smc_listen_out(struct smc_sock *new_smc)
1053 {
1054         struct smc_sock *lsmc = new_smc->listen_smc;
1055         struct sock *newsmcsk = &new_smc->sk;
1056
1057         if (lsmc->sk.sk_state == SMC_LISTEN) {
1058                 lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1059                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
1060                 release_sock(&lsmc->sk);
1061         } else { /* no longer listening */
1062                 smc_close_non_accepted(newsmcsk);
1063         }
1064
1065         /* Wake up accept */
1066         lsmc->sk.sk_data_ready(&lsmc->sk);
1067         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1068 }
1069
1070 /* listen worker: finish in state connected */
1071 static void smc_listen_out_connected(struct smc_sock *new_smc)
1072 {
1073         struct sock *newsmcsk = &new_smc->sk;
1074
1075         sk_refcnt_debug_inc(newsmcsk);
1076         if (newsmcsk->sk_state == SMC_INIT)
1077                 newsmcsk->sk_state = SMC_ACTIVE;
1078
1079         smc_listen_out(new_smc);
1080 }
1081
1082 /* listen worker: finish in error state */
1083 static void smc_listen_out_err(struct smc_sock *new_smc)
1084 {
1085         struct sock *newsmcsk = &new_smc->sk;
1086
1087         if (newsmcsk->sk_state == SMC_INIT)
1088                 sock_put(&new_smc->sk); /* passive closing */
1089         newsmcsk->sk_state = SMC_CLOSED;
1090         smc_conn_free(&new_smc->conn);
1091
1092         smc_listen_out(new_smc);
1093 }
1094
1095 /* listen worker: decline and fall back if possible */
1096 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1097                                int local_contact)
1098 {
1099         /* RDMA setup failed, switch back to TCP */
1100         if (local_contact == SMC_FIRST_CONTACT)
1101                 smc_lgr_forget(new_smc->conn.lgr);
1102         if (reason_code < 0) { /* error, no fallback possible */
1103                 smc_listen_out_err(new_smc);
1104                 return;
1105         }
1106         smc_conn_free(&new_smc->conn);
1107         smc_switch_to_fallback(new_smc);
1108         new_smc->fallback_rsn = reason_code;
1109         if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1110                 if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1111                         smc_listen_out_err(new_smc);
1112                         return;
1113                 }
1114         }
1115         smc_listen_out_connected(new_smc);
1116 }
1117
1118 /* listen worker: check prefixes */
1119 static int smc_listen_rdma_check(struct smc_sock *new_smc,
1120                                  struct smc_clc_msg_proposal *pclc)
1121 {
1122         struct smc_clc_msg_proposal_prefix *pclc_prfx;
1123         struct socket *newclcsock = new_smc->clcsock;
1124
1125         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1126         if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1127                 return SMC_CLC_DECL_CNFERR;
1128
1129         return 0;
1130 }
1131
1132 /* listen worker: initialize connection and buffers */
1133 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1134                                 struct smc_clc_msg_proposal *pclc,
1135                                 struct smc_ib_device *ibdev, u8 ibport,
1136                                 int *local_contact)
1137 {
1138         /* allocate connection / link group */
1139         *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, 0,
1140                                          &pclc->lcl, NULL, 0);
1141         if (*local_contact < 0) {
1142                 if (*local_contact == -ENOMEM)
1143                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1144                 return SMC_CLC_DECL_INTERR; /* other error */
1145         }
1146
1147         /* create send buffer and rmb */
1148         if (smc_buf_create(new_smc, false))
1149                 return SMC_CLC_DECL_MEM;
1150
1151         return 0;
1152 }
1153
1154 /* listen worker: initialize connection and buffers for SMC-D */
1155 static int smc_listen_ism_init(struct smc_sock *new_smc,
1156                                struct smc_clc_msg_proposal *pclc,
1157                                struct smcd_dev *ismdev,
1158                                int *local_contact)
1159 {
1160         struct smc_clc_msg_smcd *pclc_smcd;
1161
1162         pclc_smcd = smc_get_clc_msg_smcd(pclc);
1163         *local_contact = smc_conn_create(new_smc, true, 0, NULL, 0, 0, NULL,
1164                                          ismdev, pclc_smcd->gid);
1165         if (*local_contact < 0) {
1166                 if (*local_contact == -ENOMEM)
1167                         return SMC_CLC_DECL_MEM;/* insufficient memory*/
1168                 return SMC_CLC_DECL_INTERR; /* other error */
1169         }
1170
1171         /* Check if peer can be reached via ISM device */
1172         if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1173                             new_smc->conn.lgr->vlan_id,
1174                             new_smc->conn.lgr->smcd)) {
1175                 if (*local_contact == SMC_FIRST_CONTACT)
1176                         smc_lgr_forget(new_smc->conn.lgr);
1177                 smc_conn_free(&new_smc->conn);
1178                 return SMC_CLC_DECL_CNFERR;
1179         }
1180
1181         /* Create send and receive buffers */
1182         if (smc_buf_create(new_smc, true)) {
1183                 if (*local_contact == SMC_FIRST_CONTACT)
1184                         smc_lgr_forget(new_smc->conn.lgr);
1185                 smc_conn_free(&new_smc->conn);
1186                 return SMC_CLC_DECL_MEM;
1187         }
1188
1189         return 0;
1190 }
1191
1192 /* listen worker: register buffers */
1193 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1194 {
1195         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1196
1197         if (local_contact != SMC_FIRST_CONTACT) {
1198                 if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1199                         return SMC_CLC_DECL_ERR_REGRMB;
1200         }
1201         smc_rmb_sync_sg_for_device(&new_smc->conn);
1202
1203         return 0;
1204 }
1205
1206 /* listen worker: finish RDMA setup */
1207 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1208                                   struct smc_clc_msg_accept_confirm *cclc,
1209                                   int local_contact)
1210 {
1211         struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1212         int reason_code = 0;
1213
1214         if (local_contact == SMC_FIRST_CONTACT)
1215                 smc_link_save_peer_info(link, cclc);
1216
1217         if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1218                 reason_code = SMC_CLC_DECL_ERR_RTOK;
1219                 goto decline;
1220         }
1221
1222         if (local_contact == SMC_FIRST_CONTACT) {
1223                 if (smc_ib_ready_link(link)) {
1224                         reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1225                         goto decline;
1226                 }
1227                 /* QP confirmation over RoCE fabric */
1228                 reason_code = smc_serv_conf_first_link(new_smc);
1229                 if (reason_code)
1230                         goto decline;
1231         }
1232         return 0;
1233
1234 decline:
1235         smc_listen_decline(new_smc, reason_code, local_contact);
1236         return reason_code;
1237 }
1238
1239 /* setup for RDMA connection of server */
1240 static void smc_listen_work(struct work_struct *work)
1241 {
1242         struct smc_sock *new_smc = container_of(work, struct smc_sock,
1243                                                 smc_listen_work);
1244         struct socket *newclcsock = new_smc->clcsock;
1245         struct smc_clc_msg_accept_confirm cclc;
1246         struct smc_clc_msg_proposal *pclc;
1247         struct smc_ib_device *ibdev;
1248         bool ism_supported = false;
1249         struct smcd_dev *ismdev;
1250         u8 buf[SMC_CLC_MAX_LEN];
1251         int local_contact = 0;
1252         unsigned short vlan;
1253         int reason_code = 0;
1254         int rc = 0;
1255         u8 ibport;
1256
1257         if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1258                 return smc_listen_out_err(new_smc);
1259
1260         if (new_smc->use_fallback) {
1261                 smc_listen_out_connected(new_smc);
1262                 return;
1263         }
1264
1265         /* check if peer is smc capable */
1266         if (!tcp_sk(newclcsock->sk)->syn_smc) {
1267                 smc_switch_to_fallback(new_smc);
1268                 new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1269                 smc_listen_out_connected(new_smc);
1270                 return;
1271         }
1272
1273         /* do inband token exchange -
1274          * wait for and receive SMC Proposal CLC message
1275          */
1276         pclc = (struct smc_clc_msg_proposal *)&buf;
1277         reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1278                                        SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1279         if (reason_code) {
1280                 smc_listen_decline(new_smc, reason_code, 0);
1281                 return;
1282         }
1283
1284         /* IPSec connections opt out of SMC-R optimizations */
1285         if (using_ipsec(new_smc)) {
1286                 smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
1287                 return;
1288         }
1289
1290         mutex_lock(&smc_server_lgr_pending);
1291         smc_close_init(new_smc);
1292         smc_rx_init(new_smc);
1293         smc_tx_init(new_smc);
1294
1295         /* check if ISM is available */
1296         if ((pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) &&
1297             !smc_check_ism(new_smc, &ismdev) &&
1298             !smc_listen_ism_init(new_smc, pclc, ismdev, &local_contact)) {
1299                 ism_supported = true;
1300         }
1301
1302         /* check if RDMA is available */
1303         if (!ism_supported &&
1304             ((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
1305              smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
1306              smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
1307              smc_listen_rdma_check(new_smc, pclc) ||
1308              smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
1309                                   &local_contact) ||
1310              smc_listen_rdma_reg(new_smc, local_contact))) {
1311                 /* SMC not supported, decline */
1312                 mutex_unlock(&smc_server_lgr_pending);
1313                 smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
1314                                    local_contact);
1315                 return;
1316         }
1317
1318         /* send SMC Accept CLC message */
1319         rc = smc_clc_send_accept(new_smc, local_contact);
1320         if (rc) {
1321                 mutex_unlock(&smc_server_lgr_pending);
1322                 smc_listen_decline(new_smc, rc, local_contact);
1323                 return;
1324         }
1325
1326         /* SMC-D does not need this lock any more */
1327         if (ism_supported)
1328                 mutex_unlock(&smc_server_lgr_pending);
1329
1330         /* receive SMC Confirm CLC message */
1331         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1332                                        SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1333         if (reason_code) {
1334                 if (!ism_supported)
1335                         mutex_unlock(&smc_server_lgr_pending);
1336                 smc_listen_decline(new_smc, reason_code, local_contact);
1337                 return;
1338         }
1339
1340         /* finish worker */
1341         if (!ism_supported) {
1342                 rc = smc_listen_rdma_finish(new_smc, &cclc, local_contact);
1343                 mutex_unlock(&smc_server_lgr_pending);
1344                 if (rc)
1345                         return;
1346         }
1347         smc_conn_save_peer_info(new_smc, &cclc);
1348         smc_listen_out_connected(new_smc);
1349 }
1350
1351 static void smc_tcp_listen_work(struct work_struct *work)
1352 {
1353         struct smc_sock *lsmc = container_of(work, struct smc_sock,
1354                                              tcp_listen_work);
1355         struct sock *lsk = &lsmc->sk;
1356         struct smc_sock *new_smc;
1357         int rc = 0;
1358
1359         lock_sock(lsk);
1360         while (lsk->sk_state == SMC_LISTEN) {
1361                 rc = smc_clcsock_accept(lsmc, &new_smc);
1362                 if (rc)
1363                         goto out;
1364                 if (!new_smc)
1365                         continue;
1366
1367                 new_smc->listen_smc = lsmc;
1368                 new_smc->use_fallback = lsmc->use_fallback;
1369                 new_smc->fallback_rsn = lsmc->fallback_rsn;
1370                 sock_hold(lsk); /* sock_put in smc_listen_work */
1371                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1372                 smc_copy_sock_settings_to_smc(new_smc);
1373                 new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1374                 new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1375                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
1376                 if (!schedule_work(&new_smc->smc_listen_work))
1377                         sock_put(&new_smc->sk);
1378         }
1379
1380 out:
1381         release_sock(lsk);
1382         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1383 }
1384
1385 static int smc_listen(struct socket *sock, int backlog)
1386 {
1387         struct sock *sk = sock->sk;
1388         struct smc_sock *smc;
1389         int rc;
1390
1391         smc = smc_sk(sk);
1392         lock_sock(sk);
1393
1394         rc = -EINVAL;
1395         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
1396                 goto out;
1397
1398         rc = 0;
1399         if (sk->sk_state == SMC_LISTEN) {
1400                 sk->sk_max_ack_backlog = backlog;
1401                 goto out;
1402         }
1403         /* some socket options are handled in core, so we could not apply
1404          * them to the clc socket -- copy smc socket options to clc socket
1405          */
1406         smc_copy_sock_settings_to_clc(smc);
1407         if (!smc->use_fallback)
1408                 tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1409
1410         rc = kernel_listen(smc->clcsock, backlog);
1411         if (rc)
1412                 goto out;
1413         sk->sk_max_ack_backlog = backlog;
1414         sk->sk_ack_backlog = 0;
1415         sk->sk_state = SMC_LISTEN;
1416         sock_hold(sk); /* sock_hold in tcp_listen_worker */
1417         if (!schedule_work(&smc->tcp_listen_work))
1418                 sock_put(sk);
1419
1420 out:
1421         release_sock(sk);
1422         return rc;
1423 }
1424
1425 static int smc_accept(struct socket *sock, struct socket *new_sock,
1426                       int flags, bool kern)
1427 {
1428         struct sock *sk = sock->sk, *nsk;
1429         DECLARE_WAITQUEUE(wait, current);
1430         struct smc_sock *lsmc;
1431         long timeo;
1432         int rc = 0;
1433
1434         lsmc = smc_sk(sk);
1435         sock_hold(sk); /* sock_put below */
1436         lock_sock(sk);
1437
1438         if (lsmc->sk.sk_state != SMC_LISTEN) {
1439                 rc = -EINVAL;
1440                 release_sock(sk);
1441                 goto out;
1442         }
1443
1444         /* Wait for an incoming connection */
1445         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1446         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1447         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1448                 set_current_state(TASK_INTERRUPTIBLE);
1449                 if (!timeo) {
1450                         rc = -EAGAIN;
1451                         break;
1452                 }
1453                 release_sock(sk);
1454                 timeo = schedule_timeout(timeo);
1455                 /* wakeup by sk_data_ready in smc_listen_work() */
1456                 sched_annotate_sleep();
1457                 lock_sock(sk);
1458                 if (signal_pending(current)) {
1459                         rc = sock_intr_errno(timeo);
1460                         break;
1461                 }
1462         }
1463         set_current_state(TASK_RUNNING);
1464         remove_wait_queue(sk_sleep(sk), &wait);
1465
1466         if (!rc)
1467                 rc = sock_error(nsk);
1468         release_sock(sk);
1469         if (rc)
1470                 goto out;
1471
1472         if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1473                 /* wait till data arrives on the socket */
1474                 timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1475                                                                 MSEC_PER_SEC);
1476                 if (smc_sk(nsk)->use_fallback) {
1477                         struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1478
1479                         lock_sock(clcsk);
1480                         if (skb_queue_empty(&clcsk->sk_receive_queue))
1481                                 sk_wait_data(clcsk, &timeo, NULL);
1482                         release_sock(clcsk);
1483                 } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1484                         lock_sock(nsk);
1485                         smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1486                         release_sock(nsk);
1487                 }
1488         }
1489
1490 out:
1491         sock_put(sk); /* sock_hold above */
1492         return rc;
1493 }
1494
1495 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1496                        int peer)
1497 {
1498         struct smc_sock *smc;
1499
1500         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1501             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1502                 return -ENOTCONN;
1503
1504         smc = smc_sk(sock->sk);
1505
1506         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1507 }
1508
1509 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1510 {
1511         struct sock *sk = sock->sk;
1512         struct smc_sock *smc;
1513         int rc = -EPIPE;
1514
1515         smc = smc_sk(sk);
1516         lock_sock(sk);
1517         if ((sk->sk_state != SMC_ACTIVE) &&
1518             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1519             (sk->sk_state != SMC_INIT))
1520                 goto out;
1521
1522         if (msg->msg_flags & MSG_FASTOPEN) {
1523                 if (sk->sk_state == SMC_INIT) {
1524                         smc_switch_to_fallback(smc);
1525                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1526                 } else {
1527                         rc = -EINVAL;
1528                         goto out;
1529                 }
1530         }
1531
1532         if (smc->use_fallback)
1533                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1534         else
1535                 rc = smc_tx_sendmsg(smc, msg, len);
1536 out:
1537         release_sock(sk);
1538         return rc;
1539 }
1540
1541 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1542                        int flags)
1543 {
1544         struct sock *sk = sock->sk;
1545         struct smc_sock *smc;
1546         int rc = -ENOTCONN;
1547
1548         smc = smc_sk(sk);
1549         lock_sock(sk);
1550         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1551                 /* socket was connected before, no more data to read */
1552                 rc = 0;
1553                 goto out;
1554         }
1555         if ((sk->sk_state == SMC_INIT) ||
1556             (sk->sk_state == SMC_LISTEN) ||
1557             (sk->sk_state == SMC_CLOSED))
1558                 goto out;
1559
1560         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1561                 rc = 0;
1562                 goto out;
1563         }
1564
1565         if (smc->use_fallback) {
1566                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1567         } else {
1568                 msg->msg_namelen = 0;
1569                 rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1570         }
1571
1572 out:
1573         release_sock(sk);
1574         return rc;
1575 }
1576
1577 static __poll_t smc_accept_poll(struct sock *parent)
1578 {
1579         struct smc_sock *isk = smc_sk(parent);
1580         __poll_t mask = 0;
1581
1582         spin_lock(&isk->accept_q_lock);
1583         if (!list_empty(&isk->accept_q))
1584                 mask = EPOLLIN | EPOLLRDNORM;
1585         spin_unlock(&isk->accept_q_lock);
1586
1587         return mask;
1588 }
1589
1590 static __poll_t smc_poll(struct file *file, struct socket *sock,
1591                              poll_table *wait)
1592 {
1593         struct sock *sk = sock->sk;
1594         __poll_t mask = 0;
1595         struct smc_sock *smc;
1596
1597         if (!sk)
1598                 return EPOLLNVAL;
1599
1600         smc = smc_sk(sock->sk);
1601         if (smc->use_fallback) {
1602                 /* delegate to CLC child sock */
1603                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1604                 sk->sk_err = smc->clcsock->sk->sk_err;
1605                 if (sk->sk_err)
1606                         mask |= EPOLLERR;
1607         } else {
1608                 if (sk->sk_state != SMC_CLOSED)
1609                         sock_poll_wait(file, sock, wait);
1610                 if (sk->sk_err)
1611                         mask |= EPOLLERR;
1612                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1613                     (sk->sk_state == SMC_CLOSED))
1614                         mask |= EPOLLHUP;
1615                 if (sk->sk_state == SMC_LISTEN) {
1616                         /* woken up by sk_data_ready in smc_listen_work() */
1617                         mask = smc_accept_poll(sk);
1618                 } else {
1619                         if (atomic_read(&smc->conn.sndbuf_space) ||
1620                             sk->sk_shutdown & SEND_SHUTDOWN) {
1621                                 mask |= EPOLLOUT | EPOLLWRNORM;
1622                         } else {
1623                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1624                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1625                         }
1626                         if (atomic_read(&smc->conn.bytes_to_rcv))
1627                                 mask |= EPOLLIN | EPOLLRDNORM;
1628                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1629                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1630                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1631                                 mask |= EPOLLIN;
1632                         if (smc->conn.urg_state == SMC_URG_VALID)
1633                                 mask |= EPOLLPRI;
1634                 }
1635         }
1636
1637         return mask;
1638 }
1639
1640 static int smc_shutdown(struct socket *sock, int how)
1641 {
1642         struct sock *sk = sock->sk;
1643         struct smc_sock *smc;
1644         int rc = -EINVAL;
1645         int rc1 = 0;
1646
1647         smc = smc_sk(sk);
1648
1649         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1650                 return rc;
1651
1652         lock_sock(sk);
1653
1654         rc = -ENOTCONN;
1655         if ((sk->sk_state != SMC_ACTIVE) &&
1656             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1657             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1658             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1659             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1660             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1661                 goto out;
1662         if (smc->use_fallback) {
1663                 rc = kernel_sock_shutdown(smc->clcsock, how);
1664                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1665                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1666                         sk->sk_state = SMC_CLOSED;
1667                 goto out;
1668         }
1669         switch (how) {
1670         case SHUT_RDWR:         /* shutdown in both directions */
1671                 rc = smc_close_active(smc);
1672                 break;
1673         case SHUT_WR:
1674                 rc = smc_close_shutdown_write(smc);
1675                 break;
1676         case SHUT_RD:
1677                 rc = 0;
1678                 /* nothing more to do because peer is not involved */
1679                 break;
1680         }
1681         if (smc->clcsock)
1682                 rc1 = kernel_sock_shutdown(smc->clcsock, how);
1683         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1684         sk->sk_shutdown |= how + 1;
1685
1686 out:
1687         release_sock(sk);
1688         return rc ? rc : rc1;
1689 }
1690
1691 static int smc_setsockopt(struct socket *sock, int level, int optname,
1692                           char __user *optval, unsigned int optlen)
1693 {
1694         struct sock *sk = sock->sk;
1695         struct smc_sock *smc;
1696         int val, rc;
1697
1698         smc = smc_sk(sk);
1699
1700         /* generic setsockopts reaching us here always apply to the
1701          * CLC socket
1702          */
1703         rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1704                                            optval, optlen);
1705         if (smc->clcsock->sk->sk_err) {
1706                 sk->sk_err = smc->clcsock->sk->sk_err;
1707                 sk->sk_error_report(sk);
1708         }
1709         if (rc)
1710                 return rc;
1711
1712         if (optlen < sizeof(int))
1713                 return -EINVAL;
1714         if (get_user(val, (int __user *)optval))
1715                 return -EFAULT;
1716
1717         lock_sock(sk);
1718         switch (optname) {
1719         case TCP_ULP:
1720         case TCP_FASTOPEN:
1721         case TCP_FASTOPEN_CONNECT:
1722         case TCP_FASTOPEN_KEY:
1723         case TCP_FASTOPEN_NO_COOKIE:
1724                 /* option not supported by SMC */
1725                 if (sk->sk_state == SMC_INIT) {
1726                         smc_switch_to_fallback(smc);
1727                         smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1728                 } else {
1729                         if (!smc->use_fallback)
1730                                 rc = -EINVAL;
1731                 }
1732                 break;
1733         case TCP_NODELAY:
1734                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1735                         if (val && !smc->use_fallback)
1736                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1737                                                  0);
1738                 }
1739                 break;
1740         case TCP_CORK:
1741                 if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
1742                         if (!val && !smc->use_fallback)
1743                                 mod_delayed_work(system_wq, &smc->conn.tx_work,
1744                                                  0);
1745                 }
1746                 break;
1747         case TCP_DEFER_ACCEPT:
1748                 smc->sockopt_defer_accept = val;
1749                 break;
1750         default:
1751                 break;
1752         }
1753         release_sock(sk);
1754
1755         return rc;
1756 }
1757
1758 static int smc_getsockopt(struct socket *sock, int level, int optname,
1759                           char __user *optval, int __user *optlen)
1760 {
1761         struct smc_sock *smc;
1762
1763         smc = smc_sk(sock->sk);
1764         /* socket options apply to the CLC socket */
1765         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1766                                              optval, optlen);
1767 }
1768
1769 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1770                      unsigned long arg)
1771 {
1772         union smc_host_cursor cons, urg;
1773         struct smc_connection *conn;
1774         struct smc_sock *smc;
1775         int answ;
1776
1777         smc = smc_sk(sock->sk);
1778         conn = &smc->conn;
1779         lock_sock(&smc->sk);
1780         if (smc->use_fallback) {
1781                 if (!smc->clcsock) {
1782                         release_sock(&smc->sk);
1783                         return -EBADF;
1784                 }
1785                 answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1786                 release_sock(&smc->sk);
1787                 return answ;
1788         }
1789         switch (cmd) {
1790         case SIOCINQ: /* same as FIONREAD */
1791                 if (smc->sk.sk_state == SMC_LISTEN) {
1792                         release_sock(&smc->sk);
1793                         return -EINVAL;
1794                 }
1795                 if (smc->sk.sk_state == SMC_INIT ||
1796                     smc->sk.sk_state == SMC_CLOSED)
1797                         answ = 0;
1798                 else
1799                         answ = atomic_read(&smc->conn.bytes_to_rcv);
1800                 break;
1801         case SIOCOUTQ:
1802                 /* output queue size (not send + not acked) */
1803                 if (smc->sk.sk_state == SMC_LISTEN) {
1804                         release_sock(&smc->sk);
1805                         return -EINVAL;
1806                 }
1807                 if (smc->sk.sk_state == SMC_INIT ||
1808                     smc->sk.sk_state == SMC_CLOSED)
1809                         answ = 0;
1810                 else
1811                         answ = smc->conn.sndbuf_desc->len -
1812                                         atomic_read(&smc->conn.sndbuf_space);
1813                 break;
1814         case SIOCOUTQNSD:
1815                 /* output queue size (not send only) */
1816                 if (smc->sk.sk_state == SMC_LISTEN) {
1817                         release_sock(&smc->sk);
1818                         return -EINVAL;
1819                 }
1820                 if (smc->sk.sk_state == SMC_INIT ||
1821                     smc->sk.sk_state == SMC_CLOSED)
1822                         answ = 0;
1823                 else
1824                         answ = smc_tx_prepared_sends(&smc->conn);
1825                 break;
1826         case SIOCATMARK:
1827                 if (smc->sk.sk_state == SMC_LISTEN) {
1828                         release_sock(&smc->sk);
1829                         return -EINVAL;
1830                 }
1831                 if (smc->sk.sk_state == SMC_INIT ||
1832                     smc->sk.sk_state == SMC_CLOSED) {
1833                         answ = 0;
1834                 } else {
1835                         smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1836                         smc_curs_copy(&urg, &conn->urg_curs, conn);
1837                         answ = smc_curs_diff(conn->rmb_desc->len,
1838                                              &cons, &urg) == 1;
1839                 }
1840                 break;
1841         default:
1842                 release_sock(&smc->sk);
1843                 return -ENOIOCTLCMD;
1844         }
1845         release_sock(&smc->sk);
1846
1847         return put_user(answ, (int __user *)arg);
1848 }
1849
1850 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1851                             int offset, size_t size, int flags)
1852 {
1853         struct sock *sk = sock->sk;
1854         struct smc_sock *smc;
1855         int rc = -EPIPE;
1856
1857         smc = smc_sk(sk);
1858         lock_sock(sk);
1859         if (sk->sk_state != SMC_ACTIVE) {
1860                 release_sock(sk);
1861                 goto out;
1862         }
1863         release_sock(sk);
1864         if (smc->use_fallback)
1865                 rc = kernel_sendpage(smc->clcsock, page, offset,
1866                                      size, flags);
1867         else
1868                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1869
1870 out:
1871         return rc;
1872 }
1873
1874 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1875  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1876  * updates till whenever a respective page has been fully processed.
1877  * Note that subsequent recv() calls have to wait till all splice() processing
1878  * completed.
1879  */
1880 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1881                                struct pipe_inode_info *pipe, size_t len,
1882                                unsigned int flags)
1883 {
1884         struct sock *sk = sock->sk;
1885         struct smc_sock *smc;
1886         int rc = -ENOTCONN;
1887
1888         smc = smc_sk(sk);
1889         lock_sock(sk);
1890         if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1891                 /* socket was connected before, no more data to read */
1892                 rc = 0;
1893                 goto out;
1894         }
1895         if (sk->sk_state == SMC_INIT ||
1896             sk->sk_state == SMC_LISTEN ||
1897             sk->sk_state == SMC_CLOSED)
1898                 goto out;
1899
1900         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1901                 rc = 0;
1902                 goto out;
1903         }
1904
1905         if (smc->use_fallback) {
1906                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1907                                                     pipe, len, flags);
1908         } else {
1909                 if (*ppos) {
1910                         rc = -ESPIPE;
1911                         goto out;
1912                 }
1913                 if (flags & SPLICE_F_NONBLOCK)
1914                         flags = MSG_DONTWAIT;
1915                 else
1916                         flags = 0;
1917                 rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1918         }
1919 out:
1920         release_sock(sk);
1921
1922         return rc;
1923 }
1924
1925 /* must look like tcp */
1926 static const struct proto_ops smc_sock_ops = {
1927         .family         = PF_SMC,
1928         .owner          = THIS_MODULE,
1929         .release        = smc_release,
1930         .bind           = smc_bind,
1931         .connect        = smc_connect,
1932         .socketpair     = sock_no_socketpair,
1933         .accept         = smc_accept,
1934         .getname        = smc_getname,
1935         .poll           = smc_poll,
1936         .ioctl          = smc_ioctl,
1937         .listen         = smc_listen,
1938         .shutdown       = smc_shutdown,
1939         .setsockopt     = smc_setsockopt,
1940         .getsockopt     = smc_getsockopt,
1941         .sendmsg        = smc_sendmsg,
1942         .recvmsg        = smc_recvmsg,
1943         .mmap           = sock_no_mmap,
1944         .sendpage       = smc_sendpage,
1945         .splice_read    = smc_splice_read,
1946 };
1947
1948 static int smc_create(struct net *net, struct socket *sock, int protocol,
1949                       int kern)
1950 {
1951         int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1952         struct smc_sock *smc;
1953         struct sock *sk;
1954         int rc;
1955
1956         rc = -ESOCKTNOSUPPORT;
1957         if (sock->type != SOCK_STREAM)
1958                 goto out;
1959
1960         rc = -EPROTONOSUPPORT;
1961         if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1962                 goto out;
1963
1964         rc = -ENOBUFS;
1965         sock->ops = &smc_sock_ops;
1966         sk = smc_sock_alloc(net, sock, protocol);
1967         if (!sk)
1968                 goto out;
1969
1970         /* create internal TCP socket for CLC handshake and fallback */
1971         smc = smc_sk(sk);
1972         smc->use_fallback = false; /* assume rdma capability first */
1973         smc->fallback_rsn = 0;
1974         rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1975                               &smc->clcsock);
1976         if (rc) {
1977                 sk_common_release(sk);
1978                 goto out;
1979         }
1980         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1981         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1982
1983 out:
1984         return rc;
1985 }
1986
1987 static const struct net_proto_family smc_sock_family_ops = {
1988         .family = PF_SMC,
1989         .owner  = THIS_MODULE,
1990         .create = smc_create,
1991 };
1992
1993 unsigned int smc_net_id;
1994
1995 static __net_init int smc_net_init(struct net *net)
1996 {
1997         return smc_pnet_net_init(net);
1998 }
1999
2000 static void __net_exit smc_net_exit(struct net *net)
2001 {
2002         smc_pnet_net_exit(net);
2003 }
2004
2005 static struct pernet_operations smc_net_ops = {
2006         .init = smc_net_init,
2007         .exit = smc_net_exit,
2008         .id   = &smc_net_id,
2009         .size = sizeof(struct smc_net),
2010 };
2011
2012 static int __init smc_init(void)
2013 {
2014         int rc;
2015
2016         rc = register_pernet_subsys(&smc_net_ops);
2017         if (rc)
2018                 return rc;
2019
2020         rc = smc_pnet_init();
2021         if (rc)
2022                 return rc;
2023
2024         rc = smc_llc_init();
2025         if (rc) {
2026                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2027                 goto out_pnet;
2028         }
2029
2030         rc = smc_cdc_init();
2031         if (rc) {
2032                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2033                 goto out_pnet;
2034         }
2035
2036         rc = proto_register(&smc_proto, 1);
2037         if (rc) {
2038                 pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2039                 goto out_pnet;
2040         }
2041
2042         rc = proto_register(&smc_proto6, 1);
2043         if (rc) {
2044                 pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2045                 goto out_proto;
2046         }
2047
2048         rc = sock_register(&smc_sock_family_ops);
2049         if (rc) {
2050                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
2051                 goto out_proto6;
2052         }
2053         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2054         INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2055
2056         rc = smc_ib_register_client();
2057         if (rc) {
2058                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
2059                 goto out_sock;
2060         }
2061
2062         static_branch_enable(&tcp_have_smc);
2063         return 0;
2064
2065 out_sock:
2066         sock_unregister(PF_SMC);
2067 out_proto6:
2068         proto_unregister(&smc_proto6);
2069 out_proto:
2070         proto_unregister(&smc_proto);
2071 out_pnet:
2072         smc_pnet_exit();
2073         return rc;
2074 }
2075
2076 static void __exit smc_exit(void)
2077 {
2078         smc_core_exit();
2079         static_branch_disable(&tcp_have_smc);
2080         smc_ib_unregister_client();
2081         sock_unregister(PF_SMC);
2082         proto_unregister(&smc_proto6);
2083         proto_unregister(&smc_proto);
2084         smc_pnet_exit();
2085         unregister_pernet_subsys(&smc_net_ops);
2086 }
2087
2088 module_init(smc_init);
2089 module_exit(smc_exit);
2090
2091 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2092 MODULE_DESCRIPTION("smc socket address family");
2093 MODULE_LICENSE("GPL");
2094 MODULE_ALIAS_NETPROTO(PF_SMC);