net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/timer.h>
 106 #include <linux/string.h>
 107 #include <linux/sockios.h>
 108 #include <linux/net.h>
 109 #include <linux/mm.h>
 110 #include <linux/slab.h>
 111 #include <linux/interrupt.h>
 112 #include <linux/poll.h>
 113 #include <linux/tcp.h>
 114 #include <linux/init.h>
 115 #include <linux/highmem.h>
 116 #include <linux/user_namespace.h>
 117 #include <linux/static_key.h>
 118 #include <linux/memcontrol.h>
 119 #include <linux/prefetch.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138
 139 #include <trace/events/sock.h>
 140
 141 #ifdef CONFIG_INET
 142 #include <net/tcp.h>
 143 #endif
 144
 145 #include <net/busy_poll.h>
 146
 147 static DEFINE_MUTEX(proto_list_mutex);
 148 static LIST_HEAD(proto_list);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family:
 201  */
 202 static struct lock_class_key af_family_keys[AF_MAX];
 203 static struct lock_class_key af_family_slock_keys[AF_MAX];
 204
 205 /*
 206  * Make lock validator output more readable. (we pre-construct these
 207  * strings build-time, so that runtime initialization of socket
 208  * locks is fast):
 209  */
 210 static const char *const af_family_key_strings[AF_MAX+1] = {
 211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
 212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
 213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
 214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
 215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
 216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
 217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
 218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
 219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
 220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
 221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
 222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
 223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
 224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_KCM"      ,
 225   "sk_lock-AF_QIPCRTR", "sk_lock-AF_SMC"     , "sk_lock-AF_MAX"
 226 };
 227 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 228   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
 229   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
 230   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
 231   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
 232   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
 233   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
 234   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
 235   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
 236   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
 237   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
 238   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
 239   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
 240   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
 241   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_KCM"       ,
 242   "slock-AF_QIPCRTR", "slock-AF_SMC"     , "slock-AF_MAX"
 243 };
 244 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 245   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
 246   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
 247   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
 248   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
 249   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
 250   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
 251   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
 252   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
 253   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
 254   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
 255   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
 256   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
 257   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
 258   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_KCM"      ,
 259   "clock-AF_QIPCRTR", "clock-AF_SMC"     , "clock-AF_MAX"
 260 };
 261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 262   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 263   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 264   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 265   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 266   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 267   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 268   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 269   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 270   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 271   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 272   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 273   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 274   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 275   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 276   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
 277 };
 278 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 279   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 280   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 281   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 282   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 283   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 284   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 285   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 286   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 287   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 288   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 289   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 290   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 291   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 292   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 293   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
 294 };
 295 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 296   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 297   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 298   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 299   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 300   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 301   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 302   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 303   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 304   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 305   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 306   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 307   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 308   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 309   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 310   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
 311 };
 312
 313 /*
 314  * sk_callback_lock and sk queues locking rules are per-address-family,
 315  * so split the lock classes by using a per-AF key:
 316  */
 317 static struct lock_class_key af_callback_keys[AF_MAX];
 318 static struct lock_class_key af_rlock_keys[AF_MAX];
 319 static struct lock_class_key af_wlock_keys[AF_MAX];
 320 static struct lock_class_key af_elock_keys[AF_MAX];
 321
 322 /* Take into consideration the size of the struct sk_buff overhead in the
 323  * determination of these values, since that is non-constant across
 324  * platforms.  This makes socket queueing behavior and performance
 325  * not depend upon such differences.
 326  */
 327 #define _SK_MEM_PACKETS         256
 328 #define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
 329 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 330 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 331
 332 /* Run time adjustable parameters. */
 333 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 334 EXPORT_SYMBOL(sysctl_wmem_max);
 335 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 336 EXPORT_SYMBOL(sysctl_rmem_max);
 337 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 338 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 339
 340 /* Maximal space eaten by iovec or ancillary data plus some space */
 341 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 342 EXPORT_SYMBOL(sysctl_optmem_max);
 343
 344 int sysctl_tstamp_allow_data __read_mostly = 1;
 345
 346 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
 347 EXPORT_SYMBOL_GPL(memalloc_socks);
 348
 349 /**
 350  * sk_set_memalloc - sets %SOCK_MEMALLOC
 351  * @sk: socket to set it on
 352  *
 353  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 354  * It's the responsibility of the admin to adjust min_free_kbytes
 355  * to meet the requirements
 356  */
 357 void sk_set_memalloc(struct sock *sk)
 358 {
 359         sock_set_flag(sk, SOCK_MEMALLOC);
 360         sk->sk_allocation |= __GFP_MEMALLOC;
 361         static_key_slow_inc(&memalloc_socks);
 362 }
 363 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 364
 365 void sk_clear_memalloc(struct sock *sk)
 366 {
 367         sock_reset_flag(sk, SOCK_MEMALLOC);
 368         sk->sk_allocation &= ~__GFP_MEMALLOC;
 369         static_key_slow_dec(&memalloc_socks);
 370
 371         /*
 372          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 373          * progress of swapping. SOCK_MEMALLOC may be cleared while
 374          * it has rmem allocations due to the last swapfile being deactivated
 375          * but there is a risk that the socket is unusable due to exceeding
 376          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 377          */
 378         sk_mem_reclaim(sk);
 379 }
 380 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 381
 382 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 383 {
 384         int ret;
 385         unsigned long pflags = current->flags;
 386
 387         /* these should have been dropped before queueing */
 388         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 389
 390         current->flags |= PF_MEMALLOC;
 391         ret = sk->sk_backlog_rcv(sk, skb);
 392         tsk_restore_flags(current, pflags, PF_MEMALLOC);
 393
 394         return ret;
 395 }
 396 EXPORT_SYMBOL(__sk_backlog_rcv);
 397
 398 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 399 {
 400         struct timeval tv;
 401
 402         if (optlen < sizeof(tv))
 403                 return -EINVAL;
 404         if (copy_from_user(&tv, optval, sizeof(tv)))
 405                 return -EFAULT;
 406         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 407                 return -EDOM;
 408
 409         if (tv.tv_sec < 0) {
 410                 static int warned __read_mostly;
 411
 412                 *timeo_p = 0;
 413                 if (warned < 10 && net_ratelimit()) {
 414                         warned++;
 415                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 416                                 __func__, current->comm, task_pid_nr(current));
 417                 }
 418                 return 0;
 419         }
 420         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 421         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 422                 return 0;
 423         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 424                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 425         return 0;
 426 }
 427
 428 static void sock_warn_obsolete_bsdism(const char *name)
 429 {
 430         static int warned;
 431         static char warncomm[TASK_COMM_LEN];
 432         if (strcmp(warncomm, current->comm) && warned < 5) {
 433                 strcpy(warncomm,  current->comm);
 434                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 435                         warncomm, name);
 436                 warned++;
 437         }
 438 }
 439
 440 static bool sock_needs_netstamp(const struct sock *sk)
 441 {
 442         switch (sk->sk_family) {
 443         case AF_UNSPEC:
 444         case AF_UNIX:
 445                 return false;
 446         default:
 447                 return true;
 448         }
 449 }
 450
 451 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 452 {
 453         if (sk->sk_flags & flags) {
 454                 sk->sk_flags &= ~flags;
 455                 if (sock_needs_netstamp(sk) &&
 456                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 457                         net_disable_timestamp();
 458         }
 459 }
 460
 461
 462 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 463 {
 464         unsigned long flags;
 465         struct sk_buff_head *list = &sk->sk_receive_queue;
 466
 467         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 468                 atomic_inc(&sk->sk_drops);
 469                 trace_sock_rcvqueue_full(sk, skb);
 470                 return -ENOMEM;
 471         }
 472
 473         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 474                 atomic_inc(&sk->sk_drops);
 475                 return -ENOBUFS;
 476         }
 477
 478         skb->dev = NULL;
 479         skb_set_owner_r(skb, sk);
 480
 481         /* we escape from rcu protected region, make sure we dont leak
 482          * a norefcounted dst
 483          */
 484         skb_dst_force(skb);
 485
 486         spin_lock_irqsave(&list->lock, flags);
 487         sock_skb_set_dropcount(sk, skb);
 488         __skb_queue_tail(list, skb);
 489         spin_unlock_irqrestore(&list->lock, flags);
 490
 491         if (!sock_flag(sk, SOCK_DEAD))
 492                 sk->sk_data_ready(sk);
 493         return 0;
 494 }
 495 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 496
 497 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 498 {
 499         int err;
 500
 501         err = sk_filter(sk, skb);
 502         if (err)
 503                 return err;
 504
 505         return __sock_queue_rcv_skb(sk, skb);
 506 }
 507 EXPORT_SYMBOL(sock_queue_rcv_skb);
 508
 509 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 510                      const int nested, unsigned int trim_cap, bool refcounted)
 511 {
 512         int rc = NET_RX_SUCCESS;
 513
 514         if (sk_filter_trim_cap(sk, skb, trim_cap))
 515                 goto discard_and_relse;
 516
 517         skb->dev = NULL;
 518
 519         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 520                 atomic_inc(&sk->sk_drops);
 521                 goto discard_and_relse;
 522         }
 523         if (nested)
 524                 bh_lock_sock_nested(sk);
 525         else
 526                 bh_lock_sock(sk);
 527         if (!sock_owned_by_user(sk)) {
 528                 /*
 529                  * trylock + unlock semantics:
 530                  */
 531                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 532
 533                 rc = sk_backlog_rcv(sk, skb);
 534
 535                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 536         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 537                 bh_unlock_sock(sk);
 538                 atomic_inc(&sk->sk_drops);
 539                 goto discard_and_relse;
 540         }
 541
 542         bh_unlock_sock(sk);
 543 out:
 544         if (refcounted)
 545                 sock_put(sk);
 546         return rc;
 547 discard_and_relse:
 548         kfree_skb(skb);
 549         goto out;
 550 }
 551 EXPORT_SYMBOL(__sk_receive_skb);
 552
 553 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 554 {
 555         struct dst_entry *dst = __sk_dst_get(sk);
 556
 557         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 558                 sk_tx_queue_clear(sk);
 559                 sk->sk_dst_pending_confirm = 0;
 560                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 561                 dst_release(dst);
 562                 return NULL;
 563         }
 564
 565         return dst;
 566 }
 567 EXPORT_SYMBOL(__sk_dst_check);
 568
 569 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 570 {
 571         struct dst_entry *dst = sk_dst_get(sk);
 572
 573         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 574                 sk_dst_reset(sk);
 575                 dst_release(dst);
 576                 return NULL;
 577         }
 578
 579         return dst;
 580 }
 581 EXPORT_SYMBOL(sk_dst_check);
 582
 583 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 584                                 int optlen)
 585 {
 586         int ret = -ENOPROTOOPT;
 587 #ifdef CONFIG_NETDEVICES
 588         struct net *net = sock_net(sk);
 589         char devname[IFNAMSIZ];
 590         int index;
 591
 592         /* Sorry... */
 593         ret = -EPERM;
 594         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 595                 goto out;
 596
 597         ret = -EINVAL;
 598         if (optlen < 0)
 599                 goto out;
 600
 601         /* Bind this socket to a particular device like "eth0",
 602          * as specified in the passed interface name. If the
 603          * name is "" or the option length is zero the socket
 604          * is not bound.
 605          */
 606         if (optlen > IFNAMSIZ - 1)
 607                 optlen = IFNAMSIZ - 1;
 608         memset(devname, 0, sizeof(devname));
 609
 610         ret = -EFAULT;
 611         if (copy_from_user(devname, optval, optlen))
 612                 goto out;
 613
 614         index = 0;
 615         if (devname[0] != '\0') {
 616                 struct net_device *dev;
 617
 618                 rcu_read_lock();
 619                 dev = dev_get_by_name_rcu(net, devname);
 620                 if (dev)
 621                         index = dev->ifindex;
 622                 rcu_read_unlock();
 623                 ret = -ENODEV;
 624                 if (!dev)
 625                         goto out;
 626         }
 627
 628         lock_sock(sk);
 629         sk->sk_bound_dev_if = index;
 630         sk_dst_reset(sk);
 631         release_sock(sk);
 632
 633         ret = 0;
 634
 635 out:
 636 #endif
 637
 638         return ret;
 639 }
 640
 641 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 642                                 int __user *optlen, int len)
 643 {
 644         int ret = -ENOPROTOOPT;
 645 #ifdef CONFIG_NETDEVICES
 646         struct net *net = sock_net(sk);
 647         char devname[IFNAMSIZ];
 648
 649         if (sk->sk_bound_dev_if == 0) {
 650                 len = 0;
 651                 goto zero;
 652         }
 653
 654         ret = -EINVAL;
 655         if (len < IFNAMSIZ)
 656                 goto out;
 657
 658         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 659         if (ret)
 660                 goto out;
 661
 662         len = strlen(devname) + 1;
 663
 664         ret = -EFAULT;
 665         if (copy_to_user(optval, devname, len))
 666                 goto out;
 667
 668 zero:
 669         ret = -EFAULT;
 670         if (put_user(len, optlen))
 671                 goto out;
 672
 673         ret = 0;
 674
 675 out:
 676 #endif
 677
 678         return ret;
 679 }
 680
 681 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 682 {
 683         if (valbool)
 684                 sock_set_flag(sk, bit);
 685         else
 686                 sock_reset_flag(sk, bit);
 687 }
 688
 689 bool sk_mc_loop(struct sock *sk)
 690 {
 691         if (dev_recursion_level())
 692                 return false;
 693         if (!sk)
 694                 return true;
 695         switch (sk->sk_family) {
 696         case AF_INET:
 697                 return inet_sk(sk)->mc_loop;
 698 #if IS_ENABLED(CONFIG_IPV6)
 699         case AF_INET6:
 700                 return inet6_sk(sk)->mc_loop;
 701 #endif
 702         }
 703         WARN_ON(1);
 704         return true;
 705 }
 706 EXPORT_SYMBOL(sk_mc_loop);
 707
 708 /*
 709  *      This is meant for all protocols to use and covers goings on
 710  *      at the socket level. Everything here is generic.
 711  */
 712
 713 int sock_setsockopt(struct socket *sock, int level, int optname,
 714                     char __user *optval, unsigned int optlen)
 715 {
 716         struct sock *sk = sock->sk;
 717         int val;
 718         int valbool;
 719         struct linger ling;
 720         int ret = 0;
 721
 722         /*
 723          *      Options without arguments
 724          */
 725
 726         if (optname == SO_BINDTODEVICE)
 727                 return sock_setbindtodevice(sk, optval, optlen);
 728
 729         if (optlen < sizeof(int))
 730                 return -EINVAL;
 731
 732         if (get_user(val, (int __user *)optval))
 733                 return -EFAULT;
 734
 735         valbool = val ? 1 : 0;
 736
 737         lock_sock(sk);
 738
 739         switch (optname) {
 740         case SO_DEBUG:
 741                 if (val && !capable(CAP_NET_ADMIN))
 742                         ret = -EACCES;
 743                 else
 744                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 745                 break;
 746         case SO_REUSEADDR:
 747                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 748                 break;
 749         case SO_REUSEPORT:
 750                 sk->sk_reuseport = valbool;
 751                 break;
 752         case SO_TYPE:
 753         case SO_PROTOCOL:
 754         case SO_DOMAIN:
 755         case SO_ERROR:
 756                 ret = -ENOPROTOOPT;
 757                 break;
 758         case SO_DONTROUTE:
 759                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 760                 break;
 761         case SO_BROADCAST:
 762                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 763                 break;
 764         case SO_SNDBUF:
 765                 /* Don't error on this BSD doesn't and if you think
 766                  * about it this is right. Otherwise apps have to
 767                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 768                  * are treated in BSD as hints
 769                  */
 770                 val = min_t(u32, val, sysctl_wmem_max);
 771 set_sndbuf:
 772                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 773                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 774                 /* Wake up sending tasks if we upped the value. */
 775                 sk->sk_write_space(sk);
 776                 break;
 777
 778         case SO_SNDBUFFORCE:
 779                 if (!capable(CAP_NET_ADMIN)) {
 780                         ret = -EPERM;
 781                         break;
 782                 }
 783                 goto set_sndbuf;
 784
 785         case SO_RCVBUF:
 786                 /* Don't error on this BSD doesn't and if you think
 787                  * about it this is right. Otherwise apps have to
 788                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 789                  * are treated in BSD as hints
 790                  */
 791                 val = min_t(u32, val, sysctl_rmem_max);
 792 set_rcvbuf:
 793                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 794                 /*
 795                  * We double it on the way in to account for
 796                  * "struct sk_buff" etc. overhead.   Applications
 797                  * assume that the SO_RCVBUF setting they make will
 798                  * allow that much actual data to be received on that
 799                  * socket.
 800                  *
 801                  * Applications are unaware that "struct sk_buff" and
 802                  * other overheads allocate from the receive buffer
 803                  * during socket buffer allocation.
 804                  *
 805                  * And after considering the possible alternatives,
 806                  * returning the value we actually used in getsockopt
 807                  * is the most desirable behavior.
 808                  */
 809                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 810                 break;
 811
 812         case SO_RCVBUFFORCE:
 813                 if (!capable(CAP_NET_ADMIN)) {
 814                         ret = -EPERM;
 815                         break;
 816                 }
 817                 goto set_rcvbuf;
 818
 819         case SO_KEEPALIVE:
 820                 if (sk->sk_prot->keepalive)
 821                         sk->sk_prot->keepalive(sk, valbool);
 822                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 823                 break;
 824
 825         case SO_OOBINLINE:
 826                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 827                 break;
 828
 829         case SO_NO_CHECK:
 830                 sk->sk_no_check_tx = valbool;
 831                 break;
 832
 833         case SO_PRIORITY:
 834                 if ((val >= 0 && val <= 6) ||
 835                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 836                         sk->sk_priority = val;
 837                 else
 838                         ret = -EPERM;
 839                 break;
 840
 841         case SO_LINGER:
 842                 if (optlen < sizeof(ling)) {
 843                         ret = -EINVAL;  /* 1003.1g */
 844                         break;
 845                 }
 846                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 847                         ret = -EFAULT;
 848                         break;
 849                 }
 850                 if (!ling.l_onoff)
 851                         sock_reset_flag(sk, SOCK_LINGER);
 852                 else {
 853 #if (BITS_PER_LONG == 32)
 854                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 855                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 856                         else
 857 #endif
 858                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 859                         sock_set_flag(sk, SOCK_LINGER);
 860                 }
 861                 break;
 862
 863         case SO_BSDCOMPAT:
 864                 sock_warn_obsolete_bsdism("setsockopt");
 865                 break;
 866
 867         case SO_PASSCRED:
 868                 if (valbool)
 869                         set_bit(SOCK_PASSCRED, &sock->flags);
 870                 else
 871                         clear_bit(SOCK_PASSCRED, &sock->flags);
 872                 break;
 873
 874         case SO_TIMESTAMP:
 875         case SO_TIMESTAMPNS:
 876                 if (valbool)  {
 877                         if (optname == SO_TIMESTAMP)
 878                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 879                         else
 880                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 881                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 882                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 883                 } else {
 884                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 885                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 886                 }
 887                 break;
 888
 889         case SO_TIMESTAMPING:
 890                 if (val & ~SOF_TIMESTAMPING_MASK) {
 891                         ret = -EINVAL;
 892                         break;
 893                 }
 894
 895                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 896                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 897                         if (sk->sk_protocol == IPPROTO_TCP &&
 898                             sk->sk_type == SOCK_STREAM) {
 899                                 if ((1 << sk->sk_state) &
 900                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 901                                         ret = -EINVAL;
 902                                         break;
 903                                 }
 904                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 905                         } else {
 906                                 sk->sk_tskey = 0;
 907                         }
 908                 }
 909
 910                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 911                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 912                         ret = -EINVAL;
 913                         break;
 914                 }
 915
 916                 sk->sk_tsflags = val;
 917                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 918                         sock_enable_timestamp(sk,
 919                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 920                 else
 921                         sock_disable_timestamp(sk,
 922                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 923                 break;
 924
 925         case SO_RCVLOWAT:
 926                 if (val < 0)
 927                         val = INT_MAX;
 928                 sk->sk_rcvlowat = val ? : 1;
 929                 break;
 930
 931         case SO_RCVTIMEO:
 932                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 933                 break;
 934
 935         case SO_SNDTIMEO:
 936                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 937                 break;
 938
 939         case SO_ATTACH_FILTER:
 940                 ret = -EINVAL;
 941                 if (optlen == sizeof(struct sock_fprog)) {
 942                         struct sock_fprog fprog;
 943
 944                         ret = -EFAULT;
 945                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 946                                 break;
 947
 948                         ret = sk_attach_filter(&fprog, sk);
 949                 }
 950                 break;
 951
 952         case SO_ATTACH_BPF:
 953                 ret = -EINVAL;
 954                 if (optlen == sizeof(u32)) {
 955                         u32 ufd;
 956
 957                         ret = -EFAULT;
 958                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 959                                 break;
 960
 961                         ret = sk_attach_bpf(ufd, sk);
 962                 }
 963                 break;
 964
 965         case SO_ATTACH_REUSEPORT_CBPF:
 966                 ret = -EINVAL;
 967                 if (optlen == sizeof(struct sock_fprog)) {
 968                         struct sock_fprog fprog;
 969
 970                         ret = -EFAULT;
 971                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 972                                 break;
 973
 974                         ret = sk_reuseport_attach_filter(&fprog, sk);
 975                 }
 976                 break;
 977
 978         case SO_ATTACH_REUSEPORT_EBPF:
 979                 ret = -EINVAL;
 980                 if (optlen == sizeof(u32)) {
 981                         u32 ufd;
 982
 983                         ret = -EFAULT;
 984                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 985                                 break;
 986
 987                         ret = sk_reuseport_attach_bpf(ufd, sk);
 988                 }
 989                 break;
 990
 991         case SO_DETACH_FILTER:
 992                 ret = sk_detach_filter(sk);
 993                 break;
 994
 995         case SO_LOCK_FILTER:
 996                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 997                         ret = -EPERM;
 998                 else
 999                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1000                 break;
1001
1002         case SO_PASSSEC:
1003                 if (valbool)
1004                         set_bit(SOCK_PASSSEC, &sock->flags);
1005                 else
1006                         clear_bit(SOCK_PASSSEC, &sock->flags);
1007                 break;
1008         case SO_MARK:
1009                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1010                         ret = -EPERM;
1011                 else
1012                         sk->sk_mark = val;
1013                 break;
1014
1015         case SO_RXQ_OVFL:
1016                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1017                 break;
1018
1019         case SO_WIFI_STATUS:
1020                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1021                 break;
1022
1023         case SO_PEEK_OFF:
1024                 if (sock->ops->set_peek_off)
1025                         ret = sock->ops->set_peek_off(sk, val);
1026                 else
1027                         ret = -EOPNOTSUPP;
1028                 break;
1029
1030         case SO_NOFCS:
1031                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1032                 break;
1033
1034         case SO_SELECT_ERR_QUEUE:
1035                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1036                 break;
1037
1038 #ifdef CONFIG_NET_RX_BUSY_POLL
1039         case SO_BUSY_POLL:
1040                 /* allow unprivileged users to decrease the value */
1041                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1042                         ret = -EPERM;
1043                 else {
1044                         if (val < 0)
1045                                 ret = -EINVAL;
1046                         else
1047                                 sk->sk_ll_usec = val;
1048                 }
1049                 break;
1050 #endif
1051
1052         case SO_MAX_PACING_RATE:
1053                 sk->sk_max_pacing_rate = val;
1054                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1055                                          sk->sk_max_pacing_rate);
1056                 break;
1057
1058         case SO_INCOMING_CPU:
1059                 sk->sk_incoming_cpu = val;
1060                 break;
1061
1062         case SO_CNX_ADVICE:
1063                 if (val == 1)
1064                         dst_negative_advice(sk);
1065                 break;
1066         default:
1067                 ret = -ENOPROTOOPT;
1068                 break;
1069         }
1070         release_sock(sk);
1071         return ret;
1072 }
1073 EXPORT_SYMBOL(sock_setsockopt);
1074
1075
1076 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1077                           struct ucred *ucred)
1078 {
1079         ucred->pid = pid_vnr(pid);
1080         ucred->uid = ucred->gid = -1;
1081         if (cred) {
1082                 struct user_namespace *current_ns = current_user_ns();
1083
1084                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1085                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1086         }
1087 }
1088
1089 int sock_getsockopt(struct socket *sock, int level, int optname,
1090                     char __user *optval, int __user *optlen)
1091 {
1092         struct sock *sk = sock->sk;
1093
1094         union {
1095                 int val;
1096                 struct linger ling;
1097                 struct timeval tm;
1098         } v;
1099
1100         int lv = sizeof(int);
1101         int len;
1102
1103         if (get_user(len, optlen))
1104                 return -EFAULT;
1105         if (len < 0)
1106                 return -EINVAL;
1107
1108         memset(&v, 0, sizeof(v));
1109
1110         switch (optname) {
1111         case SO_DEBUG:
1112                 v.val = sock_flag(sk, SOCK_DBG);
1113                 break;
1114
1115         case SO_DONTROUTE:
1116                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1117                 break;
1118
1119         case SO_BROADCAST:
1120                 v.val = sock_flag(sk, SOCK_BROADCAST);
1121                 break;
1122
1123         case SO_SNDBUF:
1124                 v.val = sk->sk_sndbuf;
1125                 break;
1126
1127         case SO_RCVBUF:
1128                 v.val = sk->sk_rcvbuf;
1129                 break;
1130
1131         case SO_REUSEADDR:
1132                 v.val = sk->sk_reuse;
1133                 break;
1134
1135         case SO_REUSEPORT:
1136                 v.val = sk->sk_reuseport;
1137                 break;
1138
1139         case SO_KEEPALIVE:
1140                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1141                 break;
1142
1143         case SO_TYPE:
1144                 v.val = sk->sk_type;
1145                 break;
1146
1147         case SO_PROTOCOL:
1148                 v.val = sk->sk_protocol;
1149                 break;
1150
1151         case SO_DOMAIN:
1152                 v.val = sk->sk_family;
1153                 break;
1154
1155         case SO_ERROR:
1156                 v.val = -sock_error(sk);
1157                 if (v.val == 0)
1158                         v.val = xchg(&sk->sk_err_soft, 0);
1159                 break;
1160
1161         case SO_OOBINLINE:
1162                 v.val = sock_flag(sk, SOCK_URGINLINE);
1163                 break;
1164
1165         case SO_NO_CHECK:
1166                 v.val = sk->sk_no_check_tx;
1167                 break;
1168
1169         case SO_PRIORITY:
1170                 v.val = sk->sk_priority;
1171                 break;
1172
1173         case SO_LINGER:
1174                 lv              = sizeof(v.ling);
1175                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1176                 v.ling.l_linger = sk->sk_lingertime / HZ;
1177                 break;
1178
1179         case SO_BSDCOMPAT:
1180                 sock_warn_obsolete_bsdism("getsockopt");
1181                 break;
1182
1183         case SO_TIMESTAMP:
1184                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1185                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1186                 break;
1187
1188         case SO_TIMESTAMPNS:
1189                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1190                 break;
1191
1192         case SO_TIMESTAMPING:
1193                 v.val = sk->sk_tsflags;
1194                 break;
1195
1196         case SO_RCVTIMEO:
1197                 lv = sizeof(struct timeval);
1198                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1199                         v.tm.tv_sec = 0;
1200                         v.tm.tv_usec = 0;
1201                 } else {
1202                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1203                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1204                 }
1205                 break;
1206
1207         case SO_SNDTIMEO:
1208                 lv = sizeof(struct timeval);
1209                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1210                         v.tm.tv_sec = 0;
1211                         v.tm.tv_usec = 0;
1212                 } else {
1213                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1214                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1215                 }
1216                 break;
1217
1218         case SO_RCVLOWAT:
1219                 v.val = sk->sk_rcvlowat;
1220                 break;
1221
1222         case SO_SNDLOWAT:
1223                 v.val = 1;
1224                 break;
1225
1226         case SO_PASSCRED:
1227                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1228                 break;
1229
1230         case SO_PEERCRED:
1231         {
1232                 struct ucred peercred;
1233                 if (len > sizeof(peercred))
1234                         len = sizeof(peercred);
1235                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1236                 if (copy_to_user(optval, &peercred, len))
1237                         return -EFAULT;
1238                 goto lenout;
1239         }
1240
1241         case SO_PEERNAME:
1242         {
1243                 char address[128];
1244
1245                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1246                         return -ENOTCONN;
1247                 if (lv < len)
1248                         return -EINVAL;
1249                 if (copy_to_user(optval, address, len))
1250                         return -EFAULT;
1251                 goto lenout;
1252         }
1253
1254         /* Dubious BSD thing... Probably nobody even uses it, but
1255          * the UNIX standard wants it for whatever reason... -DaveM
1256          */
1257         case SO_ACCEPTCONN:
1258                 v.val = sk->sk_state == TCP_LISTEN;
1259                 break;
1260
1261         case SO_PASSSEC:
1262                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1263                 break;
1264
1265         case SO_PEERSEC:
1266                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1267
1268         case SO_MARK:
1269                 v.val = sk->sk_mark;
1270                 break;
1271
1272         case SO_RXQ_OVFL:
1273                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1274                 break;
1275
1276         case SO_WIFI_STATUS:
1277                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1278                 break;
1279
1280         case SO_PEEK_OFF:
1281                 if (!sock->ops->set_peek_off)
1282                         return -EOPNOTSUPP;
1283
1284                 v.val = sk->sk_peek_off;
1285                 break;
1286         case SO_NOFCS:
1287                 v.val = sock_flag(sk, SOCK_NOFCS);
1288                 break;
1289
1290         case SO_BINDTODEVICE:
1291                 return sock_getbindtodevice(sk, optval, optlen, len);
1292
1293         case SO_GET_FILTER:
1294                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1295                 if (len < 0)
1296                         return len;
1297
1298                 goto lenout;
1299
1300         case SO_LOCK_FILTER:
1301                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1302                 break;
1303
1304         case SO_BPF_EXTENSIONS:
1305                 v.val = bpf_tell_extensions();
1306                 break;
1307
1308         case SO_SELECT_ERR_QUEUE:
1309                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1310                 break;
1311
1312 #ifdef CONFIG_NET_RX_BUSY_POLL
1313         case SO_BUSY_POLL:
1314                 v.val = sk->sk_ll_usec;
1315                 break;
1316 #endif
1317
1318         case SO_MAX_PACING_RATE:
1319                 v.val = sk->sk_max_pacing_rate;
1320                 break;
1321
1322         case SO_INCOMING_CPU:
1323                 v.val = sk->sk_incoming_cpu;
1324                 break;
1325
1326         default:
1327                 /* We implement the SO_SNDLOWAT etc to not be settable
1328                  * (1003.1g 7).
1329                  */
1330                 return -ENOPROTOOPT;
1331         }
1332
1333         if (len > lv)
1334                 len = lv;
1335         if (copy_to_user(optval, &v, len))
1336                 return -EFAULT;
1337 lenout:
1338         if (put_user(len, optlen))
1339                 return -EFAULT;
1340         return 0;
1341 }
1342
1343 /*
1344  * Initialize an sk_lock.
1345  *
1346  * (We also register the sk_lock with the lock validator.)
1347  */
1348 static inline void sock_lock_init(struct sock *sk)
1349 {
1350         sock_lock_init_class_and_name(sk,
1351                         af_family_slock_key_strings[sk->sk_family],
1352                         af_family_slock_keys + sk->sk_family,
1353                         af_family_key_strings[sk->sk_family],
1354                         af_family_keys + sk->sk_family);
1355 }
1356
1357 /*
1358  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1359  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1360  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1361  */
1362 static void sock_copy(struct sock *nsk, const struct sock *osk)
1363 {
1364 #ifdef CONFIG_SECURITY_NETWORK
1365         void *sptr = nsk->sk_security;
1366 #endif
1367         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1368
1369         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1370                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1371
1372 #ifdef CONFIG_SECURITY_NETWORK
1373         nsk->sk_security = sptr;
1374         security_sk_clone(osk, nsk);
1375 #endif
1376 }
1377
1378 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1379                 int family)
1380 {
1381         struct sock *sk;
1382         struct kmem_cache *slab;
1383
1384         slab = prot->slab;
1385         if (slab != NULL) {
1386                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1387                 if (!sk)
1388                         return sk;
1389                 if (priority & __GFP_ZERO)
1390                         sk_prot_clear_nulls(sk, prot->obj_size);
1391         } else
1392                 sk = kmalloc(prot->obj_size, priority);
1393
1394         if (sk != NULL) {
1395                 kmemcheck_annotate_bitfield(sk, flags);
1396
1397                 if (security_sk_alloc(sk, family, priority))
1398                         goto out_free;
1399
1400                 if (!try_module_get(prot->owner))
1401                         goto out_free_sec;
1402                 sk_tx_queue_clear(sk);
1403         }
1404
1405         return sk;
1406
1407 out_free_sec:
1408         security_sk_free(sk);
1409 out_free:
1410         if (slab != NULL)
1411                 kmem_cache_free(slab, sk);
1412         else
1413                 kfree(sk);
1414         return NULL;
1415 }
1416
1417 static void sk_prot_free(struct proto *prot, struct sock *sk)
1418 {
1419         struct kmem_cache *slab;
1420         struct module *owner;
1421
1422         owner = prot->owner;
1423         slab = prot->slab;
1424
1425         cgroup_sk_free(&sk->sk_cgrp_data);
1426         mem_cgroup_sk_free(sk);
1427         security_sk_free(sk);
1428         if (slab != NULL)
1429                 kmem_cache_free(slab, sk);
1430         else
1431                 kfree(sk);
1432         module_put(owner);
1433 }
1434
1435 /**
1436  *      sk_alloc - All socket objects are allocated here
1437  *      @net: the applicable net namespace
1438  *      @family: protocol family
1439  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1440  *      @prot: struct proto associated with this new sock instance
1441  *      @kern: is this to be a kernel socket?
1442  */
1443 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1444                       struct proto *prot, int kern)
1445 {
1446         struct sock *sk;
1447
1448         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1449         if (sk) {
1450                 sk->sk_family = family;
1451                 /*
1452                  * See comment in struct sock definition to understand
1453                  * why we need sk_prot_creator -acme
1454                  */
1455                 sk->sk_prot = sk->sk_prot_creator = prot;
1456                 sock_lock_init(sk);
1457                 sk->sk_net_refcnt = kern ? 0 : 1;
1458                 if (likely(sk->sk_net_refcnt))
1459                         get_net(net);
1460                 sock_net_set(sk, net);
1461                 atomic_set(&sk->sk_wmem_alloc, 1);
1462
1463                 mem_cgroup_sk_alloc(sk);
1464                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1465                 sock_update_classid(&sk->sk_cgrp_data);
1466                 sock_update_netprioidx(&sk->sk_cgrp_data);
1467         }
1468
1469         return sk;
1470 }
1471 EXPORT_SYMBOL(sk_alloc);
1472
1473 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1474  * grace period. This is the case for UDP sockets and TCP listeners.
1475  */
1476 static void __sk_destruct(struct rcu_head *head)
1477 {
1478         struct sock *sk = container_of(head, struct sock, sk_rcu);
1479         struct sk_filter *filter;
1480
1481         if (sk->sk_destruct)
1482                 sk->sk_destruct(sk);
1483
1484         filter = rcu_dereference_check(sk->sk_filter,
1485                                        atomic_read(&sk->sk_wmem_alloc) == 0);
1486         if (filter) {
1487                 sk_filter_uncharge(sk, filter);
1488                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1489         }
1490         if (rcu_access_pointer(sk->sk_reuseport_cb))
1491                 reuseport_detach_sock(sk);
1492
1493         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1494
1495         if (atomic_read(&sk->sk_omem_alloc))
1496                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1497                          __func__, atomic_read(&sk->sk_omem_alloc));
1498
1499         if (sk->sk_peer_cred)
1500                 put_cred(sk->sk_peer_cred);
1501         put_pid(sk->sk_peer_pid);
1502         if (likely(sk->sk_net_refcnt))
1503                 put_net(sock_net(sk));
1504         sk_prot_free(sk->sk_prot_creator, sk);
1505 }
1506
1507 void sk_destruct(struct sock *sk)
1508 {
1509         if (sock_flag(sk, SOCK_RCU_FREE))
1510                 call_rcu(&sk->sk_rcu, __sk_destruct);
1511         else
1512                 __sk_destruct(&sk->sk_rcu);
1513 }
1514
1515 static void __sk_free(struct sock *sk)
1516 {
1517         if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1518                 sock_diag_broadcast_destroy(sk);
1519         else
1520                 sk_destruct(sk);
1521 }
1522
1523 void sk_free(struct sock *sk)
1524 {
1525         /*
1526          * We subtract one from sk_wmem_alloc and can know if
1527          * some packets are still in some tx queue.
1528          * If not null, sock_wfree() will call __sk_free(sk) later
1529          */
1530         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1531                 __sk_free(sk);
1532 }
1533 EXPORT_SYMBOL(sk_free);
1534
1535 static void sk_init_common(struct sock *sk)
1536 {
1537         skb_queue_head_init(&sk->sk_receive_queue);
1538         skb_queue_head_init(&sk->sk_write_queue);
1539         skb_queue_head_init(&sk->sk_error_queue);
1540
1541         rwlock_init(&sk->sk_callback_lock);
1542         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1543                         af_rlock_keys + sk->sk_family,
1544                         af_family_rlock_key_strings[sk->sk_family]);
1545         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1546                         af_wlock_keys + sk->sk_family,
1547                         af_family_wlock_key_strings[sk->sk_family]);
1548         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1549                         af_elock_keys + sk->sk_family,
1550                         af_family_elock_key_strings[sk->sk_family]);
1551         lockdep_set_class_and_name(&sk->sk_callback_lock,
1552                         af_callback_keys + sk->sk_family,
1553                         af_family_clock_key_strings[sk->sk_family]);
1554 }
1555
1556 /**
1557  *      sk_clone_lock - clone a socket, and lock its clone
1558  *      @sk: the socket to clone
1559  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1560  *
1561  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1562  */
1563 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1564 {
1565         struct sock *newsk;
1566         bool is_charged = true;
1567
1568         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1569         if (newsk != NULL) {
1570                 struct sk_filter *filter;
1571
1572                 sock_copy(newsk, sk);
1573
1574                 /* SANITY */
1575                 if (likely(newsk->sk_net_refcnt))
1576                         get_net(sock_net(newsk));
1577                 sk_node_init(&newsk->sk_node);
1578                 sock_lock_init(newsk);
1579                 bh_lock_sock(newsk);
1580                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1581                 newsk->sk_backlog.len = 0;
1582
1583                 atomic_set(&newsk->sk_rmem_alloc, 0);
1584                 /*
1585                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1586                  */
1587                 atomic_set(&newsk->sk_wmem_alloc, 1);
1588                 atomic_set(&newsk->sk_omem_alloc, 0);
1589                 sk_init_common(newsk);
1590
1591                 newsk->sk_dst_cache     = NULL;
1592                 newsk->sk_dst_pending_confirm = 0;
1593                 newsk->sk_wmem_queued   = 0;
1594                 newsk->sk_forward_alloc = 0;
1595                 atomic_set(&newsk->sk_drops, 0);
1596                 newsk->sk_send_head     = NULL;
1597                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1598
1599                 sock_reset_flag(newsk, SOCK_DONE);
1600
1601                 filter = rcu_dereference_protected(newsk->sk_filter, 1);
1602                 if (filter != NULL)
1603                         /* though it's an empty new sock, the charging may fail
1604                          * if sysctl_optmem_max was changed between creation of
1605                          * original socket and cloning
1606                          */
1607                         is_charged = sk_filter_charge(newsk, filter);
1608
1609                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1610                         sk_free_unlock_clone(newsk);
1611                         newsk = NULL;
1612                         goto out;
1613                 }
1614                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1615
1616                 newsk->sk_err      = 0;
1617                 newsk->sk_err_soft = 0;
1618                 newsk->sk_priority = 0;
1619                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1620                 atomic64_set(&newsk->sk_cookie, 0);
1621
1622                 mem_cgroup_sk_alloc(newsk);
1623                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1624
1625                 /*
1626                  * Before updating sk_refcnt, we must commit prior changes to memory
1627                  * (Documentation/RCU/rculist_nulls.txt for details)
1628                  */
1629                 smp_wmb();
1630                 atomic_set(&newsk->sk_refcnt, 2);
1631
1632                 /*
1633                  * Increment the counter in the same struct proto as the master
1634                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1635                  * is the same as sk->sk_prot->socks, as this field was copied
1636                  * with memcpy).
1637                  *
1638                  * This _changes_ the previous behaviour, where
1639                  * tcp_create_openreq_child always was incrementing the
1640                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1641                  * to be taken into account in all callers. -acme
1642                  */
1643                 sk_refcnt_debug_inc(newsk);
1644                 sk_set_socket(newsk, NULL);
1645                 newsk->sk_wq = NULL;
1646
1647                 if (newsk->sk_prot->sockets_allocated)
1648                         sk_sockets_allocated_inc(newsk);
1649
1650                 if (sock_needs_netstamp(sk) &&
1651                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1652                         net_enable_timestamp();
1653         }
1654 out:
1655         return newsk;
1656 }
1657 EXPORT_SYMBOL_GPL(sk_clone_lock);
1658
1659 void sk_free_unlock_clone(struct sock *sk)
1660 {
1661         /* It is still raw copy of parent, so invalidate
1662          * destructor and make plain sk_free() */
1663         sk->sk_destruct = NULL;
1664         bh_unlock_sock(sk);
1665         sk_free(sk);
1666 }
1667 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1668
1669 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1670 {
1671         u32 max_segs = 1;
1672
1673         sk_dst_set(sk, dst);
1674         sk->sk_route_caps = dst->dev->features;
1675         if (sk->sk_route_caps & NETIF_F_GSO)
1676                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1677         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1678         if (sk_can_gso(sk)) {
1679                 if (dst->header_len) {
1680                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1681                 } else {
1682                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1683                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1684                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1685                 }
1686         }
1687         sk->sk_gso_max_segs = max_segs;
1688 }
1689 EXPORT_SYMBOL_GPL(sk_setup_caps);
1690
1691 /*
1692  *      Simple resource managers for sockets.
1693  */
1694
1695
1696 /*
1697  * Write buffer destructor automatically called from kfree_skb.
1698  */
1699 void sock_wfree(struct sk_buff *skb)
1700 {
1701         struct sock *sk = skb->sk;
1702         unsigned int len = skb->truesize;
1703
1704         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1705                 /*
1706                  * Keep a reference on sk_wmem_alloc, this will be released
1707                  * after sk_write_space() call
1708                  */
1709                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1710                 sk->sk_write_space(sk);
1711                 len = 1;
1712         }
1713         /*
1714          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1715          * could not do because of in-flight packets
1716          */
1717         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1718                 __sk_free(sk);
1719 }
1720 EXPORT_SYMBOL(sock_wfree);
1721
1722 /* This variant of sock_wfree() is used by TCP,
1723  * since it sets SOCK_USE_WRITE_QUEUE.
1724  */
1725 void __sock_wfree(struct sk_buff *skb)
1726 {
1727         struct sock *sk = skb->sk;
1728
1729         if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1730                 __sk_free(sk);
1731 }
1732
1733 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1734 {
1735         skb_orphan(skb);
1736         skb->sk = sk;
1737 #ifdef CONFIG_INET
1738         if (unlikely(!sk_fullsock(sk))) {
1739                 skb->destructor = sock_edemux;
1740                 sock_hold(sk);
1741                 return;
1742         }
1743 #endif
1744         skb->destructor = sock_wfree;
1745         skb_set_hash_from_sk(skb, sk);
1746         /*
1747          * We used to take a refcount on sk, but following operation
1748          * is enough to guarantee sk_free() wont free this sock until
1749          * all in-flight packets are completed
1750          */
1751         atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1752 }
1753 EXPORT_SYMBOL(skb_set_owner_w);
1754
1755 /* This helper is used by netem, as it can hold packets in its
1756  * delay queue. We want to allow the owner socket to send more
1757  * packets, as if they were already TX completed by a typical driver.
1758  * But we also want to keep skb->sk set because some packet schedulers
1759  * rely on it (sch_fq for example). So we set skb->truesize to a small
1760  * amount (1) and decrease sk_wmem_alloc accordingly.
1761  */
1762 void skb_orphan_partial(struct sk_buff *skb)
1763 {
1764         /* If this skb is a TCP pure ACK or already went here,
1765          * we have nothing to do. 2 is already a very small truesize.
1766          */
1767         if (skb->truesize <= 2)
1768                 return;
1769
1770         /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1771          * so we do not completely orphan skb, but transfert all
1772          * accounted bytes but one, to avoid unexpected reorders.
1773          */
1774         if (skb->destructor == sock_wfree
1775 #ifdef CONFIG_INET
1776             || skb->destructor == tcp_wfree
1777 #endif
1778                 ) {
1779                 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1780                 skb->truesize = 1;
1781         } else {
1782                 skb_orphan(skb);
1783         }
1784 }
1785 EXPORT_SYMBOL(skb_orphan_partial);
1786
1787 /*
1788  * Read buffer destructor automatically called from kfree_skb.
1789  */
1790 void sock_rfree(struct sk_buff *skb)
1791 {
1792         struct sock *sk = skb->sk;
1793         unsigned int len = skb->truesize;
1794
1795         atomic_sub(len, &sk->sk_rmem_alloc);
1796         sk_mem_uncharge(sk, len);
1797 }
1798 EXPORT_SYMBOL(sock_rfree);
1799
1800 /*
1801  * Buffer destructor for skbs that are not used directly in read or write
1802  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1803  */
1804 void sock_efree(struct sk_buff *skb)
1805 {
1806         sock_put(skb->sk);
1807 }
1808 EXPORT_SYMBOL(sock_efree);
1809
1810 kuid_t sock_i_uid(struct sock *sk)
1811 {
1812         kuid_t uid;
1813
1814         read_lock_bh(&sk->sk_callback_lock);
1815         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1816         read_unlock_bh(&sk->sk_callback_lock);
1817         return uid;
1818 }
1819 EXPORT_SYMBOL(sock_i_uid);
1820
1821 unsigned long sock_i_ino(struct sock *sk)
1822 {
1823         unsigned long ino;
1824
1825         read_lock_bh(&sk->sk_callback_lock);
1826         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1827         read_unlock_bh(&sk->sk_callback_lock);
1828         return ino;
1829 }
1830 EXPORT_SYMBOL(sock_i_ino);
1831
1832 /*
1833  * Allocate a skb from the socket's send buffer.
1834  */
1835 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1836                              gfp_t priority)
1837 {
1838         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1839                 struct sk_buff *skb = alloc_skb(size, priority);
1840                 if (skb) {
1841                         skb_set_owner_w(skb, sk);
1842                         return skb;
1843                 }
1844         }
1845         return NULL;
1846 }
1847 EXPORT_SYMBOL(sock_wmalloc);
1848
1849 /*
1850  * Allocate a memory block from the socket's option memory buffer.
1851  */
1852 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1853 {
1854         if ((unsigned int)size <= sysctl_optmem_max &&
1855             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1856                 void *mem;
1857                 /* First do the add, to avoid the race if kmalloc
1858                  * might sleep.
1859                  */
1860                 atomic_add(size, &sk->sk_omem_alloc);
1861                 mem = kmalloc(size, priority);
1862                 if (mem)
1863                         return mem;
1864                 atomic_sub(size, &sk->sk_omem_alloc);
1865         }
1866         return NULL;
1867 }
1868 EXPORT_SYMBOL(sock_kmalloc);
1869
1870 /* Free an option memory block. Note, we actually want the inline
1871  * here as this allows gcc to detect the nullify and fold away the
1872  * condition entirely.
1873  */
1874 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1875                                   const bool nullify)
1876 {
1877         if (WARN_ON_ONCE(!mem))
1878                 return;
1879         if (nullify)
1880                 kzfree(mem);
1881         else
1882                 kfree(mem);
1883         atomic_sub(size, &sk->sk_omem_alloc);
1884 }
1885
1886 void sock_kfree_s(struct sock *sk, void *mem, int size)
1887 {
1888         __sock_kfree_s(sk, mem, size, false);
1889 }
1890 EXPORT_SYMBOL(sock_kfree_s);
1891
1892 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1893 {
1894         __sock_kfree_s(sk, mem, size, true);
1895 }
1896 EXPORT_SYMBOL(sock_kzfree_s);
1897
1898 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1899    I think, these locks should be removed for datagram sockets.
1900  */
1901 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1902 {
1903         DEFINE_WAIT(wait);
1904
1905         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1906         for (;;) {
1907                 if (!timeo)
1908                         break;
1909                 if (signal_pending(current))
1910                         break;
1911                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1912                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1913                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1914                         break;
1915                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1916                         break;
1917                 if (sk->sk_err)
1918                         break;
1919                 timeo = schedule_timeout(timeo);
1920         }
1921         finish_wait(sk_sleep(sk), &wait);
1922         return timeo;
1923 }
1924
1925
1926 /*
1927  *      Generic send/receive buffer handlers
1928  */
1929
1930 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1931                                      unsigned long data_len, int noblock,
1932                                      int *errcode, int max_page_order)
1933 {
1934         struct sk_buff *skb;
1935         long timeo;
1936         int err;
1937
1938         timeo = sock_sndtimeo(sk, noblock);
1939         for (;;) {
1940                 err = sock_error(sk);
1941                 if (err != 0)
1942                         goto failure;
1943
1944                 err = -EPIPE;
1945                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1946                         goto failure;
1947
1948                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1949                         break;
1950
1951                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1952                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1953                 err = -EAGAIN;
1954                 if (!timeo)
1955                         goto failure;
1956                 if (signal_pending(current))
1957                         goto interrupted;
1958                 timeo = sock_wait_for_wmem(sk, timeo);
1959         }
1960         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1961                                    errcode, sk->sk_allocation);
1962         if (skb)
1963                 skb_set_owner_w(skb, sk);
1964         return skb;
1965
1966 interrupted:
1967         err = sock_intr_errno(timeo);
1968 failure:
1969         *errcode = err;
1970         return NULL;
1971 }
1972 EXPORT_SYMBOL(sock_alloc_send_pskb);
1973
1974 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1975                                     int noblock, int *errcode)
1976 {
1977         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1978 }
1979 EXPORT_SYMBOL(sock_alloc_send_skb);
1980
1981 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
1982                      struct sockcm_cookie *sockc)
1983 {
1984         u32 tsflags;
1985
1986         switch (cmsg->cmsg_type) {
1987         case SO_MARK:
1988                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1989                         return -EPERM;
1990                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1991                         return -EINVAL;
1992                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1993                 break;
1994         case SO_TIMESTAMPING:
1995                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1996                         return -EINVAL;
1997
1998                 tsflags = *(u32 *)CMSG_DATA(cmsg);
1999                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2000                         return -EINVAL;
2001
2002                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2003                 sockc->tsflags |= tsflags;
2004                 break;
2005         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2006         case SCM_RIGHTS:
2007         case SCM_CREDENTIALS:
2008                 break;
2009         default:
2010                 return -EINVAL;
2011         }
2012         return 0;
2013 }
2014 EXPORT_SYMBOL(__sock_cmsg_send);
2015
2016 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2017                    struct sockcm_cookie *sockc)
2018 {
2019         struct cmsghdr *cmsg;
2020         int ret;
2021
2022         for_each_cmsghdr(cmsg, msg) {
2023                 if (!CMSG_OK(msg, cmsg))
2024                         return -EINVAL;
2025                 if (cmsg->cmsg_level != SOL_SOCKET)
2026                         continue;
2027                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2028                 if (ret)
2029                         return ret;
2030         }
2031         return 0;
2032 }
2033 EXPORT_SYMBOL(sock_cmsg_send);
2034
2035 /* On 32bit arches, an skb frag is limited to 2^15 */
2036 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2037
2038 /**
2039  * skb_page_frag_refill - check that a page_frag contains enough room
2040  * @sz: minimum size of the fragment we want to get
2041  * @pfrag: pointer to page_frag
2042  * @gfp: priority for memory allocation
2043  *
2044  * Note: While this allocator tries to use high order pages, there is
2045  * no guarantee that allocations succeed. Therefore, @sz MUST be
2046  * less or equal than PAGE_SIZE.
2047  */
2048 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2049 {
2050         if (pfrag->page) {
2051                 if (page_ref_count(pfrag->page) == 1) {
2052                         pfrag->offset = 0;
2053                         return true;
2054                 }
2055                 if (pfrag->offset + sz <= pfrag->size)
2056                         return true;
2057                 put_page(pfrag->page);
2058         }
2059
2060         pfrag->offset = 0;
2061         if (SKB_FRAG_PAGE_ORDER) {
2062                 /* Avoid direct reclaim but allow kswapd to wake */
2063                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2064                                           __GFP_COMP | __GFP_NOWARN |
2065                                           __GFP_NORETRY,
2066                                           SKB_FRAG_PAGE_ORDER);
2067                 if (likely(pfrag->page)) {
2068                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2069                         return true;
2070                 }
2071         }
2072         pfrag->page = alloc_page(gfp);
2073         if (likely(pfrag->page)) {
2074                 pfrag->size = PAGE_SIZE;
2075                 return true;
2076         }
2077         return false;
2078 }
2079 EXPORT_SYMBOL(skb_page_frag_refill);
2080
2081 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2082 {
2083         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2084                 return true;
2085
2086         sk_enter_memory_pressure(sk);
2087         sk_stream_moderate_sndbuf(sk);
2088         return false;
2089 }
2090 EXPORT_SYMBOL(sk_page_frag_refill);
2091
2092 static void __lock_sock(struct sock *sk)
2093         __releases(&sk->sk_lock.slock)
2094         __acquires(&sk->sk_lock.slock)
2095 {
2096         DEFINE_WAIT(wait);
2097
2098         for (;;) {
2099                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2100                                         TASK_UNINTERRUPTIBLE);
2101                 spin_unlock_bh(&sk->sk_lock.slock);
2102                 schedule();
2103                 spin_lock_bh(&sk->sk_lock.slock);
2104                 if (!sock_owned_by_user(sk))
2105                         break;
2106         }
2107         finish_wait(&sk->sk_lock.wq, &wait);
2108 }
2109
2110 static void __release_sock(struct sock *sk)
2111         __releases(&sk->sk_lock.slock)
2112         __acquires(&sk->sk_lock.slock)
2113 {
2114         struct sk_buff *skb, *next;
2115
2116         while ((skb = sk->sk_backlog.head) != NULL) {
2117                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2118
2119                 spin_unlock_bh(&sk->sk_lock.slock);
2120
2121                 do {
2122                         next = skb->next;
2123                         prefetch(next);
2124                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2125                         skb->next = NULL;
2126                         sk_backlog_rcv(sk, skb);
2127
2128                         cond_resched();
2129
2130                         skb = next;
2131                 } while (skb != NULL);
2132
2133                 spin_lock_bh(&sk->sk_lock.slock);
2134         }
2135
2136         /*
2137          * Doing the zeroing here guarantee we can not loop forever
2138          * while a wild producer attempts to flood us.
2139          */
2140         sk->sk_backlog.len = 0;
2141 }
2142
2143 void __sk_flush_backlog(struct sock *sk)
2144 {
2145         spin_lock_bh(&sk->sk_lock.slock);
2146         __release_sock(sk);
2147         spin_unlock_bh(&sk->sk_lock.slock);
2148 }
2149
2150 /**
2151  * sk_wait_data - wait for data to arrive at sk_receive_queue
2152  * @sk:    sock to wait on
2153  * @timeo: for how long
2154  * @skb:   last skb seen on sk_receive_queue
2155  *
2156  * Now socket state including sk->sk_err is changed only under lock,
2157  * hence we may omit checks after joining wait queue.
2158  * We check receive queue before schedule() only as optimization;
2159  * it is very likely that release_sock() added new data.
2160  */
2161 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2162 {
2163         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2164         int rc;
2165
2166         add_wait_queue(sk_sleep(sk), &wait);
2167         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2168         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2169         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2170         remove_wait_queue(sk_sleep(sk), &wait);
2171         return rc;
2172 }
2173 EXPORT_SYMBOL(sk_wait_data);
2174
2175 /**
2176  *      __sk_mem_raise_allocated - increase memory_allocated
2177  *      @sk: socket
2178  *      @size: memory size to allocate
2179  *      @amt: pages to allocate
2180  *      @kind: allocation type
2181  *
2182  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2183  */
2184 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2185 {
2186         struct proto *prot = sk->sk_prot;
2187         long allocated = sk_memory_allocated_add(sk, amt);
2188
2189         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2190             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2191                 goto suppress_allocation;
2192
2193         /* Under limit. */
2194         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2195                 sk_leave_memory_pressure(sk);
2196                 return 1;
2197         }
2198
2199         /* Under pressure. */
2200         if (allocated > sk_prot_mem_limits(sk, 1))
2201                 sk_enter_memory_pressure(sk);
2202
2203         /* Over hard limit. */
2204         if (allocated > sk_prot_mem_limits(sk, 2))
2205                 goto suppress_allocation;
2206
2207         /* guarantee minimum buffer size under pressure */
2208         if (kind == SK_MEM_RECV) {
2209                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2210                         return 1;
2211
2212         } else { /* SK_MEM_SEND */
2213                 if (sk->sk_type == SOCK_STREAM) {
2214                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2215                                 return 1;
2216                 } else if (atomic_read(&sk->sk_wmem_alloc) <
2217                            prot->sysctl_wmem[0])
2218                                 return 1;
2219         }
2220
2221         if (sk_has_memory_pressure(sk)) {
2222                 int alloc;
2223
2224                 if (!sk_under_memory_pressure(sk))
2225                         return 1;
2226                 alloc = sk_sockets_allocated_read_positive(sk);
2227                 if (sk_prot_mem_limits(sk, 2) > alloc *
2228                     sk_mem_pages(sk->sk_wmem_queued +
2229                                  atomic_read(&sk->sk_rmem_alloc) +
2230                                  sk->sk_forward_alloc))
2231                         return 1;
2232         }
2233
2234 suppress_allocation:
2235
2236         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2237                 sk_stream_moderate_sndbuf(sk);
2238
2239                 /* Fail only if socket is _under_ its sndbuf.
2240                  * In this case we cannot block, so that we have to fail.
2241                  */
2242                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2243                         return 1;
2244         }
2245
2246         trace_sock_exceed_buf_limit(sk, prot, allocated);
2247
2248         sk_memory_allocated_sub(sk, amt);
2249
2250         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2251                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2252
2253         return 0;
2254 }
2255 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2256
2257 /**
2258  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2259  *      @sk: socket
2260  *      @size: memory size to allocate
2261  *      @kind: allocation type
2262  *
2263  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2264  *      rmem allocation. This function assumes that protocols which have
2265  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2266  */
2267 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2268 {
2269         int ret, amt = sk_mem_pages(size);
2270
2271         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2272         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2273         if (!ret)
2274                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2275         return ret;
2276 }
2277 EXPORT_SYMBOL(__sk_mem_schedule);
2278
2279 /**
2280  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2281  *      @sk: socket
2282  *      @amount: number of quanta
2283  *
2284  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2285  */
2286 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2287 {
2288         sk_memory_allocated_sub(sk, amount);
2289
2290         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2291                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2292
2293         if (sk_under_memory_pressure(sk) &&
2294             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2295                 sk_leave_memory_pressure(sk);
2296 }
2297 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2298
2299 /**
2300  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2301  *      @sk: socket
2302  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2303  */
2304 void __sk_mem_reclaim(struct sock *sk, int amount)
2305 {
2306         amount >>= SK_MEM_QUANTUM_SHIFT;
2307         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2308         __sk_mem_reduce_allocated(sk, amount);
2309 }
2310 EXPORT_SYMBOL(__sk_mem_reclaim);
2311
2312 int sk_set_peek_off(struct sock *sk, int val)
2313 {
2314         if (val < 0)
2315                 return -EINVAL;
2316
2317         sk->sk_peek_off = val;
2318         return 0;
2319 }
2320 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2321
2322 /*
2323  * Set of default routines for initialising struct proto_ops when
2324  * the protocol does not support a particular function. In certain
2325  * cases where it makes no sense for a protocol to have a "do nothing"
2326  * function, some default processing is provided.
2327  */
2328
2329 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2330 {
2331         return -EOPNOTSUPP;
2332 }
2333 EXPORT_SYMBOL(sock_no_bind);
2334
2335 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2336                     int len, int flags)
2337 {
2338         return -EOPNOTSUPP;
2339 }
2340 EXPORT_SYMBOL(sock_no_connect);
2341
2342 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2343 {
2344         return -EOPNOTSUPP;
2345 }
2346 EXPORT_SYMBOL(sock_no_socketpair);
2347
2348 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2349 {
2350         return -EOPNOTSUPP;
2351 }
2352 EXPORT_SYMBOL(sock_no_accept);
2353
2354 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2355                     int *len, int peer)
2356 {
2357         return -EOPNOTSUPP;
2358 }
2359 EXPORT_SYMBOL(sock_no_getname);
2360
2361 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2362 {
2363         return 0;
2364 }
2365 EXPORT_SYMBOL(sock_no_poll);
2366
2367 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2368 {
2369         return -EOPNOTSUPP;
2370 }
2371 EXPORT_SYMBOL(sock_no_ioctl);
2372
2373 int sock_no_listen(struct socket *sock, int backlog)
2374 {
2375         return -EOPNOTSUPP;
2376 }
2377 EXPORT_SYMBOL(sock_no_listen);
2378
2379 int sock_no_shutdown(struct socket *sock, int how)
2380 {
2381         return -EOPNOTSUPP;
2382 }
2383 EXPORT_SYMBOL(sock_no_shutdown);
2384
2385 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2386                     char __user *optval, unsigned int optlen)
2387 {
2388         return -EOPNOTSUPP;
2389 }
2390 EXPORT_SYMBOL(sock_no_setsockopt);
2391
2392 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2393                     char __user *optval, int __user *optlen)
2394 {
2395         return -EOPNOTSUPP;
2396 }
2397 EXPORT_SYMBOL(sock_no_getsockopt);
2398
2399 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2400 {
2401         return -EOPNOTSUPP;
2402 }
2403 EXPORT_SYMBOL(sock_no_sendmsg);
2404
2405 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2406                     int flags)
2407 {
2408         return -EOPNOTSUPP;
2409 }
2410 EXPORT_SYMBOL(sock_no_recvmsg);
2411
2412 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2413 {
2414         /* Mirror missing mmap method error code */
2415         return -ENODEV;
2416 }
2417 EXPORT_SYMBOL(sock_no_mmap);
2418
2419 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2420 {
2421         ssize_t res;
2422         struct msghdr msg = {.msg_flags = flags};
2423         struct kvec iov;
2424         char *kaddr = kmap(page);
2425         iov.iov_base = kaddr + offset;
2426         iov.iov_len = size;
2427         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2428         kunmap(page);
2429         return res;
2430 }
2431 EXPORT_SYMBOL(sock_no_sendpage);
2432
2433 /*
2434  *      Default Socket Callbacks
2435  */
2436
2437 static void sock_def_wakeup(struct sock *sk)
2438 {
2439         struct socket_wq *wq;
2440
2441         rcu_read_lock();
2442         wq = rcu_dereference(sk->sk_wq);
2443         if (skwq_has_sleeper(wq))
2444                 wake_up_interruptible_all(&wq->wait);
2445         rcu_read_unlock();
2446 }
2447
2448 static void sock_def_error_report(struct sock *sk)
2449 {
2450         struct socket_wq *wq;
2451
2452         rcu_read_lock();
2453         wq = rcu_dereference(sk->sk_wq);
2454         if (skwq_has_sleeper(wq))
2455                 wake_up_interruptible_poll(&wq->wait, POLLERR);
2456         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2457         rcu_read_unlock();
2458 }
2459
2460 static void sock_def_readable(struct sock *sk)
2461 {
2462         struct socket_wq *wq;
2463
2464         rcu_read_lock();
2465         wq = rcu_dereference(sk->sk_wq);
2466         if (skwq_has_sleeper(wq))
2467                 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2468                                                 POLLRDNORM | POLLRDBAND);
2469         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2470         rcu_read_unlock();
2471 }
2472
2473 static void sock_def_write_space(struct sock *sk)
2474 {
2475         struct socket_wq *wq;
2476
2477         rcu_read_lock();
2478
2479         /* Do not wake up a writer until he can make "significant"
2480          * progress.  --DaveM
2481          */
2482         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2483                 wq = rcu_dereference(sk->sk_wq);
2484                 if (skwq_has_sleeper(wq))
2485                         wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2486                                                 POLLWRNORM | POLLWRBAND);
2487
2488                 /* Should agree with poll, otherwise some programs break */
2489                 if (sock_writeable(sk))
2490                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2491         }
2492
2493         rcu_read_unlock();
2494 }
2495
2496 static void sock_def_destruct(struct sock *sk)
2497 {
2498 }
2499
2500 void sk_send_sigurg(struct sock *sk)
2501 {
2502         if (sk->sk_socket && sk->sk_socket->file)
2503                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2504                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2505 }
2506 EXPORT_SYMBOL(sk_send_sigurg);
2507
2508 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2509                     unsigned long expires)
2510 {
2511         if (!mod_timer(timer, expires))
2512                 sock_hold(sk);
2513 }
2514 EXPORT_SYMBOL(sk_reset_timer);
2515
2516 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2517 {
2518         if (del_timer(timer))
2519                 __sock_put(sk);
2520 }
2521 EXPORT_SYMBOL(sk_stop_timer);
2522
2523 void sock_init_data(struct socket *sock, struct sock *sk)
2524 {
2525         sk_init_common(sk);
2526         sk->sk_send_head        =       NULL;
2527
2528         init_timer(&sk->sk_timer);
2529
2530         sk->sk_allocation       =       GFP_KERNEL;
2531         sk->sk_rcvbuf           =       sysctl_rmem_default;
2532         sk->sk_sndbuf           =       sysctl_wmem_default;
2533         sk->sk_state            =       TCP_CLOSE;
2534         sk_set_socket(sk, sock);
2535
2536         sock_set_flag(sk, SOCK_ZAPPED);
2537
2538         if (sock) {
2539                 sk->sk_type     =       sock->type;
2540                 sk->sk_wq       =       sock->wq;
2541                 sock->sk        =       sk;
2542                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2543         } else {
2544                 sk->sk_wq       =       NULL;
2545                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2546         }
2547
2548         sk->sk_state_change     =       sock_def_wakeup;
2549         sk->sk_data_ready       =       sock_def_readable;
2550         sk->sk_write_space      =       sock_def_write_space;
2551         sk->sk_error_report     =       sock_def_error_report;
2552         sk->sk_destruct         =       sock_def_destruct;
2553
2554         sk->sk_frag.page        =       NULL;
2555         sk->sk_frag.offset      =       0;
2556         sk->sk_peek_off         =       -1;
2557
2558         sk->sk_peer_pid         =       NULL;
2559         sk->sk_peer_cred        =       NULL;
2560         sk->sk_write_pending    =       0;
2561         sk->sk_rcvlowat         =       1;
2562         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2563         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2564
2565         sk->sk_stamp = ktime_set(-1L, 0);
2566
2567 #ifdef CONFIG_NET_RX_BUSY_POLL
2568         sk->sk_napi_id          =       0;
2569         sk->sk_ll_usec          =       sysctl_net_busy_read;
2570 #endif
2571
2572         sk->sk_max_pacing_rate = ~0U;
2573         sk->sk_pacing_rate = ~0U;
2574         sk->sk_incoming_cpu = -1;
2575         /*
2576          * Before updating sk_refcnt, we must commit prior changes to memory
2577          * (Documentation/RCU/rculist_nulls.txt for details)
2578          */
2579         smp_wmb();
2580         atomic_set(&sk->sk_refcnt, 1);
2581         atomic_set(&sk->sk_drops, 0);
2582 }
2583 EXPORT_SYMBOL(sock_init_data);
2584
2585 void lock_sock_nested(struct sock *sk, int subclass)
2586 {
2587         might_sleep();
2588         spin_lock_bh(&sk->sk_lock.slock);
2589         if (sk->sk_lock.owned)
2590                 __lock_sock(sk);
2591         sk->sk_lock.owned = 1;
2592         spin_unlock(&sk->sk_lock.slock);
2593         /*
2594          * The sk_lock has mutex_lock() semantics here:
2595          */
2596         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2597         local_bh_enable();
2598 }
2599 EXPORT_SYMBOL(lock_sock_nested);
2600
2601 void release_sock(struct sock *sk)
2602 {
2603         spin_lock_bh(&sk->sk_lock.slock);
2604         if (sk->sk_backlog.tail)
2605                 __release_sock(sk);
2606
2607         /* Warning : release_cb() might need to release sk ownership,
2608          * ie call sock_release_ownership(sk) before us.
2609          */
2610         if (sk->sk_prot->release_cb)
2611                 sk->sk_prot->release_cb(sk);
2612
2613         sock_release_ownership(sk);
2614         if (waitqueue_active(&sk->sk_lock.wq))
2615                 wake_up(&sk->sk_lock.wq);
2616         spin_unlock_bh(&sk->sk_lock.slock);
2617 }
2618 EXPORT_SYMBOL(release_sock);
2619
2620 /**
2621  * lock_sock_fast - fast version of lock_sock
2622  * @sk: socket
2623  *
2624  * This version should be used for very small section, where process wont block
2625  * return false if fast path is taken
2626  *   sk_lock.slock locked, owned = 0, BH disabled
2627  * return true if slow path is taken
2628  *   sk_lock.slock unlocked, owned = 1, BH enabled
2629  */
2630 bool lock_sock_fast(struct sock *sk)
2631 {
2632         might_sleep();
2633         spin_lock_bh(&sk->sk_lock.slock);
2634
2635         if (!sk->sk_lock.owned)
2636                 /*
2637                  * Note : We must disable BH
2638                  */
2639                 return false;
2640
2641         __lock_sock(sk);
2642         sk->sk_lock.owned = 1;
2643         spin_unlock(&sk->sk_lock.slock);
2644         /*
2645          * The sk_lock has mutex_lock() semantics here:
2646          */
2647         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2648         local_bh_enable();
2649         return true;
2650 }
2651 EXPORT_SYMBOL(lock_sock_fast);
2652
2653 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2654 {
2655         struct timeval tv;
2656         if (!sock_flag(sk, SOCK_TIMESTAMP))
2657                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2658         tv = ktime_to_timeval(sk->sk_stamp);
2659         if (tv.tv_sec == -1)
2660                 return -ENOENT;
2661         if (tv.tv_sec == 0) {
2662                 sk->sk_stamp = ktime_get_real();
2663                 tv = ktime_to_timeval(sk->sk_stamp);
2664         }
2665         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2666 }
2667 EXPORT_SYMBOL(sock_get_timestamp);
2668
2669 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2670 {
2671         struct timespec ts;
2672         if (!sock_flag(sk, SOCK_TIMESTAMP))
2673                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2674         ts = ktime_to_timespec(sk->sk_stamp);
2675         if (ts.tv_sec == -1)
2676                 return -ENOENT;
2677         if (ts.tv_sec == 0) {
2678                 sk->sk_stamp = ktime_get_real();
2679                 ts = ktime_to_timespec(sk->sk_stamp);
2680         }
2681         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2682 }
2683 EXPORT_SYMBOL(sock_get_timestampns);
2684
2685 void sock_enable_timestamp(struct sock *sk, int flag)
2686 {
2687         if (!sock_flag(sk, flag)) {
2688                 unsigned long previous_flags = sk->sk_flags;
2689
2690                 sock_set_flag(sk, flag);
2691                 /*
2692                  * we just set one of the two flags which require net
2693                  * time stamping, but time stamping might have been on
2694                  * already because of the other one
2695                  */
2696                 if (sock_needs_netstamp(sk) &&
2697                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2698                         net_enable_timestamp();
2699         }
2700 }
2701
2702 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2703                        int level, int type)
2704 {
2705         struct sock_exterr_skb *serr;
2706         struct sk_buff *skb;
2707         int copied, err;
2708
2709         err = -EAGAIN;
2710         skb = sock_dequeue_err_skb(sk);
2711         if (skb == NULL)
2712                 goto out;
2713
2714         copied = skb->len;
2715         if (copied > len) {
2716                 msg->msg_flags |= MSG_TRUNC;
2717                 copied = len;
2718         }
2719         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2720         if (err)
2721                 goto out_free_skb;
2722
2723         sock_recv_timestamp(msg, sk, skb);
2724
2725         serr = SKB_EXT_ERR(skb);
2726         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2727
2728         msg->msg_flags |= MSG_ERRQUEUE;
2729         err = copied;
2730
2731 out_free_skb:
2732         kfree_skb(skb);
2733 out:
2734         return err;
2735 }
2736 EXPORT_SYMBOL(sock_recv_errqueue);
2737
2738 /*
2739  *      Get a socket option on an socket.
2740  *
2741  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
2742  *      asynchronous errors should be reported by getsockopt. We assume
2743  *      this means if you specify SO_ERROR (otherwise whats the point of it).
2744  */
2745 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2746                            char __user *optval, int __user *optlen)
2747 {
2748         struct sock *sk = sock->sk;
2749
2750         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2751 }
2752 EXPORT_SYMBOL(sock_common_getsockopt);
2753
2754 #ifdef CONFIG_COMPAT
2755 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2756                                   char __user *optval, int __user *optlen)
2757 {
2758         struct sock *sk = sock->sk;
2759
2760         if (sk->sk_prot->compat_getsockopt != NULL)
2761                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2762                                                       optval, optlen);
2763         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2764 }
2765 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2766 #endif
2767
2768 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2769                         int flags)
2770 {
2771         struct sock *sk = sock->sk;
2772         int addr_len = 0;
2773         int err;
2774
2775         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2776                                    flags & ~MSG_DONTWAIT, &addr_len);
2777         if (err >= 0)
2778                 msg->msg_namelen = addr_len;
2779         return err;
2780 }
2781 EXPORT_SYMBOL(sock_common_recvmsg);
2782
2783 /*
2784  *      Set socket options on an inet socket.
2785  */
2786 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2787                            char __user *optval, unsigned int optlen)
2788 {
2789         struct sock *sk = sock->sk;
2790
2791         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2792 }
2793 EXPORT_SYMBOL(sock_common_setsockopt);
2794
2795 #ifdef CONFIG_COMPAT
2796 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2797                                   char __user *optval, unsigned int optlen)
2798 {
2799         struct sock *sk = sock->sk;
2800
2801         if (sk->sk_prot->compat_setsockopt != NULL)
2802                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2803                                                       optval, optlen);
2804         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2805 }
2806 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2807 #endif
2808
2809 void sk_common_release(struct sock *sk)
2810 {
2811         if (sk->sk_prot->destroy)
2812                 sk->sk_prot->destroy(sk);
2813
2814         /*
2815          * Observation: when sock_common_release is called, processes have
2816          * no access to socket. But net still has.
2817          * Step one, detach it from networking:
2818          *
2819          * A. Remove from hash tables.
2820          */
2821
2822         sk->sk_prot->unhash(sk);
2823
2824         /*
2825          * In this point socket cannot receive new packets, but it is possible
2826          * that some packets are in flight because some CPU runs receiver and
2827          * did hash table lookup before we unhashed socket. They will achieve
2828          * receive queue and will be purged by socket destructor.
2829          *
2830          * Also we still have packets pending on receive queue and probably,
2831          * our own packets waiting in device queues. sock_destroy will drain
2832          * receive queue, but transmitted packets will delay socket destruction
2833          * until the last reference will be released.
2834          */
2835
2836         sock_orphan(sk);
2837
2838         xfrm_sk_free_policy(sk);
2839
2840         sk_refcnt_debug_release(sk);
2841
2842         if (sk->sk_frag.page) {
2843                 put_page(sk->sk_frag.page);
2844                 sk->sk_frag.page = NULL;
2845         }
2846
2847         sock_put(sk);
2848 }
2849 EXPORT_SYMBOL(sk_common_release);
2850
2851 #ifdef CONFIG_PROC_FS
2852 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2853 struct prot_inuse {
2854         int val[PROTO_INUSE_NR];
2855 };
2856
2857 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2858
2859 #ifdef CONFIG_NET_NS
2860 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2861 {
2862         __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2863 }
2864 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2865
2866 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2867 {
2868         int cpu, idx = prot->inuse_idx;
2869         int res = 0;
2870
2871         for_each_possible_cpu(cpu)
2872                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2873
2874         return res >= 0 ? res : 0;
2875 }
2876 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2877
2878 static int __net_init sock_inuse_init_net(struct net *net)
2879 {
2880         net->core.inuse = alloc_percpu(struct prot_inuse);
2881         return net->core.inuse ? 0 : -ENOMEM;
2882 }
2883
2884 static void __net_exit sock_inuse_exit_net(struct net *net)
2885 {
2886         free_percpu(net->core.inuse);
2887 }
2888
2889 static struct pernet_operations net_inuse_ops = {
2890         .init = sock_inuse_init_net,
2891         .exit = sock_inuse_exit_net,
2892 };
2893
2894 static __init int net_inuse_init(void)
2895 {
2896         if (register_pernet_subsys(&net_inuse_ops))
2897                 panic("Cannot initialize net inuse counters");
2898
2899         return 0;
2900 }
2901
2902 core_initcall(net_inuse_init);
2903 #else
2904 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2905
2906 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2907 {
2908         __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2909 }
2910 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2911
2912 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2913 {
2914         int cpu, idx = prot->inuse_idx;
2915         int res = 0;
2916
2917         for_each_possible_cpu(cpu)
2918                 res += per_cpu(prot_inuse, cpu).val[idx];
2919
2920         return res >= 0 ? res : 0;
2921 }
2922 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2923 #endif
2924
2925 static void assign_proto_idx(struct proto *prot)
2926 {
2927         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2928
2929         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2930                 pr_err("PROTO_INUSE_NR exhausted\n");
2931                 return;
2932         }
2933
2934         set_bit(prot->inuse_idx, proto_inuse_idx);
2935 }
2936
2937 static void release_proto_idx(struct proto *prot)
2938 {
2939         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2940                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2941 }
2942 #else
2943 static inline void assign_proto_idx(struct proto *prot)
2944 {
2945 }
2946
2947 static inline void release_proto_idx(struct proto *prot)
2948 {
2949 }
2950 #endif
2951
2952 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2953 {
2954         if (!rsk_prot)
2955                 return;
2956         kfree(rsk_prot->slab_name);
2957         rsk_prot->slab_name = NULL;
2958         kmem_cache_destroy(rsk_prot->slab);
2959         rsk_prot->slab = NULL;
2960 }
2961
2962 static int req_prot_init(const struct proto *prot)
2963 {
2964         struct request_sock_ops *rsk_prot = prot->rsk_prot;
2965
2966         if (!rsk_prot)
2967                 return 0;
2968
2969         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2970                                         prot->name);
2971         if (!rsk_prot->slab_name)
2972                 return -ENOMEM;
2973
2974         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2975                                            rsk_prot->obj_size, 0,
2976                                            prot->slab_flags, NULL);
2977
2978         if (!rsk_prot->slab) {
2979                 pr_crit("%s: Can't create request sock SLAB cache!\n",
2980                         prot->name);
2981                 return -ENOMEM;
2982         }
2983         return 0;
2984 }
2985
2986 int proto_register(struct proto *prot, int alloc_slab)
2987 {
2988         if (alloc_slab) {
2989                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2990                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2991                                         NULL);
2992
2993                 if (prot->slab == NULL) {
2994                         pr_crit("%s: Can't create sock SLAB cache!\n",
2995                                 prot->name);
2996                         goto out;
2997                 }
2998
2999                 if (req_prot_init(prot))
3000                         goto out_free_request_sock_slab;
3001
3002                 if (prot->twsk_prot != NULL) {
3003                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3004
3005                         if (prot->twsk_prot->twsk_slab_name == NULL)
3006                                 goto out_free_request_sock_slab;
3007
3008                         prot->twsk_prot->twsk_slab =
3009                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3010                                                   prot->twsk_prot->twsk_obj_size,
3011                                                   0,
3012                                                   prot->slab_flags,
3013                                                   NULL);
3014                         if (prot->twsk_prot->twsk_slab == NULL)
3015                                 goto out_free_timewait_sock_slab_name;
3016                 }
3017         }
3018
3019         mutex_lock(&proto_list_mutex);
3020         list_add(&prot->node, &proto_list);
3021         assign_proto_idx(prot);
3022         mutex_unlock(&proto_list_mutex);
3023         return 0;
3024
3025 out_free_timewait_sock_slab_name:
3026         kfree(prot->twsk_prot->twsk_slab_name);
3027 out_free_request_sock_slab:
3028         req_prot_cleanup(prot->rsk_prot);
3029
3030         kmem_cache_destroy(prot->slab);
3031         prot->slab = NULL;
3032 out:
3033         return -ENOBUFS;
3034 }
3035 EXPORT_SYMBOL(proto_register);
3036
3037 void proto_unregister(struct proto *prot)
3038 {
3039         mutex_lock(&proto_list_mutex);
3040         release_proto_idx(prot);
3041         list_del(&prot->node);
3042         mutex_unlock(&proto_list_mutex);
3043
3044         kmem_cache_destroy(prot->slab);
3045         prot->slab = NULL;
3046
3047         req_prot_cleanup(prot->rsk_prot);
3048
3049         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3050                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3051                 kfree(prot->twsk_prot->twsk_slab_name);
3052                 prot->twsk_prot->twsk_slab = NULL;
3053         }
3054 }
3055 EXPORT_SYMBOL(proto_unregister);
3056
3057 #ifdef CONFIG_PROC_FS
3058 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3059         __acquires(proto_list_mutex)
3060 {
3061         mutex_lock(&proto_list_mutex);
3062         return seq_list_start_head(&proto_list, *pos);
3063 }
3064
3065 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3066 {
3067         return seq_list_next(v, &proto_list, pos);
3068 }
3069
3070 static void proto_seq_stop(struct seq_file *seq, void *v)
3071         __releases(proto_list_mutex)
3072 {
3073         mutex_unlock(&proto_list_mutex);
3074 }
3075
3076 static char proto_method_implemented(const void *method)
3077 {
3078         return method == NULL ? 'n' : 'y';
3079 }
3080 static long sock_prot_memory_allocated(struct proto *proto)
3081 {
3082         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3083 }
3084
3085 static char *sock_prot_memory_pressure(struct proto *proto)
3086 {
3087         return proto->memory_pressure != NULL ?
3088         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3089 }
3090
3091 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3092 {
3093
3094         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3095                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3096                    proto->name,
3097                    proto->obj_size,
3098                    sock_prot_inuse_get(seq_file_net(seq), proto),
3099                    sock_prot_memory_allocated(proto),
3100                    sock_prot_memory_pressure(proto),
3101                    proto->max_header,
3102                    proto->slab == NULL ? "no" : "yes",
3103                    module_name(proto->owner),
3104                    proto_method_implemented(proto->close),
3105                    proto_method_implemented(proto->connect),
3106                    proto_method_implemented(proto->disconnect),
3107                    proto_method_implemented(proto->accept),
3108                    proto_method_implemented(proto->ioctl),
3109                    proto_method_implemented(proto->init),
3110                    proto_method_implemented(proto->destroy),
3111                    proto_method_implemented(proto->shutdown),
3112                    proto_method_implemented(proto->setsockopt),
3113                    proto_method_implemented(proto->getsockopt),
3114                    proto_method_implemented(proto->sendmsg),
3115                    proto_method_implemented(proto->recvmsg),
3116                    proto_method_implemented(proto->sendpage),
3117                    proto_method_implemented(proto->bind),
3118                    proto_method_implemented(proto->backlog_rcv),
3119                    proto_method_implemented(proto->hash),
3120                    proto_method_implemented(proto->unhash),
3121                    proto_method_implemented(proto->get_port),
3122                    proto_method_implemented(proto->enter_memory_pressure));
3123 }
3124
3125 static int proto_seq_show(struct seq_file *seq, void *v)
3126 {
3127         if (v == &proto_list)
3128                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3129                            "protocol",
3130                            "size",
3131                            "sockets",
3132                            "memory",
3133                            "press",
3134                            "maxhdr",
3135                            "slab",
3136                            "module",
3137                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3138         else
3139                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3140         return 0;
3141 }
3142
3143 static const struct seq_operations proto_seq_ops = {
3144         .start  = proto_seq_start,
3145         .next   = proto_seq_next,
3146         .stop   = proto_seq_stop,
3147         .show   = proto_seq_show,
3148 };
3149
3150 static int proto_seq_open(struct inode *inode, struct file *file)
3151 {
3152         return seq_open_net(inode, file, &proto_seq_ops,
3153                             sizeof(struct seq_net_private));
3154 }
3155
3156 static const struct file_operations proto_seq_fops = {
3157         .owner          = THIS_MODULE,
3158         .open           = proto_seq_open,
3159         .read           = seq_read,
3160         .llseek         = seq_lseek,
3161         .release        = seq_release_net,
3162 };
3163
3164 static __net_init int proto_init_net(struct net *net)
3165 {
3166         if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
3167                 return -ENOMEM;
3168
3169         return 0;
3170 }
3171
3172 static __net_exit void proto_exit_net(struct net *net)
3173 {
3174         remove_proc_entry("protocols", net->proc_net);
3175 }
3176
3177
3178 static __net_initdata struct pernet_operations proto_net_ops = {
3179         .init = proto_init_net,
3180         .exit = proto_exit_net,
3181 };
3182
3183 static int __init proto_init(void)
3184 {
3185         return register_pernet_subsys(&proto_net_ops);
3186 }
3187
3188 subsys_initcall(proto_init);
3189
3190 #endif /* PROC_FS */