]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - include/net/sock.h
Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'
[linux.git] / include / net / sock.h
index 92b269709b9a8a7e5d69c55ac66b834501a2931c..c4f5e6fca17cf4e0029080410202cb66ce0fad37 100644 (file)
@@ -343,6 +343,9 @@ struct sock {
 #define sk_rxhash              __sk_common.skc_rxhash
 
        socket_lock_t           sk_lock;
+       atomic_t                sk_drops;
+       int                     sk_rcvlowat;
+       struct sk_buff_head     sk_error_queue;
        struct sk_buff_head     sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
@@ -359,14 +362,13 @@ struct sock {
                struct sk_buff  *tail;
        } sk_backlog;
 #define sk_rmem_alloc sk_backlog.rmem_alloc
-       int                     sk_forward_alloc;
 
-       __u32                   sk_txhash;
+       int                     sk_forward_alloc;
 #ifdef CONFIG_NET_RX_BUSY_POLL
-       unsigned int            sk_napi_id;
        unsigned int            sk_ll_usec;
+       /* ===== mostly read cache line ===== */
+       unsigned int            sk_napi_id;
 #endif
-       atomic_t                sk_drops;
        int                     sk_rcvbuf;
 
        struct sk_filter __rcu  *sk_filter;
@@ -379,16 +381,50 @@ struct sock {
 #endif
        struct dst_entry        *sk_rx_dst;
        struct dst_entry __rcu  *sk_dst_cache;
-       /* Note: 32bit hole on 64bit arches */
-       atomic_t                sk_wmem_alloc;
        atomic_t                sk_omem_alloc;
        int                     sk_sndbuf;
+
+       /* ===== cache line for TX ===== */
+       int                     sk_wmem_queued;
+       atomic_t                sk_wmem_alloc;
+       unsigned long           sk_tsq_flags;
+       struct sk_buff          *sk_send_head;
        struct sk_buff_head     sk_write_queue;
+       __s32                   sk_peek_off;
+       int                     sk_write_pending;
+       long                    sk_sndtimeo;
+       struct timer_list       sk_timer;
+       __u32                   sk_priority;
+       __u32                   sk_mark;
+       u32                     sk_pacing_rate; /* bytes per second */
+       u32                     sk_max_pacing_rate;
+       struct page_frag        sk_frag;
+       netdev_features_t       sk_route_caps;
+       netdev_features_t       sk_route_nocaps;
+       int                     sk_gso_type;
+       unsigned int            sk_gso_max_size;
+       gfp_t                   sk_allocation;
+       __u32                   sk_txhash;
 
        /*
         * Because of non atomicity rules, all
         * changes are protected by socket lock.
         */
+       unsigned int            __sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT  16
+#define SK_FL_PROTO_MASK   0x00ff0000
+
+#define SK_FL_TYPE_SHIFT   0
+#define SK_FL_TYPE_MASK    0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT  8
+#define SK_FL_PROTO_MASK   0x0000ff00
+
+#define SK_FL_TYPE_SHIFT   16
+#define SK_FL_TYPE_MASK    0xffff0000
+#endif
+
        kmemcheck_bitfield_begin(flags);
        unsigned int            sk_padding : 2,
                                sk_no_check_tx : 1,
@@ -399,41 +435,24 @@ struct sock {
 #define SK_PROTOCOL_MAX U8_MAX
        kmemcheck_bitfield_end(flags);
 
-       int                     sk_wmem_queued;
-       gfp_t                   sk_allocation;
-       u32                     sk_pacing_rate; /* bytes per second */
-       u32                     sk_max_pacing_rate;
-       netdev_features_t       sk_route_caps;
-       netdev_features_t       sk_route_nocaps;
-       int                     sk_gso_type;
-       unsigned int            sk_gso_max_size;
        u16                     sk_gso_max_segs;
-       int                     sk_rcvlowat;
        unsigned long           sk_lingertime;
-       struct sk_buff_head     sk_error_queue;
        struct proto            *sk_prot_creator;
        rwlock_t                sk_callback_lock;
        int                     sk_err,
                                sk_err_soft;
        u32                     sk_ack_backlog;
        u32                     sk_max_ack_backlog;
-       __u32                   sk_priority;
-       __u32                   sk_mark;
+       kuid_t                  sk_uid;
        struct pid              *sk_peer_pid;
        const struct cred       *sk_peer_cred;
        long                    sk_rcvtimeo;
-       long                    sk_sndtimeo;
-       struct timer_list       sk_timer;
        ktime_t                 sk_stamp;
        u16                     sk_tsflags;
        u8                      sk_shutdown;
        u32                     sk_tskey;
        struct socket           *sk_socket;
        void                    *sk_user_data;
-       struct page_frag        sk_frag;
-       struct sk_buff          *sk_send_head;
-       __s32                   sk_peek_off;
-       int                     sk_write_pending;
 #ifdef CONFIG_SECURITY
        void                    *sk_security;
 #endif
@@ -894,7 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-       sock_rps_record_flow_hash(sk->sk_rxhash);
+       if (static_key_false(&rfs_needed)) {
+               /* Reading sk->sk_rxhash might incur an expensive cache line
+                * miss.
+                *
+                * TCP_ESTABLISHED does cover almost all states where RFS
+                * might be useful, and is cheaper [1] than testing :
+                *      IPv4: inet_sk(sk)->inet_daddr
+                *      IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+                * OR   an additional socket flag
+                * [1] : sk_state and sk_prot are in the same cache line.
+                */
+               if (sk->sk_state == TCP_ESTABLISHED)
+                       sock_rps_record_flow_hash(sk->sk_rxhash);
+       }
 #endif
 }
 
@@ -914,14 +946,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
 #endif
 }
 
-#define sk_wait_event(__sk, __timeo, __condition)                      \
+#define sk_wait_event(__sk, __timeo, __condition, __wait)              \
        ({      int __rc;                                               \
                release_sock(__sk);                                     \
                __rc = __condition;                                     \
                if (!__rc) {                                            \
-                       *(__timeo) = schedule_timeout(*(__timeo));      \
+                       *(__timeo) = wait_woken(__wait,                 \
+                                               TASK_INTERRUPTIBLE,     \
+                                               *(__timeo));            \
                }                                                       \
-               sched_annotate_sleep();                                         \
+               sched_annotate_sleep();                                 \
                lock_sock(__sk);                                        \
                __rc = __condition;                                     \
                __rc;                                                   \
@@ -1162,11 +1196,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
        sk->sk_prot->enter_memory_pressure(sk);
 }
 
-static inline long sk_prot_mem_limits(const struct sock *sk, int index)
-{
-       return sk->sk_prot->sysctl_mem[index];
-}
-
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
@@ -1276,14 +1305,32 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
 /*
  * Functions for memory accounting
  */
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
 int __sk_mem_schedule(struct sock *sk, int size, int kind);
+void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
-#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+/* We used to have PAGE_SIZE here, but systems with 64KB pages
+ * do not necessarily have 16x time more memory than 4KB ones.
+ */
+#define SK_MEM_QUANTUM 4096
 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
 #define SK_MEM_SEND    0
 #define SK_MEM_RECV    1
 
+/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+static inline long sk_prot_mem_limits(const struct sock *sk, int index)
+{
+       long val = sk->sk_prot->sysctl_mem[index];
+
+#if PAGE_SIZE > SK_MEM_QUANTUM
+       val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
+#elif PAGE_SIZE < SK_MEM_QUANTUM
+       val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
+#endif
+       return val;
+}
+
 static inline int sk_mem_pages(int amt)
 {
        return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
@@ -1651,6 +1698,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
        sk->sk_wq = parent->wq;
        parent->sk = sk;
        sk_set_socket(sk, parent);
+       sk->sk_uid = SOCK_INODE(parent)->i_uid;
        security_sock_graft(sk, parent);
        write_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1658,6 +1706,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 kuid_t sock_i_uid(struct sock *sk);
 unsigned long sock_i_ino(struct sock *sk);
 
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+       return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
 static inline u32 net_tx_rndhash(void)
 {
        u32 v = prandom_u32();
@@ -1783,13 +1836,13 @@ static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
 {
        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
-               if (csum_and_copy_from_iter(to, copy, &csum, from) != copy)
+               if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, offset);
        } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
-               if (copy_from_iter_nocache(to, copy, from) != copy)
+               if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
-       } else if (copy_from_iter(to, copy, from) != copy)
+       } else if (!copy_from_iter_full(to, copy, from))
                return -EFAULT;
 
        return 0;
@@ -1952,6 +2005,10 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
 
 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
 
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+                       unsigned int flags,
+                       void (*destructor)(struct sock *sk,
+                                          struct sk_buff *skb));
 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
@@ -2108,7 +2165,8 @@ struct sock_skb_cb {
 static inline void
 sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
 {
-       SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+       SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+                                               atomic_read(&sk->sk_drops) : 0;
 }
 
 static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
@@ -2137,8 +2195,8 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
         */
        if (sock_flag(sk, SOCK_RCVTSTAMP) ||
            (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
-           (kt.tv64 && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
-           (hwtstamps->hwtstamp.tv64 &&
+           (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
+           (hwtstamps->hwtstamp &&
             (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                __sock_recv_timestamp(msg, sk, skb);
        else