Merge branches 'pm-core', 'pm-qos', 'pm-domains' and 'pm-opp'

[linux.git] / include / net / sock.h
diff --git a/include/net/sock.h b/include/net/sock.h

index 92b269709b9a8a7e5d69c55ac66b834501a2931c..c4f5e6fca17cf4e0029080410202cb66ce0fad37 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -343,6 +343,9 @@ struct sock {
  #define sk_rxhash              __sk_common.skc_rxhash
  
         socket_lock_t           sk_lock;
+       atomic_t                sk_drops;
+       int                     sk_rcvlowat;
+       struct sk_buff_head     sk_error_queue;
         struct sk_buff_head     sk_receive_queue;
         /*
          * The backlog queue is special, it is always used with
@@ -359,14 +362,13 @@ struct sock {
                 struct sk_buff  *tail;
         } sk_backlog;
  #define sk_rmem_alloc sk_backlog.rmem_alloc
-       int                     sk_forward_alloc;
  
-       __u32                   sk_txhash;
+       int                     sk_forward_alloc;
  #ifdef CONFIG_NET_RX_BUSY_POLL
-       unsigned int            sk_napi_id;
         unsigned int            sk_ll_usec;
+       /* ===== mostly read cache line ===== */
+       unsigned int            sk_napi_id;
  #endif
-       atomic_t                sk_drops;
         int                     sk_rcvbuf;
  
         struct sk_filter __rcu  *sk_filter;
@@ -379,16 +381,50 @@ struct sock {
  #endif
         struct dst_entry        *sk_rx_dst;
         struct dst_entry __rcu  *sk_dst_cache;
-       /* Note: 32bit hole on 64bit arches */
-       atomic_t                sk_wmem_alloc;
         atomic_t                sk_omem_alloc;
         int                     sk_sndbuf;
+
+       /* ===== cache line for TX ===== */
+       int                     sk_wmem_queued;
+       atomic_t                sk_wmem_alloc;
+       unsigned long           sk_tsq_flags;
+       struct sk_buff          *sk_send_head;
         struct sk_buff_head     sk_write_queue;
+       __s32                   sk_peek_off;
+       int                     sk_write_pending;
+       long                    sk_sndtimeo;
+       struct timer_list       sk_timer;
+       __u32                   sk_priority;
+       __u32                   sk_mark;
+       u32                     sk_pacing_rate; /* bytes per second */
+       u32                     sk_max_pacing_rate;
+       struct page_frag        sk_frag;
+       netdev_features_t       sk_route_caps;
+       netdev_features_t       sk_route_nocaps;
+       int                     sk_gso_type;
+       unsigned int            sk_gso_max_size;
+       gfp_t                   sk_allocation;
+       __u32                   sk_txhash;
  
         /*
          * Because of non atomicity rules, all
          * changes are protected by socket lock.
          */
+       unsigned int            __sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT  16
+#define SK_FL_PROTO_MASK   0x00ff0000
+
+#define SK_FL_TYPE_SHIFT   0
+#define SK_FL_TYPE_MASK    0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT  8
+#define SK_FL_PROTO_MASK   0x0000ff00
+
+#define SK_FL_TYPE_SHIFT   16
+#define SK_FL_TYPE_MASK    0xffff0000
+#endif
+
         kmemcheck_bitfield_begin(flags);
         unsigned int            sk_padding : 2,
                                 sk_no_check_tx : 1,
@@ -399,41 +435,24 @@ struct sock {
  #define SK_PROTOCOL_MAX U8_MAX
         kmemcheck_bitfield_end(flags);
  
-       int                     sk_wmem_queued;
-       gfp_t                   sk_allocation;
-       u32                     sk_pacing_rate; /* bytes per second */
-       u32                     sk_max_pacing_rate;
-       netdev_features_t       sk_route_caps;
-       netdev_features_t       sk_route_nocaps;
-       int                     sk_gso_type;
-       unsigned int            sk_gso_max_size;
         u16                     sk_gso_max_segs;
-       int                     sk_rcvlowat;
         unsigned long           sk_lingertime;
-       struct sk_buff_head     sk_error_queue;
         struct proto            *sk_prot_creator;
         rwlock_t                sk_callback_lock;
         int                     sk_err,
                                 sk_err_soft;
         u32                     sk_ack_backlog;
         u32                     sk_max_ack_backlog;
-       __u32                   sk_priority;
-       __u32                   sk_mark;
+       kuid_t                  sk_uid;
         struct pid              *sk_peer_pid;
         const struct cred       *sk_peer_cred;
         long                    sk_rcvtimeo;
-       long                    sk_sndtimeo;
-       struct timer_list       sk_timer;
         ktime_t                 sk_stamp;
         u16                     sk_tsflags;
         u8                      sk_shutdown;
         u32                     sk_tskey;
         struct socket           *sk_socket;
         void                    *sk_user_data;
-       struct page_frag        sk_frag;
-       struct sk_buff          *sk_send_head;
-       __s32                   sk_peek_off;
-       int                     sk_write_pending;
  #ifdef CONFIG_SECURITY
         void                    *sk_security;
  #endif
@@ -894,7 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
  static inline void sock_rps_record_flow(const struct sock *sk)
  {
  #ifdef CONFIG_RPS
-       sock_rps_record_flow_hash(sk->sk_rxhash);
+       if (static_key_false(&rfs_needed)) {
+               /* Reading sk->sk_rxhash might incur an expensive cache line
+                * miss.
+                *
+                * TCP_ESTABLISHED does cover almost all states where RFS
+                * might be useful, and is cheaper [1] than testing :
+                *      IPv4: inet_sk(sk)->inet_daddr
+                *      IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+                * OR   an additional socket flag
+                * [1] : sk_state and sk_prot are in the same cache line.
+                */
+               if (sk->sk_state == TCP_ESTABLISHED)
+                       sock_rps_record_flow_hash(sk->sk_rxhash);
+       }
  #endif
  }
  
@@ -914,14 +946,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
  #endif
  }
  
-#define sk_wait_event(__sk, __timeo, __condition)                      \
+#define sk_wait_event(__sk, __timeo, __condition, __wait)              \
         ({      int __rc;                                               \
                 release_sock(__sk);                                     \
                 __rc = __condition;                                     \
                 if (!__rc) {                                            \
-                       *(__timeo) = schedule_timeout(*(__timeo));      \
+                       *(__timeo) = wait_woken(__wait,                 \
+                                               TASK_INTERRUPTIBLE,     \
+                                               *(__timeo));            \
                 }                                                       \
-               sched_annotate_sleep();                                         \
+               sched_annotate_sleep();                                 \
                 lock_sock(__sk);                                        \
                 __rc = __condition;                                     \
                 __rc;                                                   \
@@ -1162,11 +1196,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
         sk->sk_prot->enter_memory_pressure(sk);
  }
  
-static inline long sk_prot_mem_limits(const struct sock *sk, int index)
-{
-       return sk->sk_prot->sysctl_mem[index];
-}
-
  static inline long
  sk_memory_allocated(const struct sock *sk)
  {
@@ -1276,14 +1305,32 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
  /*
   * Functions for memory accounting
   */
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
  int __sk_mem_schedule(struct sock *sk, int size, int kind);
+void __sk_mem_reduce_allocated(struct sock *sk, int amount);
  void __sk_mem_reclaim(struct sock *sk, int amount);
  
-#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+/* We used to have PAGE_SIZE here, but systems with 64KB pages
+ * do not necessarily have 16x time more memory than 4KB ones.
+ */
+#define SK_MEM_QUANTUM 4096
  #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
  #define SK_MEM_SEND    0
  #define SK_MEM_RECV    1
  
+/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+static inline long sk_prot_mem_limits(const struct sock *sk, int index)
+{
+       long val = sk->sk_prot->sysctl_mem[index];
+
+#if PAGE_SIZE > SK_MEM_QUANTUM
+       val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
+#elif PAGE_SIZE < SK_MEM_QUANTUM
+       val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
+#endif
+       return val;
+}
+
  static inline int sk_mem_pages(int amt)
  {
         return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
@@ -1651,6 +1698,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
         sk->sk_wq = parent->wq;
         parent->sk = sk;
         sk_set_socket(sk, parent);
+       sk->sk_uid = SOCK_INODE(parent)->i_uid;
         security_sock_graft(sk, parent);
         write_unlock_bh(&sk->sk_callback_lock);
  }
@@ -1658,6 +1706,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
  kuid_t sock_i_uid(struct sock *sk);
  unsigned long sock_i_ino(struct sock *sk);
  
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+       return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
  static inline u32 net_tx_rndhash(void)
  {
         u32 v = prandom_u32();
@@ -1783,13 +1836,13 @@ static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
  {
         if (skb->ip_summed == CHECKSUM_NONE) {
                 __wsum csum = 0;
-               if (csum_and_copy_from_iter(to, copy, &csum, from) != copy)
+               if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                         return -EFAULT;
                 skb->csum = csum_block_add(skb->csum, csum, offset);
         } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
-               if (copy_from_iter_nocache(to, copy, from) != copy)
+               if (!copy_from_iter_full_nocache(to, copy, from))
                         return -EFAULT;
-       } else if (copy_from_iter(to, copy, from) != copy)
+       } else if (!copy_from_iter_full(to, copy, from))
                 return -EFAULT;
  
         return 0;
@@ -1952,6 +2005,10 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
  
  void sk_stop_timer(struct sock *sk, struct timer_list *timer);
  
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+                       unsigned int flags,
+                       void (*destructor)(struct sock *sk,
+                                          struct sk_buff *skb));
  int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
  int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
  
@@ -2108,7 +2165,8 @@ struct sock_skb_cb {
  static inline void
  sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
  {
-       SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+       SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+                                               atomic_read(&sk->sk_drops) : 0;
  }
  
  static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
@@ -2137,8 +2195,8 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
          */
         if (sock_flag(sk, SOCK_RCVTSTAMP) ||
             (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
-           (kt.tv64 && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
-           (hwtstamps->hwtstamp.tv64 &&
+           (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
+           (hwtstamps->hwtstamp &&
              (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                 __sock_recv_timestamp(msg, sk, skb);
         else