]> asedeno.scripts.mit.edu Git - linux.git/blob - net/netfilter/ipvs/ip_vs_sync.c
Merge ath-next from git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git
[linux.git] / net / netfilter / ipvs / ip_vs_sync.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * IPVS         An implementation of the IP virtual server support for the
4  *              LINUX operating system.  IPVS is now implemented as a module
5  *              over the NetFilter framework. IPVS can be used to build a
6  *              high-performance and highly available server based on a
7  *              cluster of servers.
8  *
9  * Version 1,   is capable of handling both version 0 and 1 messages.
10  *              Version 0 is the plain old format.
11  *              Note Version 0 receivers will just drop Ver 1 messages.
12  *              Version 1 is capable of handle IPv6, Persistence data,
13  *              time-outs, and firewall marks.
14  *              In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
15  *              Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
16  *
17  * Definitions  Message: is a complete datagram
18  *              Sync_conn: is a part of a Message
19  *              Param Data is an option to a Sync_conn.
20  *
21  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
22  *
23  * ip_vs_sync:  sync connection info from master load balancer to backups
24  *              through multicast
25  *
26  * Changes:
27  *      Alexandre Cassen        :       Added master & backup support at a time.
28  *      Alexandre Cassen        :       Added SyncID support for incoming sync
29  *                                      messages filtering.
30  *      Justin Ossevoort        :       Fix endian problem on sync message size.
31  *      Hans Schillstrom        :       Added Version 1: i.e. IPv6,
32  *                                      Persistence support, fwmark and time-out.
33  */
34
35 #define KMSG_COMPONENT "IPVS"
36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
37
38 #include <linux/module.h>
39 #include <linux/slab.h>
40 #include <linux/inetdevice.h>
41 #include <linux/net.h>
42 #include <linux/completion.h>
43 #include <linux/delay.h>
44 #include <linux/skbuff.h>
45 #include <linux/in.h>
46 #include <linux/igmp.h>                 /* for ip_mc_join_group */
47 #include <linux/udp.h>
48 #include <linux/err.h>
49 #include <linux/kthread.h>
50 #include <linux/wait.h>
51 #include <linux/kernel.h>
52 #include <linux/sched/signal.h>
53
54 #include <asm/unaligned.h>              /* Used for ntoh_seq and hton_seq */
55
56 #include <net/ip.h>
57 #include <net/sock.h>
58
59 #include <net/ip_vs.h>
60
61 #define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
62 #define IP_VS_SYNC_PORT  8848          /* multicast port */
63
64 #define SYNC_PROTO_VER  1               /* Protocol version in header */
65
66 static struct lock_class_key __ipvs_sync_key;
67 /*
68  *      IPVS sync connection entry
69  *      Version 0, i.e. original version.
70  */
71 struct ip_vs_sync_conn_v0 {
72         __u8                    reserved;
73
74         /* Protocol, addresses and port numbers */
75         __u8                    protocol;       /* Which protocol (TCP/UDP) */
76         __be16                  cport;
77         __be16                  vport;
78         __be16                  dport;
79         __be32                  caddr;          /* client address */
80         __be32                  vaddr;          /* virtual address */
81         __be32                  daddr;          /* destination address */
82
83         /* Flags and state transition */
84         __be16                  flags;          /* status flags */
85         __be16                  state;          /* state info */
86
87         /* The sequence options start here */
88 };
89
90 struct ip_vs_sync_conn_options {
91         struct ip_vs_seq        in_seq;         /* incoming seq. struct */
92         struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
93 };
94
95 /*
96      Sync Connection format (sync_conn)
97
98        0                   1                   2                   3
99        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
100       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101       |    Type       |    Protocol   | Ver.  |        Size           |
102       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
103       |                             Flags                             |
104       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105       |            State              |         cport                 |
106       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107       |            vport              |         dport                 |
108       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
109       |                             fwmark                            |
110       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
111       |                             timeout  (in sec.)                |
112       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
113       |                              ...                              |
114       |                        IP-Addresses  (v4 or v6)               |
115       |                              ...                              |
116       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
117   Optional Parameters.
118       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
119       | Param. Type    | Param. Length |   Param. data                |
120       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
121       |                              ...                              |
122       |                               +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
123       |                               | Param Type    | Param. Length |
124       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
125       |                           Param  data                         |
126       |         Last Param data should be padded for 32 bit alignment |
127       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
128 */
129
130 /*
131  *  Type 0, IPv4 sync connection format
132  */
133 struct ip_vs_sync_v4 {
134         __u8                    type;
135         __u8                    protocol;       /* Which protocol (TCP/UDP) */
136         __be16                  ver_size;       /* Version msb 4 bits */
137         /* Flags and state transition */
138         __be32                  flags;          /* status flags */
139         __be16                  state;          /* state info   */
140         /* Protocol, addresses and port numbers */
141         __be16                  cport;
142         __be16                  vport;
143         __be16                  dport;
144         __be32                  fwmark;         /* Firewall mark from skb */
145         __be32                  timeout;        /* cp timeout */
146         __be32                  caddr;          /* client address */
147         __be32                  vaddr;          /* virtual address */
148         __be32                  daddr;          /* destination address */
149         /* The sequence options start here */
150         /* PE data padded to 32bit alignment after seq. options */
151 };
152 /*
153  * Type 2 messages IPv6
154  */
155 struct ip_vs_sync_v6 {
156         __u8                    type;
157         __u8                    protocol;       /* Which protocol (TCP/UDP) */
158         __be16                  ver_size;       /* Version msb 4 bits */
159         /* Flags and state transition */
160         __be32                  flags;          /* status flags */
161         __be16                  state;          /* state info   */
162         /* Protocol, addresses and port numbers */
163         __be16                  cport;
164         __be16                  vport;
165         __be16                  dport;
166         __be32                  fwmark;         /* Firewall mark from skb */
167         __be32                  timeout;        /* cp timeout */
168         struct in6_addr         caddr;          /* client address */
169         struct in6_addr         vaddr;          /* virtual address */
170         struct in6_addr         daddr;          /* destination address */
171         /* The sequence options start here */
172         /* PE data padded to 32bit alignment after seq. options */
173 };
174
175 union ip_vs_sync_conn {
176         struct ip_vs_sync_v4    v4;
177         struct ip_vs_sync_v6    v6;
178 };
179
180 /* Bits in Type field in above */
181 #define STYPE_INET6             0
182 #define STYPE_F_INET6           (1 << STYPE_INET6)
183
184 #define SVER_SHIFT              12              /* Shift to get version */
185 #define SVER_MASK               0x0fff          /* Mask to strip version */
186
187 #define IPVS_OPT_SEQ_DATA       1
188 #define IPVS_OPT_PE_DATA        2
189 #define IPVS_OPT_PE_NAME        3
190 #define IPVS_OPT_PARAM          7
191
192 #define IPVS_OPT_F_SEQ_DATA     (1 << (IPVS_OPT_SEQ_DATA-1))
193 #define IPVS_OPT_F_PE_DATA      (1 << (IPVS_OPT_PE_DATA-1))
194 #define IPVS_OPT_F_PE_NAME      (1 << (IPVS_OPT_PE_NAME-1))
195 #define IPVS_OPT_F_PARAM        (1 << (IPVS_OPT_PARAM-1))
196
197 struct ip_vs_sync_thread_data {
198         struct netns_ipvs *ipvs;
199         struct socket *sock;
200         char *buf;
201         int id;
202 };
203
204 /* Version 0 definition of packet sizes */
205 #define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn_v0))
206 #define FULL_CONN_SIZE  \
207 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
208
209
210 /*
211   The master mulitcasts messages (Datagrams) to the backup load balancers
212   in the following format.
213
214  Version 1:
215   Note, first byte should be Zero, so ver 0 receivers will drop the packet.
216
217        0                   1                   2                   3
218        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
219       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
220       |      0        |    SyncID     |            Size               |
221       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
222       |  Count Conns  |    Version    |    Reserved, set to Zero      |
223       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
224       |                                                               |
225       |                    IPVS Sync Connection (1)                   |
226       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
227       |                            .                                  |
228       ~                            .                                  ~
229       |                            .                                  |
230       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
231       |                                                               |
232       |                    IPVS Sync Connection (n)                   |
233       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
234
235  Version 0 Header
236        0                   1                   2                   3
237        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
238       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
239       |  Count Conns  |    SyncID     |            Size               |
240       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
241       |                    IPVS Sync Connection (1)                   |
242 */
243
244 #define SYNC_MESG_HEADER_LEN    4
245 #define MAX_CONNS_PER_SYNCBUFF  255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
246
247 /* Version 0 header */
248 struct ip_vs_sync_mesg_v0 {
249         __u8                    nr_conns;
250         __u8                    syncid;
251         __be16                  size;
252
253         /* ip_vs_sync_conn entries start here */
254 };
255
256 /* Version 1 header */
257 struct ip_vs_sync_mesg {
258         __u8                    reserved;       /* must be zero */
259         __u8                    syncid;
260         __be16                  size;
261         __u8                    nr_conns;
262         __s8                    version;        /* SYNC_PROTO_VER  */
263         __u16                   spare;
264         /* ip_vs_sync_conn entries start here */
265 };
266
267 union ipvs_sockaddr {
268         struct sockaddr_in      in;
269         struct sockaddr_in6     in6;
270 };
271
272 struct ip_vs_sync_buff {
273         struct list_head        list;
274         unsigned long           firstuse;
275
276         /* pointers for the message data */
277         struct ip_vs_sync_mesg  *mesg;
278         unsigned char           *head;
279         unsigned char           *end;
280 };
281
282 /*
283  * Copy of struct ip_vs_seq
284  * From unaligned network order to aligned host order
285  */
286 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
287 {
288         memset(ho, 0, sizeof(*ho));
289         ho->init_seq       = get_unaligned_be32(&no->init_seq);
290         ho->delta          = get_unaligned_be32(&no->delta);
291         ho->previous_delta = get_unaligned_be32(&no->previous_delta);
292 }
293
294 /*
295  * Copy of struct ip_vs_seq
296  * From Aligned host order to unaligned network order
297  */
298 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
299 {
300         put_unaligned_be32(ho->init_seq, &no->init_seq);
301         put_unaligned_be32(ho->delta, &no->delta);
302         put_unaligned_be32(ho->previous_delta, &no->previous_delta);
303 }
304
305 static inline struct ip_vs_sync_buff *
306 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
307 {
308         struct ip_vs_sync_buff *sb;
309
310         spin_lock_bh(&ipvs->sync_lock);
311         if (list_empty(&ms->sync_queue)) {
312                 sb = NULL;
313                 __set_current_state(TASK_INTERRUPTIBLE);
314         } else {
315                 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
316                                 list);
317                 list_del(&sb->list);
318                 ms->sync_queue_len--;
319                 if (!ms->sync_queue_len)
320                         ms->sync_queue_delay = 0;
321         }
322         spin_unlock_bh(&ipvs->sync_lock);
323
324         return sb;
325 }
326
327 /*
328  * Create a new sync buffer for Version 1 proto.
329  */
330 static inline struct ip_vs_sync_buff *
331 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len)
332 {
333         struct ip_vs_sync_buff *sb;
334
335         if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
336                 return NULL;
337
338         len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg),
339                     ipvs->mcfg.sync_maxlen);
340         sb->mesg = kmalloc(len, GFP_ATOMIC);
341         if (!sb->mesg) {
342                 kfree(sb);
343                 return NULL;
344         }
345         sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zero now */
346         sb->mesg->version = SYNC_PROTO_VER;
347         sb->mesg->syncid = ipvs->mcfg.syncid;
348         sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
349         sb->mesg->nr_conns = 0;
350         sb->mesg->spare = 0;
351         sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
352         sb->end = (unsigned char *)sb->mesg + len;
353
354         sb->firstuse = jiffies;
355         return sb;
356 }
357
358 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
359 {
360         kfree(sb->mesg);
361         kfree(sb);
362 }
363
364 static inline void sb_queue_tail(struct netns_ipvs *ipvs,
365                                  struct ipvs_master_sync_state *ms)
366 {
367         struct ip_vs_sync_buff *sb = ms->sync_buff;
368
369         spin_lock(&ipvs->sync_lock);
370         if (ipvs->sync_state & IP_VS_STATE_MASTER &&
371             ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
372                 if (!ms->sync_queue_len)
373                         schedule_delayed_work(&ms->master_wakeup_work,
374                                               max(IPVS_SYNC_SEND_DELAY, 1));
375                 ms->sync_queue_len++;
376                 list_add_tail(&sb->list, &ms->sync_queue);
377                 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
378                         wake_up_process(ms->master_thread);
379         } else
380                 ip_vs_sync_buff_release(sb);
381         spin_unlock(&ipvs->sync_lock);
382 }
383
384 /*
385  *      Get the current sync buffer if it has been created for more
386  *      than the specified time or the specified time is zero.
387  */
388 static inline struct ip_vs_sync_buff *
389 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
390                    unsigned long time)
391 {
392         struct ip_vs_sync_buff *sb;
393
394         spin_lock_bh(&ipvs->sync_buff_lock);
395         sb = ms->sync_buff;
396         if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
397                 ms->sync_buff = NULL;
398                 __set_current_state(TASK_RUNNING);
399         } else
400                 sb = NULL;
401         spin_unlock_bh(&ipvs->sync_buff_lock);
402         return sb;
403 }
404
405 static inline int
406 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
407 {
408         return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
409 }
410
411 /*
412  * Create a new sync buffer for Version 0 proto.
413  */
414 static inline struct ip_vs_sync_buff *
415 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len)
416 {
417         struct ip_vs_sync_buff *sb;
418         struct ip_vs_sync_mesg_v0 *mesg;
419
420         if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
421                 return NULL;
422
423         len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0),
424                     ipvs->mcfg.sync_maxlen);
425         sb->mesg = kmalloc(len, GFP_ATOMIC);
426         if (!sb->mesg) {
427                 kfree(sb);
428                 return NULL;
429         }
430         mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
431         mesg->nr_conns = 0;
432         mesg->syncid = ipvs->mcfg.syncid;
433         mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
434         sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
435         sb->end = (unsigned char *)mesg + len;
436         sb->firstuse = jiffies;
437         return sb;
438 }
439
440 /* Check if connection is controlled by persistence */
441 static inline bool in_persistence(struct ip_vs_conn *cp)
442 {
443         for (cp = cp->control; cp; cp = cp->control) {
444                 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
445                         return true;
446         }
447         return false;
448 }
449
450 /* Check if conn should be synced.
451  * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
452  * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
453  *      sync_retries times with period of sync_refresh_period/8
454  * - (2) if both sync_refresh_period and sync_period are 0 send sync only
455  *      for state changes or only once when pkts matches sync_threshold
456  * - (3) templates: rate can be reduced only with sync_refresh_period or
457  *      with (2)
458  */
459 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
460                                   struct ip_vs_conn *cp, int pkts)
461 {
462         unsigned long orig = READ_ONCE(cp->sync_endtime);
463         unsigned long now = jiffies;
464         unsigned long n = (now + cp->timeout) & ~3UL;
465         unsigned int sync_refresh_period;
466         int sync_period;
467         int force;
468
469         /* Check if we sync in current state */
470         if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
471                 force = 0;
472         else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
473                 return 0;
474         else if (likely(cp->protocol == IPPROTO_TCP)) {
475                 if (!((1 << cp->state) &
476                       ((1 << IP_VS_TCP_S_ESTABLISHED) |
477                        (1 << IP_VS_TCP_S_FIN_WAIT) |
478                        (1 << IP_VS_TCP_S_CLOSE) |
479                        (1 << IP_VS_TCP_S_CLOSE_WAIT) |
480                        (1 << IP_VS_TCP_S_TIME_WAIT))))
481                         return 0;
482                 force = cp->state != cp->old_state;
483                 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
484                         goto set;
485         } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
486                 if (!((1 << cp->state) &
487                       ((1 << IP_VS_SCTP_S_ESTABLISHED) |
488                        (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
489                        (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
490                        (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
491                        (1 << IP_VS_SCTP_S_CLOSED))))
492                         return 0;
493                 force = cp->state != cp->old_state;
494                 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
495                         goto set;
496         } else {
497                 /* UDP or another protocol with single state */
498                 force = 0;
499         }
500
501         sync_refresh_period = sysctl_sync_refresh_period(ipvs);
502         if (sync_refresh_period > 0) {
503                 long diff = n - orig;
504                 long min_diff = max(cp->timeout >> 1, 10UL * HZ);
505
506                 /* Avoid sync if difference is below sync_refresh_period
507                  * and below the half timeout.
508                  */
509                 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
510                         int retries = orig & 3;
511
512                         if (retries >= sysctl_sync_retries(ipvs))
513                                 return 0;
514                         if (time_before(now, orig - cp->timeout +
515                                         (sync_refresh_period >> 3)))
516                                 return 0;
517                         n |= retries + 1;
518                 }
519         }
520         sync_period = sysctl_sync_period(ipvs);
521         if (sync_period > 0) {
522                 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
523                     pkts % sync_period != sysctl_sync_threshold(ipvs))
524                         return 0;
525         } else if (!sync_refresh_period &&
526                    pkts != sysctl_sync_threshold(ipvs))
527                 return 0;
528
529 set:
530         cp->old_state = cp->state;
531         n = cmpxchg(&cp->sync_endtime, orig, n);
532         return n == orig || force;
533 }
534
535 /*
536  *      Version 0 , could be switched in by sys_ctl.
537  *      Add an ip_vs_conn information into the current sync_buff.
538  */
539 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
540                                int pkts)
541 {
542         struct ip_vs_sync_mesg_v0 *m;
543         struct ip_vs_sync_conn_v0 *s;
544         struct ip_vs_sync_buff *buff;
545         struct ipvs_master_sync_state *ms;
546         int id;
547         unsigned int len;
548
549         if (unlikely(cp->af != AF_INET))
550                 return;
551         /* Do not sync ONE PACKET */
552         if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
553                 return;
554
555         if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
556                 return;
557
558         spin_lock_bh(&ipvs->sync_buff_lock);
559         if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
560                 spin_unlock_bh(&ipvs->sync_buff_lock);
561                 return;
562         }
563
564         id = select_master_thread_id(ipvs, cp);
565         ms = &ipvs->ms[id];
566         buff = ms->sync_buff;
567         len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
568                 SIMPLE_CONN_SIZE;
569         if (buff) {
570                 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
571                 /* Send buffer if it is for v1 */
572                 if (buff->head + len > buff->end || !m->nr_conns) {
573                         sb_queue_tail(ipvs, ms);
574                         ms->sync_buff = NULL;
575                         buff = NULL;
576                 }
577         }
578         if (!buff) {
579                 buff = ip_vs_sync_buff_create_v0(ipvs, len);
580                 if (!buff) {
581                         spin_unlock_bh(&ipvs->sync_buff_lock);
582                         pr_err("ip_vs_sync_buff_create failed.\n");
583                         return;
584                 }
585                 ms->sync_buff = buff;
586         }
587
588         m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
589         s = (struct ip_vs_sync_conn_v0 *) buff->head;
590
591         /* copy members */
592         s->reserved = 0;
593         s->protocol = cp->protocol;
594         s->cport = cp->cport;
595         s->vport = cp->vport;
596         s->dport = cp->dport;
597         s->caddr = cp->caddr.ip;
598         s->vaddr = cp->vaddr.ip;
599         s->daddr = cp->daddr.ip;
600         s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
601         s->state = htons(cp->state);
602         if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
603                 struct ip_vs_sync_conn_options *opt =
604                         (struct ip_vs_sync_conn_options *)&s[1];
605                 memcpy(opt, &cp->in_seq, sizeof(*opt));
606         }
607
608         m->nr_conns++;
609         m->size = htons(ntohs(m->size) + len);
610         buff->head += len;
611         spin_unlock_bh(&ipvs->sync_buff_lock);
612
613         /* synchronize its controller if it has */
614         cp = cp->control;
615         if (cp) {
616                 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
617                         pkts = atomic_add_return(1, &cp->in_pkts);
618                 else
619                         pkts = sysctl_sync_threshold(ipvs);
620                 ip_vs_sync_conn(ipvs, cp, pkts);
621         }
622 }
623
624 /*
625  *      Add an ip_vs_conn information into the current sync_buff.
626  *      Called by ip_vs_in.
627  *      Sending Version 1 messages
628  */
629 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts)
630 {
631         struct ip_vs_sync_mesg *m;
632         union ip_vs_sync_conn *s;
633         struct ip_vs_sync_buff *buff;
634         struct ipvs_master_sync_state *ms;
635         int id;
636         __u8 *p;
637         unsigned int len, pe_name_len, pad;
638
639         /* Handle old version of the protocol */
640         if (sysctl_sync_ver(ipvs) == 0) {
641                 ip_vs_sync_conn_v0(ipvs, cp, pkts);
642                 return;
643         }
644         /* Do not sync ONE PACKET */
645         if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
646                 goto control;
647 sloop:
648         if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
649                 goto control;
650
651         /* Sanity checks */
652         pe_name_len = 0;
653         if (cp->pe_data_len) {
654                 if (!cp->pe_data || !cp->dest) {
655                         IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
656                         return;
657                 }
658                 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
659         }
660
661         spin_lock_bh(&ipvs->sync_buff_lock);
662         if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
663                 spin_unlock_bh(&ipvs->sync_buff_lock);
664                 return;
665         }
666
667         id = select_master_thread_id(ipvs, cp);
668         ms = &ipvs->ms[id];
669
670 #ifdef CONFIG_IP_VS_IPV6
671         if (cp->af == AF_INET6)
672                 len = sizeof(struct ip_vs_sync_v6);
673         else
674 #endif
675                 len = sizeof(struct ip_vs_sync_v4);
676
677         if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
678                 len += sizeof(struct ip_vs_sync_conn_options) + 2;
679
680         if (cp->pe_data_len)
681                 len += cp->pe_data_len + 2;     /* + Param hdr field */
682         if (pe_name_len)
683                 len += pe_name_len + 2;
684
685         /* check if there is a space for this one  */
686         pad = 0;
687         buff = ms->sync_buff;
688         if (buff) {
689                 m = buff->mesg;
690                 pad = (4 - (size_t) buff->head) & 3;
691                 /* Send buffer if it is for v0 */
692                 if (buff->head + len + pad > buff->end || m->reserved) {
693                         sb_queue_tail(ipvs, ms);
694                         ms->sync_buff = NULL;
695                         buff = NULL;
696                         pad = 0;
697                 }
698         }
699
700         if (!buff) {
701                 buff = ip_vs_sync_buff_create(ipvs, len);
702                 if (!buff) {
703                         spin_unlock_bh(&ipvs->sync_buff_lock);
704                         pr_err("ip_vs_sync_buff_create failed.\n");
705                         return;
706                 }
707                 ms->sync_buff = buff;
708                 m = buff->mesg;
709         }
710
711         p = buff->head;
712         buff->head += pad + len;
713         m->size = htons(ntohs(m->size) + pad + len);
714         /* Add ev. padding from prev. sync_conn */
715         while (pad--)
716                 *(p++) = 0;
717
718         s = (union ip_vs_sync_conn *)p;
719
720         /* Set message type  & copy members */
721         s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
722         s->v4.ver_size = htons(len & SVER_MASK);        /* Version 0 */
723         s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
724         s->v4.state = htons(cp->state);
725         s->v4.protocol = cp->protocol;
726         s->v4.cport = cp->cport;
727         s->v4.vport = cp->vport;
728         s->v4.dport = cp->dport;
729         s->v4.fwmark = htonl(cp->fwmark);
730         s->v4.timeout = htonl(cp->timeout / HZ);
731         m->nr_conns++;
732
733 #ifdef CONFIG_IP_VS_IPV6
734         if (cp->af == AF_INET6) {
735                 p += sizeof(struct ip_vs_sync_v6);
736                 s->v6.caddr = cp->caddr.in6;
737                 s->v6.vaddr = cp->vaddr.in6;
738                 s->v6.daddr = cp->daddr.in6;
739         } else
740 #endif
741         {
742                 p += sizeof(struct ip_vs_sync_v4);      /* options ptr */
743                 s->v4.caddr = cp->caddr.ip;
744                 s->v4.vaddr = cp->vaddr.ip;
745                 s->v4.daddr = cp->daddr.ip;
746         }
747         if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
748                 *(p++) = IPVS_OPT_SEQ_DATA;
749                 *(p++) = sizeof(struct ip_vs_sync_conn_options);
750                 hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
751                 p += sizeof(struct ip_vs_seq);
752                 hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
753                 p += sizeof(struct ip_vs_seq);
754         }
755         /* Handle pe data */
756         if (cp->pe_data_len && cp->pe_data) {
757                 *(p++) = IPVS_OPT_PE_DATA;
758                 *(p++) = cp->pe_data_len;
759                 memcpy(p, cp->pe_data, cp->pe_data_len);
760                 p += cp->pe_data_len;
761                 if (pe_name_len) {
762                         /* Add PE_NAME */
763                         *(p++) = IPVS_OPT_PE_NAME;
764                         *(p++) = pe_name_len;
765                         memcpy(p, cp->pe->name, pe_name_len);
766                         p += pe_name_len;
767                 }
768         }
769
770         spin_unlock_bh(&ipvs->sync_buff_lock);
771
772 control:
773         /* synchronize its controller if it has */
774         cp = cp->control;
775         if (!cp)
776                 return;
777         if (cp->flags & IP_VS_CONN_F_TEMPLATE)
778                 pkts = atomic_add_return(1, &cp->in_pkts);
779         else
780                 pkts = sysctl_sync_threshold(ipvs);
781         goto sloop;
782 }
783
784 /*
785  *  fill_param used by version 1
786  */
787 static inline int
788 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc,
789                            struct ip_vs_conn_param *p,
790                            __u8 *pe_data, unsigned int pe_data_len,
791                            __u8 *pe_name, unsigned int pe_name_len)
792 {
793 #ifdef CONFIG_IP_VS_IPV6
794         if (af == AF_INET6)
795                 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol,
796                                       (const union nf_inet_addr *)&sc->v6.caddr,
797                                       sc->v6.cport,
798                                       (const union nf_inet_addr *)&sc->v6.vaddr,
799                                       sc->v6.vport, p);
800         else
801 #endif
802                 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol,
803                                       (const union nf_inet_addr *)&sc->v4.caddr,
804                                       sc->v4.cport,
805                                       (const union nf_inet_addr *)&sc->v4.vaddr,
806                                       sc->v4.vport, p);
807         /* Handle pe data */
808         if (pe_data_len) {
809                 if (pe_name_len) {
810                         char buff[IP_VS_PENAME_MAXLEN+1];
811
812                         memcpy(buff, pe_name, pe_name_len);
813                         buff[pe_name_len]=0;
814                         p->pe = __ip_vs_pe_getbyname(buff);
815                         if (!p->pe) {
816                                 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
817                                              buff);
818                                 return 1;
819                         }
820                 } else {
821                         IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
822                         return 1;
823                 }
824
825                 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
826                 if (!p->pe_data) {
827                         module_put(p->pe->module);
828                         return -ENOMEM;
829                 }
830                 p->pe_data_len = pe_data_len;
831         }
832         return 0;
833 }
834
835 /*
836  *  Connection Add / Update.
837  *  Common for version 0 and 1 reception of backup sync_conns.
838  *  Param: ...
839  *         timeout is in sec.
840  */
841 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param,
842                             unsigned int flags, unsigned int state,
843                             unsigned int protocol, unsigned int type,
844                             const union nf_inet_addr *daddr, __be16 dport,
845                             unsigned long timeout, __u32 fwmark,
846                             struct ip_vs_sync_conn_options *opt)
847 {
848         struct ip_vs_dest *dest;
849         struct ip_vs_conn *cp;
850
851         if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
852                 cp = ip_vs_conn_in_get(param);
853                 if (cp && ((cp->dport != dport) ||
854                            !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
855                         if (!(flags & IP_VS_CONN_F_INACTIVE)) {
856                                 ip_vs_conn_expire_now(cp);
857                                 __ip_vs_conn_put(cp);
858                                 cp = NULL;
859                         } else {
860                                 /* This is the expiration message for the
861                                  * connection that was already replaced, so we
862                                  * just ignore it.
863                                  */
864                                 __ip_vs_conn_put(cp);
865                                 kfree(param->pe_data);
866                                 return;
867                         }
868                 }
869         } else {
870                 cp = ip_vs_ct_in_get(param);
871         }
872
873         if (cp) {
874                 /* Free pe_data */
875                 kfree(param->pe_data);
876
877                 dest = cp->dest;
878                 spin_lock_bh(&cp->lock);
879                 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
880                     !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
881                         if (flags & IP_VS_CONN_F_INACTIVE) {
882                                 atomic_dec(&dest->activeconns);
883                                 atomic_inc(&dest->inactconns);
884                         } else {
885                                 atomic_inc(&dest->activeconns);
886                                 atomic_dec(&dest->inactconns);
887                         }
888                 }
889                 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
890                 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
891                 cp->flags = flags;
892                 spin_unlock_bh(&cp->lock);
893                 if (!dest)
894                         ip_vs_try_bind_dest(cp);
895         } else {
896                 /*
897                  * Find the appropriate destination for the connection.
898                  * If it is not found the connection will remain unbound
899                  * but still handled.
900                  */
901                 rcu_read_lock();
902                 /* This function is only invoked by the synchronization
903                  * code. We do not currently support heterogeneous pools
904                  * with synchronization, so we can make the assumption that
905                  * the svc_af is the same as the dest_af
906                  */
907                 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport,
908                                        param->vaddr, param->vport, protocol,
909                                        fwmark, flags);
910
911                 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
912                                     fwmark);
913                 rcu_read_unlock();
914                 if (!cp) {
915                         kfree(param->pe_data);
916                         IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
917                         return;
918                 }
919                 if (!(flags & IP_VS_CONN_F_TEMPLATE))
920                         kfree(param->pe_data);
921         }
922
923         if (opt) {
924                 cp->in_seq = opt->in_seq;
925                 cp->out_seq = opt->out_seq;
926         }
927         atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
928         cp->state = state;
929         cp->old_state = cp->state;
930         /*
931          * For Ver 0 messages style
932          *  - Not possible to recover the right timeout for templates
933          *  - can not find the right fwmark
934          *    virtual service. If needed, we can do it for
935          *    non-fwmark persistent services.
936          * Ver 1 messages style.
937          *  - No problem.
938          */
939         if (timeout) {
940                 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
941                         timeout = MAX_SCHEDULE_TIMEOUT / HZ;
942                 cp->timeout = timeout*HZ;
943         } else {
944                 struct ip_vs_proto_data *pd;
945
946                 pd = ip_vs_proto_data_get(ipvs, protocol);
947                 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
948                         cp->timeout = pd->timeout_table[state];
949                 else
950                         cp->timeout = (3*60*HZ);
951         }
952         ip_vs_conn_put(cp);
953 }
954
955 /*
956  *  Process received multicast message for Version 0
957  */
958 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer,
959                                      const size_t buflen)
960 {
961         struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
962         struct ip_vs_sync_conn_v0 *s;
963         struct ip_vs_sync_conn_options *opt;
964         struct ip_vs_protocol *pp;
965         struct ip_vs_conn_param param;
966         char *p;
967         int i;
968
969         p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
970         for (i=0; i<m->nr_conns; i++) {
971                 unsigned int flags, state;
972
973                 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
974                         IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
975                         return;
976                 }
977                 s = (struct ip_vs_sync_conn_v0 *) p;
978                 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
979                 flags &= ~IP_VS_CONN_F_HASHED;
980                 if (flags & IP_VS_CONN_F_SEQ_MASK) {
981                         opt = (struct ip_vs_sync_conn_options *)&s[1];
982                         p += FULL_CONN_SIZE;
983                         if (p > buffer+buflen) {
984                                 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
985                                 return;
986                         }
987                 } else {
988                         opt = NULL;
989                         p += SIMPLE_CONN_SIZE;
990                 }
991
992                 state = ntohs(s->state);
993                 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
994                         pp = ip_vs_proto_get(s->protocol);
995                         if (!pp) {
996                                 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
997                                         s->protocol);
998                                 continue;
999                         }
1000                         if (state >= pp->num_states) {
1001                                 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
1002                                         pp->name, state);
1003                                 continue;
1004                         }
1005                 } else {
1006                         /* protocol in templates is not used for state/timeout */
1007                         if (state > 0) {
1008                                 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
1009                                         state);
1010                                 state = 0;
1011                         }
1012                 }
1013
1014                 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
1015                                       (const union nf_inet_addr *)&s->caddr,
1016                                       s->cport,
1017                                       (const union nf_inet_addr *)&s->vaddr,
1018                                       s->vport, &param);
1019
1020                 /* Send timeout as Zero */
1021                 ip_vs_proc_conn(ipvs, &param, flags, state, s->protocol, AF_INET,
1022                                 (union nf_inet_addr *)&s->daddr, s->dport,
1023                                 0, 0, opt);
1024         }
1025 }
1026
1027 /*
1028  * Handle options
1029  */
1030 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
1031                                     __u32 *opt_flags,
1032                                     struct ip_vs_sync_conn_options *opt)
1033 {
1034         struct ip_vs_sync_conn_options *topt;
1035
1036         topt = (struct ip_vs_sync_conn_options *)p;
1037
1038         if (plen != sizeof(struct ip_vs_sync_conn_options)) {
1039                 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1040                 return -EINVAL;
1041         }
1042         if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
1043                 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1044                 return -EINVAL;
1045         }
1046         ntoh_seq(&topt->in_seq, &opt->in_seq);
1047         ntoh_seq(&topt->out_seq, &opt->out_seq);
1048         *opt_flags |= IPVS_OPT_F_SEQ_DATA;
1049         return 0;
1050 }
1051
1052 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
1053                           __u8 **data, unsigned int maxlen,
1054                           __u32 *opt_flags, __u32 flag)
1055 {
1056         if (plen > maxlen) {
1057                 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
1058                 return -EINVAL;
1059         }
1060         if (*opt_flags & flag) {
1061                 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
1062                 return -EINVAL;
1063         }
1064         *data_len = plen;
1065         *data = p;
1066         *opt_flags |= flag;
1067         return 0;
1068 }
1069 /*
1070  *   Process a Version 1 sync. connection
1071  */
1072 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end)
1073 {
1074         struct ip_vs_sync_conn_options opt;
1075         union  ip_vs_sync_conn *s;
1076         struct ip_vs_protocol *pp;
1077         struct ip_vs_conn_param param;
1078         __u32 flags;
1079         unsigned int af, state, pe_data_len=0, pe_name_len=0;
1080         __u8 *pe_data=NULL, *pe_name=NULL;
1081         __u32 opt_flags=0;
1082         int retc=0;
1083
1084         s = (union ip_vs_sync_conn *) p;
1085
1086         if (s->v6.type & STYPE_F_INET6) {
1087 #ifdef CONFIG_IP_VS_IPV6
1088                 af = AF_INET6;
1089                 p += sizeof(struct ip_vs_sync_v6);
1090 #else
1091                 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1092                 retc = 10;
1093                 goto out;
1094 #endif
1095         } else if (!s->v4.type) {
1096                 af = AF_INET;
1097                 p += sizeof(struct ip_vs_sync_v4);
1098         } else {
1099                 return -10;
1100         }
1101         if (p > msg_end)
1102                 return -20;
1103
1104         /* Process optional params check Type & Len. */
1105         while (p < msg_end) {
1106                 int ptype;
1107                 int plen;
1108
1109                 if (p+2 > msg_end)
1110                         return -30;
1111                 ptype = *(p++);
1112                 plen  = *(p++);
1113
1114                 if (!plen || ((p + plen) > msg_end))
1115                         return -40;
1116                 /* Handle seq option  p = param data */
1117                 switch (ptype & ~IPVS_OPT_F_PARAM) {
1118                 case IPVS_OPT_SEQ_DATA:
1119                         if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
1120                                 return -50;
1121                         break;
1122
1123                 case IPVS_OPT_PE_DATA:
1124                         if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
1125                                            IP_VS_PEDATA_MAXLEN, &opt_flags,
1126                                            IPVS_OPT_F_PE_DATA))
1127                                 return -60;
1128                         break;
1129
1130                 case IPVS_OPT_PE_NAME:
1131                         if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1132                                            IP_VS_PENAME_MAXLEN, &opt_flags,
1133                                            IPVS_OPT_F_PE_NAME))
1134                                 return -70;
1135                         break;
1136
1137                 default:
1138                         /* Param data mandatory ? */
1139                         if (!(ptype & IPVS_OPT_F_PARAM)) {
1140                                 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1141                                           ptype & ~IPVS_OPT_F_PARAM);
1142                                 retc = 20;
1143                                 goto out;
1144                         }
1145                 }
1146                 p += plen;  /* Next option */
1147         }
1148
1149         /* Get flags and Mask off unsupported */
1150         flags  = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
1151         flags |= IP_VS_CONN_F_SYNC;
1152         state = ntohs(s->v4.state);
1153
1154         if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1155                 pp = ip_vs_proto_get(s->v4.protocol);
1156                 if (!pp) {
1157                         IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1158                                 s->v4.protocol);
1159                         retc = 30;
1160                         goto out;
1161                 }
1162                 if (state >= pp->num_states) {
1163                         IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1164                                 pp->name, state);
1165                         retc = 40;
1166                         goto out;
1167                 }
1168         } else {
1169                 /* protocol in templates is not used for state/timeout */
1170                 if (state > 0) {
1171                         IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1172                                 state);
1173                         state = 0;
1174                 }
1175         }
1176         if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
1177                                        pe_data_len, pe_name, pe_name_len)) {
1178                 retc = 50;
1179                 goto out;
1180         }
1181         /* If only IPv4, just silent skip IPv6 */
1182         if (af == AF_INET)
1183                 ip_vs_proc_conn(ipvs, &param, flags, state, s->v4.protocol, af,
1184                                 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1185                                 ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1186                                 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1187                                 );
1188 #ifdef CONFIG_IP_VS_IPV6
1189         else
1190                 ip_vs_proc_conn(ipvs, &param, flags, state, s->v6.protocol, af,
1191                                 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1192                                 ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1193                                 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1194                                 );
1195 #endif
1196         ip_vs_pe_put(param.pe);
1197         return 0;
1198         /* Error exit */
1199 out:
1200         IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1201         return retc;
1202
1203 }
1204 /*
1205  *      Process received multicast message and create the corresponding
1206  *      ip_vs_conn entries.
1207  *      Handles Version 0 & 1
1208  */
1209 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
1210                                   const size_t buflen)
1211 {
1212         struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1213         __u8 *p, *msg_end;
1214         int i, nr_conns;
1215
1216         if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1217                 IP_VS_DBG(2, "BACKUP, message header too short\n");
1218                 return;
1219         }
1220
1221         if (buflen != ntohs(m2->size)) {
1222                 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1223                 return;
1224         }
1225         /* SyncID sanity check */
1226         if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) {
1227                 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1228                 return;
1229         }
1230         /* Handle version 1  message */
1231         if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1232             && (m2->spare == 0)) {
1233
1234                 msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1235                 nr_conns = m2->nr_conns;
1236
1237                 for (i=0; i<nr_conns; i++) {
1238                         union ip_vs_sync_conn *s;
1239                         unsigned int size;
1240                         int retc;
1241
1242                         p = msg_end;
1243                         if (p + sizeof(s->v4) > buffer+buflen) {
1244                                 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1245                                 return;
1246                         }
1247                         s = (union ip_vs_sync_conn *)p;
1248                         size = ntohs(s->v4.ver_size) & SVER_MASK;
1249                         msg_end = p + size;
1250                         /* Basic sanity checks */
1251                         if (msg_end  > buffer+buflen) {
1252                                 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1253                                 return;
1254                         }
1255                         if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
1256                                 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1257                                               ntohs(s->v4.ver_size) >> SVER_SHIFT);
1258                                 return;
1259                         }
1260                         /* Process a single sync_conn */
1261                         retc = ip_vs_proc_sync_conn(ipvs, p, msg_end);
1262                         if (retc < 0) {
1263                                 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1264                                              retc);
1265                                 return;
1266                         }
1267                         /* Make sure we have 32 bit alignment */
1268                         msg_end = p + ((size + 3) & ~3);
1269                 }
1270         } else {
1271                 /* Old type of message */
1272                 ip_vs_process_message_v0(ipvs, buffer, buflen);
1273                 return;
1274         }
1275 }
1276
1277
1278 /*
1279  *      Setup sndbuf (mode=1) or rcvbuf (mode=0)
1280  */
1281 static void set_sock_size(struct sock *sk, int mode, int val)
1282 {
1283         /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
1284         /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
1285         lock_sock(sk);
1286         if (mode) {
1287                 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
1288                               sysctl_wmem_max);
1289                 sk->sk_sndbuf = val * 2;
1290                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1291         } else {
1292                 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
1293                               sysctl_rmem_max);
1294                 sk->sk_rcvbuf = val * 2;
1295                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
1296         }
1297         release_sock(sk);
1298 }
1299
1300 /*
1301  *      Setup loopback of outgoing multicasts on a sending socket
1302  */
1303 static void set_mcast_loop(struct sock *sk, u_char loop)
1304 {
1305         struct inet_sock *inet = inet_sk(sk);
1306
1307         /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
1308         lock_sock(sk);
1309         inet->mc_loop = loop ? 1 : 0;
1310 #ifdef CONFIG_IP_VS_IPV6
1311         if (sk->sk_family == AF_INET6) {
1312                 struct ipv6_pinfo *np = inet6_sk(sk);
1313
1314                 /* IPV6_MULTICAST_LOOP */
1315                 np->mc_loop = loop ? 1 : 0;
1316         }
1317 #endif
1318         release_sock(sk);
1319 }
1320
1321 /*
1322  *      Specify TTL for outgoing multicasts on a sending socket
1323  */
1324 static void set_mcast_ttl(struct sock *sk, u_char ttl)
1325 {
1326         struct inet_sock *inet = inet_sk(sk);
1327
1328         /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
1329         lock_sock(sk);
1330         inet->mc_ttl = ttl;
1331 #ifdef CONFIG_IP_VS_IPV6
1332         if (sk->sk_family == AF_INET6) {
1333                 struct ipv6_pinfo *np = inet6_sk(sk);
1334
1335                 /* IPV6_MULTICAST_HOPS */
1336                 np->mcast_hops = ttl;
1337         }
1338 #endif
1339         release_sock(sk);
1340 }
1341
1342 /* Control fragmentation of messages */
1343 static void set_mcast_pmtudisc(struct sock *sk, int val)
1344 {
1345         struct inet_sock *inet = inet_sk(sk);
1346
1347         /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
1348         lock_sock(sk);
1349         inet->pmtudisc = val;
1350 #ifdef CONFIG_IP_VS_IPV6
1351         if (sk->sk_family == AF_INET6) {
1352                 struct ipv6_pinfo *np = inet6_sk(sk);
1353
1354                 /* IPV6_MTU_DISCOVER */
1355                 np->pmtudisc = val;
1356         }
1357 #endif
1358         release_sock(sk);
1359 }
1360
1361 /*
1362  *      Specifiy default interface for outgoing multicasts
1363  */
1364 static int set_mcast_if(struct sock *sk, struct net_device *dev)
1365 {
1366         struct inet_sock *inet = inet_sk(sk);
1367
1368         if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1369                 return -EINVAL;
1370
1371         lock_sock(sk);
1372         inet->mc_index = dev->ifindex;
1373         /*  inet->mc_addr  = 0; */
1374 #ifdef CONFIG_IP_VS_IPV6
1375         if (sk->sk_family == AF_INET6) {
1376                 struct ipv6_pinfo *np = inet6_sk(sk);
1377
1378                 /* IPV6_MULTICAST_IF */
1379                 np->mcast_oif = dev->ifindex;
1380         }
1381 #endif
1382         release_sock(sk);
1383
1384         return 0;
1385 }
1386
1387
1388 /*
1389  *      Join a multicast group.
1390  *      the group is specified by a class D multicast address 224.0.0.0/8
1391  *      in the in_addr structure passed in as a parameter.
1392  */
1393 static int
1394 join_mcast_group(struct sock *sk, struct in_addr *addr, struct net_device *dev)
1395 {
1396         struct ip_mreqn mreq;
1397         int ret;
1398
1399         memset(&mreq, 0, sizeof(mreq));
1400         memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
1401
1402         if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1403                 return -EINVAL;
1404
1405         mreq.imr_ifindex = dev->ifindex;
1406
1407         lock_sock(sk);
1408         ret = ip_mc_join_group(sk, &mreq);
1409         release_sock(sk);
1410
1411         return ret;
1412 }
1413
1414 #ifdef CONFIG_IP_VS_IPV6
1415 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr,
1416                              struct net_device *dev)
1417 {
1418         int ret;
1419
1420         if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
1421                 return -EINVAL;
1422
1423         lock_sock(sk);
1424         ret = ipv6_sock_mc_join(sk, dev->ifindex, addr);
1425         release_sock(sk);
1426
1427         return ret;
1428 }
1429 #endif
1430
1431 static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
1432 {
1433         __be32 addr;
1434         struct sockaddr_in sin;
1435
1436         addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1437         if (!addr)
1438                 pr_err("You probably need to specify IP address on "
1439                        "multicast interface.\n");
1440
1441         IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1442                   dev->name, &addr);
1443
1444         /* Now bind the socket with the address of multicast interface */
1445         sin.sin_family       = AF_INET;
1446         sin.sin_addr.s_addr  = addr;
1447         sin.sin_port         = 0;
1448
1449         return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
1450 }
1451
1452 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
1453                                struct ipvs_sync_daemon_cfg *c, int id)
1454 {
1455         if (AF_INET6 == c->mcast_af) {
1456                 sa->in6 = (struct sockaddr_in6) {
1457                         .sin6_family = AF_INET6,
1458                         .sin6_port = htons(c->mcast_port + id),
1459                 };
1460                 sa->in6.sin6_addr = c->mcast_group.in6;
1461                 *salen = sizeof(sa->in6);
1462         } else {
1463                 sa->in = (struct sockaddr_in) {
1464                         .sin_family = AF_INET,
1465                         .sin_port = htons(c->mcast_port + id),
1466                 };
1467                 sa->in.sin_addr = c->mcast_group.in;
1468                 *salen = sizeof(sa->in);
1469         }
1470 }
1471
1472 /*
1473  *      Set up sending multicast socket over UDP
1474  */
1475 static int make_send_sock(struct netns_ipvs *ipvs, int id,
1476                           struct net_device *dev, struct socket **sock_ret)
1477 {
1478         /* multicast addr */
1479         union ipvs_sockaddr mcast_addr;
1480         struct socket *sock;
1481         int result, salen;
1482
1483         /* First create a socket */
1484         result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM,
1485                                   IPPROTO_UDP, &sock);
1486         if (result < 0) {
1487                 pr_err("Error during creation of socket; terminating\n");
1488                 goto error;
1489         }
1490         *sock_ret = sock;
1491         result = set_mcast_if(sock->sk, dev);
1492         if (result < 0) {
1493                 pr_err("Error setting outbound mcast interface\n");
1494                 goto error;
1495         }
1496
1497         set_mcast_loop(sock->sk, 0);
1498         set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl);
1499         /* Allow fragmentation if MTU changes */
1500         set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT);
1501         result = sysctl_sync_sock_size(ipvs);
1502         if (result > 0)
1503                 set_sock_size(sock->sk, 1, result);
1504
1505         if (AF_INET == ipvs->mcfg.mcast_af)
1506                 result = bind_mcastif_addr(sock, dev);
1507         else
1508                 result = 0;
1509         if (result < 0) {
1510                 pr_err("Error binding address of the mcast interface\n");
1511                 goto error;
1512         }
1513
1514         get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
1515         result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
1516                                     salen, 0);
1517         if (result < 0) {
1518                 pr_err("Error connecting to the multicast addr\n");
1519                 goto error;
1520         }
1521
1522         return 0;
1523
1524 error:
1525         return result;
1526 }
1527
1528
1529 /*
1530  *      Set up receiving multicast socket over UDP
1531  */
1532 static int make_receive_sock(struct netns_ipvs *ipvs, int id,
1533                              struct net_device *dev, struct socket **sock_ret)
1534 {
1535         /* multicast addr */
1536         union ipvs_sockaddr mcast_addr;
1537         struct socket *sock;
1538         int result, salen;
1539
1540         /* First create a socket */
1541         result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM,
1542                                   IPPROTO_UDP, &sock);
1543         if (result < 0) {
1544                 pr_err("Error during creation of socket; terminating\n");
1545                 goto error;
1546         }
1547         *sock_ret = sock;
1548         /* it is equivalent to the REUSEADDR option in user-space */
1549         sock->sk->sk_reuse = SK_CAN_REUSE;
1550         result = sysctl_sync_sock_size(ipvs);
1551         if (result > 0)
1552                 set_sock_size(sock->sk, 0, result);
1553
1554         get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
1555         sock->sk->sk_bound_dev_if = dev->ifindex;
1556         result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
1557         if (result < 0) {
1558                 pr_err("Error binding to the multicast addr\n");
1559                 goto error;
1560         }
1561
1562         /* join the multicast group */
1563 #ifdef CONFIG_IP_VS_IPV6
1564         if (ipvs->bcfg.mcast_af == AF_INET6)
1565                 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr,
1566                                            dev);
1567         else
1568 #endif
1569                 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr,
1570                                           dev);
1571         if (result < 0) {
1572                 pr_err("Error joining to the multicast group\n");
1573                 goto error;
1574         }
1575
1576         return 0;
1577
1578 error:
1579         return result;
1580 }
1581
1582
1583 static int
1584 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
1585 {
1586         struct msghdr   msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
1587         struct kvec     iov;
1588         int             len;
1589
1590         EnterFunction(7);
1591         iov.iov_base     = (void *)buffer;
1592         iov.iov_len      = length;
1593
1594         len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
1595
1596         LeaveFunction(7);
1597         return len;
1598 }
1599
1600 static int
1601 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
1602 {
1603         int msize;
1604         int ret;
1605
1606         msize = ntohs(msg->size);
1607
1608         ret = ip_vs_send_async(sock, (char *)msg, msize);
1609         if (ret >= 0 || ret == -EAGAIN)
1610                 return ret;
1611         pr_err("ip_vs_send_async error %d\n", ret);
1612         return 0;
1613 }
1614
1615 static int
1616 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
1617 {
1618         struct msghdr           msg = {NULL,};
1619         struct kvec             iov = {buffer, buflen};
1620         int                     len;
1621
1622         EnterFunction(7);
1623
1624         /* Receive a packet */
1625         iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, buflen);
1626         len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
1627         if (len < 0)
1628                 return len;
1629
1630         LeaveFunction(7);
1631         return len;
1632 }
1633
1634 /* Wakeup the master thread for sending */
1635 static void master_wakeup_work_handler(struct work_struct *work)
1636 {
1637         struct ipvs_master_sync_state *ms =
1638                 container_of(work, struct ipvs_master_sync_state,
1639                              master_wakeup_work.work);
1640         struct netns_ipvs *ipvs = ms->ipvs;
1641
1642         spin_lock_bh(&ipvs->sync_lock);
1643         if (ms->sync_queue_len &&
1644             ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
1645                 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
1646                 wake_up_process(ms->master_thread);
1647         }
1648         spin_unlock_bh(&ipvs->sync_lock);
1649 }
1650
1651 /* Get next buffer to send */
1652 static inline struct ip_vs_sync_buff *
1653 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
1654 {
1655         struct ip_vs_sync_buff *sb;
1656
1657         sb = sb_dequeue(ipvs, ms);
1658         if (sb)
1659                 return sb;
1660         /* Do not delay entries in buffer for more than 2 seconds */
1661         return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
1662 }
1663
1664 static int sync_thread_master(void *data)
1665 {
1666         struct ip_vs_sync_thread_data *tinfo = data;
1667         struct netns_ipvs *ipvs = tinfo->ipvs;
1668         struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
1669         struct sock *sk = tinfo->sock->sk;
1670         struct ip_vs_sync_buff *sb;
1671
1672         pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1673                 "syncid = %d, id = %d\n",
1674                 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id);
1675
1676         for (;;) {
1677                 sb = next_sync_buff(ipvs, ms);
1678                 if (unlikely(kthread_should_stop()))
1679                         break;
1680                 if (!sb) {
1681                         schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
1682                         continue;
1683                 }
1684                 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
1685                         /* (Ab)use interruptible sleep to avoid increasing
1686                          * the load avg.
1687                          */
1688                         __wait_event_interruptible(*sk_sleep(sk),
1689                                                    sock_writeable(sk) ||
1690                                                    kthread_should_stop());
1691                         if (unlikely(kthread_should_stop()))
1692                                 goto done;
1693                 }
1694                 ip_vs_sync_buff_release(sb);
1695         }
1696
1697 done:
1698         __set_current_state(TASK_RUNNING);
1699         if (sb)
1700                 ip_vs_sync_buff_release(sb);
1701
1702         /* clean up the sync_buff queue */
1703         while ((sb = sb_dequeue(ipvs, ms)))
1704                 ip_vs_sync_buff_release(sb);
1705         __set_current_state(TASK_RUNNING);
1706
1707         /* clean up the current sync_buff */
1708         sb = get_curr_sync_buff(ipvs, ms, 0);
1709         if (sb)
1710                 ip_vs_sync_buff_release(sb);
1711
1712         /* release the sending multicast socket */
1713         sock_release(tinfo->sock);
1714         kfree(tinfo);
1715
1716         return 0;
1717 }
1718
1719
1720 static int sync_thread_backup(void *data)
1721 {
1722         struct ip_vs_sync_thread_data *tinfo = data;
1723         struct netns_ipvs *ipvs = tinfo->ipvs;
1724         int len;
1725
1726         pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1727                 "syncid = %d, id = %d\n",
1728                 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
1729
1730         while (!kthread_should_stop()) {
1731                 wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
1732                          !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
1733                          || kthread_should_stop());
1734
1735                 /* do we have data now? */
1736                 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
1737                         len = ip_vs_receive(tinfo->sock, tinfo->buf,
1738                                         ipvs->bcfg.sync_maxlen);
1739                         if (len <= 0) {
1740                                 if (len != -EAGAIN)
1741                                         pr_err("receiving message error\n");
1742                                 break;
1743                         }
1744
1745                         ip_vs_process_message(ipvs, tinfo->buf, len);
1746                 }
1747         }
1748
1749         /* release the sending multicast socket */
1750         sock_release(tinfo->sock);
1751         kfree(tinfo->buf);
1752         kfree(tinfo);
1753
1754         return 0;
1755 }
1756
1757
1758 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
1759                       int state)
1760 {
1761         struct ip_vs_sync_thread_data *tinfo = NULL;
1762         struct task_struct **array = NULL, *task;
1763         struct net_device *dev;
1764         char *name;
1765         int (*threadfn)(void *data);
1766         int id = 0, count, hlen;
1767         int result = -ENOMEM;
1768         u16 mtu, min_mtu;
1769
1770         IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1771         IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1772                   sizeof(struct ip_vs_sync_conn_v0));
1773
1774         /* Do not hold one mutex and then to block on another */
1775         for (;;) {
1776                 rtnl_lock();
1777                 if (mutex_trylock(&ipvs->sync_mutex))
1778                         break;
1779                 rtnl_unlock();
1780                 mutex_lock(&ipvs->sync_mutex);
1781                 if (rtnl_trylock())
1782                         break;
1783                 mutex_unlock(&ipvs->sync_mutex);
1784         }
1785
1786         if (!ipvs->sync_state) {
1787                 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
1788                 ipvs->threads_mask = count - 1;
1789         } else
1790                 count = ipvs->threads_mask + 1;
1791
1792         if (c->mcast_af == AF_UNSPEC) {
1793                 c->mcast_af = AF_INET;
1794                 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP);
1795         }
1796         if (!c->mcast_port)
1797                 c->mcast_port = IP_VS_SYNC_PORT;
1798         if (!c->mcast_ttl)
1799                 c->mcast_ttl = 1;
1800
1801         dev = __dev_get_by_name(ipvs->net, c->mcast_ifn);
1802         if (!dev) {
1803                 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn);
1804                 result = -ENODEV;
1805                 goto out_early;
1806         }
1807         hlen = (AF_INET6 == c->mcast_af) ?
1808                sizeof(struct ipv6hdr) + sizeof(struct udphdr) :
1809                sizeof(struct iphdr) + sizeof(struct udphdr);
1810         mtu = (state == IP_VS_STATE_BACKUP) ?
1811                   clamp(dev->mtu, 1500U, 65535U) : 1500U;
1812         min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1;
1813
1814         if (c->sync_maxlen)
1815                 c->sync_maxlen = clamp_t(unsigned int,
1816                                          c->sync_maxlen, min_mtu,
1817                                          65535 - hlen);
1818         else
1819                 c->sync_maxlen = mtu - hlen;
1820
1821         if (state == IP_VS_STATE_MASTER) {
1822                 result = -EEXIST;
1823                 if (ipvs->ms)
1824                         goto out_early;
1825
1826                 ipvs->mcfg = *c;
1827                 name = "ipvs-m:%d:%d";
1828                 threadfn = sync_thread_master;
1829         } else if (state == IP_VS_STATE_BACKUP) {
1830                 result = -EEXIST;
1831                 if (ipvs->backup_threads)
1832                         goto out_early;
1833
1834                 ipvs->bcfg = *c;
1835                 name = "ipvs-b:%d:%d";
1836                 threadfn = sync_thread_backup;
1837         } else {
1838                 result = -EINVAL;
1839                 goto out_early;
1840         }
1841
1842         if (state == IP_VS_STATE_MASTER) {
1843                 struct ipvs_master_sync_state *ms;
1844
1845                 result = -ENOMEM;
1846                 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL);
1847                 if (!ipvs->ms)
1848                         goto out;
1849                 ms = ipvs->ms;
1850                 for (id = 0; id < count; id++, ms++) {
1851                         INIT_LIST_HEAD(&ms->sync_queue);
1852                         ms->sync_queue_len = 0;
1853                         ms->sync_queue_delay = 0;
1854                         INIT_DELAYED_WORK(&ms->master_wakeup_work,
1855                                           master_wakeup_work_handler);
1856                         ms->ipvs = ipvs;
1857                 }
1858         } else {
1859                 array = kcalloc(count, sizeof(struct task_struct *),
1860                                 GFP_KERNEL);
1861                 result = -ENOMEM;
1862                 if (!array)
1863                         goto out;
1864         }
1865
1866         for (id = 0; id < count; id++) {
1867                 result = -ENOMEM;
1868                 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
1869                 if (!tinfo)
1870                         goto out;
1871                 tinfo->ipvs = ipvs;
1872                 tinfo->sock = NULL;
1873                 if (state == IP_VS_STATE_BACKUP) {
1874                         tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
1875                                              GFP_KERNEL);
1876                         if (!tinfo->buf)
1877                                 goto out;
1878                 } else {
1879                         tinfo->buf = NULL;
1880                 }
1881                 tinfo->id = id;
1882                 if (state == IP_VS_STATE_MASTER)
1883                         result = make_send_sock(ipvs, id, dev, &tinfo->sock);
1884                 else
1885                         result = make_receive_sock(ipvs, id, dev, &tinfo->sock);
1886                 if (result < 0)
1887                         goto out;
1888
1889                 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
1890                 if (IS_ERR(task)) {
1891                         result = PTR_ERR(task);
1892                         goto out;
1893                 }
1894                 tinfo = NULL;
1895                 if (state == IP_VS_STATE_MASTER)
1896                         ipvs->ms[id].master_thread = task;
1897                 else
1898                         array[id] = task;
1899         }
1900
1901         /* mark as active */
1902
1903         if (state == IP_VS_STATE_BACKUP)
1904                 ipvs->backup_threads = array;
1905         spin_lock_bh(&ipvs->sync_buff_lock);
1906         ipvs->sync_state |= state;
1907         spin_unlock_bh(&ipvs->sync_buff_lock);
1908
1909         mutex_unlock(&ipvs->sync_mutex);
1910         rtnl_unlock();
1911
1912         /* increase the module use count */
1913         ip_vs_use_count_inc();
1914
1915         return 0;
1916
1917 out:
1918         /* We do not need RTNL lock anymore, release it here so that
1919          * sock_release below and in the kthreads can use rtnl_lock
1920          * to leave the mcast group.
1921          */
1922         rtnl_unlock();
1923         count = id;
1924         while (count-- > 0) {
1925                 if (state == IP_VS_STATE_MASTER)
1926                         kthread_stop(ipvs->ms[count].master_thread);
1927                 else
1928                         kthread_stop(array[count]);
1929         }
1930         if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
1931                 kfree(ipvs->ms);
1932                 ipvs->ms = NULL;
1933         }
1934         mutex_unlock(&ipvs->sync_mutex);
1935         if (tinfo) {
1936                 if (tinfo->sock)
1937                         sock_release(tinfo->sock);
1938                 kfree(tinfo->buf);
1939                 kfree(tinfo);
1940         }
1941         kfree(array);
1942         return result;
1943
1944 out_early:
1945         mutex_unlock(&ipvs->sync_mutex);
1946         rtnl_unlock();
1947         return result;
1948 }
1949
1950
1951 int stop_sync_thread(struct netns_ipvs *ipvs, int state)
1952 {
1953         struct task_struct **array;
1954         int id;
1955         int retc = -EINVAL;
1956
1957         IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
1958
1959         if (state == IP_VS_STATE_MASTER) {
1960                 if (!ipvs->ms)
1961                         return -ESRCH;
1962
1963                 /*
1964                  * The lock synchronizes with sb_queue_tail(), so that we don't
1965                  * add sync buffers to the queue, when we are already in
1966                  * progress of stopping the master sync daemon.
1967                  */
1968
1969                 spin_lock_bh(&ipvs->sync_buff_lock);
1970                 spin_lock(&ipvs->sync_lock);
1971                 ipvs->sync_state &= ~IP_VS_STATE_MASTER;
1972                 spin_unlock(&ipvs->sync_lock);
1973                 spin_unlock_bh(&ipvs->sync_buff_lock);
1974
1975                 retc = 0;
1976                 for (id = ipvs->threads_mask; id >= 0; id--) {
1977                         struct ipvs_master_sync_state *ms = &ipvs->ms[id];
1978                         int ret;
1979
1980                         pr_info("stopping master sync thread %d ...\n",
1981                                 task_pid_nr(ms->master_thread));
1982                         cancel_delayed_work_sync(&ms->master_wakeup_work);
1983                         ret = kthread_stop(ms->master_thread);
1984                         if (retc >= 0)
1985                                 retc = ret;
1986                 }
1987                 kfree(ipvs->ms);
1988                 ipvs->ms = NULL;
1989         } else if (state == IP_VS_STATE_BACKUP) {
1990                 if (!ipvs->backup_threads)
1991                         return -ESRCH;
1992
1993                 ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
1994                 array = ipvs->backup_threads;
1995                 retc = 0;
1996                 for (id = ipvs->threads_mask; id >= 0; id--) {
1997                         int ret;
1998
1999                         pr_info("stopping backup sync thread %d ...\n",
2000                                 task_pid_nr(array[id]));
2001                         ret = kthread_stop(array[id]);
2002                         if (retc >= 0)
2003                                 retc = ret;
2004                 }
2005                 kfree(array);
2006                 ipvs->backup_threads = NULL;
2007         }
2008
2009         /* decrease the module use count */
2010         ip_vs_use_count_dec();
2011
2012         return retc;
2013 }
2014
2015 /*
2016  * Initialize data struct for each netns
2017  */
2018 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs)
2019 {
2020         __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
2021         spin_lock_init(&ipvs->sync_lock);
2022         spin_lock_init(&ipvs->sync_buff_lock);
2023         return 0;
2024 }
2025
2026 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
2027 {
2028         int retc;
2029
2030         mutex_lock(&ipvs->sync_mutex);
2031         retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
2032         if (retc && retc != -ESRCH)
2033                 pr_err("Failed to stop Master Daemon\n");
2034
2035         retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
2036         if (retc && retc != -ESRCH)
2037                 pr_err("Failed to stop Backup Daemon\n");
2038         mutex_unlock(&ipvs->sync_mutex);
2039 }