1 /* This file is part of the Project Athena Zephyr Notification System.
2 * It contains functions for communication with other servers.
4 * Created by: John T. Kohl
6 * $Source: /afs/dev.mit.edu/source/repository/athena/lib/zephyr/server/server.c,v $
9 * Copyright (c) 1987, 1991 by the Massachusetts Institute of Technology.
10 * For copying and distribution information, see the file
14 #include <zephyr/mit-copyright.h>
16 #include <sys/socket.h>
20 static const char rcsid_server_c[] = "$Id$";
24 #define SRV_NACKTAB_HASHSIZE 1023
25 #define SRV_NACKTAB_HASHVAL(which, uid) (((which) ^ (uid).zuid_addr.s_addr ^ \
26 (uid).tv.tv_sec ^ (uid).tv.tv_usec) \
27 % SRV_NACKTAB_HASHSIZE)
29 * Server manager. Deal with traffic to and from other servers.
33 * void server_shutdown()
35 * void server_timo(which)
38 * void server_dispatch(notice, auth, who)
41 * struct sockaddr_in *who;
43 * void server_recover(client)
46 * void server_adispatch(notice, auth, who, server)
49 * struct sockaddr_in *who;
52 * void server_forward(notice, auth, who)
55 * struct sockaddr_in *who;
57 * Server *server_which_server(who)
58 * struct sockaddr_in *who;
60 * void server_kill_clt(client);
63 * void server_dump_servers(fp);
66 * void server_reset();
69 static void server_flush __P((Server *));
70 static void hello_respond __P((struct sockaddr_in *, int, int));
71 static void srv_responded __P((struct sockaddr_in *));
72 static void send_msg __P((struct sockaddr_in *, char *, int));
73 static void send_msg_list __P((struct sockaddr_in *, char *, char **, int,
75 static void srv_nack_cancel __P((ZNotice_t *, struct sockaddr_in *));
76 static void srv_nack_release __P((Server *));
77 static void srv_nack_renumber __P((int *));
78 static void send_stats __P((struct sockaddr_in *));
79 static void server_queue __P((Server *, int, void *, int,
80 struct sockaddr_in *));
81 static void server_hello __P((Server *, int));
82 static void setup_server __P((Server *, struct in_addr *));
83 static void srv_rexmit __P((void *));
84 static void server_forw_reliable __P((Server *, caddr_t, int, ZNotice_t *));
85 static Code_t admin_dispatch __P((ZNotice_t *, int, struct sockaddr_in *,
87 static Code_t kill_clt __P((ZNotice_t *, Server *));
88 static Code_t extract_addr __P((ZNotice_t *, struct sockaddr_in *));
91 static Code_t server_register();
94 static struct in_addr *get_server_addrs __P((int *number));
95 static char **get_server_list __P((char *file));
96 static char **get_single_server __P((void));
97 static void free_server_list __P((char **list));
99 static Unacked *srv_nacktab[SRV_NACKTAB_HASHSIZE];
100 Server *otherservers; /* points to an array of the known
102 int nservers; /* number of other servers */
103 int me_server_idx; /* # of my entry in the array */
105 #define ADJUST (1) /* adjust timeout on hello input */
106 #define DONT_ADJUST (0) /* don't adjust timeout */
108 /* parameters controlling the transitions of the FSM's--patchable with adb */
109 long timo_up = TIMO_UP;
110 long timo_tardy = TIMO_TARDY;
111 long timo_dead = TIMO_DEAD;
113 /* counters to measure old protocol use */
115 int old_compat_count_uloc = 0;
116 int old_compat_count_ulocate = 0;
117 int old_compat_count_subscr = 0;
118 #endif /* OLD_COMPAT */
120 int new_compat_count_uloc = 0;
121 int new_compat_count_subscr = 0;
122 #endif /* NEW_COMPAT */
128 * Initialize the array of servers. The `limbo' server goes in the first
129 * slot (otherservers[0]).
130 * Contact Hesiod to find all the other servers, allocate space for the
131 * structure, initialize them all to SERV_DEAD with expired timeouts.
132 * Set up a list header for server_forward retransmits.
139 struct in_addr *serv_addr, *server_addrs, limbo_addr;
141 /* we don't need to mask SIGFPE here since when we are called,
142 the signal handler isn't set up yet. */
144 /* talk to hesiod here, set nservers */
145 server_addrs = get_server_addrs(&nservers);
147 syslog(LOG_ERR, "No servers?!?");
156 /* increment servers to make room for 'limbo' */
159 otherservers = (Server *) malloc(nservers * sizeof(Server));
163 limbo_addr.s_addr = 0;
164 setup_server(otherservers, &limbo_addr);
165 timer_reset(otherservers[0].timer);
166 otherservers[0].timer = NULL;
167 otherservers[0].queue = NULL;
168 otherservers[0].dumping = 0;
170 for (serv_addr = server_addrs, i = 1; i < nservers; serv_addr++, i++) {
171 setup_server(&otherservers[i], serv_addr);
173 if (serv_addr->s_addr == my_addr.s_addr) {
175 otherservers[i].state = SERV_UP;
176 timer_reset(otherservers[i].timer);
177 otherservers[i].timer = NULL;
178 otherservers[i].queue = NULL;
179 otherservers[i].dumping = 0;
181 zdbug((LOG_DEBUG,"found myself"));
186 /* free up the addresses */
189 if (me_server_idx == -1) {
190 syslog(LOG_WARNING, "I'm a renegade server!");
191 otherservers = (Server *) realloc(otherservers,
192 ++nservers * sizeof(Server));
194 syslog(LOG_CRIT, "renegade realloc");
197 setup_server(&otherservers[nservers - 1], &my_addr);
199 otherservers[nservers - 1].state = SERV_UP;
201 /* I don't send hello's to myself--cancel the timer */
202 timer_reset(otherservers[nservers - 1].timer);
203 otherservers[nservers - 1].timer = NULL;
205 /* cancel and reschedule all the timers--pointers need
207 /* don't reschedule limbo's timer, so start i=1 */
208 for (i = 1; i < nservers - 1; i++) {
209 timer_reset(otherservers[i].timer);
210 /* all the HELLO's are due now */
211 otherservers[i].timer = timer_set_rel(0L, server_timo,
214 me_server_idx = nservers - 1;
220 * server_reset: re-initializes otherservers array by refreshing from Hesiod
223 * If any server is no longer named in the new list, and that server is in
224 * state SERV_DEAD, it is dropped from the server list.
225 * All other currently-known servers are retained.
226 * Any additional servers not previously known are added to the table.
228 * WARNING: Don't call this routine if any of the ancestor procedures have a
229 * handle on a particular server other than by indexing on otherservers[].
235 struct in_addr *server_addrs;
236 struct in_addr *serv_addr;
239 int *ok_list_new, *ok_list_old;
243 zdbug((LOG_DEBUG, "server_reset"));
247 syslog(LOG_INFO, "server_reset while alone, punt");
252 /* Find out what servers are supposed to be known. */
253 server_addrs = get_server_addrs(&num_servers);
255 syslog(LOG_ERR, "server_reset no servers. nothing done.");
258 ok_list_new = (int *) malloc(num_servers * sizeof(int));
260 syslog(LOG_ERR, "server_reset no mem new");
263 ok_list_old = (int *) malloc(nservers * sizeof(int));
265 syslog(LOG_ERR, "server_reset no mem old");
270 memset(ok_list_old, 0, nservers * sizeof(int));
271 memset(ok_list_new, 0, num_servers * sizeof(int));
273 /* reset timers--pointers will move */
274 for (j = 1; j < nservers; j++) { /* skip limbo */
275 if (j == me_server_idx)
277 timer_reset(otherservers[j].timer);
278 otherservers[j].timer = NULL;
281 /* check off entries on new list which are on old list.
282 check off entries on old list which are on new list. */
284 /* count limbo as "OK" */
286 ok_list_old[0] = 1; /* limbo is OK */
288 for (serv_addr = server_addrs, i = 0; i < num_servers; serv_addr++, i++) {
289 for (j = 1; j < nservers; j++) { /* j = 1 since we skip limbo */
290 if (otherservers[j].addr.sin_addr.s_addr == serv_addr->s_addr) {
291 /* if server is on both lists, mark */
295 break; /* for j loop */
300 /* remove any dead servers on old list not on new list. */
301 if (num_ok < nservers) {
304 new_num = 1; /* limbo */
305 /* count number of servers to keep */
306 for (j = 1; j < nservers; j++) {
307 /* since we are never SERV_DEAD, the following
308 test prevents removing ourself from the list */
309 if (ok_list_old[j] || (otherservers[j].state != SERV_DEAD)) {
310 syslog(LOG_INFO, "keeping server %s",
311 otherservers[j].addr_str);
315 if (new_num < nservers) {
316 servers = (Server *) malloc(new_num * sizeof(Server));
318 syslog(LOG_CRIT, "server_reset server malloc");
322 servers[0] = otherservers[0]; /* copy limbo */
324 srv = (int *) malloc(nservers * sizeof(int));
325 memset(srv, 0, nservers * sizeof(int));
327 /* copy the kept servers */
328 for (j = 1; j < nservers; j++) { /* skip limbo */
329 if (ok_list_old[j] ||
330 otherservers[j].state != SERV_DEAD) {
331 servers[i] = otherservers[j];
335 syslog(LOG_INFO, "flushing server %s",
336 otherservers[j].addr_str);
337 server_flush(&otherservers[j]);
342 srv_nack_renumber(srv);
346 otherservers = servers;
351 /* add any new servers on new list not on old list. */
353 for (i = 0; i < num_servers; i++) {
358 /* new_num is number of extras. */
360 otherservers = (Server *) realloc(otherservers, nservers * sizeof(Server));
362 syslog(LOG_CRIT, "server_reset realloc");
367 for (j = 1; j < nservers - new_num; j++) {
368 if (otherservers[j].addr.sin_addr.s_addr == my_addr.s_addr) {
373 if (!me_server_idx) {
374 syslog(LOG_CRIT, "can't find myself");
378 /* fill in otherservers with the new servers */
379 for (i = 0; i < num_servers; i++) {
380 if (!ok_list_new[i]) {
381 setup_server(&otherservers[nservers - (new_num--)],
383 syslog(LOG_INFO, "adding server %s", inet_ntoa(server_addrs[i]));
388 /* reset timers, to go off now.
389 We can't get a time-left indication (bleagh!)
390 so we expire them all now. This will generally
391 be non-destructive. We assume that when this code is
392 entered via a SIGHUP trigger that a system wizard
393 is watching the goings-on to make sure things straighten
396 for (i = 1; i < nservers; i++) { /* skip limbo */
397 if (i != me_server_idx && !otherservers[i].timer) {
398 otherservers[i].timer =
399 timer_set_rel(0L, server_timo, &otherservers[i]);
401 zdbug((LOG_DEBUG, "reset timer for %s",
402 otherservers[i].addr_str));
410 zdbug((LOG_DEBUG, "server_reset: %d servers now", nservers));
414 /* note: these must match the order given in zserver.h */
431 * A server timout has expired. If enough hello's have been unanswered,
432 * change state and act accordingly. Send a "hello" and reset the timer,
433 * incrementing the number of hello's sent.
435 * See the FSM in the Zephyr document for a better picture of what's
443 Server *which = (Server *) arg;
447 zdbug((LOG_DEBUG,"srv_timo: %s", which->addr_str));
449 /* change state and reset if appropriate */
450 switch(which->state) {
451 case SERV_DEAD: /* leave him dead */
455 case SERV_UP: /* he's now tardy */
456 which->state = SERV_TARDY;
457 which->num_hello_sent = 0;
458 which->timeout = timo_tardy;
463 if (which->num_hello_sent >= ((which->state == SERV_TARDY) ?
466 /* he hasn't answered, assume DEAD */
467 which->state = SERV_DEAD;
468 which->num_hello_sent = 0;
469 which->timeout = timo_dead;
470 srv_nack_release(which);
475 syslog(LOG_ERR,"Bad server state, server 0x%x\n",which);
478 /* now he's either TARDY, STARTING, or DEAD
479 We send a "hello," which increments the counter */
481 zdbug((LOG_DEBUG, "srv %s is %s", which->addr_str,
482 srv_states[which->state]));
484 server_hello(which, auth);
485 /* reschedule the timer */
486 which->timer = timer_set_rel(which->timeout, server_timo, which);
490 * Dispatch a notice from some other server
495 server_dispatch(notice, auth, who)
498 struct sockaddr_in *who;
501 struct sockaddr_in newwho;
503 String *notice_class;
506 zdbug((LOG_DEBUG, "server_dispatch"));
509 if (notice->z_kind == SERVACK) {
510 srv_nack_cancel(notice, who);
514 /* set up a who for the real origin */
515 memset(&newwho, 0, sizeof(newwho));
516 newwho.sin_family = AF_INET;
517 newwho.sin_addr.s_addr = notice->z_sender_addr.s_addr;
518 newwho.sin_port = notice->z_port;
520 server = server_which_server(who);
522 /* we can dispatch to routines safely here, since they will
523 return ZSRV_REQUEUE if appropriate. We bounce this back
524 to the caller, and the caller will re-queue the message
525 for us to process later. */
527 notice_class = make_string(notice->z_class, 1);
529 if (realm_which_realm(&newwho))
530 status = realm_dispatch(notice, auth, &newwho, server);
531 else if (class_is_admin(notice_class)) {
532 /* admins don't get acked, else we get a packet loop */
533 /* will return requeue if bdump request and dumping */
535 return admin_dispatch(notice, auth, who, server);
536 } else if (class_is_control(notice_class)) {
537 status = control_dispatch(notice, auth, &newwho, server);
539 } else if (class_is_ulogin(notice_class)) {
540 status = ulogin_dispatch(notice, auth, &newwho, server);
542 } else if (class_is_ulocate(notice_class)) {
543 status = ulocate_dispatch(notice, auth, &newwho, server);
546 /* shouldn't come from another server */
547 syslog(LOG_WARNING, "srv_disp: pkt cls %s", notice->z_class);
548 status = ZERR_NONE; /* XXX */
550 if (status != ZSRV_REQUEUE)
551 ack(notice, who); /* acknowledge it if processed */
552 free_string(notice_class);
558 * Register a new server (one not in our list). This MUST be authenticated.
563 server_register(notice, auth, who)
566 struct sockaddr_in *who;
572 if (who->sin_port != srv_addr.sin_port) {
574 zdbug((LOG_DEBUG, "srv_wrong port %d", ntohs(who->sin_port)));
578 /* Not yet... talk to ken about authenticators */
582 zdbug((LOG_DEBUG, "srv_unauth"));
587 /* OK, go ahead and set him up. */
588 temp = (Server *) malloc((nservers + 1) * sizeof(Server));
590 syslog(LOG_CRIT, "srv_reg malloc");
594 memcpy(temp, otherservers, nservers * sizeof(Server));
597 /* don't reschedule limbo's timer, so start i=1 */
598 for (i = 1; i < nservers; i++) {
599 if (i == me_server_idx) /* don't reset myself */
601 /* reschedule the timers--we moved otherservers */
602 timerval = timer_when(otherservers[i].timer);
603 timer_reset(otherservers[i].timer);
604 otherservers[i].timer = timer_set_abs(timerval, server_timo,
607 setup_server(&otherservers[nservers], &who->sin_addr);
608 otherservers[nservers].state = SERV_STARTING;
609 otherservers[nservers].timeout = timo_tardy;
610 otherservers[nservers].update_queue = NULL;
611 otherservers[nservers].dumping = 0;
615 zdbug((LOG_DEBUG, "srv %s is %s", otherservers[nservers].addr_str,
616 srv_states[otherservers[nservers].state]));
624 * Tell the other servers that this client died.
628 server_kill_clt(client)
632 char buf[512], *lyst[2];
634 ZNotice_t *pnotice; /* speed hack */
639 lyst[0] = inet_ntoa(client->addr.sin_addr),
640 sprintf(buf, "%d", ntohs(client->addr.sin_port));
644 zdbug((LOG_DEBUG, "server kill clt %s/%s", lyst[0], lyst[1]));
649 memset (¬ice, 0, sizeof(notice));
651 pnotice->z_kind = ACKED;
653 pnotice->z_port = srv_addr.sin_port;
654 pnotice->z_class = ZEPHYR_ADMIN_CLASS;
655 pnotice->z_class_inst = "";
656 pnotice->z_opcode = ADMIN_KILL_CLT;
657 pnotice->z_sender = myname; /* myname is the hostname */
658 pnotice->z_recipient = "";
659 pnotice->z_default_format = "";
660 pnotice->z_num_other_fields = 0;
665 /* don't tell limbo to flush, start at 1*/
666 for (i = 1; i < nservers; i++) {
667 if (i == me_server_idx) /* don't xmit to myself */
669 if (otherservers[i].state == SERV_DEAD)
672 retval = ZFormatNoticeList(pnotice, lyst, 2, &pack, &packlen,
673 auth ? ZAUTH : ZNOAUTH);
674 if (retval != ZERR_NONE) {
675 syslog(LOG_WARNING, "kill_clt format: %s", error_message(retval));
678 server_forw_reliable(&otherservers[i], pack, packlen, pnotice);
683 * A client has died. remove it
687 kill_clt(notice, server)
691 struct sockaddr_in who;
695 zdbug((LOG_DEBUG, "kill_clt"));
697 if (extract_addr(notice, &who) != ZERR_NONE)
698 return ZERR_NONE; /* XXX */
699 client = client_find(&who.sin_addr, notice->z_port);
701 syslog(LOG_NOTICE, "kill_clt: no such client (%s/%d) from %s",
702 inet_ntoa(who.sin_addr), ntohs(who.sin_port),
704 return ZERR_NONE; /* XXX */
708 syslog(LOG_DEBUG, "kill_clt clt_dereg %s/%d from %s",
709 inet_ntoa(who.sin_addr), ntohs(who.sin_port), server->addr_str);
713 /* remove the locations, too */
714 client_deregister(client, 1);
719 * extract a sockaddr_in from a message body
723 extract_addr(notice, who)
725 struct sockaddr_in *who;
727 char *cp = notice->z_message;
729 if (!notice->z_message_len) {
730 syslog(LOG_WARNING, "bad addr pkt");
733 who->sin_addr.s_addr = inet_addr(notice->z_message);
735 cp += strlen(cp) + 1;
736 if (cp >= notice->z_message + notice->z_message_len) {
737 syslog(LOG_WARNING, "short addr pkt");
740 who->sin_port = notice->z_port = htons((u_short) atoi(cp));
741 who->sin_family = AF_INET;
743 zdbug((LOG_DEBUG,"ext %s/%d", inet_ntoa(who->sin_addr),
744 ntohs(who->sin_port)));
750 * Flush all data associated with the server which
759 syslog(LOG_DEBUG, "server_flush %s", which->addr_str);
761 srv_nack_release(which);
765 * send a hello to which, updating the count of hello's sent
766 * Authenticate if auth is set.
770 server_hello(which, auth)
774 send_msg(&which->addr, ADMIN_HELLO, auth);
775 which->num_hello_sent++;
779 * Handle an ADMIN message from a server
784 admin_dispatch(notice, auth, who, server)
787 struct sockaddr_in *who;
790 char *opcode = notice->z_opcode;
791 Code_t status = ZERR_NONE;
794 zdbug((LOG_DEBUG, "ADMIN received"));
797 if (strcmp(opcode, ADMIN_HELLO) == 0) {
798 hello_respond(who, ADJUST, auth);
799 } else if (strcmp(opcode, ADMIN_IMHERE) == 0) {
801 } else if (strcmp(opcode, ADMIN_SHUTDOWN) == 0) {
803 zdbug((LOG_DEBUG, "server shutdown"));
806 srv_nack_release(server);
807 server->state = SERV_DEAD;
808 server->timeout = timo_dead;
809 /* don't worry about the timer, it will
810 be set appropriately on the next send */
812 zdbug((LOG_DEBUG, "srv %s is %s", server->addr_str,
813 srv_states[server->state]));
816 } else if (strcmp(opcode, ADMIN_BDUMP) == 0) {
817 /* Ignore a brain dump request if this is a brain dump packet
818 * or a packet being processed concurrently during a brain
820 if (bdumping || bdump_concurrent)
822 bdump_get(notice, auth, who, server);
823 } else if (strcmp(opcode, ADMIN_KILL_CLT) == 0) {
824 status = kill_clt(notice, server);
825 if (status == ZERR_NONE)
828 syslog(LOG_WARNING, "ADMIN unknown opcode %s",opcode);
835 * Handle an ADMIN message from some random client.
836 * For now, assume it's a registration-type message from some other
837 * previously unknown server
842 server_adispatch(notice, auth, who, server)
845 struct sockaddr_in *who;
849 /* this had better be a HELLO message--start of acquisition
850 protocol, OR a status req packet */
852 if (strcmp(notice->z_opcode, ADMIN_STATUS) == 0) {
859 syslog(LOG_INFO, "disp: new server?");
860 if (server_register(notice, auth, who) != ZERR_NONE) {
861 syslog(LOG_INFO, "new server failed");
863 syslog(LOG_INFO, "new server %s, %d", inet_ntoa(who->sin_addr),
864 ntohs(who->sin_port));
865 hello_respond(who, DONT_ADJUST, auth);
868 syslog(LOG_INFO, "srv_adisp: server attempt from %s",
869 inet_ntoa(who->sin_addr));
877 struct sockaddr_in *who;
883 char *vers, *pkts, *upt;
887 #define NUM_FIXED 3 /* 3 fixed fields, plus server info */
888 /* well, not really...but for
889 backward compatibility, we gotta
891 vers = get_version();
893 sprintf(buf, "%d pkts", npackets);
895 sprintf(buf, "%d seconds operational",NOW - uptime);
899 if (old_compat_count_uloc)
901 if (old_compat_count_ulocate)
903 if (old_compat_count_subscr)
905 #endif /* OLD_COMPAT */
907 if (new_compat_count_uloc)
909 if (new_compat_count_subscr)
911 #endif /* NEW_COMPAT */
912 extrafields += nrealms;
913 responses = (char **) malloc((NUM_FIXED + nservers + extrafields) *
919 num_resp = NUM_FIXED;
920 /* start at 1 and ignore limbo */
921 for (i = 1; i < nservers ; i++) {
922 sprintf(buf, "%s/%s%s", otherservers[i].addr_str,
923 srv_states[(int) otherservers[i].state],
924 otherservers[i].dumping ? " (DUMPING)" : "");
925 responses[num_resp++] = strsave(buf);
928 if (old_compat_count_uloc) {
929 sprintf(buf, "%d old old location requests", old_compat_count_uloc);
930 responses[num_resp++] = strsave(buf);
932 if (old_compat_count_ulocate) {
933 sprintf(buf, "%d old old loc lookup requests",
934 old_compat_count_ulocate);
935 responses[num_resp++] = strsave(buf);
937 if (old_compat_count_subscr) {
938 sprintf(buf, "%d old old subscr requests", old_compat_count_subscr);
939 responses[num_resp++] = strsave(buf);
941 #endif /* OLD_COMPAT */
943 if (new_compat_count_uloc) {
944 sprintf(buf, "%d new old location requests", new_compat_count_uloc);
945 responses[num_resp++] = strsave(buf);
947 if (new_compat_count_subscr) {
948 sprintf(buf, "%d new old subscr requests", new_compat_count_subscr);
949 responses[num_resp++] = strsave(buf);
951 #endif /* NEW_COMPAT */
952 for (realm = otherrealms, i = 0; i < nrealms ; i++, realm++) {
953 sprintf(buf, "%s(%s)/%s", realm->name,
954 inet_ntoa((realm->addrs[realm->idx]).sin_addr),
955 rlm_states[(int) realm->state]);
956 responses[num_resp++] = strsave(buf);
959 send_msg_list(who, ADMIN_STATUS, responses, num_resp, 0);
961 /* Start at one; don't try to free static version string */
962 for (i = 1; i < num_resp; i++)
968 * Get a list of server addresses.
970 * This list is retrieved from Hesiod.
972 * This list is read from a file.
974 * Return a pointer to an array of allocated storage. This storage is
975 * freed by the caller.
978 static struct in_addr *
979 get_server_addrs(number)
980 int *number; /* RETURN */
983 char **server_hosts = NULL;
984 char **server_hosts_free = NULL;
986 struct in_addr *addrs;
987 struct in_addr *addr;
990 server_hosts = get_server_list(list_file);
991 server_hosts_free = server_hosts;
994 server_hosts = hes_resolve("zephyr","sloc");
997 server_hosts = get_single_server();
998 server_hosts_free = server_hosts;
1004 for (cpp = server_hosts; *cpp; cpp++)
1007 addrs = (struct in_addr *) malloc(i * sizeof(struct in_addr));
1009 /* Convert to in_addr's */
1010 for (cpp = server_hosts, addr = addrs, i = 0; *cpp; cpp++) {
1011 hp = gethostbyname(*cpp);
1013 memcpy(addr, hp->h_addr, sizeof(struct in_addr));
1016 syslog(LOG_WARNING, "hostname failed, %s", *cpp);
1020 if (server_hosts_free)
1021 free_server_list(server_hosts_free);
1025 static int nhosts = 0;
1028 * read "file" to get a list of names of hosts to peer with.
1029 * The file should contain a list of host names, one per line.
1033 get_server_list(file)
1037 char buf[MAXHOSTNAMELEN];
1042 fp = fopen(file, "r");
1045 /* start with 16, realloc if necessary */
1047 ret_list = (char **) malloc(nhosts * sizeof(char *));
1051 while (fgets(buf, MAXHOSTNAMELEN, fp)) {
1052 /* nuke the newline, being careful not to overrun
1053 the buffer searching for it with strlen() */
1054 buf[MAXHOSTNAMELEN - 1] = '\0';
1055 newline = strchr(buf, '\n');
1059 if (nused + 1 >= nhosts) {
1060 /* get more pointer space if necessary */
1061 /* +1 to leave room for null pointer */
1062 ret_list = (char **) realloc(ret_list, nhosts * 2);
1063 nhosts = nhosts * 2;
1065 ret_list[nused++] = strsave(buf);
1072 ret_list[nused] = NULL;
1079 char buf[MAXHOSTNAMELEN];
1083 ret_list = (char **) malloc(nhosts * sizeof(char *));
1086 if (gethostname(buf, sizeof(buf)) < 0) {
1090 ret_list[nused++] = strsave(buf);
1091 ret_list[nused] = NULL;
1096 * free storage allocated by get_server_list
1099 free_server_list(list)
1102 char **orig_list = list;
1104 if (!nhosts) /* nothing allocated */
1106 for (; *list; list++)
1113 * initialize the server structure for address addr, and set a timer
1114 * to go off immediately to send hello's to other servers.
1118 setup_server(server, addr)
1120 struct in_addr *addr;
1122 server->state = SERV_DEAD;
1123 server->timeout = timo_dead;
1124 server->num_hello_sent = 0;
1125 server->addr.sin_family = AF_INET;
1126 /* he listens to the same port we do */
1127 server->addr.sin_port = srv_addr.sin_port;
1128 server->addr.sin_addr = *addr;
1129 strcpy(server->addr_str, inet_ntoa(*addr));
1130 server->timer = timer_set_rel(0L, server_timo, server);
1131 server->queue = NULL;
1132 server->dumping = 0;
1136 * Someone sent us a hello message, respond to them.
1140 hello_respond(who, adj, auth)
1141 struct sockaddr_in *who;
1148 zdbug((LOG_DEBUG, "hello from %s", inet_ntoa(who->sin_addr)));
1151 send_msg(who, ADMIN_IMHERE, auth);
1155 /* If we think he's down, schedule an immediate HELLO. */
1157 which = server_which_server(who);
1161 switch (which->state) {
1163 /* he said hello, we thought he was dead.
1164 reschedule his hello for now. */
1165 timer_reset(which->timer);
1166 which->timer = timer_set_rel(0L, server_timo, which);
1177 * return the server descriptor for server at who
1181 server_which_server(who)
1182 struct sockaddr_in *who;
1187 if (who->sin_port != srv_addr.sin_port)
1190 /* don't check limbo */
1191 for (server = &otherservers[1], i = 1; i < nservers; i++, server++) {
1192 if (server->addr.sin_addr.s_addr == who->sin_addr.s_addr)
1199 * We received a response to a hello packet or an ack. Adjust server state
1204 struct sockaddr_in *who;
1206 Server *which = server_which_server(who);
1209 zdbug((LOG_DEBUG, "srv_responded %s", inet_ntoa(who->sin_addr)));
1213 syslog(LOG_ERR, "hello input from non-server?!");
1217 switch (which->state) {
1219 /* he responded, we thought he was dead. mark as starting
1221 which->state = SERV_STARTING;
1222 which->timeout = timo_tardy;
1223 timer_reset(which->timer);
1224 which->timer = timer_set_rel(0L, server_timo, which);
1227 /* here we negotiate and set up a braindump */
1228 if (bdump_socket < 0)
1233 which->state = SERV_UP;
1237 /* reset the timer and counts */
1238 which->num_hello_sent = 0;
1239 which->timeout = timo_up;
1240 timer_reset(which->timer);
1241 which->timer = timer_set_rel(which->timeout, server_timo, which);
1245 zdbug((LOG_DEBUG, "srv %s is %s", which->addr_str,
1246 srv_states[which->state]));
1251 * Send each of the other servers a shutdown message.
1259 /* don't tell limbo to go away, start at 1*/
1260 for (i = 1; i < nservers; i++)
1261 send_msg(&otherservers[i].addr, ADMIN_SHUTDOWN, 1);
1265 * send a message to who with admin class and opcode and clinst as specified.
1266 * auth is set if we want to send authenticated
1270 send_msg(who, opcode, auth)
1271 struct sockaddr_in *who;
1276 ZNotice_t *pnotice; /* speed hack */
1283 memset (¬ice, 0, sizeof(notice));
1285 pnotice->z_kind = ACKED;
1287 pnotice->z_port = srv_addr.sin_port;
1288 pnotice->z_class = ZEPHYR_ADMIN_CLASS;
1289 pnotice->z_class_inst = "";
1290 pnotice->z_opcode = opcode;
1291 pnotice->z_sender = myname; /* myname is the hostname */
1292 pnotice->z_recipient = "";
1293 pnotice->z_default_format = "";
1294 pnotice->z_message = NULL;
1295 pnotice->z_message_len = 0;
1296 pnotice->z_num_other_fields = 0;
1298 /* XXX for now, we don't do authentication */
1301 retval = ZFormatNotice(pnotice, &pack, &packlen, auth ? ZAUTH : ZNOAUTH);
1302 if (retval != ZERR_NONE) {
1303 syslog(LOG_WARNING, "snd_msg format: %s", error_message(retval));
1306 retval = ZSetDestAddr(who);
1307 if (retval != ZERR_NONE) {
1308 syslog(LOG_WARNING, "snd_msg set addr: %s", error_message(retval));
1312 /* don't wait for ack */
1313 retval = ZSendPacket(pack, packlen, 0);
1314 if (retval != ZERR_NONE)
1315 syslog(LOG_WARNING, "snd_msg xmit: %s", error_message(retval));
1320 * send a notice with a message to who with admin class and opcode and
1321 * message body as specified.
1322 * auth is set if we want to send authenticated
1323 * server_idx is -1 if we are sending to a client, or the server index
1324 * if we are sending to a server.
1328 send_msg_list(who, opcode, lyst, num, auth)
1329 struct sockaddr_in *who;
1341 memset (¬ice, 0, sizeof(notice));
1343 notice.z_kind = UNSAFE;
1344 notice.z_port = srv_addr.sin_port;
1345 notice.z_class = ZEPHYR_ADMIN_CLASS;
1346 notice.z_class_inst = "";
1347 notice.z_opcode = opcode;
1348 notice.z_sender = myname; /* myname is the hostname */
1349 notice.z_recipient = "";
1350 notice.z_default_format = "";
1351 notice.z_message = NULL;
1352 notice.z_message_len = 0;
1353 notice.z_num_other_fields = 0;
1355 /* XXX for now, we don't do authentication */
1358 retval = ZFormatNoticeList(¬ice, lyst, num, &pack, &packlen,
1359 auth ? ZAUTH : ZNOAUTH);
1360 if (retval != ZERR_NONE) {
1361 syslog(LOG_WARNING, "snd_msg_lst format: %s", error_message(retval));
1364 retval = ZSetDestAddr(who);
1365 if (retval != ZERR_NONE) {
1366 syslog(LOG_WARNING, "snd_msg_lst set addr: %s", error_message(retval));
1370 xmit_frag(¬ice, pack, packlen, 0);
1375 * Forward the notice to the other servers
1379 server_forward(notice, auth, who)
1382 struct sockaddr_in *who;
1390 zdbug((LOG_DEBUG, "srv_forw"));
1392 /* don't send to limbo */
1393 for (i = 1; i < nservers; i++) {
1394 if (i == me_server_idx) /* don't xmit to myself */
1396 if (otherservers[i].state == SERV_DEAD &&
1397 otherservers[i].dumping == 0) {
1398 /* if we are dumping to him, we want to
1399 queue it, even if he's dead */
1403 pack = malloc(sizeof(ZPacket_t));
1405 syslog(LOG_CRIT, "srv_fwd malloc");
1408 retval = ZNewFormatSmallRawNotice(notice, pack, &packlen);
1409 if (retval != ZERR_NONE) {
1410 syslog(LOG_WARNING, "srv_fwd format: %s", error_message(retval));
1413 if (otherservers[i].dumping) {
1414 server_queue(&otherservers[i], packlen, pack, auth, who);
1417 server_forw_reliable(&otherservers[i], pack, packlen, notice);
1422 server_forw_reliable(server, pack, packlen, notice)
1432 retval = ZSetDestAddr(&server->addr);
1433 if (retval != ZERR_NONE) {
1434 syslog(LOG_WARNING, "srv_fwd_rel set addr: %s", error_message(retval));
1438 retval = ZSendPacket(pack, packlen, 0);
1439 if (retval != ZERR_NONE) {
1440 syslog(LOG_WARNING, "srv_fwd xmit: %s", error_message(retval));
1444 /* now we've sent it, mark it as not ack'ed */
1446 nacked = (Unacked *) malloc(sizeof(Unacked));
1448 /* no space: just punt */
1449 syslog(LOG_ERR, "srv_forw_rel nack malloc");
1454 nacked->client = NULL;
1455 nacked->rexmits = 0;
1456 nacked->packet = pack;
1457 nacked->dest.srv_idx = server - otherservers;
1458 nacked->packsz = packlen;
1459 nacked->uid = notice->z_uid;
1460 nacked->timer = timer_set_rel(rexmit_times[0], srv_rexmit, nacked);
1461 hashval = SRV_NACKTAB_HASHVAL(nacked->dest.srv_idx, nacked->uid);
1462 LIST_INSERT(&srv_nacktab[hashval], nacked);
1466 * send the queued message for the server.
1470 server_send_queue(server)
1477 while (server->queue) {
1478 pending = server_dequeue(server);
1479 status = ZParseNotice(pending->packet, pending->len, ¬ice);
1480 if (status != ZERR_NONE) {
1481 syslog(LOG_ERR, "ssq bad notice parse (%s): %s",
1482 inet_ntoa(pending->who.sin_addr), error_message(status));
1484 server_forw_reliable(server, pending->packet, pending->len,
1487 /* ACK handling routines will free the packet */
1493 * a server has acknowledged a message we sent to him; remove it from
1494 * server unacked queue
1498 srv_nack_cancel(notice, who)
1500 struct sockaddr_in *who;
1502 Server *server = server_which_server(who);
1507 syslog(LOG_ERR, "non-server ack?");
1510 hashval = SRV_NACKTAB_HASHVAL(server - otherservers, notice->z_uid);
1511 for (nacked = srv_nacktab[hashval]; nacked; nacked = nacked->next) {
1512 if (nacked->dest.srv_idx == server - otherservers
1513 && ZCompareUID(&nacked->uid, ¬ice->z_uid)) {
1514 timer_reset(nacked->timer);
1515 free(nacked->packet);
1516 LIST_DELETE(nacked);
1522 zdbug((LOG_DEBUG, "srv_nack not found"));
1527 * retransmit a message to another server
1534 Unacked *packet = (Unacked *) arg;
1536 /* retransmit the packet */
1539 zdbug((LOG_DEBUG,"srv_rexmit to %s/%d",
1540 otherservers[packet->dest.srv_idx].addr_str,
1541 ntohs(otherservers[packet->dest.srv_idx].addr.sin_port)));
1543 if (otherservers[packet->dest.srv_idx].state == SERV_DEAD) {
1545 zdbug((LOG_DEBUG, "cancelling send to dead server"));
1547 LIST_DELETE(packet);
1548 free(packet->packet);
1549 srv_nack_release(&otherservers[packet->dest.srv_idx]);
1553 retval = ZSetDestAddr(&otherservers[packet->dest.srv_idx].addr);
1554 if (retval != ZERR_NONE) {
1555 syslog(LOG_WARNING, "srv_rexmit set addr: %s", error_message(retval));
1557 retval = ZSendPacket(packet->packet, packet->packsz, 0);
1558 if (retval != ZERR_NONE)
1559 syslog(LOG_WARNING, "srv_rexmit xmit: %s",
1560 error_message(retval));
1563 /* reset the timer */
1564 if (rexmit_times[packet->rexmits + 1] != -1)
1566 packet->timer = timer_set_rel(rexmit_times[packet->rexmits], srv_rexmit,
1571 * Clean up the not-yet-acked queue and release anything destined
1576 srv_nack_release(server)
1580 Unacked *nacked, *next;
1582 for (i = 0; i < SRV_NACKTAB_HASHSIZE; i++) {
1583 for (nacked = srv_nacktab[i]; nacked; nacked = next) {
1584 next = nacked->next;
1585 if (nacked->dest.srv_idx == server - otherservers) {
1586 timer_reset(nacked->timer);
1587 LIST_DELETE(nacked);
1588 free(nacked->packet);
1596 * Adjust indices of not-yet-acked packets sent to other servers to
1597 * continue to refer to the correct server.
1601 srv_nack_renumber (new_idx)
1604 /* XXX release any private queue for this server */
1608 /* search the not-yet-acked list for anything destined to 'from', and
1609 change the index to 'to'. */
1610 for (i = 0; i < SRV_NACKTAB_HASHSIZE; i++) {
1611 for (nacked = srv_nacktab[i]; nacked; nacked = nacked->next) {
1612 idx = new_idx[nacked->dest.srv_idx];
1614 syslog(LOG_ERR, "srv_nack_renumber error: [%d]=%d",
1615 nacked->dest.srv_idx, idx);
1618 nacked->dest.srv_idx = idx;
1624 * Queue this notice to be transmitted to the server when it is ready.
1627 server_queue(server, len, pack, auth, who)
1632 struct sockaddr_in *who;
1636 pending = (Pending *) malloc(sizeof(Pending));
1638 syslog(LOG_CRIT, "update_queue malloc");
1641 pending->packet = pack;
1643 pending->auth = auth;
1644 pending->who = *who;
1645 pending->next = NULL;
1647 /* put it on the end of the list */
1649 server->queue_last->next = pending;
1651 server->queue = server->queue_last = pending;
1655 * Pull a notice off the hold queue.
1659 server_dequeue(server)
1666 pending = server->queue;
1667 server->queue = pending->next;
1672 * free storage used by a pending queue entry.
1676 server_pending_free(pending)
1679 free(pending->packet);
1685 * Queue something to be handled later by this server.
1689 server_self_queue(notice, auth, who)
1692 struct sockaddr_in * who;
1698 retval = ZFormatRawNotice(notice, &pack, &packlen);
1699 if (retval != ZERR_NONE) {
1700 syslog(LOG_CRIT, "srv_self_queue format: %s", error_message(retval));
1703 server_queue(me_server, packlen, pack, auth, who);
1707 * dump info about servers onto the fp.
1708 * assumed to be called with SIGFPE blocked
1709 * (true if called from signal handler)
1712 server_dump_servers(fp)
1717 for (i = 0; i < nservers ; i++) {
1718 fprintf(fp, "%d:%s/%s%s\n", i, otherservers[i].addr_str,
1719 srv_states[otherservers[i].state],
1720 otherservers[i].dumping ? " (DUMPING)" : "");