From: Jorrit Herder Date: Thu, 20 Oct 2005 20:31:18 +0000 (+0000) Subject: New Reincarnation Server functionality. X-Git-Tag: v3.1.2a~593 X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zlib_tech.html?a=commitdiff_plain;h=2a98fed5150872842c43a08f68b1481328d30abc;p=minix.git New Reincarnation Server functionality. - service refresh: to cleanly stop and restart a server or driver - binary exponential backoff: don't restart in a loop --- diff --git a/servers/rs/main.c b/servers/rs/main.c index d168ae9d4..81a81844e 100644 --- a/servers/rs/main.c +++ b/servers/rs/main.c @@ -7,7 +7,7 @@ * Created: * Jul 22, 2005 by Jorrit N. Herder */ -#include "rs.h" +#include "inc.h" #include #include "../../kernel/const.h" #include "../../kernel/type.h" @@ -16,7 +16,6 @@ FORWARD _PROTOTYPE(void init_server, (void) ); FORWARD _PROTOTYPE(void get_work, (message *m) ); FORWARD _PROTOTYPE(void reply, (int whom, int result) ); -FORWARD _PROTOTYPE(int do_getsysinfo, (message *m) ); /* Data buffers to retrieve info during initialization. */ PRIVATE struct boot_image image[NR_BOOT_PROCS]; @@ -85,13 +84,16 @@ PUBLIC int main(void) */ else { switch(call_nr) { - case SRV_UP: - result = do_start(&m); + case RS_UP: + result = do_up(&m); break; - case SRV_DOWN: - result = do_stop(&m); + case RS_DOWN: + result = do_down(&m); break; - case SRV_SHUTDOWN: + case RS_REFRESH: + result = do_refresh(&m); + break; + case RS_SHUTDOWN: result = do_shutdown(&m); break; case GETSYSINFO: @@ -148,7 +150,7 @@ PRIVATE void init_server(void) ip = &image[s]; if (ip->proc_nr >= 0) { nr_in_use ++; - rproc[s].r_flags = IN_USE; + rproc[s].r_flags = RS_IN_USE; rproc[s].r_proc_nr = ip->proc_nr; rproc[s].r_pid = getnpid(ip->proc_nr); for(t=0; t< NR_DEVICES; t++) @@ -162,39 +164,12 @@ PRIVATE void init_server(void) } /* Set alarm to periodically check driver status. */ - if (OK != (s=sys_setalarm(HZ, 0))) + if (OK != (s=sys_setalarm(RS_DELTA_T, 0))) panic("RS", "couldn't set alarm", s); } -/*===========================================================================* - * do_getsysinfo * - *===========================================================================*/ -PRIVATE int do_getsysinfo(m_ptr) -message *m_ptr; -{ - vir_bytes src_addr, dst_addr; - int dst_proc; - size_t len; - int s; - - switch(m_ptr->m1_i1) { - case SI_PROC_TAB: - src_addr = (vir_bytes) rproc; - len = sizeof(struct rproc) * NR_SYS_PROCS; - break; - default: - return(EINVAL); - } - - dst_proc = m_ptr->m_source; - dst_addr = (vir_bytes) m_ptr->m1_p1; - if (OK != (s=sys_datacopy(SELF, src_addr, dst_proc, dst_addr, len))) - return(s); - return(OK); -} - /*===========================================================================* * get_work * *===========================================================================*/ diff --git a/servers/rs/manager.c b/servers/rs/manager.c index d34bc0b96..2c0041b5f 100644 --- a/servers/rs/manager.c +++ b/servers/rs/manager.c @@ -3,7 +3,7 @@ * Jul 22, 2005: Created (Jorrit N. Herder) */ -#include "rs.h" +#include "inc.h" #include #include #include @@ -17,16 +17,16 @@ extern int errno; /* error status */ /* Prototypes for internal functions that do the hard work. */ FORWARD _PROTOTYPE( int start_service, (struct rproc *rp) ); -FORWARD _PROTOTYPE( int stop_service, (struct rproc *rp) ); +FORWARD _PROTOTYPE( int stop_service, (struct rproc *rp,int how) ); PRIVATE int shutting_down = FALSE; #define EXEC_FAILED 49 /* recognizable status */ /*===========================================================================* - * do_start * + * do_up * *===========================================================================*/ -PUBLIC int do_start(m_ptr) +PUBLIC int do_up(m_ptr) message *m_ptr; /* request message pointer */ { /* A request was made to start a new system service. Dismember the request @@ -44,7 +44,7 @@ message *m_ptr; /* request message pointer */ if (nr_in_use >= NR_SYS_PROCS) return(EAGAIN); for (slot_nr = 0; slot_nr < NR_SYS_PROCS; slot_nr++) { rp = &rproc[slot_nr]; /* get pointer to slot */ - if (! rp->r_flags & IN_USE) /* check if available */ + if (! rp->r_flags & RS_IN_USE) /* check if available */ break; } nr_in_use ++; /* update administration */ @@ -52,10 +52,10 @@ message *m_ptr; /* request message pointer */ /* Obtain command name and parameters. This is a space-separated string * that looks like "/sbin/service arg1 arg2 ...". Arguments are optional. */ - if (m_ptr->SRV_CMD_LEN > MAX_COMMAND_LEN) return(E2BIG); - if (OK!=(s=sys_datacopy(m_ptr->m_source, (vir_bytes) m_ptr->SRV_CMD_ADDR, - SELF, (vir_bytes) rp->r_cmd, m_ptr->SRV_CMD_LEN))) return(s); - rp->r_cmd[m_ptr->SRV_CMD_LEN] = '\0'; /* ensure it is terminated */ + if (m_ptr->RS_CMD_LEN > MAX_COMMAND_LEN) return(E2BIG); + if (OK!=(s=sys_datacopy(m_ptr->m_source, (vir_bytes) m_ptr->RS_CMD_ADDR, + SELF, (vir_bytes) rp->r_cmd, m_ptr->RS_CMD_LEN))) return(s); + rp->r_cmd[m_ptr->RS_CMD_LEN] = '\0'; /* ensure it is terminated */ if (rp->r_cmd[0] != '/') return(EINVAL); /* insist on absolute path */ /* Build argument vector to be passed to execute call. The format of the @@ -77,10 +77,11 @@ message *m_ptr; /* request message pointer */ rp->r_argv[arg_count] = NULL; /* end with NULL pointer */ rp->r_argc = arg_count; - /* Check if a heartbeat period was given. */ - rp->r_period = m_ptr->SRV_PERIOD; - rp->r_dev_nr = m_ptr->SRV_DEV_MAJOR; + /* Initialize some fields. */ + rp->r_period = m_ptr->RS_PERIOD; + rp->r_dev_nr = m_ptr->RS_DEV_MAJOR; rp->r_dev_style = STYLE_DEV; + rp->r_restarts = -1; /* will be incremented */ /* All information was gathered. Now try to start the system service. */ return(start_service(rp)); @@ -88,21 +89,49 @@ message *m_ptr; /* request message pointer */ /*===========================================================================* - * do_stop * + * do_down * *===========================================================================*/ -PUBLIC int do_stop(message *m_ptr) +PUBLIC int do_down(message *m_ptr) { register struct rproc *rp; - pid_t pid = (pid_t) m_ptr->SRV_PID; + pid_t pid = (pid_t) m_ptr->RS_PID; for (rp=BEG_RPROC_ADDR; rpr_flags & IN_USE && rp->r_pid == pid) { - printf("stopping %d (%d)\n", pid, m_ptr->SRV_PID); - stop_service(rp); + if (rp->r_flags & RS_IN_USE && rp->r_pid == pid) { +#if VERBOSE + printf("stopping %d (%d)\n", pid, m_ptr->RS_PID); +#endif + stop_service(rp,RS_EXITING); return(OK); } } - printf("not found %d (%d)\n", pid, m_ptr->SRV_PID); +#if VERBOSE + printf("not found %d (%d)\n", pid, m_ptr->RS_PID); +#endif + return(ESRCH); +} + + +/*===========================================================================* + * do_refresh * + *===========================================================================*/ +PUBLIC int do_refresh(message *m_ptr) +{ + register struct rproc *rp; + pid_t pid = (pid_t) m_ptr->RS_PID; + + for (rp=BEG_RPROC_ADDR; rpr_flags & RS_IN_USE && rp->r_pid == pid) { +#if VERBOSE + printf("refreshing %d (%d)\n", pid, m_ptr->RS_PID); +#endif + stop_service(rp,RS_REFRESHING); + return(OK); + } + } +#if VERBOSE + printf("not found %d (%d)\n", pid, m_ptr->RS_PID); +#endif return(ESRCH); } @@ -151,24 +180,37 @@ PUBLIC void do_exit(message *m_ptr) * This should always succeed. */ for (rp=BEG_RPROC_ADDR; rpr_flags & IN_USE) && rp->r_pid == exit_pid) { + if ((rp->r_flags & RS_IN_USE) && rp->r_pid == exit_pid) { - printf("Slot found!\n"); rproc_ptr[rp->r_proc_nr] = NULL; /* invalidate */ - if ((rp->r_flags & EXIT_PENDING) || shutting_down) { - printf("Expected exit. Doing nothing.\n"); + if ((rp->r_flags & RS_EXITING) || shutting_down) { rp->r_flags = 0; /* release slot */ rproc_ptr[rp->r_proc_nr] = NULL; } + else if(rp->r_flags & RS_REFRESHING) { + rp->r_restarts = -1; /* reset counter */ + start_service(rp); /* direct restart */ + } else if (WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == EXEC_FAILED) { - printf("Exit because EXEC() failed. Doing nothing.\n"); rp->r_flags = 0; /* release slot */ } else { +#if VERBOSE printf("Unexpected exit. Restarting %s\n", rp->r_cmd); - start_service(rp); /* restart */ +#endif + /* Determine what to do. If this is the first unexpected + * exit, immediately restart this service. Otherwise use + * a binary exponetial backoff. + */ + if (rp->r_restarts > 0) { + rp->r_backoff = 1 << MIN(rp->r_restarts,(BACKOFF_BITS-1)); + rp->r_backoff = MIN(rp->r_backoff,MAX_BACKOFF); + } + else { + start_service(rp); /* direct restart */ + } } break; } @@ -188,10 +230,30 @@ message *m_ptr; /* Search system services table. Only check slots that are in use. */ for (rp=BEG_RPROC_ADDR; rpr_flags & IN_USE) { + if (rp->r_flags & RS_IN_USE) { + + /* If the service is to be revived (because it repeatedly exited, + * and was not directly restarted), the binary backoff field is + * greater than zero. + */ + if (rp->r_backoff > 0) { + rp->r_backoff -= 1; + if (rp->r_backoff == 0) { + start_service(rp); + } + } + + /* If the service was signaled with a SIGTERM and fails to respond, + * kill the system service with a SIGKILL signal. + */ + else if (rp->r_stop_tm > 0 && now - rp->r_stop_tm > 2*RS_DELTA_T) { + kill(rp->r_pid, SIGKILL); /* terminate */ + } - /* If the service has a period assigned check its status. */ - if (rp->r_period > 0) { + /* There seems to be no special conditions. If the service has a + * period assigned check its status. + */ + else if (rp->r_period > 0) { /* Check if an answer to a status request is still pending. If * the driver didn't respond within time, kill it to simulate @@ -218,18 +280,11 @@ message *m_ptr; rp->r_check_tm = now; /* mark time */ } } - - /* If the service was signaled with a SIGTERM and fails to respond, - * kill the system service with a SIGKILL signal. - */ - if (rp->r_stop_tm > 0 && now - rp->r_stop_tm > 2*HZ) { - kill(rp->r_pid, SIGKILL); /* terminate */ - } } } /* Reschedule a synchronous alarm for the next period. */ - if (OK != (s=sys_setalarm(HZ, 0))) + if (OK != (s=sys_setalarm(RS_DELTA_T, 0))) panic("RS", "couldn't set alarm", s); } @@ -274,7 +329,7 @@ struct rproc *rp; if ((s=mapdriver(child_proc_nr, rp->r_dev_nr, rp->r_dev_style)) < 0) { report("RS", "couldn't map driver", errno); kill(child_pid, SIGKILL); /* kill driver */ - rp->r_flags |= EXIT_PENDING; /* expect exit */ + rp->r_flags |= RS_EXITING; /* expect exit */ return(s); /* return error */ } } @@ -287,7 +342,7 @@ struct rproc *rp; if ((s = _taskcall(SYSTEM, SYS_PRIVCTL, &m)) < 0) { /* set privileges */ report("RS","call to SYSTEM failed", s); /* to let child run */ kill(child_pid, SIGKILL); /* kill driver */ - rp->r_flags |= EXIT_PENDING; /* expect exit */ + rp->r_flags |= RS_EXITING; /* expect exit */ return(s); /* return error */ } @@ -301,7 +356,8 @@ struct rproc *rp; * thing that can go wrong now, is that execution fails at the child. If * that's the case, the child will exit. */ - rp->r_flags = IN_USE; /* mark slot in use */ + rp->r_flags = RS_IN_USE; /* mark slot in use */ + rp->r_restarts += 1; /* raise nr of restarts */ rp->r_proc_nr = child_proc_nr; /* set child details */ rp->r_pid = child_pid; rp->r_check_tm = 0; /* not check yet */ @@ -314,16 +370,49 @@ struct rproc *rp; /*===========================================================================* * stop_service * *===========================================================================*/ -PRIVATE int stop_service(rp) +PRIVATE int stop_service(rp,how) struct rproc *rp; +int how; { - printf("RS tries to stop %s (pid %d)\n", rp->r_cmd, rp->r_pid); /* Try to stop the system service. First send a SIGTERM signal to ask the * system service to terminate. If the service didn't install a signal * handler, it will be killed. If it did and ignores the signal, we'll * find out because we record the time here and send a SIGKILL. */ - rp->r_flags |= EXIT_PENDING; /* expect exit */ +#if VERBOSE + printf("RS tries to stop %s (pid %d)\n", rp->r_cmd, rp->r_pid); +#endif + + rp->r_flags |= how; /* what to on exit? */ kill(rp->r_pid, SIGTERM); /* first try friendly */ getuptime(&rp->r_stop_tm); /* record current time */ } + + +/*===========================================================================* + * do_getsysinfo * + *===========================================================================*/ +PUBLIC int do_getsysinfo(m_ptr) +message *m_ptr; +{ + vir_bytes src_addr, dst_addr; + int dst_proc; + size_t len; + int s; + + switch(m_ptr->m1_i1) { + case SI_PROC_TAB: + src_addr = (vir_bytes) rproc; + len = sizeof(struct rproc) * NR_SYS_PROCS; + break; + default: + return(EINVAL); + } + + dst_proc = m_ptr->m_source; + dst_addr = (vir_bytes) m_ptr->m1_p1; + if (OK != (s=sys_datacopy(SELF, src_addr, dst_proc, dst_addr, len))) + return(s); + return(OK); +} + diff --git a/servers/rs/manager.h b/servers/rs/manager.h new file mode 100644 index 000000000..7fe936dc0 --- /dev/null +++ b/servers/rs/manager.h @@ -0,0 +1,51 @@ +/* This table has one slot per system process. It contains information for + * servers and driver needed by the reincarnation server to keep track of + * each process' status. + */ + +/* Space reserved for program and arguments. */ +#define MAX_COMMAND_LEN 512 /* maximum argument string length */ +#define MAX_NR_ARGS 4 /* maximum number of arguments */ + +/* Definition of the system process table. This table only has entries for + * the servers and drivers, and thus is not directly indexed by slot number. + */ +extern struct rproc { + int r_proc_nr; /* process slot number */ + pid_t r_pid; /* process id */ + dev_t r_dev_nr; /* major device number */ + int r_dev_style; /* device style */ + + int r_restarts; /* number of restarts (initially zero) */ + long r_backoff; /* number of periods to wait before revive */ + unsigned r_flags; /* status and policy flags */ + + long r_period; /* heartbeat period (or zero) */ + clock_t r_check_tm; /* timestamp of last check */ + clock_t r_alive_tm; /* timestamp of last heartbeat */ + clock_t r_stop_tm; /* timestamp of SIGTERM signal */ + + char r_cmd[MAX_COMMAND_LEN]; /* raw command plus arguments */ + char *r_argv[MAX_NR_ARGS+2]; /* parsed arguments vector */ + int r_argc; /* number of arguments */ +} rproc[NR_SYS_PROCS]; + +/* Mapping for fast access to the system process table. */ +extern struct rproc *rproc_ptr[NR_PROCS]; +extern int nr_in_use; + +/* Flag values. */ +#define RS_IN_USE 0x001 /* set when process slot is in use */ +#define RS_EXITING 0x002 /* set when exit is expected */ +#define RS_REFRESHING 0x004 /* set when refresh must be done */ + +/* Constants determining RS period and binary exponential backoff. */ +#define RS_DELTA_T 60 /* check every T ticks */ +#define BACKOFF_BITS (sizeof(long)*8) /* bits in backoff field */ +#define MAX_BACKOFF 30 /* max backoff in RS_DELTA_T */ + +/* Magic process table addresses. */ +#define BEG_RPROC_ADDR (&rproc[0]) +#define END_RPROC_ADDR (&rproc[NR_SYS_PROCS]) +#define NIL_RPROC ((struct mproc *) 0) + diff --git a/servers/rs/proto.h b/servers/rs/proto.h index 26e7cc115..5cfb9c779 100644 --- a/servers/rs/proto.h +++ b/servers/rs/proto.h @@ -4,10 +4,12 @@ _PROTOTYPE( int main, (void)); /* manager.c */ -_PROTOTYPE( int do_start, (message *m)); -_PROTOTYPE( int do_stop, (message *m)); +_PROTOTYPE( int do_up, (message *m)); +_PROTOTYPE( int do_down, (message *m)); +_PROTOTYPE( int do_refresh, (message *m)); _PROTOTYPE( int do_shutdown, (message *m)); _PROTOTYPE( void do_period, (message *m)); _PROTOTYPE( void do_exit, (message *m)); +_PROTOTYPE( int do_getsysinfo, (message *m)); diff --git a/servers/rs/rs.h b/servers/rs/rs.h deleted file mode 100644 index edd81fb51..000000000 --- a/servers/rs/rs.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Header file for the system service manager server. - * - * Created: - * Jul 22, 2005 by Jorrit N. Herder - */ - -#define _SYSTEM 1 /* get OK and negative error codes */ -#define _MINIX 1 /* tell headers to include MINIX stuff */ - -#define VERBOSE 1 /* display diagnostics */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "proto.h" -#include "rproc.h" - diff --git a/servers/rs/service.c b/servers/rs/service.c index c2a5b120f..e437983df 100644 --- a/servers/rs/service.c +++ b/servers/rs/service.c @@ -23,6 +23,7 @@ PRIVATE char *known_requests[] = { "up", "down", + "refresh", "shutdown", "catch for illegal requests" }; @@ -72,6 +73,7 @@ PRIVATE void print_usage(char *app_name, char *problem) printf(" %s up [%s ] [%s ] [%s ]\n", app_name, ARG_ARGS, ARG_DEV, ARG_PERIOD); printf(" %s down \n", app_name); + printf(" %s refresh \n", app_name); printf(" %s shutdown\n", app_name); printf("\n"); } @@ -109,9 +111,9 @@ PRIVATE int parse_arguments(int argc, char **argv) print_usage(argv[ARG_NAME], "illegal request type"); exit(ENOSYS); } + req_nr = RS_RQ_BASE + req_type; - req_nr = SRV_RQ_BASE + req_type; - if (req_nr == SRV_UP) { + if (req_nr == RS_UP) { /* Verify argument count. */ if (argc - 1 < ARG_PATH) { @@ -171,7 +173,7 @@ PRIVATE int parse_arguments(int argc, char **argv) } } } - else if (req_nr == SRV_DOWN) { + else if (req_nr == RS_DOWN || req_nr == RS_REFRESH) { /* Verify argument count. */ if (argc - 1 < ARG_PID) { @@ -183,7 +185,7 @@ PRIVATE int parse_arguments(int argc, char **argv) exit(EINVAL); } } - else if (req_nr == SRV_SHUTDOWN) { + else if (req_nr == RS_SHUTDOWN) { /* no extra arguments required */ } @@ -198,6 +200,7 @@ PUBLIC int main(int argc, char **argv) { message m; int result; + int request; int s; /* Verify and parse the command line arguments. All arguments are checked @@ -205,34 +208,35 @@ PUBLIC int main(int argc, char **argv) * all needed parameters to perform the request are extracted and stored * global variables. */ - parse_arguments(argc, argv); + request = parse_arguments(argc, argv); /* Arguments seem fine. Try to perform the request. Only valid requests * should end up here. The default is used for not yet supported requests. */ - switch(req_type+SRV_RQ_BASE) { - case SRV_UP: + switch(request) { + case RS_UP: /* Build space-separated command string to be passed to RS server. */ strcpy(command, req_path); command[strlen(req_path)] = ' '; strcpy(command+strlen(req_path)+1, req_args); /* Build request message and send the request. */ - m.SRV_CMD_ADDR = command; - m.SRV_CMD_LEN = strlen(command); - m.SRV_DEV_MAJOR = req_major; - m.SRV_PERIOD = req_period; - if (OK != (s=_taskcall(RS_PROC_NR, SRV_UP, &m))) + m.RS_CMD_ADDR = command; + m.RS_CMD_LEN = strlen(command); + m.RS_DEV_MAJOR = req_major; + m.RS_PERIOD = req_period; + if (OK != (s=_taskcall(RS_PROC_NR, request, &m))) failure(s); result = m.m_type; break; - case SRV_DOWN: - m.SRV_PID = req_pid; - if (OK != (s=_taskcall(RS_PROC_NR, SRV_DOWN, &m))) + case RS_DOWN: + case RS_REFRESH: + m.RS_PID = req_pid; + if (OK != (s=_taskcall(RS_PROC_NR, request, &m))) failure(s); break; - case SRV_SHUTDOWN: - if (OK != (s=_taskcall(RS_PROC_NR, SRV_SHUTDOWN, &m))) + case RS_SHUTDOWN: + if (OK != (s=_taskcall(RS_PROC_NR, request, &m))) failure(s); break; default: