From: David van Moolenbroek Date: Tue, 14 Jul 2015 05:42:48 +0000 (+0200) Subject: RS/VM: proper preparation for multi-VM live update X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zpipe.c?a=commitdiff_plain;h=abf8a7e7b3a56fc9b8ce81cb3a69d8531aa90ce5;p=minix.git RS/VM: proper preparation for multi-VM live update Due to changed VM internals, more elaborate preparation is required before a live update with multiple components including VM can take place. This patch adds the essential preparation infrastructure to VM and adapts RS to make use of it. As a side effect, it is no longer necessary to supply RS as the last component (if at all) during the set-up of a multicomponent live update operation. Change-Id: If069fd3f93f96f9d5433998e4615f861465ef448 --- diff --git a/etc/system.conf b/etc/system.conf index b920a6993..c1e67bd3d 100644 --- a/etc/system.conf +++ b/etc/system.conf @@ -11,7 +11,8 @@ service rs RS_SET_PRIV # 37 RS_UPDATE # 41 RS_MEMCTL # 42 - PROCCTL + PROCCTL # 45 + RS_PREPARE # 48 ; io NONE; # No I/O range allowed irq NONE; # No IRQ allowed diff --git a/minix/commands/service/parse.c b/minix/commands/service/parse.c index 26a91a124..2cf52599f 100644 --- a/minix/commands/service/parse.c +++ b/minix/commands/service/parse.c @@ -754,6 +754,7 @@ struct { "CLEARCACHE", VM_CLEARCACHE }, { "VFS_MMAP", VM_VFS_MMAP }, { "VFS_REPLY", VM_VFS_REPLY }, + { "RS_PREPARE", VM_RS_PREPARE }, { NULL, 0 }, }; diff --git a/minix/include/minix/com.h b/minix/include/minix/com.h index 26415097f..df3a71006 100644 --- a/minix/include/minix/com.h +++ b/minix/include/minix/com.h @@ -751,8 +751,10 @@ #define VM_GETRUSAGE (VM_RQ_BASE+47) +#define VM_RS_PREPARE (VM_RQ_BASE+48) + /* Total. */ -#define NR_VM_CALLS 48 +#define NR_VM_CALLS 49 #define VM_CALL_MASK_SIZE BITMAP_CHUNKS(NR_VM_CALLS) /* not handled as a normal VM call, thus at the end of the reserved rage */ diff --git a/minix/include/minix/vm.h b/minix/include/minix/vm.h index f1b75259a..6047e0ed6 100644 --- a/minix/include/minix/vm.h +++ b/minix/include/minix/vm.h @@ -19,6 +19,7 @@ int vm_notify_sig(endpoint_t ep, endpoint_t ipc_ep); int vm_set_priv(endpoint_t ep, void *buf, int sys_proc); int vm_update(endpoint_t src_e, endpoint_t dst_e, int flags); int vm_memctl(endpoint_t ep, int req, void** addr, size_t *len); +int vm_prepare(endpoint_t src_e, endpoint_t dst_e, int flags); int vm_query_exit(endpoint_t *endpt); int vm_watch_exit(endpoint_t ep); int minix_vfs_mmap(endpoint_t who, off_t offset, size_t len, diff --git a/minix/lib/libsys/Makefile b/minix/lib/libsys/Makefile index cb6874f5b..1e5d7fcd0 100644 --- a/minix/lib/libsys/Makefile +++ b/minix/lib/libsys/Makefile @@ -100,6 +100,7 @@ SRCS+= \ vm_map_phys.c \ vm_memctl.c \ vm_notify_sig.c \ + vm_prepare.c \ vm_procctl.c \ vm_query_exit.c \ vm_set_priv.c \ diff --git a/minix/lib/libsys/vm_prepare.c b/minix/lib/libsys/vm_prepare.c new file mode 100644 index 000000000..9cf3d048f --- /dev/null +++ b/minix/lib/libsys/vm_prepare.c @@ -0,0 +1,17 @@ +#include "syslib.h" + +#include +#include + +int +vm_prepare(endpoint_t src_e, endpoint_t dst_e, int flags) +{ + message m; + + memset(&m, 0, sizeof(m)); + m.m_lsys_vm_update.src = src_e; + m.m_lsys_vm_update.dst = dst_e; + m.m_lsys_vm_update.flags = flags; + + return _taskcall(VM_PROC_NR, VM_RS_PREPARE, &m); +} diff --git a/minix/servers/rs/const.h b/minix/servers/rs/const.h index 050c3007d..d5a3bc895 100644 --- a/minix/servers/rs/const.h +++ b/minix/servers/rs/const.h @@ -87,21 +87,18 @@ #define RUPDATE_INIT() memset(&rupdate, 0, sizeof(rupdate)) #define RUPDATE_CLEAR() RUPDATE_INIT() +/* Note that we have 'B' last in order to allow 'continue' statements */ #define RUPDATE_ITER(HEAD, RPUPD_PREV, RPUPD, B) do { \ - RPUPD = HEAD; \ - RPUPD_PREV = NULL; \ - while(RPUPD) { \ + for(RPUPD = HEAD, RPUPD_PREV = NULL; RPUPD != NULL; \ + RPUPD_PREV = RPUPD, RPUPD = RPUPD->next_rpupd) { \ B \ - RPUPD_PREV = RPUPD; \ - RPUPD = RPUPD->next_rpupd; \ } \ } while(0) #define RUPDATE_REV_ITER(TAIL, RPUPD_PREV, RPUPD, B) do { \ RPUPD = TAIL; \ - while(RPUPD) { \ + for(RPUPD = TAIL; RPUPD != NULL; RPUPD = RPUPD->prev_rpupd) { \ RPUPD_PREV = RPUPD->prev_rpupd; \ B \ - RPUPD = RPUPD->prev_rpupd; \ } \ } while(0) diff --git a/minix/servers/rs/request.c b/minix/servers/rs/request.c index 88436d6c7..5babfeb0c 100644 --- a/minix/servers/rs/request.c +++ b/minix/servers/rs/request.c @@ -667,10 +667,6 @@ int do_update(message *m_ptr) printf("RS: the specified process is already part of the currently scheduled update\n"); return EINVAL; } - if(rupdate.last_rpupd->rp->r_pub->endpoint == RS_PROC_NR) { - printf("RS: RS should always be the last service to update in a multi-component update\n"); - return EINVAL; - } } /* Prepare-only update for VM, PM, and VFS is only supported with an unreachable state. */ diff --git a/minix/servers/rs/update.c b/minix/servers/rs/update.c index f6c6f200e..8dedfa5b6 100644 --- a/minix/servers/rs/update.c +++ b/minix/servers/rs/update.c @@ -23,35 +23,62 @@ void rupdate_clear_upds() void rupdate_add_upd(struct rprocupd* rpupd) { /* Add an update descriptor to the update chain. */ - struct rprocupd* prev_rpupd; + struct rprocupd *prev_rpupd, *walk_rpupd; + endpoint_t ep; int lu_flags; - rpupd->prev_rpupd = rupdate.last_rpupd; - if(rupdate.num_rpupds == 0) { - rupdate.first_rpupd = rpupd; - rupdate.curr_rpupd = rpupd; - } - else { - rupdate.last_rpupd->next_rpupd = rpupd; + /* In order to allow multicomponent-with-VM live updates to be processed + * correctly, we perform partial sorting on the chain: RS is to be last (if + * present), VM is to be right before it (if present), and all the other + * processes are to be at the start of the chain. + */ + + ep = rpupd->rp->r_pub->endpoint; + + assert(rpupd->next_rpupd == NULL); + assert(rpupd->prev_rpupd == NULL); + + /* Determine what element to insert after, if not at the head. */ + prev_rpupd = rupdate.last_rpupd; + if (prev_rpupd != NULL && ep != RS_PROC_NR && + prev_rpupd->rp->r_pub->endpoint == RS_PROC_NR) + prev_rpupd = prev_rpupd->prev_rpupd; + if (prev_rpupd != NULL && ep != RS_PROC_NR && ep != VM_PROC_NR && + prev_rpupd->rp->r_pub->endpoint == VM_PROC_NR) + prev_rpupd = prev_rpupd->prev_rpupd; + + /* Perform the insertion. */ + if (prev_rpupd == NULL) { + rpupd->next_rpupd = rupdate.first_rpupd; + rupdate.first_rpupd = rupdate.curr_rpupd = rpupd; + } else { + rpupd->next_rpupd = prev_rpupd->next_rpupd; + rpupd->prev_rpupd = prev_rpupd; + prev_rpupd->next_rpupd = rpupd; } - rupdate.last_rpupd = rpupd; + + if (rpupd->next_rpupd != NULL) + rpupd->next_rpupd->prev_rpupd = rpupd; + else + rupdate.last_rpupd = rpupd; + rupdate.num_rpupds++; /* Propagate relevant flags from the new descriptor. */ lu_flags = rpupd->lu_flags & (SEF_LU_INCLUDES_VM|SEF_LU_INCLUDES_RS|SEF_LU_UNSAFE|SEF_LU_MULTI); if(lu_flags) { - RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, rpupd, - rpupd->lu_flags |= lu_flags; - rpupd->init_flags |= lu_flags; + RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, walk_rpupd, + walk_rpupd->lu_flags |= lu_flags; + walk_rpupd->init_flags |= lu_flags; ); } /* Set VM/RS update descriptor pointers. */ if(!rupdate.vm_rpupd && (lu_flags & SEF_LU_INCLUDES_VM)) { - rupdate.vm_rpupd = rupdate.last_rpupd; + rupdate.vm_rpupd = rpupd; } else if(!rupdate.rs_rpupd && (lu_flags & SEF_LU_INCLUDES_RS)) { - rupdate.rs_rpupd = rupdate.last_rpupd; + rupdate.rs_rpupd = rpupd; } } @@ -419,14 +446,6 @@ int start_update_prepare(int allow_retries) if(rpupd->lu_flags & SEF_LU_NOMMAP) { rp->r_pub->sys_flags |= SF_VM_NOMMAP; } - if(!(rpupd->lu_flags & SEF_LU_UNSAFE)) { - if(rs_verbose) - printf("RS: %s pinning memory\n", srv_to_string(rp)); - vm_memctl(rp->r_pub->new_endpoint, VM_RS_MEM_PIN, 0, 0); - if(rs_verbose) - printf("RS: %s pinning memory\n", srv_to_string(new_rp)); - vm_memctl(new_rp->r_pub->endpoint, VM_RS_MEM_PIN, 0, 0); - } } } ); @@ -448,7 +467,9 @@ int start_update_prepare(int allow_retries) struct rprocupd* start_update_prepare_next() { /* Request the next service in the update chain to prepare for the update. */ - struct rprocupd *rpupd = NULL; + struct rprocupd *rpupd, *prev_rpupd, *walk_rpupd; + struct rproc *rp, *new_rp; + if(!RUPDATE_IS_UPDATING()) { rpupd = rupdate.first_rpupd; } @@ -458,6 +479,34 @@ struct rprocupd* start_update_prepare_next() if(!rpupd) { return NULL; } + + if (RUPDATE_IS_UPD_VM_MULTI() && rpupd == rupdate.vm_rpupd) { + /* We are doing a multicomponent live update that includes VM, and all + * services are now ready (and thereby stopped) except VM and possibly + * RS. This is the last point in time, and therefore also the best, that + * we can ask the (old) VM instance to do stuff for us, before we ask it + * to get ready as well: preallocate and pin memory, and copy over + * memory-mapped regions. Do this now, for all services except VM + * itself. In particular, also do it for RS, as we know that RS (yes, + * this service) is not going to create problems from here on. + */ + RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, walk_rpupd, + if (UPD_IS_PREPARING_ONLY(walk_rpupd)) + continue; /* skip prepare-only processes */ + if (walk_rpupd == rupdate.vm_rpupd) + continue; /* skip VM */ + rp = walk_rpupd->rp; + new_rp = rp->r_new_rp; + assert(rp && new_rp); + if (rs_verbose) + printf("RS: preparing VM for %s -> %s\n", srv_to_string(rp), + srv_to_string(new_rp)); + /* Ask VM to prepare the new instance based on the old instance. */ + vm_prepare(rp->r_pub->new_endpoint, new_rp->r_pub->endpoint, + rp->r_pub->sys_flags); + ); + } + rupdate.flags |= RS_UPDATING; while(1) { diff --git a/minix/servers/vm/main.c b/minix/servers/vm/main.c index 6f37e676c..7488cdc2f 100644 --- a/minix/servers/vm/main.c +++ b/minix/servers/vm/main.c @@ -554,6 +554,7 @@ void init_vm(void) /* Calls from RS */ CALLMAP(VM_RS_SET_PRIV, do_rs_set_priv); + CALLMAP(VM_RS_PREPARE, do_rs_prepare); CALLMAP(VM_RS_UPDATE, do_rs_update); CALLMAP(VM_RS_MEMCTL, do_rs_memctl); diff --git a/minix/servers/vm/proto.h b/minix/servers/vm/proto.h index 546fe1734..c46eb5d06 100644 --- a/minix/servers/vm/proto.h +++ b/minix/servers/vm/proto.h @@ -49,6 +49,7 @@ int do_info(message *); int swap_proc_slot(struct vmproc *src_vmp, struct vmproc *dst_vmp); int swap_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp, int sys_upd_flags); +int map_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp); void adjust_proc_refs(void); int do_getrusage(message *m); @@ -192,6 +193,7 @@ void map_sanitycheck(const char *file, int line); /* rs.c */ int do_rs_set_priv(message *m); +int do_rs_prepare(message *m); int do_rs_update(message *m); int do_rs_memctl(message *m); diff --git a/minix/servers/vm/rs.c b/minix/servers/vm/rs.c index d6c2ac8be..9f7298cb5 100644 --- a/minix/servers/vm/rs.c +++ b/minix/servers/vm/rs.c @@ -66,6 +66,85 @@ int do_rs_set_priv(message *m) return OK; } +/*===========================================================================* + * do_rs_prepare * + *===========================================================================*/ +int do_rs_prepare(message *m_ptr) +{ + /* Prepare a new instance of a service for an upcoming live-update + * switch, based on the old instance of this service. This call is + * used only by RS and only for a multicomponent live update which + * includes VM. In this case, all processes need to be prepared such + * that they don't require the new VM instance to perform actions + * during live update that cannot be undone in the case of a rollback. + */ + endpoint_t src_e, dst_e; + int src_p, dst_p; + struct vmproc *src_vmp, *dst_vmp; + struct vir_region *src_data_vr, *dst_data_vr; + vir_bytes src_addr, dst_addr; + int sys_upd_flags; + + src_e = m_ptr->m_lsys_vm_update.src; + dst_e = m_ptr->m_lsys_vm_update.dst; + sys_upd_flags = m_ptr->m_lsys_vm_update.flags; + + /* Lookup slots for source and destination process. */ + if(vm_isokendpt(src_e, &src_p) != OK) { + printf("VM: do_rs_prepare: bad src endpoint %d\n", src_e); + return EINVAL; + } + src_vmp = &vmproc[src_p]; + if(vm_isokendpt(dst_e, &dst_p) != OK) { + printf("VM: do_rs_prepare: bad dst endpoint %d\n", dst_e); + return EINVAL; + } + dst_vmp = &vmproc[dst_p]; + + /* Pin memory for the source process. */ + map_pin_memory(src_vmp); + + /* See if the source process has a larger heap than the destination + * process. If so, extend the heap of the destination process to + * match the source's. While this may end up wasting quite some + * memory, it is absolutely essential that the destination process + * does not run out of heap memory during the live update window, + * and since most processes will be doing an identity transfer, they + * are likely to require as much heap as their previous instances. + * Better safe than sorry. TODO: prevent wasting memory somehow; + * this seems particularly relevant for RS. + */ + src_data_vr = region_search(&src_vmp->vm_regions_avl, VM_MMAPBASE, + AVL_LESS); + assert(src_data_vr); + dst_data_vr = region_search(&dst_vmp->vm_regions_avl, VM_MMAPBASE, + AVL_LESS); + assert(dst_data_vr); + + src_addr = src_data_vr->vaddr + src_data_vr->length; + dst_addr = dst_data_vr->vaddr + dst_data_vr->length; + if (src_addr > dst_addr) + real_brk(dst_vmp, src_addr); + + /* Now also pin memory for the destination process. */ + map_pin_memory(dst_vmp); + + /* Finally, map the source process's memory-mapped regions into the + * destination process. This needs to happen now, because VM may not + * allocate any objects during the live update window, since this + * would prevent successful rollback of VM afterwards. The + * destination may not actually touch these regions during the live + * update window either, because they are mapped copy-on-write and a + * pagefault would also cause object allocation. Objects are pages, + * slab objects, anything in the new VM instance to which changes are + * visible in the old VM basically. + */ + if (!(sys_upd_flags & SF_VM_NOMMAP)) + map_proc_dyn_data(src_vmp, dst_vmp); + + return OK; +} + /*===========================================================================* * do_rs_update * *===========================================================================*/ diff --git a/minix/servers/vm/utility.c b/minix/servers/vm/utility.c index 8583b032c..08bc28849 100644 --- a/minix/servers/vm/utility.c +++ b/minix/servers/vm/utility.c @@ -223,12 +223,13 @@ int swap_proc_slot(struct vmproc *src_vmp, struct vmproc *dst_vmp) * Transfer memory mapped regions, using CoW sharing, from 'src_vmp' to * 'dst_vmp', for the source process's address range of 'start_addr' * (inclusive) to 'end_addr' (exclusive). Return OK or an error code. + * If the regions seem to have been transferred already, do nothing. */ static int -transfer_mmap_regions(struct vmproc *dst_vmp, struct vmproc *src_vmp, +transfer_mmap_regions(struct vmproc *src_vmp, struct vmproc *dst_vmp, vir_bytes start_addr, vir_bytes end_addr) { - struct vir_region *start_vr, *end_vr; + struct vir_region *start_vr, *check_vr, *end_vr; start_vr = region_search(&src_vmp->vm_regions_avl, start_addr, AVL_GREATER_EQUAL); @@ -236,6 +237,31 @@ transfer_mmap_regions(struct vmproc *dst_vmp, struct vmproc *src_vmp, if (start_vr == NULL || start_vr->vaddr >= end_addr) return OK; /* nothing to do */ + /* In the case of multicomponent live update that includes VM, this + * function may be called for the same process more than once, for the + * sake of keeping code paths as little divergent as possible while at + * the same time ensuring that the regions are copied early enough. + * + * To compensate for these multiple calls, we perform a very simple + * check here to see if the region to transfer is already present in + * the target process. If so, we can safely skip copying the regions + * again, because there is no other possible explanation for the + * region being present already. Things would go horribly wrong if we + * tried copying anyway, but this check is not good enough to detect + * all such problems, since we do a check on the base address only. + */ + check_vr = region_search(&dst_vmp->vm_regions_avl, start_vr->vaddr, + AVL_EQUAL); + if (check_vr != NULL) { +#if LU_DEBUG + printf("VM: transfer_mmap_regions: skipping transfer from " + "%d to %d (0x%lx already present)\n", + src_vmp->vm_endpoint, dst_vmp->vm_endpoint, + start_vr->vaddr); +#endif + return OK; + } + end_vr = region_search(&src_vmp->vm_regions_avl, end_addr, AVL_LESS); assert(end_vr != NULL); assert(start_vr->vaddr <= end_vr->vaddr); @@ -249,6 +275,38 @@ transfer_mmap_regions(struct vmproc *dst_vmp, struct vmproc *src_vmp, return map_proc_copy_range(dst_vmp, src_vmp, start_vr, end_vr); } +/* + * Create copy-on-write mappings in process 'dst_vmp' for all memory-mapped + * regions present in 'src_vmp'. Return OK on success, or an error otherwise. + * In the case of failure, successfully created mappings are not undone. + */ +int +map_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp) +{ + int r; + +#if LU_DEBUG + printf("VM: mapping dynamic data from %d to %d\n", + src_vmp->vm_endpoint, dst_vmp->vm_endpoint); +#endif + + /* Transfer memory mapped regions now. To sandbox the new instance and + * prevent state corruption on rollback, we share all the regions + * between the two instances as COW. + */ + r = transfer_mmap_regions(src_vmp, dst_vmp, VM_MMAPBASE, VM_MMAPTOP); + + /* If the stack is not mapped at the VM_DATATOP, there might be some + * more regions hiding above the stack. We also have to transfer + * those. + */ + if (r == OK && VM_STACKTOP < VM_DATATOP) + r = transfer_mmap_regions(src_vmp, dst_vmp, VM_STACKTOP, + VM_DATATOP); + + return r; +} + /*===========================================================================* * swap_proc_dyn_data * *===========================================================================*/ @@ -297,22 +355,8 @@ int swap_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp, /* Make sure regions are consistent. */ assert(region_search_root(&src_vmp->vm_regions_avl) && region_search_root(&dst_vmp->vm_regions_avl)); - /* Transfer memory mapped regions now. To sandbox the new instance and - * prevent state corruption on rollback, we share all the regions - * between the two instances as COW. Source and destination are - * intentionally swapped in these calls! - */ - r = transfer_mmap_regions(src_vmp, dst_vmp, VM_MMAPBASE, VM_MMAPTOP); - - /* If the stack is not mapped at the VM_DATATOP, there might be some - * more regions hiding above the stack. We also have to transfer - * those. - */ - if (r == OK && VM_STACKTOP < VM_DATATOP) - r = transfer_mmap_regions(src_vmp, dst_vmp, VM_STACKTOP, - VM_DATATOP); - - return r; + /* Source and destination are intentionally swapped here! */ + return map_proc_dyn_data(dst_vmp, src_vmp); } void *mmap(void *addr, size_t len, int f, int f2, int f3, off_t o)