RS_SET_PRIV # 37
RS_UPDATE # 41
RS_MEMCTL # 42
- PROCCTL
+ PROCCTL # 45
+ RS_PREPARE # 48
;
io NONE; # No I/O range allowed
irq NONE; # No IRQ allowed
{ "CLEARCACHE", VM_CLEARCACHE },
{ "VFS_MMAP", VM_VFS_MMAP },
{ "VFS_REPLY", VM_VFS_REPLY },
+ { "RS_PREPARE", VM_RS_PREPARE },
{ NULL, 0 },
};
#define VM_GETRUSAGE (VM_RQ_BASE+47)
+#define VM_RS_PREPARE (VM_RQ_BASE+48)
+
/* Total. */
-#define NR_VM_CALLS 48
+#define NR_VM_CALLS 49
#define VM_CALL_MASK_SIZE BITMAP_CHUNKS(NR_VM_CALLS)
/* not handled as a normal VM call, thus at the end of the reserved rage */
int vm_set_priv(endpoint_t ep, void *buf, int sys_proc);
int vm_update(endpoint_t src_e, endpoint_t dst_e, int flags);
int vm_memctl(endpoint_t ep, int req, void** addr, size_t *len);
+int vm_prepare(endpoint_t src_e, endpoint_t dst_e, int flags);
int vm_query_exit(endpoint_t *endpt);
int vm_watch_exit(endpoint_t ep);
int minix_vfs_mmap(endpoint_t who, off_t offset, size_t len,
vm_map_phys.c \
vm_memctl.c \
vm_notify_sig.c \
+ vm_prepare.c \
vm_procctl.c \
vm_query_exit.c \
vm_set_priv.c \
--- /dev/null
+#include "syslib.h"
+
+#include <unistd.h>
+#include <string.h>
+
+int
+vm_prepare(endpoint_t src_e, endpoint_t dst_e, int flags)
+{
+ message m;
+
+ memset(&m, 0, sizeof(m));
+ m.m_lsys_vm_update.src = src_e;
+ m.m_lsys_vm_update.dst = dst_e;
+ m.m_lsys_vm_update.flags = flags;
+
+ return _taskcall(VM_PROC_NR, VM_RS_PREPARE, &m);
+}
#define RUPDATE_INIT() memset(&rupdate, 0, sizeof(rupdate))
#define RUPDATE_CLEAR() RUPDATE_INIT()
+/* Note that we have 'B' last in order to allow 'continue' statements */
#define RUPDATE_ITER(HEAD, RPUPD_PREV, RPUPD, B) do { \
- RPUPD = HEAD; \
- RPUPD_PREV = NULL; \
- while(RPUPD) { \
+ for(RPUPD = HEAD, RPUPD_PREV = NULL; RPUPD != NULL; \
+ RPUPD_PREV = RPUPD, RPUPD = RPUPD->next_rpupd) { \
B \
- RPUPD_PREV = RPUPD; \
- RPUPD = RPUPD->next_rpupd; \
} \
} while(0)
#define RUPDATE_REV_ITER(TAIL, RPUPD_PREV, RPUPD, B) do { \
RPUPD = TAIL; \
- while(RPUPD) { \
+ for(RPUPD = TAIL; RPUPD != NULL; RPUPD = RPUPD->prev_rpupd) { \
RPUPD_PREV = RPUPD->prev_rpupd; \
B \
- RPUPD = RPUPD->prev_rpupd; \
} \
} while(0)
printf("RS: the specified process is already part of the currently scheduled update\n");
return EINVAL;
}
- if(rupdate.last_rpupd->rp->r_pub->endpoint == RS_PROC_NR) {
- printf("RS: RS should always be the last service to update in a multi-component update\n");
- return EINVAL;
- }
}
/* Prepare-only update for VM, PM, and VFS is only supported with an unreachable state. */
void rupdate_add_upd(struct rprocupd* rpupd)
{
/* Add an update descriptor to the update chain. */
- struct rprocupd* prev_rpupd;
+ struct rprocupd *prev_rpupd, *walk_rpupd;
+ endpoint_t ep;
int lu_flags;
- rpupd->prev_rpupd = rupdate.last_rpupd;
- if(rupdate.num_rpupds == 0) {
- rupdate.first_rpupd = rpupd;
- rupdate.curr_rpupd = rpupd;
- }
- else {
- rupdate.last_rpupd->next_rpupd = rpupd;
+ /* In order to allow multicomponent-with-VM live updates to be processed
+ * correctly, we perform partial sorting on the chain: RS is to be last (if
+ * present), VM is to be right before it (if present), and all the other
+ * processes are to be at the start of the chain.
+ */
+
+ ep = rpupd->rp->r_pub->endpoint;
+
+ assert(rpupd->next_rpupd == NULL);
+ assert(rpupd->prev_rpupd == NULL);
+
+ /* Determine what element to insert after, if not at the head. */
+ prev_rpupd = rupdate.last_rpupd;
+ if (prev_rpupd != NULL && ep != RS_PROC_NR &&
+ prev_rpupd->rp->r_pub->endpoint == RS_PROC_NR)
+ prev_rpupd = prev_rpupd->prev_rpupd;
+ if (prev_rpupd != NULL && ep != RS_PROC_NR && ep != VM_PROC_NR &&
+ prev_rpupd->rp->r_pub->endpoint == VM_PROC_NR)
+ prev_rpupd = prev_rpupd->prev_rpupd;
+
+ /* Perform the insertion. */
+ if (prev_rpupd == NULL) {
+ rpupd->next_rpupd = rupdate.first_rpupd;
+ rupdate.first_rpupd = rupdate.curr_rpupd = rpupd;
+ } else {
+ rpupd->next_rpupd = prev_rpupd->next_rpupd;
+ rpupd->prev_rpupd = prev_rpupd;
+ prev_rpupd->next_rpupd = rpupd;
}
- rupdate.last_rpupd = rpupd;
+
+ if (rpupd->next_rpupd != NULL)
+ rpupd->next_rpupd->prev_rpupd = rpupd;
+ else
+ rupdate.last_rpupd = rpupd;
+
rupdate.num_rpupds++;
/* Propagate relevant flags from the new descriptor. */
lu_flags = rpupd->lu_flags & (SEF_LU_INCLUDES_VM|SEF_LU_INCLUDES_RS|SEF_LU_UNSAFE|SEF_LU_MULTI);
if(lu_flags) {
- RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, rpupd,
- rpupd->lu_flags |= lu_flags;
- rpupd->init_flags |= lu_flags;
+ RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, walk_rpupd,
+ walk_rpupd->lu_flags |= lu_flags;
+ walk_rpupd->init_flags |= lu_flags;
);
}
/* Set VM/RS update descriptor pointers. */
if(!rupdate.vm_rpupd && (lu_flags & SEF_LU_INCLUDES_VM)) {
- rupdate.vm_rpupd = rupdate.last_rpupd;
+ rupdate.vm_rpupd = rpupd;
}
else if(!rupdate.rs_rpupd && (lu_flags & SEF_LU_INCLUDES_RS)) {
- rupdate.rs_rpupd = rupdate.last_rpupd;
+ rupdate.rs_rpupd = rpupd;
}
}
if(rpupd->lu_flags & SEF_LU_NOMMAP) {
rp->r_pub->sys_flags |= SF_VM_NOMMAP;
}
- if(!(rpupd->lu_flags & SEF_LU_UNSAFE)) {
- if(rs_verbose)
- printf("RS: %s pinning memory\n", srv_to_string(rp));
- vm_memctl(rp->r_pub->new_endpoint, VM_RS_MEM_PIN, 0, 0);
- if(rs_verbose)
- printf("RS: %s pinning memory\n", srv_to_string(new_rp));
- vm_memctl(new_rp->r_pub->endpoint, VM_RS_MEM_PIN, 0, 0);
- }
}
}
);
struct rprocupd* start_update_prepare_next()
{
/* Request the next service in the update chain to prepare for the update. */
- struct rprocupd *rpupd = NULL;
+ struct rprocupd *rpupd, *prev_rpupd, *walk_rpupd;
+ struct rproc *rp, *new_rp;
+
if(!RUPDATE_IS_UPDATING()) {
rpupd = rupdate.first_rpupd;
}
if(!rpupd) {
return NULL;
}
+
+ if (RUPDATE_IS_UPD_VM_MULTI() && rpupd == rupdate.vm_rpupd) {
+ /* We are doing a multicomponent live update that includes VM, and all
+ * services are now ready (and thereby stopped) except VM and possibly
+ * RS. This is the last point in time, and therefore also the best, that
+ * we can ask the (old) VM instance to do stuff for us, before we ask it
+ * to get ready as well: preallocate and pin memory, and copy over
+ * memory-mapped regions. Do this now, for all services except VM
+ * itself. In particular, also do it for RS, as we know that RS (yes,
+ * this service) is not going to create problems from here on.
+ */
+ RUPDATE_ITER(rupdate.first_rpupd, prev_rpupd, walk_rpupd,
+ if (UPD_IS_PREPARING_ONLY(walk_rpupd))
+ continue; /* skip prepare-only processes */
+ if (walk_rpupd == rupdate.vm_rpupd)
+ continue; /* skip VM */
+ rp = walk_rpupd->rp;
+ new_rp = rp->r_new_rp;
+ assert(rp && new_rp);
+ if (rs_verbose)
+ printf("RS: preparing VM for %s -> %s\n", srv_to_string(rp),
+ srv_to_string(new_rp));
+ /* Ask VM to prepare the new instance based on the old instance. */
+ vm_prepare(rp->r_pub->new_endpoint, new_rp->r_pub->endpoint,
+ rp->r_pub->sys_flags);
+ );
+ }
+
rupdate.flags |= RS_UPDATING;
while(1) {
/* Calls from RS */
CALLMAP(VM_RS_SET_PRIV, do_rs_set_priv);
+ CALLMAP(VM_RS_PREPARE, do_rs_prepare);
CALLMAP(VM_RS_UPDATE, do_rs_update);
CALLMAP(VM_RS_MEMCTL, do_rs_memctl);
int swap_proc_slot(struct vmproc *src_vmp, struct vmproc *dst_vmp);
int swap_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp,
int sys_upd_flags);
+int map_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp);
void adjust_proc_refs(void);
int do_getrusage(message *m);
/* rs.c */
int do_rs_set_priv(message *m);
+int do_rs_prepare(message *m);
int do_rs_update(message *m);
int do_rs_memctl(message *m);
return OK;
}
+/*===========================================================================*
+ * do_rs_prepare *
+ *===========================================================================*/
+int do_rs_prepare(message *m_ptr)
+{
+ /* Prepare a new instance of a service for an upcoming live-update
+ * switch, based on the old instance of this service. This call is
+ * used only by RS and only for a multicomponent live update which
+ * includes VM. In this case, all processes need to be prepared such
+ * that they don't require the new VM instance to perform actions
+ * during live update that cannot be undone in the case of a rollback.
+ */
+ endpoint_t src_e, dst_e;
+ int src_p, dst_p;
+ struct vmproc *src_vmp, *dst_vmp;
+ struct vir_region *src_data_vr, *dst_data_vr;
+ vir_bytes src_addr, dst_addr;
+ int sys_upd_flags;
+
+ src_e = m_ptr->m_lsys_vm_update.src;
+ dst_e = m_ptr->m_lsys_vm_update.dst;
+ sys_upd_flags = m_ptr->m_lsys_vm_update.flags;
+
+ /* Lookup slots for source and destination process. */
+ if(vm_isokendpt(src_e, &src_p) != OK) {
+ printf("VM: do_rs_prepare: bad src endpoint %d\n", src_e);
+ return EINVAL;
+ }
+ src_vmp = &vmproc[src_p];
+ if(vm_isokendpt(dst_e, &dst_p) != OK) {
+ printf("VM: do_rs_prepare: bad dst endpoint %d\n", dst_e);
+ return EINVAL;
+ }
+ dst_vmp = &vmproc[dst_p];
+
+ /* Pin memory for the source process. */
+ map_pin_memory(src_vmp);
+
+ /* See if the source process has a larger heap than the destination
+ * process. If so, extend the heap of the destination process to
+ * match the source's. While this may end up wasting quite some
+ * memory, it is absolutely essential that the destination process
+ * does not run out of heap memory during the live update window,
+ * and since most processes will be doing an identity transfer, they
+ * are likely to require as much heap as their previous instances.
+ * Better safe than sorry. TODO: prevent wasting memory somehow;
+ * this seems particularly relevant for RS.
+ */
+ src_data_vr = region_search(&src_vmp->vm_regions_avl, VM_MMAPBASE,
+ AVL_LESS);
+ assert(src_data_vr);
+ dst_data_vr = region_search(&dst_vmp->vm_regions_avl, VM_MMAPBASE,
+ AVL_LESS);
+ assert(dst_data_vr);
+
+ src_addr = src_data_vr->vaddr + src_data_vr->length;
+ dst_addr = dst_data_vr->vaddr + dst_data_vr->length;
+ if (src_addr > dst_addr)
+ real_brk(dst_vmp, src_addr);
+
+ /* Now also pin memory for the destination process. */
+ map_pin_memory(dst_vmp);
+
+ /* Finally, map the source process's memory-mapped regions into the
+ * destination process. This needs to happen now, because VM may not
+ * allocate any objects during the live update window, since this
+ * would prevent successful rollback of VM afterwards. The
+ * destination may not actually touch these regions during the live
+ * update window either, because they are mapped copy-on-write and a
+ * pagefault would also cause object allocation. Objects are pages,
+ * slab objects, anything in the new VM instance to which changes are
+ * visible in the old VM basically.
+ */
+ if (!(sys_upd_flags & SF_VM_NOMMAP))
+ map_proc_dyn_data(src_vmp, dst_vmp);
+
+ return OK;
+}
+
/*===========================================================================*
* do_rs_update *
*===========================================================================*/
* Transfer memory mapped regions, using CoW sharing, from 'src_vmp' to
* 'dst_vmp', for the source process's address range of 'start_addr'
* (inclusive) to 'end_addr' (exclusive). Return OK or an error code.
+ * If the regions seem to have been transferred already, do nothing.
*/
static int
-transfer_mmap_regions(struct vmproc *dst_vmp, struct vmproc *src_vmp,
+transfer_mmap_regions(struct vmproc *src_vmp, struct vmproc *dst_vmp,
vir_bytes start_addr, vir_bytes end_addr)
{
- struct vir_region *start_vr, *end_vr;
+ struct vir_region *start_vr, *check_vr, *end_vr;
start_vr = region_search(&src_vmp->vm_regions_avl, start_addr,
AVL_GREATER_EQUAL);
if (start_vr == NULL || start_vr->vaddr >= end_addr)
return OK; /* nothing to do */
+ /* In the case of multicomponent live update that includes VM, this
+ * function may be called for the same process more than once, for the
+ * sake of keeping code paths as little divergent as possible while at
+ * the same time ensuring that the regions are copied early enough.
+ *
+ * To compensate for these multiple calls, we perform a very simple
+ * check here to see if the region to transfer is already present in
+ * the target process. If so, we can safely skip copying the regions
+ * again, because there is no other possible explanation for the
+ * region being present already. Things would go horribly wrong if we
+ * tried copying anyway, but this check is not good enough to detect
+ * all such problems, since we do a check on the base address only.
+ */
+ check_vr = region_search(&dst_vmp->vm_regions_avl, start_vr->vaddr,
+ AVL_EQUAL);
+ if (check_vr != NULL) {
+#if LU_DEBUG
+ printf("VM: transfer_mmap_regions: skipping transfer from "
+ "%d to %d (0x%lx already present)\n",
+ src_vmp->vm_endpoint, dst_vmp->vm_endpoint,
+ start_vr->vaddr);
+#endif
+ return OK;
+ }
+
end_vr = region_search(&src_vmp->vm_regions_avl, end_addr, AVL_LESS);
assert(end_vr != NULL);
assert(start_vr->vaddr <= end_vr->vaddr);
return map_proc_copy_range(dst_vmp, src_vmp, start_vr, end_vr);
}
+/*
+ * Create copy-on-write mappings in process 'dst_vmp' for all memory-mapped
+ * regions present in 'src_vmp'. Return OK on success, or an error otherwise.
+ * In the case of failure, successfully created mappings are not undone.
+ */
+int
+map_proc_dyn_data(struct vmproc *src_vmp, struct vmproc *dst_vmp)
+{
+ int r;
+
+#if LU_DEBUG
+ printf("VM: mapping dynamic data from %d to %d\n",
+ src_vmp->vm_endpoint, dst_vmp->vm_endpoint);
+#endif
+
+ /* Transfer memory mapped regions now. To sandbox the new instance and
+ * prevent state corruption on rollback, we share all the regions
+ * between the two instances as COW.
+ */
+ r = transfer_mmap_regions(src_vmp, dst_vmp, VM_MMAPBASE, VM_MMAPTOP);
+
+ /* If the stack is not mapped at the VM_DATATOP, there might be some
+ * more regions hiding above the stack. We also have to transfer
+ * those.
+ */
+ if (r == OK && VM_STACKTOP < VM_DATATOP)
+ r = transfer_mmap_regions(src_vmp, dst_vmp, VM_STACKTOP,
+ VM_DATATOP);
+
+ return r;
+}
+
/*===========================================================================*
* swap_proc_dyn_data *
*===========================================================================*/
/* Make sure regions are consistent. */
assert(region_search_root(&src_vmp->vm_regions_avl) && region_search_root(&dst_vmp->vm_regions_avl));
- /* Transfer memory mapped regions now. To sandbox the new instance and
- * prevent state corruption on rollback, we share all the regions
- * between the two instances as COW. Source and destination are
- * intentionally swapped in these calls!
- */
- r = transfer_mmap_regions(src_vmp, dst_vmp, VM_MMAPBASE, VM_MMAPTOP);
-
- /* If the stack is not mapped at the VM_DATATOP, there might be some
- * more regions hiding above the stack. We also have to transfer
- * those.
- */
- if (r == OK && VM_STACKTOP < VM_DATATOP)
- r = transfer_mmap_regions(src_vmp, dst_vmp, VM_STACKTOP,
- VM_DATATOP);
-
- return r;
+ /* Source and destination are intentionally swapped here! */
+ return map_proc_dyn_data(dst_vmp, src_vmp);
}
void *mmap(void *addr, size_t len, int f, int f2, int f3, off_t o)