From: Ben Gras Date: Tue, 7 May 2013 12:36:09 +0000 (+0000) Subject: vm: mmap support X-Git-Tag: v3.3.0~942 X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zlib_tech.html?a=commitdiff_plain;h=refs%2Fchanges%2F44%2F544%2F15;p=minix.git vm: mmap support . test74 for mmap functionality . vm: add a mem_file memory type that specifies an mmap()ped memory range, backed by a file . add fdref, an object that keeps track of FD references within VM per process and so knows how to de-duplicate the use of FD's by various mmap()ped ranges; there can be many more than there can be FD's . turned off for now, enable with 'filemap=1' as boot option Change-Id: I640b1126cdaa522a0560301cf6732b7661555672 --- diff --git a/distrib/sets/lists/minix/mi b/distrib/sets/lists/minix/mi index 1e502071b..d34e16667 100644 --- a/distrib/sets/lists/minix/mi +++ b/distrib/sets/lists/minix/mi @@ -4643,6 +4643,7 @@ ./usr/tests/minix-posix/test71 minix-sys ./usr/tests/minix-posix/test72 minix-sys ./usr/tests/minix-posix/test73 minix-sys +./usr/tests/minix-posix/test74 minix-sys ./usr/tests/minix-posix/test7 minix-sys ./usr/tests/minix-posix/test8 minix-sys ./usr/tests/minix-posix/test9 minix-sys diff --git a/etc/boot.cfg.default b/etc/boot.cfg.default index 2ad2cc404..23e805f05 100644 --- a/etc/boot.cfg.default +++ b/etc/boot.cfg.default @@ -4,5 +4,6 @@ default=2 menu=Start MINIX 3:load_mods /boot/minix_default/mod*;multiboot /boot/minix_default/kernel rootdevname=$rootdevname $args menu=Start latest MINIX 3:load_mods /boot/minix_latest/mod*;multiboot /boot/minix_latest/kernel rootdevname=$rootdevname $args menu=Start latest MINIX 3 in single user mode:load_mods /boot/minix_latest/mod*;multiboot /boot/minix_latest/kernel rootdevname=$rootdevname bootopts=-s $args +menu=Start latest MINIX 3 with file mmap:load_mods /boot/minix_latest/mod*;multiboot /boot/minix_latest/kernel rootdevname=$rootdevname filemap=1 $args menu=Edit menu option:edit menu=Drop to boot prompt:prompt diff --git a/servers/vm/Makefile b/servers/vm/Makefile index ea9a67f37..f24cf428f 100644 --- a/servers/vm/Makefile +++ b/servers/vm/Makefile @@ -6,7 +6,7 @@ SRCS= main.c alloc.c utility.c exit.c fork.c break.c \ mmap.c slaballoc.c region.c pagefaults.c \ rs.c queryexit.c pb.c regionavl.c \ mem_anon.c mem_directphys.c mem_anon_contig.c mem_shared.c \ - mem_cache.c cache.c + mem_cache.c cache.c vfs.c mem_file.c fdref.c .if ${MACHINE_ARCH} == "earm" LDFLAGS+= -T ${.CURDIR}/arch/${MACHINE_ARCH}/vm.lds diff --git a/servers/vm/arch/i386/pagetable.c b/servers/vm/arch/i386/pagetable.c index 0f6cca9ce..c9e6d8fc3 100644 --- a/servers/vm/arch/i386/pagetable.c +++ b/servers/vm/arch/i386/pagetable.c @@ -1330,10 +1330,6 @@ int pt_bind(pt_t *pt, struct vmproc *who) pdeslot * ARCH_PAGEDIR_SIZE); #endif -#if 0 - printf("VM: slot %d endpoint %d has pde val 0x%lx at kernel address 0x%lx\n", - slot, who->vm_endpoint, page_directories[slot], pdes); -#endif /* Tell kernel about new page table root. */ return sys_vmctl_set_addrspace(who->vm_endpoint, pt->pt_dir_phys, pdes); } diff --git a/servers/vm/fdref.c b/servers/vm/fdref.c new file mode 100644 index 000000000..d72c5faf0 --- /dev/null +++ b/servers/vm/fdref.c @@ -0,0 +1,177 @@ + +/* File that implements the 'fdref' data structure. It keeps track + * of how many times a particular fd (per process) is referenced by + * mmapped objects. + * + * This is used to + * - have many references to the same file, without needing an FD each + * - deciding when we have to close an FD (last reference disappears) + * + * Examples: + * - if a file-mmapped region is split, the refcount increases; there are + * now two regions referencing the same FD. We can't simply close the + * FD once either region is unmapped, as the pagefaults for the other + * would stop working. So we increase the refcount to that fd. + * - if a new file-maped region is requested, we might find out it's the + * same dev/inode the same process already has referenced. we could + * decide to close the new reference and use an existing one, so + * references to the same file aren't fd-limited. + * - if a file-mapped region is copied, we have to create a new + * fdref object, as the source process might disappear; we have to + * use the new process' fd for it. + */ + +#include +#include + +#include + +#include "proto.h" +#include "vm.h" +#include "fdref.h" +#include "vmproc.h" +#include "glo.h" + +static struct fdref *fdrefs; + +void fdref_sanitycheck(void) +{ + struct vmproc *vmp; + region_iter v_iter; + struct fdref *fr; + static int prevopen = 0; + int openfd = 0; + + for(fr = fdrefs; fr; fr = fr->next) { + struct fdref *fr2; + for(fr2 = fdrefs; fr2; fr2 = fr2->next) { + if(fr == fr2) continue; + if(fr->fd == fr2->fd) { + printf("equal fd omg\n"); + util_stacktrace(); + } + if(fr->ino == fr2->ino && fr->dev == fr2->dev) { + printf("equal metadata omg\n"); + util_stacktrace(); + } + } + openfd++; + } + + for(fr = fdrefs; fr; fr = fr->next) { + fr->counting = 0; + } + + for(vmp = vmproc; vmp < &vmproc[VMP_NR]; vmp++) { + struct vir_region *vr; + if(!(vmp->vm_flags & VMF_INUSE)) + continue; + region_start_iter_least(&vmp->vm_regions_avl, &v_iter); + while((vr = region_get_iter(&v_iter))) { + if(vr->def_memtype == &mem_type_mappedfile && vr->param.file.inited) { + vr->param.file.fdref->counting++; + } + region_incr_iter(&v_iter); + } + + } + + for(fr = fdrefs; fr; fr = fr->next) { + if(fr->counting != fr->refcount) { + printf("counting %d != refcount %d\n", + fr->counting, fr->refcount); + util_stacktrace(); + } + } + + if(prevopen != openfd && openfd > 100) { + printf("%d open\n", openfd); + prevopen = openfd; + } +} + +struct fdref *fdref_new(struct vmproc *owner, ino_t ino, dev_t dev, int fd) +{ + struct fdref *fdref; + + if(!SLABALLOC(fdref)) return NULL; + + fdref->fd = fd; + fdref->refcount = 0; + fdref->dev = dev; + fdref->ino = ino; + fdref->next = fdrefs; + fdrefs = fdref; + + return fdref; +} + +void fdref_ref(struct fdref *ref, struct vir_region *region) +{ + assert(ref); + region->param.file.fdref = ref; + ref->refcount++; +} + +void fdref_deref(struct vir_region *region) +{ + struct fdref *ref = region->param.file.fdref; + int fd; + + assert(ref); + assert(ref->refcount > 0); + + fd = ref->fd; + region->param.file.fdref = NULL; + ref->refcount--; + assert(ref->refcount >= 0); + if(ref->refcount > 0) return; + + if(fdrefs == ref) fdrefs = ref->next; + else { + struct fdref *r; + for(r = fdrefs; r->next != ref; r = r->next) + ; + assert(r); + assert(r->next == ref); + r->next = ref->next; + } + + SLABFREE(ref); + ref = NULL; + + /* If the last reference has disappeared, free the + * ref object and asynchronously close the fd in VFS. + * + * We don't need a callback as a close failing, although + * unexpected, isn't a problem and can't be handled. VFS + * will print a diagnostic. + */ + if(vfs_request(VMVFSREQ_FDCLOSE, fd, region->parent, + 0, 0, NULL, NULL, NULL, 0) != OK) { + panic("fdref_deref: could not send close request"); + } +} + +struct fdref *fdref_dedup_or_new(struct vmproc *owner, + ino_t ino, dev_t dev, int fd, int mayclose) +{ + struct fdref *fr; + + for(fr = fdrefs; fr; fr = fr->next) { + if(ino == fr->ino && dev == fr->dev) { + if(fd == fr->fd) { + return fr; + } + if(!mayclose) continue; + if(vfs_request(VMVFSREQ_FDCLOSE, fd, owner, + 0, 0, NULL, NULL, NULL, 0) != OK) { + printf("fdref_dedup_or_new: could not close\n"); + } + return fr; + } + } + + return fdref_new(owner, ino, dev, fd); +} + diff --git a/servers/vm/fdref.h b/servers/vm/fdref.h new file mode 100644 index 000000000..603baf4e7 --- /dev/null +++ b/servers/vm/fdref.h @@ -0,0 +1,29 @@ + +#ifndef _FDREF_H +#define _FDREF_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct fdref { + int fd; + int refcount; + dev_t dev; + ino_t ino; + struct fdref *next; + int counting; /* sanity check */ +} *fdref; + +#endif + diff --git a/servers/vm/glo.h b/servers/vm/glo.h index 4a5e77fc3..f4d4002b5 100644 --- a/servers/vm/glo.h +++ b/servers/vm/glo.h @@ -18,6 +18,8 @@ EXTERN struct vmproc vmproc[VMP_NR]; +long enable_filemap; + EXTERN kinfo_t kernel_boot_info; #if SANITYCHECKS diff --git a/servers/vm/main.c b/servers/vm/main.c index caf148de5..3f494033a 100644 --- a/servers/vm/main.c +++ b/servers/vm/main.c @@ -323,6 +323,9 @@ void init_vm(void) panic("couldn't get bootinfo: %d", s); } + /* Turn file mmap on? */ + env_parse("filemap", "d", 0, &enable_filemap, 0, 1); + /* Sanity check */ assert(kernel_boot_info.mmap_size > 0); assert(kernel_boot_info.mods_with_kernel > 0); @@ -414,6 +417,10 @@ void init_vm(void) CALLMAP(VM_WILLEXIT, do_willexit); CALLMAP(VM_NOTIFY_SIG, do_notify_sig); + /* Calls from VFS. */ + CALLMAP(VM_VFS_REPLY, do_vfs_reply); + CALLMAP(VM_VFS_MMAP, do_vfs_mmap); + /* Calls from RS */ CALLMAP(VM_RS_SET_PRIV, do_rs_set_priv); CALLMAP(VM_RS_UPDATE, do_rs_update); diff --git a/servers/vm/mem_cache.c b/servers/vm/mem_cache.c index b5fd758c8..7bc0bc24e 100644 --- a/servers/vm/mem_cache.c +++ b/servers/vm/mem_cache.c @@ -118,7 +118,6 @@ do_mapcache(message *msg) printf("VM: map_pf failed\n"); return ENOMEM; } - assert(!vr->param.pb_cache); } diff --git a/servers/vm/mem_file.c b/servers/vm/mem_file.c new file mode 100644 index 000000000..93b546225 --- /dev/null +++ b/servers/vm/mem_file.c @@ -0,0 +1,251 @@ + +/* This file implements the methods of memory-mapped files. */ + +#include + +#include "proto.h" +#include "vm.h" +#include "region.h" +#include "glo.h" +#include "cache.h" + +/* These functions are static so as to not pollute the + * global namespace, and are accessed through their function + * pointers. + */ + +static void mappedfile_split(struct vmproc *vmp, struct vir_region *vr, + struct vir_region *r1, struct vir_region *r2); +static int mappedfile_unreference(struct phys_region *pr); +static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region, + struct phys_region *ph, int write, vfs_callback_t callback, void *, int); +static int mappedfile_sanitycheck(struct phys_region *pr, char *file, int line); +static int mappedfile_writable(struct phys_region *pr); +static int mappedfile_copy(struct vir_region *vr, struct vir_region *newvr); +static int mappedfile_lowshrink(struct vir_region *vr, vir_bytes len); +static void mappedfile_delete(struct vir_region *region); + +struct mem_type mem_type_mappedfile = { + .name = "file-mapped memory", + .ev_unreference = mappedfile_unreference, + .ev_pagefault = mappedfile_pagefault, + .ev_sanitycheck = mappedfile_sanitycheck, + .ev_copy = mappedfile_copy, + .writable = mappedfile_writable, + .ev_split = mappedfile_split, + .ev_lowshrink = mappedfile_lowshrink, + .ev_delete = mappedfile_delete, +}; + +static int mappedfile_unreference(struct phys_region *pr) +{ + assert(pr->ph->refcount == 0); + if(pr->ph->phys != MAP_NONE) + free_mem(ABS2CLICK(pr->ph->phys), 1); + return OK; +} + +static int cow_block(struct vmproc *vmp, struct vir_region *region, + struct phys_region *ph, u16_t clearend) +{ + int r; + + if((r=mem_cow(region, ph, MAP_NONE, MAP_NONE)) != OK) { + printf("mappedfile_pagefault: COW failed\n"); + return r; + } + + /* After COW we are a normal piece of anonymous memory. */ + ph->memtype = &mem_type_anon; + + if(clearend) { + phys_bytes phaddr = ph->ph->phys, po = VM_PAGE_SIZE-clearend; + assert(clearend < VM_PAGE_SIZE); + phaddr += po; + if(sys_memset(NONE, 0, phaddr, clearend) != OK) { + panic("cow_block: clearend failed\n"); + } + } + + return OK; +} + +static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region, + struct phys_region *ph, int write, vfs_callback_t cb, + void *state, int statelen) +{ + u32_t allocflags; + int procfd = region->param.file.fdref->fd; + + allocflags = vrallocflags(region->flags); + + assert(ph->ph->refcount > 0); + assert(region->param.file.inited); + assert(region->param.file.fdref); + assert(region->param.file.fdref->dev != NO_DEV); + + /* Totally new block? Create it. */ + if(ph->ph->phys == MAP_NONE) { + struct cached_page *cp; + u64_t referenced_offset = + region->param.file.offset + ph->offset; + if(region->param.file.fdref->ino == VMC_NO_INODE) { + cp = find_cached_page_bydev(region->param.file.fdref->dev, + referenced_offset, VMC_NO_INODE, 0, 1); + } else { + cp = find_cached_page_byino(region->param.file.fdref->dev, + region->param.file.fdref->ino, referenced_offset, 1); + } + if(cp) { + int result = OK; + pb_unreferenced(region, ph, 0); + pb_link(ph, cp->page, ph->offset, region); + + if(roundup(ph->offset+region->param.file.clearend, + VM_PAGE_SIZE) >= region->length) { + result = cow_block(vmp, region, ph, + region->param.file.clearend); + } else if(result == OK && write) { + result = cow_block(vmp, region, ph, 0); + } + + return result; + } + + if(!cb) { + printf("VM: mem_file: no callback, returning EFAULT\n"); + sys_sysctl_stacktrace(vmp->vm_endpoint); + return EFAULT; + } + + if(vfs_request(VMVFSREQ_FDIO, procfd, vmp, referenced_offset, + VM_PAGE_SIZE, cb, NULL, state, statelen) != OK) { + printf("VM: mappedfile_pagefault: vfs_request failed\n"); + return ENOMEM; + } + + return SUSPEND; + } + + if(!write) { + printf("mappedfile_pagefault: nonwrite fault?\n"); + return EFAULT; + } + + return cow_block(vmp, region, ph, 0); +} + +static int mappedfile_sanitycheck(struct phys_region *pr, char *file, int line) +{ + MYASSERT(usedpages_add(pr->ph->phys, VM_PAGE_SIZE) == OK); + return OK; +} + +static int mappedfile_writable(struct phys_region *pr) +{ + /* We are never writable. */ + return 0; +} + +int mappedfile_copy(struct vir_region *vr, struct vir_region *newvr) +{ + assert(vr->param.file.inited); + mappedfile_setfile(newvr->parent, newvr, vr->param.file.fdref->fd, + vr->param.file.offset, + vr->param.file.fdref->dev, vr->param.file.fdref->ino, + vr->param.file.clearend, 0, 0); + assert(newvr->param.file.inited); + + return OK; +} + +int mappedfile_setfile(struct vmproc *owner, + struct vir_region *region, int fd, u64_t offset, + dev_t dev, ino_t ino, u16_t clearend, int prefill, int mayclosefd) +{ + vir_bytes vaddr; + struct fdref *newref; + + newref = fdref_dedup_or_new(owner, ino, dev, fd, mayclosefd); + + assert(newref); + assert(!region->param.file.inited); + assert(dev != NO_DEV); + fdref_ref(newref, region); + region->param.file.offset = offset; + region->param.file.clearend = clearend; + region->param.file.inited = 1; + + if(!prefill) return OK; + + for(vaddr = 0; vaddr < region->length; vaddr+=VM_PAGE_SIZE) { + struct cached_page *cp = NULL; + struct phys_region *pr; + u64_t referenced_offset = offset + vaddr; + + if(roundup(vaddr+region->param.file.clearend, + VM_PAGE_SIZE) >= region->length) { + break; + } + + if(ino == VMC_NO_INODE) { + cp = find_cached_page_bydev(dev, referenced_offset, + VMC_NO_INODE, 0, 1); + } else { + cp = find_cached_page_byino(dev, ino, + referenced_offset, 1); + } + if(!cp) continue; + if(!(pr = pb_reference(cp->page, vaddr, region, + &mem_type_mappedfile))) { + printf("mappedfile_setfile: pb_reference failed\n"); + break; + } + if(map_ph_writept(region->parent, region, pr) != OK) { + printf("mappedfile_setfile: map_ph_writept failed\n"); + break; + } + } + + return OK; +} + +static void mappedfile_split(struct vmproc *vmp, struct vir_region *vr, + struct vir_region *r1, struct vir_region *r2) +{ + assert(!r1->param.file.inited); + assert(!r2->param.file.inited); + assert(vr->param.file.inited); + assert(r1->length + r2->length == vr->length); + assert(vr->def_memtype == &mem_type_mappedfile); + assert(r1->def_memtype == &mem_type_mappedfile); + assert(r2->def_memtype == &mem_type_mappedfile); + + r1->param.file = vr->param.file; + r2->param.file = vr->param.file; + + fdref_ref(vr->param.file.fdref, r1); + fdref_ref(vr->param.file.fdref, r2); + + r1->param.file.clearend = 0; + r2->param.file.offset += r1->length; + + assert(r1->param.file.inited); + assert(r2->param.file.inited); +} + +static int mappedfile_lowshrink(struct vir_region *vr, vir_bytes len) +{ + assert(vr->param.file.inited); + vr->param.file.offset += len; + return OK; +} + +static void mappedfile_delete(struct vir_region *region) +{ + assert(region->def_memtype == &mem_type_mappedfile); + assert(region->param.file.inited); + assert(region->param.file.fdref); + fdref_deref(region); + region->param.file.inited = 0; +} diff --git a/servers/vm/mmap.c b/servers/vm/mmap.c index 4340f94b5..3cc3707c6 100644 --- a/servers/vm/mmap.c +++ b/servers/vm/mmap.c @@ -81,6 +81,127 @@ static struct vir_region *mmap_region(struct vmproc *vmp, vir_bytes addr, return vr; } +static int mmap_file(struct vmproc *vmp, + int vmfd, u32_t off_lo, u32_t off_hi, int flags, + ino_t ino, dev_t dev, u64_t filesize, vir_bytes addr, vir_bytes len, + vir_bytes *retaddr, u16_t clearend, int writable, int mayclosefd) +{ +/* VFS has replied to a VMVFSREQ_FDLOOKUP request. */ + struct vir_region *vr; + u64_t file_offset, page_offset; + int result = OK; + u32_t vrflags = 0; + + if(writable) vrflags |= VR_WRITABLE; + + if(flags & MAP_THIRDPARTY) { + file_offset = off_lo; + } else { + file_offset = make64(off_lo, off_hi); + if(off_hi && !off_lo) { + /* XXX clang compatability hack */ + off_hi = file_offset = 0; + } + } + + /* Do some page alignments. */ + if((page_offset = (file_offset % VM_PAGE_SIZE))) { + file_offset -= page_offset; + len += page_offset; + } + + len = roundup(len, VM_PAGE_SIZE); + + /* All numbers should be page-aligned now. */ + assert(!(len % VM_PAGE_SIZE)); + assert(!(filesize % VM_PAGE_SIZE)); + assert(!(file_offset % VM_PAGE_SIZE)); + +#if 0 + /* XXX ld.so relies on longer-than-file mapping */ + if((u64_t) len + file_offset > filesize) { + printf("VM: truncating mmap dev 0x%x ino %d beyond file size in %d; offset %llu, len %lu, size %llu; ", + dev, ino, vmp->vm_endpoint, + file_offset, len, filesize); + len = filesize - file_offset; + return EINVAL; + } +#endif + + if(!(vr = mmap_region(vmp, addr, flags, len, + vrflags, &mem_type_mappedfile, 0))) { + result = ENOMEM; + } else { + *retaddr = vr->vaddr + page_offset; + result = OK; + + mappedfile_setfile(vmp, vr, vmfd, + file_offset, dev, ino, clearend, 1, mayclosefd); + } + + return result; +} + +int do_vfs_mmap(message *m) +{ + vir_bytes v; + struct vmproc *vmp; + int r, n; + u16_t clearend, flags = 0; + + /* It might be disabled */ + if(!enable_filemap) return ENXIO; + + clearend = (m->m_u.m_vm_vfs.clearend_and_flags & MVM_LENMASK); + flags = (m->m_u.m_vm_vfs.clearend_and_flags & MVM_FLAGSMASK); + + if((r=vm_isokendpt(m->m_u.m_vm_vfs.who, &n)) != OK) + panic("bad ep %d from vfs", m->m_u.m_vm_vfs.who); + vmp = &vmproc[n]; + + return mmap_file(vmp, m->m_u.m_vm_vfs.fd, m->m_u.m_vm_vfs.offset, 0, + MAP_PRIVATE | MAP_FIXED, + m->m_u.m_vm_vfs.ino, m->m_u.m_vm_vfs.dev, + (u64_t) LONG_MAX * VM_PAGE_SIZE, + m->m_u.m_vm_vfs.vaddr, m->m_u.m_vm_vfs.len, &v, + clearend, flags, 0); +} + +static void mmap_file_cont(struct vmproc *vmp, message *replymsg, void *cbarg, + void *origmsg_v) +{ + message *origmsg = (message *) origmsg_v; + message mmap_reply; + int result; + int writable = 0; + vir_bytes v = (vir_bytes) MAP_FAILED; + + if(origmsg->VMM_PROT & PROT_WRITE) + writable = 1; + + if(replymsg->VMV_RESULT != OK) { + printf("VM: VFS reply failed (%d)\n", replymsg->VMV_RESULT); + sys_sysctl_stacktrace(vmp->vm_endpoint); + result = origmsg->VMV_RESULT; + } else { + /* Finish mmap */ + result = mmap_file(vmp, replymsg->VMV_FD, origmsg->VMM_OFFSET_LO, + origmsg->VMM_OFFSET_HI, origmsg->VMM_FLAGS, + replymsg->VMV_INO, replymsg->VMV_DEV, + (u64_t) replymsg->VMV_SIZE_PAGES*PAGE_SIZE, + origmsg->VMM_ADDR, + origmsg->VMM_LEN, &v, 0, writable, 1); + } + + /* Unblock requesting process. */ + memset(&mmap_reply, 0, sizeof(mmap_reply)); + mmap_reply.m_type = result; + mmap_reply.VMM_ADDR = v; + + if(send(vmp->vm_endpoint, &mmap_reply) != OK) + panic("VM: mmap_file_cont: send() failed"); +} + /*===========================================================================* * do_mmap * *===========================================================================*/ @@ -111,11 +232,16 @@ int do_mmap(message *m) vmp = &vmproc[n]; + /* "SUSv3 specifies that mmap() should fail if length is 0" */ + if(len <= 0) { + return EINVAL; + } + if(m->VMM_FD == -1 || (m->VMM_FLAGS & MAP_ANON)) { /* actual memory in some form */ mem_type_t *mt = NULL; - if(m->VMM_FD != -1 || len <= 0) { + if(m->VMM_FD != -1) { printf("VM: mmap: fd %d, len 0x%x\n", m->VMM_FD, len); return EINVAL; } @@ -134,7 +260,23 @@ int do_mmap(message *m) return ENOMEM; } } else { - return ENXIO; + /* File mapping might be disabled */ + if(!enable_filemap) return ENXIO; + + /* files get private copies of pages on writes. */ + if(!(m->VMM_FLAGS & MAP_PRIVATE)) { + printf("VM: mmap file must MAP_PRIVATE\n"); + return ENXIO; + } + + if(vfs_request(VMVFSREQ_FDLOOKUP, m->VMM_FD, vmp, 0, 0, + mmap_file_cont, NULL, m, sizeof(*m)) != OK) { + printf("VM: vfs_request for mmap failed\n"); + return ENXIO; + } + + /* request queued; don't reply. */ + return SUSPEND; } /* Return mapping, as seen from process. */ diff --git a/servers/vm/proto.h b/servers/vm/proto.h index 2d5018a50..f162045d2 100644 --- a/servers/vm/proto.h +++ b/servers/vm/proto.h @@ -228,5 +228,14 @@ int vfs_request(int reqno, int fd, struct vmproc *vmp, u64_t offset, int do_vfs_reply(message *m); /* mem_file.c */ -void mappedfile_setfile(struct vir_region *region, int fd, u64_t offset, - dev_t dev, ino_t ino, u16_t clearend, int prefill); +int mappedfile_setfile(struct vmproc *owner, struct vir_region *region, + int fd, u64_t offset, + dev_t dev, ino_t ino, u16_t clearend, int prefill, int mayclose); + +/* fdref.c */ +struct fdref *fdref_new(struct vmproc *owner, ino_t ino, dev_t dev, int fd); +struct fdref *fdref_dedup_or_new(struct vmproc *owner, ino_t ino, dev_t dev, + int fd, int mayclose); +void fdref_ref(struct fdref *ref, struct vir_region *region); +void fdref_deref(struct vir_region *region); +void fdref_sanitycheck(void); diff --git a/servers/vm/region.c b/servers/vm/region.c index d98caf5cb..500056b93 100644 --- a/servers/vm/region.c +++ b/servers/vm/region.c @@ -833,6 +833,8 @@ struct vir_region *map_copy_region(struct vmproc *vmp, struct vir_region *vr) if(!(newvr = region_new(vr->parent, vr->vaddr, vr->length, vr->flags, vr->def_memtype))) return NULL; + USE(newvr, newvr->parent = vmp;); + if(vr->def_memtype->ev_copy && (r=vr->def_memtype->ev_copy(vr, newvr)) != OK) { map_free(newvr); printf("VM: memtype-specific copy failed (%d)\n", r); @@ -980,7 +982,6 @@ struct vir_region *start_src_vr; map_free_proc(dst); return ENOMEM; } - USE(newvr, newvr->parent = dst;); region_insert(&dst->vm_regions_avl, newvr); assert(vr->length == newvr->length); diff --git a/servers/vm/region.h b/servers/vm/region.h index 9d615ef03..070c0d467 100644 --- a/servers/vm/region.h +++ b/servers/vm/region.h @@ -19,6 +19,7 @@ #include "phys_region.h" #include "memtype.h" #include "vm.h" +#include "fdref.h" struct phys_block { #if SANITYCHECKS @@ -53,11 +54,9 @@ typedef struct vir_region { } shared; struct phys_block *pb_cache; struct { - int procfd; /* cloned fd in proc for mmap */ - dev_t dev; - ino_t ino; - u64_t offset; int inited; + struct fdref *fdref; + u64_t offset; u16_t clearend; } file; } param; diff --git a/servers/vm/vfs.c b/servers/vm/vfs.c new file mode 100644 index 000000000..274c2afb6 --- /dev/null +++ b/servers/vm/vfs.c @@ -0,0 +1,144 @@ + +/* Sending requests to VFS and handling the replies. */ + +#define _SYSTEM 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "proto.h" +#include "glo.h" +#include "util.h" +#include "region.h" +#include "sanitycheck.h" + +#define STATELEN 50 + +static struct vfs_request_node { + message reqmsg; + char reqstate[STATELEN]; + void *opaque; + endpoint_t who; + u32_t req_id; + vfs_callback_t callback; + struct vfs_request_node *next; +} *first_queued, *active; + +static void activate(void) +{ + assert(!active); + assert(first_queued); + + active = first_queued; + first_queued = first_queued->next; + + if(asynsend3(VFS_PROC_NR, &active->reqmsg, AMF_NOREPLY) != OK) + panic("VM: asynsend to VFS failed"); +} + +/*===========================================================================* + * vfs_request * + *===========================================================================*/ +int vfs_request(int reqno, int fd, struct vmproc *vmp, u64_t offset, u32_t len, + vfs_callback_t reply_callback, void *cbarg, void *state, int statelen) +{ +/* Perform an asynchronous request to VFS. + * We send a message of type VFS_VMCALL to VFS. VFS will respond + * with message type VM_VFS_REPLY. We send the request asynchronously + * and then handle the reply as it if were a VM_VFS_REPLY request. + */ + message *m; + static u32_t reqid = 0; + struct vfs_request_node *reqnode; + + reqid++; + + assert(statelen <= STATELEN); + + if(!SLABALLOC(reqnode)) { + printf("vfs_request: no memory for request node\n"); + return ENOMEM; + } + + m = &reqnode->reqmsg; + m->m_type = VFS_VMCALL; + m->VFS_VMCALL_REQ = reqno; + m->VFS_VMCALL_FD = fd; + m->VFS_VMCALL_REQID = reqid; + m->VFS_VMCALL_ENDPOINT = vmp->vm_endpoint; + m->VFS_VMCALL_OFFSET_LO = ex64lo(offset); + m->VFS_VMCALL_OFFSET_HI = ex64hi(offset); + m->VFS_VMCALL_LENGTH = len; + + reqnode->who = vmp->vm_endpoint; + reqnode->req_id = reqid; + reqnode->next = first_queued; + reqnode->callback = reply_callback; + reqnode->opaque = cbarg; + if(state) memcpy(reqnode->reqstate, state, statelen); + first_queued = reqnode; + + /* Send the request message if none pending. */ + if(!active) + activate(); + + return OK; +} + +/*===========================================================================* + * do_vfs_reply * + *===========================================================================*/ +int do_vfs_reply(message *m) +{ +/* VFS has handled a VM request and VFS has replied. It must be the + * active request. + */ + struct vfs_request_node *orignode = active; + vfs_callback_t req_callback; + void *cbarg; + int n; + struct vmproc *vmp; + if(m->m_source != VFS_PROC_NR) + return ENOSYS; + + assert(active); + assert(active->req_id == m->VMV_REQID); + + /* the endpoint may have exited */ + if(vm_isokendpt(m->VMV_ENDPOINT, &n) != OK) + vmp = NULL; + else vmp = &vmproc[n]; + + req_callback = active->callback; + cbarg = active->opaque; + active = NULL; + + /* Invoke requested reply-callback within VM. */ + if(req_callback) req_callback(vmp, m, cbarg, orignode->reqstate); + + SLABFREE(orignode); + + /* Send the next request message if any. */ + if(first_queued) + activate(); + + return SUSPEND; /* don't reply to the reply */ +} + diff --git a/test/Makefile b/test/Makefile index 7d4762dc4..87ee03dd6 100644 --- a/test/Makefile +++ b/test/Makefile @@ -34,6 +34,7 @@ OBJS.test57= test57loop.o # Cache testing programs OBJS.test71+= testcache.o OBJS.test72+= testcache.o +OBJS.test74+= testcache.o LDADD.test72+= -lminixfs PROGS += testvm @@ -47,7 +48,7 @@ MINIX_TESTS= \ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 \ 41 42 43 44 45 46 48 49 50 52 53 54 55 56 58 59 60 \ -61 64 65 66 67 68 69 70 71 72 73 +61 64 65 66 67 68 69 70 71 72 73 74 .if ${MACHINE_ARCH} == "i386" MINIX_TESTS+= \ diff --git a/test/common.c b/test/common.c index ca6725843..d0bebb75b 100644 --- a/test/common.c +++ b/test/common.c @@ -13,8 +13,7 @@ #include "common.h" int common_test_nr = -1, errct = 0, subtest; - -int quietflag = 1; +int quietflag = 1, bigflag = 0; /* provide a default max_error symbol as Max_error with a value * of 5. The test program can override it wit its own max_error @@ -30,6 +29,11 @@ int test_nr; char buf[64]; int i; + /* if this variable is set, specify to tests we are running + * in 'overnight' mode + */ + bigflag = !!getenv(BIGVARNAME); + common_test_nr = test_nr; printf("Test %2d ", test_nr); fflush(stdout); /* since stdout is probably line buffered */ diff --git a/test/run b/test/run index 41330fdc4..0abb58b84 100755 --- a/test/run +++ b/test/run @@ -28,6 +28,11 @@ alltests=" 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ sh1.sh sh2.sh interp.sh" tests_no=`expr 0` +# test mmap only if enabled in sysenv +if sysenv filemap >/dev/null +then alltests="$alltests 74" +fi + # If root, make sure the setuid tests have the correct permissions # and make the dir bin-owned. if [ "$ROOT" ] diff --git a/test/test71.c b/test/test71.c index 94b7b4b42..954b972c8 100644 --- a/test/test71.c +++ b/test/test71.c @@ -23,32 +23,6 @@ #include "common.h" #include "testcache.h" -/* we want to flexibly split this test over multiple files - * - for big working sets we might run over the 2GB MFS file limit - * - we might want to test the FS being able to handle lots of - * files / unusual metadata situations - */ -#define MBPERFILE 100 -#define MB (1024*1024) -#define MAXFILES ((u64_t) MAXBLOCKS * MAXBLOCKSIZE / MB / MBPERFILE + 1) - -static int fds[MAXFILES]; - -static void -get_fd_offset(int b, int blocksize, u64_t *file_offset, int *fd) -{ - u64_t offset = (u64_t) b * blocksize; - int filenumber; - - filenumber = offset / MB / MBPERFILE; - - assert(filenumber >= 0 && filenumber < MAXFILES); - assert(fds[filenumber] > 0); - - *fd = fds[filenumber]; - *file_offset = offset - (filenumber * MBPERFILE * MB); -} - int dowriteblock(int b, int blocksize, u32_t seed, char *data) { @@ -86,19 +60,14 @@ void testend(void) { } int main(int argc, char *argv[]) { - int f, big = !!getenv(BIGVARNAME), iter = 2; + int iter = 2; start(71); - cachequiet(!big); - if(big) iter = 3; + cachequiet(!bigflag); + if(bigflag) iter = 3; - for(f = 0; f < MAXFILES; f++) { - char tempfilename[] = "cachetest.XXXXXXXX"; - fds[f] = mkstemp(tempfilename); - if(fds[f] < 0) { perror("mkstemp"); e(20); return 1; } - assert(fds[f] > 0); - } + makefiles(MAXFILES); /* Try various combinations working set sizes * and block sizes in order to specifically @@ -112,18 +81,13 @@ main(int argc, char *argv[]) if(dotest(PAGE_SIZE*3, 100, iter)) e(3); if(dotest(PAGE_SIZE, 20000, iter)) e(5); - if(big) { + if(bigflag) { u32_t totalmem, freemem, cachedmem; if(dotest(PAGE_SIZE, 150000, iter)) e(5); getmem(&totalmem, &freemem, &cachedmem); if(dotest(PAGE_SIZE, totalmem*1.5, iter)) e(6); } - for(f = 0; f < MAXFILES; f++) { - assert(fds[f] > 0); - close(fds[f]); - } - quit(); return 0; diff --git a/test/test74.c b/test/test74.c new file mode 100644 index 000000000..8ccdaaaa0 --- /dev/null +++ b/test/test74.c @@ -0,0 +1,113 @@ +/* Test 74 - mmap functionality test. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "testcache.h" + +int +dowriteblock(int b, int blocksize, u32_t seed, char *data) +{ + u64_t offset; + int fd; + + get_fd_offset(b, blocksize, &offset, &fd); + + if(pwrite(fd, data, blocksize, offset) < blocksize) { + perror("pwrite"); + return -1; + } + + return blocksize; +} + +int +readblock(int b, int blocksize, u32_t seed, char *data) +{ + u64_t offset; + int fd; + char *mmapdata; + int pread_first = random() % 2; + + get_fd_offset(b, blocksize, &offset, &fd); + + if(pread_first) { + if(pread(fd, data, blocksize, offset) < blocksize) { + perror("pread"); + return -1; + } + } + + if((mmapdata = minix_mmap(NULL, blocksize, PROT_READ, MAP_PRIVATE | MAP_FILE, + fd, offset)) == MAP_FAILED) { + perror("mmap"); + return -1; + } + + if(!pread_first) { + if(pread(fd, data, blocksize, offset) < blocksize) { + perror("pread"); + return -1; + } + } + + if(memcmp(mmapdata, data, blocksize)) { + fprintf(stderr, "readblock: mmap, pread mismatch\n"); + return -1; + } + + if(minix_munmap(mmapdata, blocksize) < 0) { + perror("munmap"); + return -1; + } + + return blocksize; +} + +void testend(void) { } + +int +main(int argc, char *argv[]) +{ + int iter = 2; + + start(74); + + makefiles(MAXFILES); + + cachequiet(!bigflag); + if(bigflag) iter = 3; + + /* Try various combinations working set sizes + * and block sizes in order to specifically + * target the primary cache, then primary+secondary + * cache, then primary+secondary cache+secondary + * cache eviction. + */ + + if(dotest(PAGE_SIZE, 100, iter)) e(5); + if(dotest(PAGE_SIZE*2, 100, iter)) e(2); + if(dotest(PAGE_SIZE*3, 100, iter)) e(3); + if(dotest(PAGE_SIZE, 20000, iter)) e(5); + + if(bigflag) { + u32_t totalmem, freemem, cachedmem; + if(dotest(PAGE_SIZE, 150000, iter)) e(5); + getmem(&totalmem, &freemem, &cachedmem); + if(dotest(PAGE_SIZE, totalmem*1.5, iter)) e(6); + } + + quit(); + + return 0; +} + diff --git a/test/testcache.c b/test/testcache.c index b25141a27..e58925067 100644 --- a/test/testcache.c +++ b/test/testcache.c @@ -23,6 +23,8 @@ extern int quietflag; +int fds[MAXFILES]; + static void genblock(int b, char *blockdata, int blocksize, u32_t seed) { @@ -210,6 +212,37 @@ dotest(int blocksize, int nblocks, int iterations) return 0; } +void +get_fd_offset(int b, int blocksize, u64_t *file_offset, int *fd) +{ + u64_t offset = (u64_t) b * blocksize; + int filenumber; + + filenumber = offset / MB / MBPERFILE; + + assert(filenumber >= 0 && filenumber < MAXFILES); + assert(fds[filenumber] > 0); + + *fd = fds[filenumber]; + *file_offset = offset - (filenumber * MBPERFILE * MB); +} + +void +makefiles(int n) +{ + int f; + for(f = 0; f < n; f++) { + char tempfilename[] = "cachetest.XXXXXXXX"; + fds[f] = mkstemp(tempfilename); + if(fds[f] < 0) { + perror("mkstemp"); + fprintf(stderr, "mkstemp %d/%d failed\n", f, n); + exit(1); + } + assert(fds[f] > 0); + } +} + void cachequiet(int quiet) { quietflag = quiet; diff --git a/test/testcache.h b/test/testcache.h index de735f6ae..01ad939db 100644 --- a/test/testcache.h +++ b/test/testcache.h @@ -14,5 +14,21 @@ int readblock(int b, int blocksize, u32_t seed, char *block); void testend(void); int dotest(int blocksize, int nblocks, int iterations); void cachequiet(int quiet); +void get_fd_offset(int b, int blocksize, u64_t *file_offset, int *fd); +void makefiles(int n); #define OK_BLOCK_GONE -999 + +/* for file-oriented tests: + * + * we want to flexibly split tests over multiple files + * - for big working sets we might run over the 2GB MFS file limit + * - we might want to test the FS being able to handle lots of + * files / unusual metadata situations + */ +#define MBPERFILE 2000 +#define MB (1024*1024) +#define MAXFILES ((u64_t) MAXBLOCKS * MAXBLOCKSIZE / MB / MBPERFILE + 1) + +extern int fds[MAXFILES], bigflag; +