From: David van Moolenbroek Date: Sat, 15 Nov 2014 10:14:00 +0000 (+0000) Subject: libfsdriver: support mmap on FSes with no device X-Git-Url: http://zhaoyanbai.com/repos/man.dnssec-verify.html?a=commitdiff_plain;h=e321f6558257ab50a8b750451c96d1a3ca19e300;p=minix.git libfsdriver: support mmap on FSes with no device This patch adds (very limited) support for memory-mapping pages on file systems that are mounted on the special "none" device and that do not implement PEEK support by themselves. This includes hgfs, vbfs, and procfs. The solution is implemented in libvtreefs, and consists of allocating pages, filling them with content by calling the file system's READ functionality, passing the pages to VM, and freeing them again. A new VM flag is used to indicate that these pages should be mapped in only once, and thus not cached beyond their single use. This prevents stale data from getting mapped in without the involvement of the file system, which would be problematic on file systems where file contents may become outdated at any time. No VM caching means no sharing and poor performance, but mmap no longer fails on these file systems. Compared to a libc-based approach, this patch retains the on-demand nature of mmap. Especially tail(1) is known to map in a large file area only to use a small portion of it. All file systems now need to be given permission for the SETCACHEPAGE and CLEARCACHE calls to VM. A very basic regression test is added to test74. Change-Id: I17afc4cb97315b515cad1542521b98f293b6b559 --- diff --git a/etc/system.conf b/etc/system.conf index bc1962dd9..6f09d66f4 100644 --- a/etc/system.conf +++ b/etc/system.conf @@ -385,6 +385,8 @@ service procfs ; vm INFO + SETCACHEPAGE + CLEARCACHE ; uid 0; }; @@ -402,6 +404,10 @@ service hgfs ipc SYSTEM pm vfs rs vm ; + vm + SETCACHEPAGE + CLEARCACHE + ; }; service vbfs @@ -409,6 +415,10 @@ service vbfs ipc SYSTEM pm vfs rs ds vm vbox ; + vm + SETCACHEPAGE + CLEARCACHE + ; }; service printer @@ -554,6 +564,10 @@ service hello service devman { uid 0; + vm + SETCACHEPAGE + CLEARCACHE + ; }; service mmc @@ -589,6 +603,10 @@ service gpio IRQCTL # 19 PADCONF # 57 ; + vm + SETCACHEPAGE + CLEARCACHE + ; irq 29 # GPIO module 1 (dm37xx) 30 # GPIO module 2 (dm37xx) diff --git a/minix/include/minix/vm.h b/minix/include/minix/vm.h index 6dd94cb0f..9775d7133 100644 --- a/minix/include/minix/vm.h +++ b/minix/include/minix/vm.h @@ -71,7 +71,8 @@ int vm_procctl_clear(endpoint_t ep); int vm_procctl_handlemem(endpoint_t ep, vir_bytes m1, vir_bytes m2, int wr); int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset, - ino_t ino, off_t ino_offset, u32_t *flags, int blocksize); + ino_t ino, off_t ino_offset, u32_t *flags, int blocksize, + int setflags); void *vm_map_cacheblock(dev_t dev, off_t dev_offset, ino_t ino, off_t ino_offset, u32_t *flags, int blocksize); @@ -87,5 +88,8 @@ int vm_clear_cache(dev_t dev); /* special inode number for vm cache functions */ #define VMC_NO_INODE 0 /* to reference a disk block, no associated file */ +/* setflags for vm_set_cacheblock, also used internally in VM */ +#define VMSF_ONCE 0x01 /* discard block after one-time use */ + #endif /* _MINIX_VM_H */ diff --git a/minix/lib/libfsdriver/call.c b/minix/lib/libfsdriver/call.c index cb40015b6..1f6189da1 100644 --- a/minix/lib/libfsdriver/call.c +++ b/minix/lib/libfsdriver/call.c @@ -1,6 +1,7 @@ #include "fsdriver.h" #include +#include /* * Process a READSUPER request from VFS. @@ -43,7 +44,8 @@ fsdriver_readsuper(const struct fsdriver * __restrict fdp, if (r == OK) { /* This one we can set on the file system's behalf. */ - if (fdp->fdr_peek != NULL && fdp->fdr_bpeek != NULL) + if ((fdp->fdr_peek != NULL && fdp->fdr_bpeek != NULL) || + major(dev) == NONE_MAJOR) res_flags |= RES_HASPEEK; m_out->m_fs_vfs_readsuper.inode = root_node.fn_ino_nr; @@ -74,6 +76,10 @@ fsdriver_unmount(const struct fsdriver * __restrict fdp, if (fdp->fdr_unmount != NULL) fdp->fdr_unmount(); + /* If we used mmap emulation, clear any cached blocks from VM. */ + if (fdp->fdr_peek == NULL && major(fsdriver_device) == NONE_MAJOR) + vm_clear_cache(fsdriver_device); + /* Update library-local state. */ fsdriver_mounted = FALSE; @@ -206,6 +212,61 @@ fsdriver_write(const struct fsdriver * __restrict fdp, return read_write(fdp, m_in, m_out, FSC_WRITE); } +/* + * A read-based peek implementation. This allows file systems that do not have + * a buffer cache and do not implement peek, to support a limited form of mmap. + * We map in a block, fill it by calling the file system's read function, tell + * VM about the page, and then unmap the block again. We tell VM not to cache + * the block beyond its immediate use for the mmap request, so as to prevent + * potentially stale data from being cached--at the cost of performance. + */ +static ssize_t +builtin_peek(const struct fsdriver * __restrict fdp, ino_t ino_nr, + size_t nbytes, off_t pos) +{ + static u32_t flags = 0; /* storage for the VMMC_ flags of all blocks */ + static off_t dev_off = 0; /* fake device offset, see below */ + struct fsdriver_data data; + char *buf; + ssize_t r; + + if ((buf = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) + return ENOMEM; + + data.endpt = SELF; + data.grant = (cp_grant_id_t)buf; + data.size = nbytes; + + r = fdp->fdr_read(ino_nr, &data, nbytes, pos, FSC_READ); + + if (r >= 0) { + if ((size_t)r < nbytes) + memset(&buf[r], 0, nbytes - r); + + /* + * VM uses serialized communication to VFS. Since the page is + * to be used only once, VM will use and then discard it before + * sending a new peek request. Thus, it should be safe to + * reuse the same device offset all the time. However, relying + * on assumptions in protocols elsewhere a bit dangerous, so we + * use an ever-increasing device offset just to be safe. + */ + r = vm_set_cacheblock(buf, fsdriver_device, dev_off, ino_nr, + pos, &flags, nbytes, VMSF_ONCE); + + if (r == OK) { + dev_off += nbytes; + + r = nbytes; + } + } + + munmap(buf, nbytes); + + return r; +} + /* * Process a PEEK request from VFS. */ @@ -222,13 +283,22 @@ fsdriver_peek(const struct fsdriver * __restrict fdp, pos = m_in->m_vfs_fs_readwrite.seek_pos; nbytes = m_in->m_vfs_fs_readwrite.nbytes; - if (fdp->fdr_peek == NULL) - return ENOSYS; - if (pos < 0 || nbytes > SSIZE_MAX) return EINVAL; - r = fdp->fdr_peek(ino_nr, NULL /*data*/, nbytes, pos, FSC_PEEK); + if (fdp->fdr_peek == NULL) { + if (major(fsdriver_device) != NONE_MAJOR) + return ENOSYS; + + /* + * For file systems that have no backing device, emulate peek + * support by reading into temporary buffers and passing these + * to VM. + */ + r = builtin_peek(fdp, ino_nr, nbytes, pos); + } else + r = fdp->fdr_peek(ino_nr, NULL /*data*/, nbytes, pos, + FSC_PEEK); /* Do not return a new position. */ if (r >= 0) { diff --git a/minix/lib/libminixfs/cache.c b/minix/lib/libminixfs/cache.c index 697b360a9..408c26175 100644 --- a/minix/lib/libminixfs/cache.c +++ b/minix/lib/libminixfs/cache.c @@ -469,7 +469,7 @@ void lmfs_put_block( if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) { if((r=vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode, bp->lmfs_inode_offset, - &bp->lmfs_flags, fs_block_size)) != OK) { + &bp->lmfs_flags, fs_block_size, 0)) != OK) { if(r == ENOSYS) { printf("libminixfs: ENOSYS, disabling VM calls\n"); vmcache = 0; diff --git a/minix/lib/libsys/vm_cache.c b/minix/lib/libsys/vm_cache.c index c94c1e2f8..be31981c2 100644 --- a/minix/lib/libsys/vm_cache.c +++ b/minix/lib/libsys/vm_cache.c @@ -14,7 +14,7 @@ static int vm_cachecall(message *m, int call, void *addr, dev_t dev, off_t dev_offset, ino_t ino, off_t ino_offset, u32_t *flags, - int blocksize) + int blocksize, int setflags) { if(blocksize % PAGE_SIZE) panic("blocksize %d should be a multiple of pagesize %d\n", @@ -39,7 +39,7 @@ static int vm_cachecall(message *m, int call, void *addr, dev_t dev, m->m_vmmcp.flags_ptr = flags; m->m_vmmcp.dev = dev; m->m_vmmcp.pages = blocksize / PAGE_SIZE; - m->m_vmmcp.flags = 0; + m->m_vmmcp.flags = setflags; return _taskcall(VM_PROC_NR, call, m); } @@ -50,19 +50,19 @@ void *vm_map_cacheblock(dev_t dev, off_t dev_offset, message m; if(vm_cachecall(&m, VM_MAPCACHEPAGE, NULL, dev, dev_offset, - ino, ino_offset, flags, blocksize) != OK) + ino, ino_offset, flags, blocksize, 0) != OK) return MAP_FAILED; return m.m_vmmcp_reply.addr; } int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset, - ino_t ino, off_t ino_offset, u32_t *flags, int blocksize) + ino_t ino, off_t ino_offset, u32_t *flags, int blocksize, int setflags) { message m; return vm_cachecall(&m, VM_SETCACHEPAGE, block, dev, dev_offset, - ino, ino_offset, flags, blocksize); + ino, ino_offset, flags, blocksize, setflags); } int diff --git a/minix/servers/vm/cache.c b/minix/servers/vm/cache.c index 6935357d0..ecd7529ee 100644 --- a/minix/servers/vm/cache.c +++ b/minix/servers/vm/cache.c @@ -213,7 +213,8 @@ struct cached_page *find_cached_page_byino(dev_t dev, ino_t ino, u64_t ino_off, return NULL; } -int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, struct phys_block *pb) +int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, int flags, + struct phys_block *pb) { int hv_dev; struct cached_page *hb; @@ -237,6 +238,7 @@ int addcache(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, struct phys_blo hb->dev_offset = dev_off; hb->ino = ino; hb->ino_offset = ino_off; + hb->flags = flags & VMSF_ONCE; hb->page = pb; hb->page->refcount++; /* block also referenced by cache now */ hb->page->flags |= PBF_INCACHE; diff --git a/minix/servers/vm/cache.h b/minix/servers/vm/cache.h index 581349af8..0b5301d18 100644 --- a/minix/servers/vm/cache.h +++ b/minix/servers/vm/cache.h @@ -12,6 +12,7 @@ struct cached_page { ino_t ino; /* which ino is it about */ u64_t ino_offset; /* offset within ino */ + int flags; /* currently only VMSF_ONCE or 0 */ struct phys_block *page; /* page ptr */ struct cached_page *older; /* older in lru chain */ struct cached_page *newer; /* newer in lru chain */ diff --git a/minix/servers/vm/mem_cache.c b/minix/servers/vm/mem_cache.c index b61ac939e..6b1ace42a 100644 --- a/minix/servers/vm/mem_cache.c +++ b/minix/servers/vm/mem_cache.c @@ -175,6 +175,7 @@ do_setcache(message *msg) dev_t dev = msg->m_vmmcp.dev; off_t dev_off = msg->m_vmmcp.dev_offset; off_t ino_off = msg->m_vmmcp.ino_offset; + int flags = msg->m_vmmcp.flags; int n; struct vmproc *caller; phys_bytes offset; @@ -209,7 +210,8 @@ do_setcache(message *msg) if((hb=find_cached_page_bydev(dev, dev_off + offset, msg->m_vmmcp.ino, ino_off + offset, 1))) { /* block inode info updated */ - if(hb->page != phys_region->ph) { + if(hb->page != phys_region->ph || + (hb->flags & VMSF_ONCE)) { /* previous cache entry has become * obsolete; make a new one. rmcache * removes it from the cache and frees @@ -236,8 +238,8 @@ do_setcache(message *msg) phys_region->memtype = &mem_type_cache; - if((r=addcache(dev, dev_off + offset, - msg->m_vmmcp.ino, ino_off + offset, phys_region->ph)) != OK) { + if((r=addcache(dev, dev_off + offset, msg->m_vmmcp.ino, + ino_off + offset, flags, phys_region->ph)) != OK) { printf("VM: addcache failed\n"); return r; } diff --git a/minix/servers/vm/mem_file.c b/minix/servers/vm/mem_file.c index ef25cff55..d73c062d4 100644 --- a/minix/servers/vm/mem_file.c +++ b/minix/servers/vm/mem_file.c @@ -107,7 +107,17 @@ static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region, cp = find_cached_page_byino(region->param.file.fdref->dev, region->param.file.fdref->ino, referenced_offset, 1); } - if(cp) { + /* + * Normally, a cache hit saves a round-trip to the file system + * to load the page. However, if the page in the VM cache is + * marked for one-time use, then force a round-trip through the + * file system anyway, so that the FS can update the page by + * by readding it to the cache. Thus, for one-time use pages, + * no caching is performed. This approach is correct even in + * the light of concurrent requests and disappearing processes + * but relies on VM requests to VFS being fully serialized. + */ + if(cp && (!cb || !(cp->flags & VMSF_ONCE))) { int result = OK; pb_unreferenced(region, ph, 0); pb_link(ph, cp->page, ph->offset, region); @@ -120,6 +130,10 @@ static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region, result = cow_block(vmp, region, ph, 0); } + /* Discard one-use pages after mapping them in. */ + if (result == OK && (cp->flags & VMSF_ONCE)) + rmcache(cp); + return result; } @@ -210,7 +224,14 @@ int mappedfile_setfile(struct vmproc *owner, cp = find_cached_page_byino(dev, ino, referenced_offset, 1); } - if(!cp) continue; + /* + * If we get a hit for a page that is to be used only once, + * then either we found a stale page (due to a process dying + * before a requested once-page could be mapped in) or this is + * a rare case of concurrent requests for the same page. In + * both cases, force the page to be obtained from its FS later. + */ + if(!cp || (cp->flags & VMSF_ONCE)) continue; if(!(pr = pb_reference(cp->page, vaddr, region, &mem_type_mappedfile))) { printf("mappedfile_setfile: pb_reference failed\n"); diff --git a/minix/servers/vm/proto.h b/minix/servers/vm/proto.h index 0ad9998cf..e61ea9f74 100644 --- a/minix/servers/vm/proto.h +++ b/minix/servers/vm/proto.h @@ -227,7 +227,8 @@ int do_clearcache(message *m); struct cached_page *find_cached_page_bydev(dev_t dev, u64_t dev_off, ino_t ino, u64_t ino_off, int touchlru); struct cached_page *find_cached_page_byino(dev_t dev, ino_t ino, u64_t ino_off, int touchlru); -int addcache(dev_t dev, u64_t def_off, ino_t ino, u64_t ino_off, struct phys_block *pb); +int addcache(dev_t dev, u64_t def_off, ino_t ino, u64_t ino_off, int flags, + struct phys_block *pb); void cache_sanitycheck_internal(void); int cache_freepages(int pages); void get_stats_info(struct vm_stats_info *vsi); diff --git a/minix/tests/test72.c b/minix/tests/test72.c index 5a9e6289a..1723bced4 100644 --- a/minix/tests/test72.c +++ b/minix/tests/test72.c @@ -230,7 +230,7 @@ u32_t sqrt_approx(u32_t v) } int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset, - ino_t ino, off_t ino_offset, u32_t *flags, int blocksize) + ino_t ino, off_t ino_offset, u32_t *flags, int blocksize, int setflags) { return ENOSYS; } diff --git a/minix/tests/test74.c b/minix/tests/test74.c index a3cd76a82..fbfb4f6ca 100644 --- a/minix/tests/test74.c +++ b/minix/tests/test74.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -461,6 +462,63 @@ static void basic_regression(void) } +/* + * Test mmap on none-dev file systems - file systems that do not have a buffer + * cache and therefore have to fake mmap support. We use procfs as target. + * The idea is that while we succeed in mapping in /proc/uptime, we also get + * a new uptime value every time we map in the page -- VM must not cache it. + */ +static void +nonedev_regression(void) +{ + int fd; + char *buf; + unsigned long uptime1, uptime2, uptime3; + + subtest++; + + if ((fd = open(_PATH_PROC "uptime", O_RDONLY)) < 0) e(1); + + buf = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); + if (buf == MAP_FAILED) e(2); + + if (buf[4095] != 0) e(3); + + if ((uptime1 = atoi(buf)) == 0) e(4); + + if (munmap(buf, 4096) != 0) e(5); + + sleep(2); + + buf = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, + fd, 0); + if (buf == MAP_FAILED) e(6); + + if (buf[4095] != 0) e(7); + + if ((uptime2 = atoi(buf)) == 0) e(8); + + if (uptime1 == uptime2) e(9); + + if (munmap(buf, 4096) != 0) e(10); + + sleep(2); + + buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED | MAP_FILE, fd, 0); + if (buf == MAP_FAILED) e(11); + + if (buf[4095] != 0) e(12); + + if ((uptime3 = atoi(buf)) == 0) e(13); + + if (uptime1 == uptime3) e(14); + if (uptime2 == uptime3) e(15); + + if (munmap(buf, 4096) != 0) e(16); + + close(fd); +} + int main(int argc, char *argv[]) { @@ -470,6 +528,8 @@ main(int argc, char *argv[]) basic_regression(); + nonedev_regression(); + test_memory_types_vs_operations(); makefiles(MAXFILES); diff --git a/minix/tests/testvm.c b/minix/tests/testvm.c index 824e4d39e..1f5f8e989 100644 --- a/minix/tests/testvm.c +++ b/minix/tests/testvm.c @@ -44,7 +44,7 @@ int dowriteblock(int b, int blocksize, u32_t seed, char *block) memcpy(bdata, block, blocksize); if(mustset && (r=vm_set_cacheblock(bdata, MYDEV, dev_off, - VMC_NO_INODE, 0, NULL, blocksize)) != OK) { + VMC_NO_INODE, 0, NULL, blocksize, 0)) != OK) { printf("dowriteblock: vm_set_cacheblock failed %d\n", r); exit(1); }