libminixfs/VM: fix memory-mapped file corruption

author David van Moolenbroek <david@minix3.org>

Thu, 13 Aug 2015 11:29:33 +0000 (11:29 +0000)

committer David van Moolenbroek <david@minix3.org>

Thu, 13 Aug 2015 13:46:46 +0000 (13:46 +0000)
author David van Moolenbroek <david@minix3.org>
Thu, 13 Aug 2015 11:29:33 +0000 (11:29 +0000)
committer David van Moolenbroek <david@minix3.org>
Thu, 13 Aug 2015 13:46:46 +0000 (13:46 +0000)
diff --git a/etc/system.conf b/etc/system.conf

index 15f37ff1f18ff753569276556994d5f9b1ba7f9d..b920a69934cf0f9648ae776b8769ad365eb872c7 100644 (file)
--- a/etc/system.conf
+++ b/etc/system.conf
@@ -109,7 +109,7 @@ service mfs
  {
         ipc     ALL_SYS;        # All system ipc targets allowed
         system  BASIC;          # Only basic kernel calls allowed
-       vm      MAPCACHEPAGE SETCACHEPAGE CLEARCACHE;
+       vm      MAPCACHEPAGE SETCACHEPAGE FORGETCACHEPAGE CLEARCACHE;
         io      NONE;           # No I/O range allowed
         irq     NONE;           # No IRQ allowed
         sigmgr          rs;     # Signal manager is RS
@@ -136,7 +136,7 @@ service ext2
  {
         ipc     ALL_SYS;        # All system ipc targets allowed
         system  BASIC;          # Only basic kernel calls allowed
-       vm      MAPCACHEPAGE SETCACHEPAGE CLEARCACHE;
+       vm      MAPCACHEPAGE SETCACHEPAGE FORGETCACHEPAGE CLEARCACHE;
         io      NONE;           # No I/O range allowed
         irq     NONE;           # No IRQ allowed
         sigmgr          rs;     # Signal manager is RS
@@ -149,7 +149,6 @@ service pfs
  {
         ipc     ALL_SYS;        # All system ipc targets allowed
         system  BASIC;          # Only basic kernel calls allowed
-       vm      MAPCACHEPAGE SETCACHEPAGE CLEARCACHE;
         io      NONE;           # No I/O range allowed
         irq     NONE;           # No IRQ allowed
         sigmgr          rs;     # Signal manager is RS
diff --git a/minix/commands/service/parse.c b/minix/commands/service/parse.c

index 264fb432b4a6cc3f71c155a1520ed6b7958adf04..26a91a1241a48c6f9a05fbcb961dda328b9acc29 100644 (file)
--- a/minix/commands/service/parse.c
+++ b/minix/commands/service/parse.c
@@ -750,6 +750,7 @@ struct
         { "PROCCTL",            VM_PROCCTL },
         { "MAPCACHEPAGE",       VM_MAPCACHEPAGE },
         { "SETCACHEPAGE",       VM_SETCACHEPAGE },
+       { "FORGETCACHEPAGE",    VM_FORGETCACHEPAGE },
         { "CLEARCACHE",         VM_CLEARCACHE },
         { "VFS_MMAP",           VM_VFS_MMAP },
         { "VFS_REPLY",          VM_VFS_REPLY },
diff --git a/minix/fs/ext2/balloc.c b/minix/fs/ext2/balloc.c

index bef14964c94d37c1f7590c89fe29d92c82076253..947e9f538b377c809e62c4f21b5c0cc47513c9ad 100644 (file)
--- a/minix/fs/ext2/balloc.c
+++ b/minix/fs/ext2/balloc.c
@@ -329,6 +329,13 @@ void free_block(struct super_block *sp, bit_t bit_returned)
  
    if (bit_returned < sp->s_bsearch)
         sp->s_bsearch = bit_returned;
+
+  /* Also tell libminixfs, so that 1) if it has this block in its cache, it can
+   * mark it as clean, thus reducing useless writes, and 2) it can tell VM that
+   * any previous inode association is to be broken for this block, so that the
+   * block will not be mapped in erroneously later on.
+   */
+  lmfs_free_block(sp->s_dev, (block_t)bit_returned);
  }
  
  
diff --git a/minix/fs/ext2/write.c b/minix/fs/ext2/write.c

index 389ca17160d3e758a3fe82186ba372a923de86d9..638bf4e423e38e7b70da076c3665583e953adea9 100644 (file)
--- a/minix/fs/ext2/write.c
+++ b/minix/fs/ext2/write.c
@@ -226,20 +226,16 @@ int op;                           /* special actions */
                 rip->i_blocks += rip->i_sp->s_sectors_in_block;
         }
         /* b1 equals NO_BLOCK only when we are freeing up the indirect block. */
-       if(b1 == NO_BLOCK)
-               lmfs_markclean(bp);
-       else
+       if(b1 != NO_BLOCK)
                 lmfs_markdirty(bp);
         put_block(bp, INDIRECT_BLOCK);
    }
  
    /* If the single indirect block isn't there (or was just freed),
     * see if we have to keep the double indirect block, if any.
-   * If we don't have to keep it, don't bother writing it out.
     */
    if (b1 == NO_BLOCK && !single && b2 != NO_BLOCK &&
       empty_indir(bp_dindir, rip->i_sp)) {
-       lmfs_markclean(bp_dindir);
         free_block(rip->i_sp, b2);
         rip->i_blocks -= rip->i_sp->s_sectors_in_block;
         b2 = NO_BLOCK;
@@ -252,11 +248,9 @@ int op;                            /* special actions */
    }
    /* If the double indirect block isn't there (or was just freed),
     * see if we have to keep the triple indirect block, if any.
-   * If we don't have to keep it, don't bother writing it out.
     */
    if (b2 == NO_BLOCK && triple && b3 != NO_BLOCK &&
       empty_indir(bp_tindir, rip->i_sp)) {
-       lmfs_markclean(bp_tindir);
         free_block(rip->i_sp, b3);
         rip->i_blocks -= rip->i_sp->s_sectors_in_block;
         rip->i_block[EXT2_TIND_BLOCK] = NO_BLOCK;
diff --git a/minix/fs/mfs/cache.c b/minix/fs/mfs/cache.c

index 147332c40e7e233b2cb3d43fabc79b86866cf936..ed8300851174a671366a5a4f28cf3bc15e0d4d92 100644 (file)
--- a/minix/fs/mfs/cache.c
+++ b/minix/fs/mfs/cache.c
@@ -88,6 +88,12 @@ void free_zone(
    bit = (bit_t) (numb - (zone_t) (sp->s_firstdatazone - 1));
    free_bit(sp, ZMAP, bit);
    if (bit < sp->s_zsearch) sp->s_zsearch = bit;
-}
-
  
+  /* Also tell libminixfs, so that 1) if it has a block for this bit, it can
+   * mark it as clean, thus reducing useless writes, and 2) it can tell VM that
+   * any previous inode association is to be broken for this block, so that the
+   * block will not be mapped in erroneously later on.
+   */
+  assert(sp->s_log_zone_size == 0); /* otherwise we need a loop here.. */
+  lmfs_free_block(dev, (block_t)numb);
+}
diff --git a/minix/fs/mfs/write.c b/minix/fs/mfs/write.c

index 3015b8b3d5563f7f11a95bcd33c0094479551fab..3a87dd56f9b8bfa690c079e125580b9f0483bcdf 100644 (file)
--- a/minix/fs/mfs/write.c
+++ b/minix/fs/mfs/write.c
@@ -165,7 +165,7 @@ int op;                             /* special actions */
                 wr_indir(bp, ex, new_zone);
         }
         /* z1 equals NO_ZONE only when we are freeing up the indirect block. */
-       if(z1 == NO_ZONE) { MARKCLEAN(bp); } else { MARKDIRTY(bp); }
+       if(z1 != NO_ZONE) MARKDIRTY(bp);
         put_block(bp, INDIRECT_BLOCK);
    }
  
@@ -175,7 +175,6 @@ int op;                             /* special actions */
     */
    if(z1 == NO_ZONE && !single && z2 != NO_ZONE &&
       empty_indir(bp_dindir, rip->i_sp)) {
-       MARKCLEAN(bp_dindir);
         free_zone(rip->i_dev, z2);
         rip->i_zone[zones+1] = NO_ZONE;
    }
diff --git a/minix/include/minix/com.h b/minix/include/minix/com.h

index c3d42cc7706dfe61b739a6a485257c008ececc08..fe8267513cb26f4971d5c7d55be0303c70f3bea3 100644 (file)
--- a/minix/include/minix/com.h
+++ b/minix/include/minix/com.h
@@ -643,8 +643,11 @@
  /* To VM: identify cache block in FS */
  #define VM_SETCACHEPAGE                (VM_RQ_BASE+27)
  
+/* To VM: forget cache block in FS */
+#define VM_FORGETCACHEPAGE     (VM_RQ_BASE+28)
+
  /* To VM: clear all cache blocks for a device */
-#define VM_CLEARCACHE          (VM_RQ_BASE+28)
+#define VM_CLEARCACHE          (VM_RQ_BASE+29)
  
  /* To VFS: fields for request from VM. */
  #      define VFS_VMCALL_REQ           m10_i1
diff --git a/minix/include/minix/libminixfs.h b/minix/include/minix/libminixfs.h

index 8b361aa84066848a5ea67eaf1ea30226214b2657..207ca83d326837eb7482c0b65d6201c655556419 100644 (file)
--- a/minix/include/minix/libminixfs.h
+++ b/minix/include/minix/libminixfs.h
@@ -45,8 +45,9 @@ void lmfs_buf_pool(int new_nr_bufs);
  struct buf *lmfs_get_block(dev_t dev, block64_t block,int only_search);
  struct buf *lmfs_get_block_ino(dev_t dev, block64_t block,int only_search,
         ino_t ino, u64_t off);
-void lmfs_invalidate(dev_t device);
  void lmfs_put_block(struct buf *bp, int block_type);
+void lmfs_free_block(dev_t dev, block64_t block);
+void lmfs_invalidate(dev_t device);
  void lmfs_rw_scattered(dev_t, struct buf **, int, int);
  void lmfs_setquiet(int q);
  void lmfs_cache_reevaluate(dev_t dev);
diff --git a/minix/include/minix/vm.h b/minix/include/minix/vm.h

index 9775d7133d9c891e5991f12bf54254088245c7a0..b8a1c399ed967d6ea1c8388ec9636ed514c6d37f 100644 (file)
--- a/minix/include/minix/vm.h
+++ b/minix/include/minix/vm.h
@@ -21,8 +21,6 @@ int vm_update(endpoint_t src_e, endpoint_t dst_e);
  int vm_memctl(endpoint_t ep, int req);
  int vm_query_exit(endpoint_t *endpt);
  int vm_watch_exit(endpoint_t ep);
-int vm_forgetblock(u64_t id);
-void vm_forgetblocks(void);
  int minix_vfs_mmap(endpoint_t who, off_t offset, size_t len,
          dev_t dev, ino_t ino, int fd, u32_t vaddr, u16_t clearend, u16_t
         flags);
@@ -73,10 +71,9 @@ int vm_procctl_handlemem(endpoint_t ep, vir_bytes m1, vir_bytes m2, int wr);
  int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
          ino_t ino, off_t ino_offset, u32_t *flags, int blocksize,
          int setflags);
-
  void *vm_map_cacheblock(dev_t dev, off_t dev_offset,
          ino_t ino, off_t ino_offset, u32_t *flags, int blocksize);
-
+int vm_forget_cacheblock(dev_t dev, off_t dev_offset, int blocksize);
  int vm_clear_cache(dev_t dev);
  
  /* flags for vm cache functions */
diff --git a/minix/lib/libminixfs/cache.c b/minix/lib/libminixfs/cache.c

index 9c6533d5475b8e3e1fdf9cdc2dd6fdede7df3904..0b2ad2b1bc409dc9c03d7992e54f5f1c8345cc40 100644 (file)
--- a/minix/lib/libminixfs/cache.c
+++ b/minix/lib/libminixfs/cache.c
@@ -239,6 +239,27 @@ static void freeblock(struct buf *bp)
    } else assert(!bp->data);
  }
  
+/*===========================================================================*
+ *                             find_block                                   *
+ *===========================================================================*/
+static struct buf *find_block(dev_t dev, block64_t block)
+{
+/* Search the hash chain for (dev, block). Return the buffer structure if
+ * found, or NULL otherwise.
+ */
+  struct buf *bp;
+  int b;
+
+  assert(dev != NO_DEV);
+
+  b = BUFHASH(block);
+  for (bp = buf_hash[b]; bp != NULL; bp = bp->lmfs_hash)
+       if (bp->lmfs_blocknr == block && bp->lmfs_dev == dev)
+               return bp;
+
+  return NULL;
+}
+
  /*===========================================================================*
   *                             lmfs_get_block_ino                           *
   *===========================================================================*/
@@ -284,50 +305,46 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int only_search,
         util_stacktrace();
    }
  
-  /* Search the hash chain for (dev, block). */
-  b = BUFHASH(block);
-  bp = buf_hash[b];
-  while (bp != NULL) {
-       if (bp->lmfs_blocknr == block && bp->lmfs_dev == dev) {
-               if(bp->lmfs_flags & VMMC_EVICTED) {
-                       /* We had it but VM evicted it; invalidate it. */
-                       ASSERT(bp->lmfs_count == 0);
-                       ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
-                       ASSERT(!(bp->lmfs_flags & VMMC_DIRTY));
-                       bp->lmfs_dev = NO_DEV;
-                       bp->lmfs_bytes = 0;
-                       bp->data = NULL;
-                       break;
-               }
-               /* Block needed has been found. */
-               if (bp->lmfs_count == 0) {
-                       rm_lru(bp);
-                       ASSERT(bp->lmfs_needsetcache == 0);
-                       ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
-                       bp->lmfs_flags |= VMMC_BLOCK_LOCKED;
-               }
-               raisecount(bp);
-               ASSERT(bp->lmfs_bytes == fs_block_size);
-               ASSERT(bp->lmfs_dev == dev);
-               ASSERT(bp->lmfs_dev != NO_DEV);
-               ASSERT(bp->lmfs_flags & VMMC_BLOCK_LOCKED);
-               ASSERT(bp->data);
-
-               if(ino != VMC_NO_INODE) {
-                       if(bp->lmfs_inode == VMC_NO_INODE
-                       || bp->lmfs_inode != ino
-                       || bp->lmfs_inode_offset != ino_off) {
-                               bp->lmfs_inode = ino;
-                               bp->lmfs_inode_offset = ino_off;
-                               bp->lmfs_needsetcache = 1;
-                       }
+  /* See if the block is in the cache. If so, we can return it right away. */
+  bp = find_block(dev, block);
+  if (bp != NULL && !(bp->lmfs_flags & VMMC_EVICTED)) {
+       /* Block needed has been found. */
+       if (bp->lmfs_count == 0) {
+               rm_lru(bp);
+               ASSERT(bp->lmfs_needsetcache == 0);
+               ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
+               /* FIXME: race condition against the VMMC_EVICTED check */
+               bp->lmfs_flags |= VMMC_BLOCK_LOCKED;
+       }
+       raisecount(bp);
+       ASSERT(bp->lmfs_bytes == fs_block_size);
+       ASSERT(bp->lmfs_dev == dev);
+       ASSERT(bp->lmfs_dev != NO_DEV);
+       ASSERT(bp->lmfs_flags & VMMC_BLOCK_LOCKED);
+       ASSERT(bp->data);
+
+       if(ino != VMC_NO_INODE) {
+               if(bp->lmfs_inode == VMC_NO_INODE
+               || bp->lmfs_inode != ino
+               || bp->lmfs_inode_offset != ino_off) {
+                       bp->lmfs_inode = ino;
+                       bp->lmfs_inode_offset = ino_off;
+                       bp->lmfs_needsetcache = 1;
                 }
+       }
  
-               return(bp);
-       } else {
-               /* This block is not the one sought. */
-               bp = bp->lmfs_hash; /* move to next block on hash chain */
-       }
+       return(bp);
+  }
+
+  /* We had the block in the cache but VM evicted it; invalidate it. */
+  if (bp != NULL) {
+       assert(bp->lmfs_flags & VMMC_EVICTED);
+       ASSERT(bp->lmfs_count == 0);
+       ASSERT(!(bp->lmfs_flags & VMMC_BLOCK_LOCKED));
+       ASSERT(!(bp->lmfs_flags & VMMC_DIRTY));
+       bp->lmfs_dev = NO_DEV;
+       bp->lmfs_bytes = 0;
+       bp->data = NULL;
    }
  
    /* Desired block is not on available chain. Find a free block to use. */
@@ -440,7 +457,7 @@ void lmfs_put_block(
    if (bp->lmfs_count != 0) return;     /* block is still in use */
  
    /* Put this block back on the LRU chain.  */
-  if (dev == DEV_RAM || (block_type & ONE_SHOT)) {
+  if (dev == NO_DEV || dev == DEV_RAM || (block_type & ONE_SHOT)) {
         /* Block probably won't be needed quickly. Put it on front of chain.
          * It will be the next block to be evicted from the cache.
          */
@@ -483,7 +500,48 @@ void lmfs_put_block(
         }
    }
    bp->lmfs_needsetcache = 0;
+}
+
+/*===========================================================================*
+ *                             lmfs_free_block                              *
+ *===========================================================================*/
+void lmfs_free_block(dev_t dev, block64_t block)
+{
+/* The file system has just freed the given block. The block may previously
+ * have been in use as data block for an inode. Therefore, we now need to tell
+ * VM that the block is no longer associated with an inode. If we fail to do so
+ * and the inode now has a hole at this location, mapping in the hole would
+ * yield the old block contents rather than a zeroed page. In addition, if the
+ * block is in the cache, it will be removed, even if it was dirty.
+ */
+  struct buf *bp;
+  int r;
  
+  /* Tell VM to forget about the block. The primary purpose of this call is to
+   * break the inode association, but since the block is part of a mounted file
+   * system, it is not expected to be accessed directly anyway. So, save some
+   * cache memory by throwing it out of the VM cache altogether.
+   */
+  if (vmcache) {
+       if ((r = vm_forget_cacheblock(dev, block * fs_block_size,
+           fs_block_size)) != OK)
+               printf("libminixfs: vm_forget_cacheblock failed (%d)\n", r);
+  }
+
+  if ((bp = find_block(dev, block)) != NULL) {
+       lmfs_markclean(bp);
+
+       /* Invalidate the block. The block may or may not be in use right now,
+        * so don't be smart about freeing memory or repositioning in the LRU.
+        */
+       bp->lmfs_dev = NO_DEV;
+  }
+
+  /* Note that this is *not* the right place to implement TRIM support. Even
+   * though the block is freed, on the device it may still be part of a
+   * previous checkpoint or snapshot of some sort. Only the file system can
+   * be trusted to decide which blocks can be reused on the device!
+   */
  }
  
  void lmfs_cache_reevaluate(dev_t dev)
@@ -577,6 +635,10 @@ void lmfs_invalidate(
         }
    }
  
+  /* Clear the cache even if VM caching is disabled for the file system:
+   * caching may be disabled as side effect of an error, leaving blocks behind
+   * in the actual VM cache.
+   */
    vm_clear_cache(device);
  }
  
diff --git a/minix/lib/libsys/vm_cache.c b/minix/lib/libsys/vm_cache.c

index be31981c20434ed5ab7284c6fac61c0a9216c622..c91e4e396c12665ff553361dfeaf428c5bc7cb18 100644 (file)
--- a/minix/lib/libsys/vm_cache.c
+++ b/minix/lib/libsys/vm_cache.c
@@ -65,6 +65,14 @@ int vm_set_cacheblock(void *block, dev_t dev, off_t dev_offset,
                 ino, ino_offset, flags, blocksize, setflags);
  }
  
+int vm_forget_cacheblock(dev_t dev, off_t dev_offset, int blocksize)
+{
+       message m;
+
+       return vm_cachecall(&m, VM_FORGETCACHEPAGE, NULL, dev, dev_offset,
+               VMC_NO_INODE, 0, 0, blocksize, 0);
+}
+
  int
  vm_clear_cache(dev_t dev)
  {
diff --git a/minix/servers/vm/main.c b/minix/servers/vm/main.c

index 73cf0e2c422889d6d623169fedf2a43eb52799c6..0fd120372227ad696a6b1498e1182383d30c9dd4 100644 (file)
--- a/minix/servers/vm/main.c
+++ b/minix/servers/vm/main.c
@@ -507,6 +507,7 @@ void init_vm(void)
         /* Cache blocks. */
         CALLMAP(VM_MAPCACHEPAGE, do_mapcache);
         CALLMAP(VM_SETCACHEPAGE, do_setcache);
+       CALLMAP(VM_FORGETCACHEPAGE, do_forgetcache);
         CALLMAP(VM_CLEARCACHE, do_clearcache);
  
         /* getrusage */
diff --git a/minix/servers/vm/mem_cache.c b/minix/servers/vm/mem_cache.c

index 6b1ace42a8d37dc4b5ed37dcff1da2ed086dcf43..1f6942da5e3a9e38b9b2becfa98fb83036c9cca7 100644 (file)
--- a/minix/servers/vm/mem_cache.c
+++ b/minix/servers/vm/mem_cache.c
@@ -88,7 +88,7 @@ int
  do_mapcache(message *msg)
  {
         dev_t dev = msg->m_vmmcp.dev;
-       off_t dev_off = msg->m_vmmcp.dev_offset;
+       uint64_t dev_off = msg->m_vmmcp.dev_offset;
         off_t ino_off = msg->m_vmmcp.ino_offset;
         int n;
         phys_bytes bytes = msg->m_vmmcp.pages * VM_PAGE_SIZE;
@@ -173,7 +173,7 @@ do_setcache(message *msg)
  {
         int r;
         dev_t dev = msg->m_vmmcp.dev;
-       off_t dev_off = msg->m_vmmcp.dev_offset;
+       uint64_t dev_off = msg->m_vmmcp.dev_offset;
         off_t ino_off = msg->m_vmmcp.ino_offset;
         int flags = msg->m_vmmcp.flags;
         int n;
@@ -252,6 +252,38 @@ do_setcache(message *msg)
         return OK;
  }
  
+/*
+ * Forget all pages associated to a particular block in the cache.
+ */
+int
+do_forgetcache(message *msg)
+{
+       struct cached_page *hb;
+       dev_t dev;
+       uint64_t dev_off;
+       phys_bytes bytes, offset;
+
+       dev = msg->m_vmmcp.dev;
+       dev_off = msg->m_vmmcp.dev_offset;
+       bytes = msg->m_vmmcp.pages * VM_PAGE_SIZE;
+
+       if (bytes < VM_PAGE_SIZE)
+               return EINVAL;
+
+       if (dev_off % PAGE_SIZE) {
+               printf("VM: unaligned cache operation\n");
+               return EFAULT;
+       }
+
+       for (offset = 0; offset < bytes; offset += VM_PAGE_SIZE) {
+               if ((hb = find_cached_page_bydev(dev, dev_off + offset,
+                   VMC_NO_INODE, 0 /*ino_off*/, 0 /*touchlru*/)) != NULL)
+                       rmcache(hb);
+       }
+
+       return OK;
+}
+
  /*
   * A file system wants to invalidate all pages belonging to a certain device.
   */
diff --git a/minix/servers/vm/proto.h b/minix/servers/vm/proto.h

index e61ea9f7489a92b6e047137f71f317328feaee2e..b96c31e2763db29a7715090346fc8796c4a5c08e 100644 (file)
--- a/minix/servers/vm/proto.h
+++ b/minix/servers/vm/proto.h
@@ -221,6 +221,7 @@ void shared_setsource(struct vir_region *vr, endpoint_t ep, struct vir_region *s
  /* mem_cache.c */
  int do_mapcache(message *m);
  int do_setcache(message *m);
+int do_forgetcache(message *m);
  int do_clearcache(message *m);
  
  /* cache.c */
diff --git a/minix/tests/test72.c b/minix/tests/test72.c

index 1723bced4aefed1807c33e9241141c246bda1855..9e13f16dcd4c65b843a1979747ffc077c2430825 100644 (file)
--- a/minix/tests/test72.c
+++ b/minix/tests/test72.c
@@ -241,6 +241,11 @@ void *vm_map_cacheblock(dev_t dev, off_t dev_offset,
         return MAP_FAILED;
  }
  
+int vm_forget_cacheblock(dev_t dev, off_t dev_offset, int blocksize)
+{
+       return 0;
+}
+
  int vm_clear_cache(dev_t dev)
  {
         return 0;
diff --git a/minix/tests/test74.c b/minix/tests/test74.c

index a0b3b1b4090b9d4c6585a99f5f62b26f27ada468..b8cd36283a39459bfc4bb1c355c3d2a3b83c18ac 100644 (file)
--- a/minix/tests/test74.c
+++ b/minix/tests/test74.c
@@ -532,10 +532,168 @@ nonedev_regression(void)
         close(fd);
  }
  
+/*
+ * Regression test for a nasty memory-mapped file corruption bug, which is not
+ * easy to reproduce but, before being solved, did occur in practice every once
+ * in a while.  The executive summary is that through stale inode associations,
+ * VM could end up using an old block to satisfy a memory mapping.
+ *
+ * This subtest relies on a number of assumptions regarding allocation and
+ * reuse of inode numbers and blocks.  These assumptions hold for MFS but
+ * possibly no other file system.  However, if the subtest's assumptions are
+ * not met, it will simply succeed.
+ */
+static void
+corruption_regression(void)
+{
+       char *ptr, *buf;
+       struct statvfs sf;
+       struct stat st;
+       size_t block_size;
+       off_t size;
+       int fd, fd2;
+
+       subtest = 1;
+
+       if (statvfs(".", &sf) != 0) e(0);
+       block_size = sf.f_bsize;
+
+       if ((buf = malloc(block_size * 2)) == NULL) e(0);
+
+       /*
+        * We first need a file that is just large enough that it requires the
+        * allocation of a metadata block - an indirect block - when more data
+        * is written to it.  This is fileA.  We keep it open throughout the
+        * test so we can unlink it immediately.
+        */
+       if ((fd = open("fileA", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
+               e(0);
+       if (unlink("fileA") != 0) e(0);
+
+       /*
+        * Write to fileA until its next block requires the allocation of an
+        * additional metadata block - an indirect block.
+        */
+       size = 0;
+       memset(buf, 'A', block_size);
+       do {
+               /*
+                * Repeatedly write an extra block, until the file consists of
+                * more blocks than just the file data.
+                */
+               if (write(fd, buf, block_size) != block_size) e(0);
+               size += block_size;
+               if (size >= block_size * 64) {
+                       /*
+                        * It doesn't look like this is going to work.
+                        * Skip this subtest altogether.
+                        */
+                       if (close(fd) != 0) e(0);
+                       free(buf);
+
+                       return;
+               }
+               if (fstat(fd, &st) != 0) e(0);
+       } while (st.st_blocks * 512 == size);
+
+       /* Once we get there, go one step back by truncating by one block. */
+       size -= block_size; /* for MFS, size will end up being 7*block_size */
+       if (ftruncate(fd, size) != 0) e(0);
+
+       /*
+        * Create a first file, fileB, and write two blocks to it.  FileB's
+        * blocks are going to end up in the secondary VM cache, associated to
+        * fileB's inode number (and two different offsets within the file).
+        * The block cache does not know about files getting deleted, so we can
+        * unlink fileB immediately after creating it.  So far so good.
+        */
+       if ((fd2 = open("fileB", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
+               e(0);
+       if (unlink("fileB") != 0) e(0);
+       memset(buf, 'B', block_size * 2);
+       if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
+       if (close(fd2) != 0) e(0);
+
+       /*
+        * Write one extra block to fileA, hoping that this causes allocation
+        * of a metadata block as well.  This is why we tried to get fileA to
+        * the point that one more block would also require the allocation of a
+        * metadata block.  Our intent is to recycle the blocks that we just
+        * allocated and freed for fileB.  As of writing, for the metadata
+        * block, this will *not* break the association with fileB's inode,
+        * which by itself is not a problem, yet crucial to reproducing
+        * the actual problem a bit later.  Note that the test does not rely on
+        * whether the file system allocates the data block or the metadata
+        * block first, although it does need reverse deallocation (see below).
+        */
+       memset(buf, 'A', block_size);
+       if (write(fd, buf, block_size) != block_size) e(0);
+
+       /*
+        * Create a new file, fileC, which recycles the inode number of fileB,
+        * but uses two new blocks to store its data.  These new blocks will
+        * get associated to the fileB inode number, and one of them will
+        * thereby eclipse (but not remove) the association of fileA's metadata
+        * block to the inode of fileB.
+        */
+       if ((fd2 = open("fileC", O_CREAT | O_TRUNC | O_WRONLY, 0600)) == -1)
+               e(0);
+       if (unlink("fileC") != 0) e(0);
+       memset(buf, 'C', block_size * 2);
+       if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
+       if (close(fd2) != 0) e(0);
+
+       /*
+        * Free up the extra fileA blocks for reallocation, in particular
+        * including the metadata block.  Again, this will not affect the
+        * contents of the VM cache in any way.  FileA's metadata block remains
+        * cached in VM, with the inode association for fileB's block.
+        */
+       if (ftruncate(fd, size) != 0) e(0);
+
+       /*
+        * Now create yet one more file, fileD, which also recycles the inode
+        * number of fileB and fileC.  Write two blocks to it; these blocks
+        * should recycle the blocks we just freed.  One of these is fileA's
+        * just-freed metadata block, for which the new inode association will
+        * be equal to the inode association it had already (as long as blocks
+        * are freed in reverse order of their allocation, which happens to be
+        * the case for MFS).  As a result, the block is not updated in the VM
+        * cache, and VM will therefore continue to see the inode association
+        * for the corresponding block of fileC which is still in the VM cache.
+        */
+       if ((fd2 = open("fileD", O_CREAT | O_TRUNC | O_RDWR, 0600)) == -1)
+               e(0);
+       memset(buf, 'D', block_size * 2);
+       if (write(fd2, buf, block_size * 2) != block_size * 2) e(0);
+
+       ptr = mmap(NULL, block_size * 2, PROT_READ, MAP_FILE, fd2, 0);
+       if (ptr == MAP_FAILED) e(0);
+
+       /*
+        * Finally, we can test the issue.  Since fileC's block is still the
+        * block for which VM has the corresponding inode association, VM will
+        * now find and map in fileC's block, instead of fileD's block.  The
+        * result is that we get a memory-mapped area with stale contents,
+        * different from those of the underlying file.
+        */
+       if (memcmp(buf, ptr, block_size * 2)) e(0);
+
+       /* Clean up. */
+       if (munmap(ptr, block_size * 2) != 0) e(0);
+
+       if (close(fd2) != 0) e(0);
+       if (unlink("fileD") != 0) e(0);
+
+       if (close(fd) != 0) e(0);
+
+       free(buf);
+}
+
  int
  main(int argc, char *argv[])
  {
-       int iter = 2;
+       int i, iter = 2;
  
         start(74);
  
@@ -543,6 +701,14 @@ main(int argc, char *argv[])
  
         nonedev_regression();
  
+       /*
+        * Any inode or block allocation happening concurrently with this
+        * subtest will make the subtest succeed without testing the actual
+        * issue.  Thus, repeat the subtest a fair number of times.
+        */
+       for (i = 0; i < 10; i++)
+               corruption_regression();
+
         test_memory_types_vs_operations();
  
         makefiles(MAXFILES);
diff --git a/minix/tests/testvm.conf b/minix/tests/testvm.conf

index ec42c0e252b030f4f6e8d03152b65cd645f5b6ae..7be4436f834241acaaafded75de0e52f1314b7e9 100644 (file)
--- a/minix/tests/testvm.conf
+++ b/minix/tests/testvm.conf
@@ -1,7 +1,7 @@
  service testvm {
         ipc     ALL;    # All system ipc targets allowed
         system  BASIC;          # Only basic kernel calls allowed
-       vm      MAPCACHEPAGE SETCACHEPAGE CLEARCACHE;
+       vm      MAPCACHEPAGE SETCACHEPAGE FORGETCACHEPAGE CLEARCACHE;
         io      NONE;           # No I/O range allowed
         irq     NONE;           # No IRQ allowed
         sigmgr          rs;     # Signal manager is RS
author	David van Moolenbroek <david@minix3.org>
	Thu, 13 Aug 2015 11:29:33 +0000 (11:29 +0000)
committer	David van Moolenbroek <david@minix3.org>
	Thu, 13 Aug 2015 13:46:46 +0000 (13:46 +0000)
etc/system.conf		patch \| blob \| history
minix/commands/service/parse.c		patch \| blob \| history
minix/fs/ext2/balloc.c		patch \| blob \| history
minix/fs/ext2/write.c		patch \| blob \| history
minix/fs/mfs/cache.c		patch \| blob \| history
minix/fs/mfs/write.c		patch \| blob \| history
minix/include/minix/com.h		patch \| blob \| history
minix/include/minix/libminixfs.h		patch \| blob \| history
minix/include/minix/vm.h		patch \| blob \| history
minix/lib/libminixfs/cache.c		patch \| blob \| history
minix/lib/libsys/vm_cache.c		patch \| blob \| history
minix/servers/vm/main.c		patch \| blob \| history
minix/servers/vm/mem_cache.c		patch \| blob \| history
minix/servers/vm/proto.h		patch \| blob \| history
minix/tests/test72.c		patch \| blob \| history
minix/tests/test74.c		patch \| blob \| history
minix/tests/testvm.conf		patch \| blob \| history