printf("ext2fs: fsdriver_zero failed\n");
}
return r;
+ } else if (call == FSC_PEEK) {
+ /* Peeking a nonexistent block. Report to VM. */
+ lmfs_zero_block_ino(dev, ino, ino_off);
+ return OK;
} else {
- /* Writing to or peeking a nonexistent block.
+ /* Writing to a nonexistent block.
* Create and enter in inode.
*/
if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL)
printf("MFS: fsdriver_zero failed\n");
}
return r;
+ } else if (call == FSC_PEEK) {
+ /* Peeking a nonexistent block. Report to VM. */
+ lmfs_zero_block_ino(dev, ino, ino_off);
+ return OK;
} else {
- /* Writing to or peeking a nonexistent block.
+ /* Writing to a nonexistent block.
* Create and enter in inode.
*/
if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL)
ino_t ino, u64_t off);
void lmfs_put_block(struct buf *bp, int block_type);
void lmfs_free_block(dev_t dev, block64_t block);
+void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t off);
void lmfs_invalidate(dev_t device);
void lmfs_rw_scattered(dev_t, struct buf **, int, int);
void lmfs_setquiet(int q);
*/
dev_t dev;
uint64_t dev_off;
- int r;
+ int r, setflags;
if (bp == NULL) return; /* it is easier to check here than in caller */
/* block has sensible content - if necesary, identify it to VM */
if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) {
- if((r=vm_set_cacheblock(bp->data, dev, dev_off,
- bp->lmfs_inode, bp->lmfs_inode_offset,
- &bp->lmfs_flags, fs_block_size, 0)) != OK) {
+ setflags = (block_type & ONE_SHOT) ? VMSF_ONCE : 0;
+ if ((r = vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode,
+ bp->lmfs_inode_offset, &bp->lmfs_flags, fs_block_size,
+ setflags)) != OK) {
if(r == ENOSYS) {
printf("libminixfs: ENOSYS, disabling VM calls\n");
vmcache = 0;
}
}
bp->lmfs_needsetcache = 0;
+
+ /* Now that we (may) have given the block to VM, invalidate the block if it
+ * is a one-shot block. Otherwise, it may still be reobtained immediately
+ * after, which could be a problem if VM already forgot the block and we are
+ * expected to pass it to VM again, which then wouldn't happen.
+ */
+ if (block_type & ONE_SHOT)
+ bp->lmfs_dev = NO_DEV;
}
/*===========================================================================*
*/
}
+/*===========================================================================*
+ * lmfs_zero_block_ino *
+ *===========================================================================*/
+void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off)
+{
+/* Files may have holes. From an application perspective, these are just file
+ * regions filled with zeroes. From a file system perspective however, holes
+ * may represent unallocated regions on disk. Thus, these holes do not have
+ * corresponding blocks on the disk, and therefore also no block number.
+ * Therefore, we cannot simply use lmfs_get_block_ino() for them. For reads,
+ * this is not a problem, since the file system can just zero out the target
+ * application buffer instead. For mapped pages however, this *is* a problem,
+ * since the VM cache needs to be told about the corresponding block, and VM
+ * does not accept blocks without a device offset. The role of this function is
+ * therefore to tell VM about the hole using a fake device offset. The device
+ * offsets are picked so that the VM cache will see a block memory-mapped for
+ * the hole in the file, while the same block is not visible when
+ * memory-mapping the block device.
+ */
+ struct buf *bp;
+ static block64_t fake_block = 0;
+
+ if (!vmcache)
+ return;
+
+ assert(fs_block_size > 0);
+
+ /* Pick a block number which is above the threshold of what can possibly be
+ * mapped in by mmap'ing the device, since off_t is signed, and it is safe to
+ * say that it will take a while before we have 8-exabyte devices. Pick a
+ * different block number each time to avoid possible concurrency issues.
+ * FIXME: it does not seem like VM actually verifies mmap offsets though..
+ */
+ if (fake_block == 0 || ++fake_block >= UINT64_MAX / fs_block_size)
+ fake_block = ((uint64_t)INT64_MAX + 1) / fs_block_size;
+
+ /* Obtain a block. */
+ bp = lmfs_get_block_ino(dev, fake_block, NO_READ, ino, ino_off);
+ assert(bp != NULL);
+ assert(bp->lmfs_dev != NO_DEV);
+
+ /* The block is already zeroed, as it has just been allocated with mmap. File
+ * systems do not rely on this assumption yet, so if VM ever gets changed to
+ * not clear the blocks we allocate (e.g., by recycling pages in the VM cache
+ * for the same process, which would be safe), we need to add a memset here.
+ */
+
+ /* Release the block. We don't expect it to be accessed ever again. Moreover,
+ * if we keep the block around in the VM cache, it may erroneously be mapped
+ * in beyond the file end later. Hence, use VMSF_ONCE when passing it to VM.
+ * TODO: tell VM that it is an all-zeroes block, so that VM can deduplicate
+ * all such pages in its cache.
+ */
+ lmfs_put_block(bp, ONE_SHOT);
+}
+
void lmfs_cache_reevaluate(dev_t dev)
{
if(bufs_in_use == 0 && dev != NO_DEV) {
free(buf);
}
+/*
+ * Test mmap on file holes. Holes are a tricky case with the current VM
+ * implementation. There are two main issues. First, whenever a file data
+ * block is freed, VM has to know about this, or it will later blindly map in
+ * the old data. This, file systems explicitly tell VM (through libminixfs)
+ * whenever a block is freed, upon which VM cache forgets the block. Second,
+ * blocks are accessed primarily by a <dev,dev_off> pair and only additionally
+ * by a <ino,ino_off> pair. Holes have no meaningful value for the first pair,
+ * but do need to be registered in VM with the second pair, or accessing them
+ * will generate a segmentation fault. Thus, file systems explicitly tell VM
+ * (through libminixfs) when a hole is being peeked; libminixfs currently fakes
+ * a device offset to make this work.
+ */
+static void
+hole_regression(void)
+{
+ struct statvfs st;
+ size_t block_size;
+ char *buf;
+ int fd;
+
+ if (statvfs(".", &st) < 0) e(1);
+
+ block_size = st.f_bsize;
+
+ if ((buf = malloc(block_size)) == NULL) e(2);
+
+ if ((fd = open("testfile", O_CREAT | O_TRUNC | O_RDWR)) < 0) e(3);
+
+ if (unlink("testfile") != 0) e(4);
+
+ /*
+ * We perform the test twice, in a not-so-perfect attempt to test the
+ * two aspects independently. The first part immediately creates a
+ * hole, and is supposed to fail only if reporting holes to VM does not
+ * work. However, it may also fail if a page for a previous file with
+ * the same inode number as "testfile" is still in the VM cache.
+ */
+ memset(buf, 12, block_size);
+
+ if (write(fd, buf, block_size) != block_size) e(5);
+
+ if (lseek(fd, block_size * 2, SEEK_CUR) != block_size * 3) e(6);
+
+ memset(buf, 78, block_size);
+
+ if (write(fd, buf, block_size) != block_size) e(7);
+
+ free(buf);
+
+ if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
+ fd, 0)) == MAP_FAILED) e(8);
+
+ if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(9);
+ if (buf[1 * block_size] != 0 || buf[2 * block_size - 1] != 0) e(10);
+ if (buf[2 * block_size] != 0 || buf[3 * block_size - 1] != 0) e(11);
+ if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(12);
+
+ if (munmap(buf, 4 * block_size) != 0) e(13);
+
+ /*
+ * The second part first creates file content and only turns part of it
+ * into a file hole, thus ensuring that VM has previously cached pages
+ * for the blocks that are freed. The test will fail if VM keeps the
+ * pages around in its cache.
+ */
+ if ((buf = malloc(block_size)) == NULL) e(14);
+
+ if (lseek(fd, block_size, SEEK_SET) != block_size) e(15);
+
+ memset(buf, 34, block_size);
+
+ if (write(fd, buf, block_size) != block_size) e(16);
+
+ memset(buf, 56, block_size);
+
+ if (write(fd, buf, block_size) != block_size) e(17);
+
+ if (ftruncate(fd, block_size) != 0) e(18);
+
+ if (lseek(fd, block_size * 3, SEEK_SET) != block_size * 3) e(19);
+
+ memset(buf, 78, block_size);
+
+ if (write(fd, buf, block_size) != block_size) e(20);
+
+ free(buf);
+
+ if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE,
+ fd, 0)) == MAP_FAILED) e(21);
+
+ if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(22);
+ if (buf[1 * block_size] != 0 || buf[2 * block_size - 1] != 0) e(23);
+ if (buf[2 * block_size] != 0 || buf[3 * block_size - 1] != 0) e(24);
+ if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(25);
+
+ if (munmap(buf, 4 * block_size) != 0) e(26);
+
+ close(fd);
+}
+
int
main(int argc, char *argv[])
{
for (i = 0; i < 10; i++)
corruption_regression();
+ hole_regression();
+
test_memory_types_vs_operations();
makefiles(MAXFILES);