From: David van Moolenbroek <david@minix3.org>
Date: Sat, 4 Apr 2015 15:55:48 +0000 (+0000)
Subject: libminixfs: rework prefetch API
X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zpipe.c?a=commitdiff_plain;h=4472b590c74fed661146a925d8dff2ed1b04256e;p=minix.git

libminixfs: rework prefetch API

This patch changes the prefetch API so that file systems must now
provide a set of block numbers, rather than a set of buffers.  The
result is a leaner and more well-defined API; linear computation of
the range of blocks to prefetch; duplicates no longer interfering
with the prefetch process; guaranteed inclusion of the block needed
next into the prefetch range; and, limits and policy decisions better
established by libminixfs now actually being moved into libminixfs.

Change-Id: I7e44daf2d2d164bc5e2f1473ad717f3ff0f0a77f
---

diff --git a/minix/fs/ext2/misc.c b/minix/fs/ext2/misc.c
index 585db14fa..4f79cf931 100644
--- a/minix/fs/ext2/misc.c
+++ b/minix/fs/ext2/misc.c
@@ -19,8 +19,6 @@ void fs_sync(void)
  */
   struct inode *rip;
 
-  assert(lmfs_nr_bufs() > 0);
-
   if (superblock->s_rd_only)
 	return; /* nothing to sync */
 
diff --git a/minix/fs/ext2/path.c b/minix/fs/ext2/path.c
index 0fb0c3711..c8e4c2635 100644
--- a/minix/fs/ext2/path.c
+++ b/minix/fs/ext2/path.c
@@ -215,7 +215,6 @@ int ftype;			 /* used when ENTER and INCOMPAT_FILETYPE */
 				/* 'flag' is LOOK_UP */
 				*numb = (ino_t) conv4(le_CPU, dp->d_ino);
 			}
-			assert(lmfs_dev(bp) != NO_DEV);
 			put_block(bp);
 			return(r);
 		}
@@ -250,7 +249,6 @@ int ftype;			 /* used when ENTER and INCOMPAT_FILETYPE */
 	}
 
 	/* The whole block has been searched or ENTER has a free slot. */
-	assert(lmfs_dev(bp) != NO_DEV);
 	if (e_hit) break;	/* e_hit set if ENTER can be performed now */
 	put_block(bp);		 /* otherwise, continue searching dir */
   }
diff --git a/minix/fs/ext2/read.c b/minix/fs/ext2/read.c
index 4a38ff2fe..26595c510 100644
--- a/minix/fs/ext2/read.c
+++ b/minix/fs/ext2/read.c
@@ -252,8 +252,6 @@ int opportunistic;
 		b = rip->i_block[EXT2_TIND_BLOCK];
 		if (b == NO_BLOCK) return(NO_BLOCK);
 		bp = get_block(rip->i_dev, b, NORMAL); /* get triple ind block */
-		ASSERT(lmfs_dev(bp) != NO_DEV);
-		ASSERT(lmfs_dev(bp) == rip->i_dev);
 		excess = block_pos - triple_ind_s;
 		mindex = excess / addr_in_block2;
 		b = rd_indir(bp, mindex);	/* num of double ind block */
@@ -264,8 +262,6 @@ int opportunistic;
 	bp = get_block(rip->i_dev, b, iomode); /* get double indirect block */
 	if (bp == NULL)
 		return NO_BLOCK;		/* peeking failed */
-	ASSERT(lmfs_dev(bp) != NO_DEV);
-	ASSERT(lmfs_dev(bp) == rip->i_dev);
 	mindex = excess / addr_in_block;
 	b = rd_indir(bp, mindex);	/* num of single ind block */
 	put_block(bp);				/* release double ind block */
@@ -276,8 +272,6 @@ int opportunistic;
   if (bp == NULL)
 	return NO_BLOCK;			/* peeking failed */
 
-  ASSERT(lmfs_dev(bp) != NO_DEV);
-  ASSERT(lmfs_dev(bp) == rip->i_dev);
   b = rd_indir(bp, mindex);
   put_block(bp);				/* release single ind block */
 
@@ -332,34 +326,16 @@ unsigned bytes_ahead;           /* bytes beyond position for immediate use */
  * flag on all reads to allow this.
  */
 /* Minimum number of blocks to prefetch. */
-# define BLOCKS_MINIMUM		(nr_bufs < 50 ? 18 : 32)
-  int nr_bufs = lmfs_nr_bufs();
+# define BLOCKS_MINIMUM		32
   int r, read_q_size;
   unsigned int blocks_ahead, fragment, block_size;
   block_t block, blocks_left;
   off_t ind1_pos;
   dev_t dev;
   struct buf *bp = NULL;
-  static unsigned int readqsize = 0;
-  static struct buf **read_q = NULL;
+  static block64_t read_q[LMFS_MAX_PREFETCH];
   u64_t position_running;
 
-  if(readqsize != nr_bufs) {
-	if(readqsize > 0) {
-		assert(read_q != NULL);
-		free(read_q);
-		read_q = NULL;
-		readqsize = 0;
-	} 
-
-	assert(readqsize == 0);
-	assert(read_q == NULL);
-
-	if(!(read_q = malloc(sizeof(read_q[0])*nr_bufs)))
-		panic("couldn't allocate read_q");
-	readqsize = nr_bufs;
-  }
-
   dev = rip->i_dev;
   assert(dev != NO_DEV);
   block_size = get_block_size(dev);
@@ -372,11 +348,11 @@ unsigned bytes_ahead;           /* bytes beyond position for immediate use */
   bytes_ahead += fragment;
   blocks_ahead = (bytes_ahead + block_size - 1) / block_size;
 
-  r = lmfs_get_block_ino(&bp, dev, block, PREFETCH, rip->i_num, position);
-  if (r != OK)
+  r = lmfs_get_block_ino(&bp, dev, block, PEEK, rip->i_num, position);
+  if (r == OK)
+	return(bp);
+  if (r != ENOENT)
 	panic("ext2: error getting block (%llu,%u): %d", dev, block, r);
-  assert(bp != NULL);
-  if (lmfs_dev(bp) != NO_DEV) return(bp);
 
   /* The best guess for the number of blocks to prefetch:  A lot.
    * It is impossible to tell what the device looks like, so we don't even
@@ -408,9 +384,6 @@ unsigned bytes_ahead;           /* bytes beyond position for immediate use */
 	blocks_left++;
   }
 
-  /* No more than the maximum request. */
-  if (blocks_ahead > NR_IOREQS) blocks_ahead = NR_IOREQS;
-
   /* Read at least the minimum number of blocks, but not after a seek. */
   if (blocks_ahead < BLOCKS_MINIMUM && rip->i_seek == NO_SEEK)
 	blocks_ahead = BLOCKS_MINIMUM;
@@ -418,38 +391,39 @@ unsigned bytes_ahead;           /* bytes beyond position for immediate use */
   /* Can't go past end of file. */
   if (blocks_ahead > blocks_left) blocks_ahead = blocks_left;
 
+  /* No more than the maximum request. */
+  if (blocks_ahead > LMFS_MAX_PREFETCH) blocks_ahead = LMFS_MAX_PREFETCH;
+
   read_q_size = 0;
 
   /* Acquire block buffers. */
   for (;;) {
   	block_t thisblock;
-	read_q[read_q_size++] = bp;
+	read_q[read_q_size++] = block;
 
 	if (--blocks_ahead == 0) break;
 
-	/* Don't trash the cache, leave 4 free. */
-	if (lmfs_bufs_in_use() >= nr_bufs - 4) break;
-
 	block++;
 	position_running += block_size;
 
 	thisblock = read_map(rip, (off_t) ex64lo(position_running), 1);
 	if (thisblock != NO_BLOCK) {
-		r = lmfs_get_block_ino(&bp, dev, thisblock, PREFETCH,
-		    rip->i_num, position_running);
-		if (r != OK)
-			panic("ext2: error getting block (%llu,%u): %d",
-			    dev, thisblock, r);
-	} else {
-		bp = get_block(dev, block, PREFETCH);
-	}
-	if (lmfs_dev(bp) != NO_DEV) {
+		r = lmfs_get_block_ino(&bp, dev, thisblock, PEEK, rip->i_num,
+		    position_running);
+		block = thisblock;
+	} else
+		r = lmfs_get_block(&bp, dev, block, PEEK);
+
+	if (r == OK) {
 		/* Oops, block already in the cache, get out. */
 		put_block(bp);
 		break;
 	}
+	if (r != ENOENT)
+		panic("ext2: error getting block (%llu,%u): %d", dev, block,
+		    r);
   }
-  lmfs_rw_scattered(dev, read_q, read_q_size, READING);
+  lmfs_prefetch(dev, read_q, read_q_size);
 
   r = lmfs_get_block_ino(&bp, dev, baseblock, NORMAL, rip->i_num, position);
   if (r != OK)
diff --git a/minix/fs/mfs/cache.c b/minix/fs/mfs/cache.c
index bb3a4a4ac..4b3fcc223 100644
--- a/minix/fs/mfs/cache.c
+++ b/minix/fs/mfs/cache.c
@@ -57,7 +57,7 @@ zone_t alloc_zone(
    *     z = b + sp->s_firstdatazone - 1
    * Alloc_bit() never returns 0, since this is used for NO_BIT (failure).
    */
-  sp = get_super(dev);
+  sp = &superblock;
 
   /* If z is 0, skip initial part of the map known to be fully in use. */
   if (z == sp->s_firstdatazone) {
@@ -93,7 +93,7 @@ void free_zone(
   bit_t bit;
 
   /* Locate the appropriate super_block and return bit. */
-  sp = get_super(dev);
+  sp = &superblock;
   if (numb < sp->s_firstdatazone || numb >= sp->s_zones) return;
   bit = (bit_t) (numb - (zone_t) (sp->s_firstdatazone - 1));
   free_bit(sp, ZMAP, bit);
diff --git a/minix/fs/mfs/clean.h b/minix/fs/mfs/clean.h
index 427696b2a..5d367116d 100644
--- a/minix/fs/mfs/clean.h
+++ b/minix/fs/mfs/clean.h
@@ -2,10 +2,13 @@
 #ifndef _MFS_CLEAN_H
 #define _MFS_CLEAN_H 1
 
-#define MARKDIRTY(b) do { if(superblock.s_dev == lmfs_dev(b) && superblock.s_rd_only) { printf("%s:%d: dirty block on rofs! ", __FILE__, __LINE__); util_stacktrace(); } else { lmfs_markdirty(b); } } while(0)
-#define MARKCLEAN(b) lmfs_markclean(b)
-
-#define ISDIRTY(b)	(!lmfs_isclean(b))
-#define ISCLEAN(b)	(lmfs_isclean(b))
+#define MARKDIRTY(b) do { \
+	if (superblock.s_rd_only) { \
+		printf("%s:%d: dirty block on rofs! ", __FILE__, __LINE__); \
+		util_stacktrace(); \
+	} else { \
+		lmfs_markdirty(b); \
+	} \
+} while(0)
 
 #endif
diff --git a/minix/fs/mfs/inode.c b/minix/fs/mfs/inode.c
index 65c5e3d65..2827c047e 100644
--- a/minix/fs/mfs/inode.c
+++ b/minix/fs/mfs/inode.c
@@ -258,7 +258,7 @@ struct inode *alloc_inode(dev_t dev, mode_t bits, uid_t uid, gid_t gid)
   int major, minor, inumb;
   bit_t b;
 
-  sp = get_super(dev);	/* get pointer to super_block */
+  sp = &superblock;
   if (sp->s_rd_only) {	/* can't allocate an inode on a read only device. */
 	err_code = EROFS;
 	return(NULL);
@@ -335,8 +335,7 @@ static void free_inode(
   register struct super_block *sp;
   bit_t b;
 
-  /* Locate the appropriate super_block. */
-  sp = get_super(dev);
+  sp = &superblock;
   if (inumb == NO_ENTRY || inumb > sp->s_ninodes) return;
   b = (bit_t) inumb;
   free_bit(sp, IMAP, b);
@@ -385,7 +384,7 @@ int rw_flag;			/* READING or WRITING */
   block_t b, offset;
 
   /* Get the block where the inode resides. */
-  sp = get_super(rip->i_dev);	/* get pointer to super block */
+  sp = &superblock;
   rip->i_sp = sp;		/* inode must contain super block pointer */
   offset = START_BLOCK + sp->s_imap_blocks + sp->s_zmap_blocks;
   b = (block_t) (rip->i_num - 1)/sp->s_inodes_per_block + offset;
diff --git a/minix/fs/mfs/misc.c b/minix/fs/mfs/misc.c
index 48d85ec49..99f54ff26 100644
--- a/minix/fs/mfs/misc.c
+++ b/minix/fs/mfs/misc.c
@@ -15,8 +15,6 @@ void fs_sync(void)
  */
   struct inode *rip;
 
-  assert(lmfs_nr_bufs() > 0);
-
   /* Write all the dirty inodes to the disk. */
   for(rip = &inode[0]; rip < &inode[NR_INODES]; rip++)
 	  if(rip->i_count > 0 && IN_ISDIRTY(rip)) rw_inode(rip, WRITING);
diff --git a/minix/fs/mfs/path.c b/minix/fs/mfs/path.c
index 2eae77e58..034fdcfd9 100644
--- a/minix/fs/mfs/path.c
+++ b/minix/fs/mfs/path.c
@@ -140,7 +140,6 @@ int flag;			 /* LOOK_UP, ENTER, DELETE or IS_EMPTY */
 
 	assert(ldir_ptr->i_dev != NO_DEV);
 	assert(bp != NULL);
-	assert(lmfs_dev(bp) != NO_DEV);
 
 	/* Search a directory block. */
 	for (dp = &b_dir(bp)[0];
@@ -185,7 +184,6 @@ int flag;			 /* LOOK_UP, ENTER, DELETE or IS_EMPTY */
 				*numb = (ino_t) conv4(sp->s_native,
 						      (int) dp->mfs_d_ino);
 			}
-			assert(lmfs_dev(bp) != NO_DEV);
 			put_block(bp);
 			return(r);
 		}
@@ -199,7 +197,6 @@ int flag;			 /* LOOK_UP, ENTER, DELETE or IS_EMPTY */
 
 	/* The whole block has been searched or ENTER has a free slot. */
 	if (e_hit) break;	/* e_hit set if ENTER can be performed now */
-	assert(lmfs_dev(bp) != NO_DEV);
 	put_block(bp);		/* otherwise, continue searching dir */
   }
 
diff --git a/minix/fs/mfs/proto.h b/minix/fs/mfs/proto.h
index a519cf203..d0598e897 100644
--- a/minix/fs/mfs/proto.h
+++ b/minix/fs/mfs/proto.h
@@ -84,7 +84,6 @@ int fs_statvfs(struct statvfs *st);
 bit_t alloc_bit(struct super_block *sp, int map, bit_t origin);
 void free_bit(struct super_block *sp, int map, bit_t bit_returned);
 unsigned int get_block_size(dev_t dev);
-struct super_block *get_super(dev_t dev);
 int read_super(struct super_block *sp);
 int write_super(struct super_block *sp);
 
diff --git a/minix/fs/mfs/read.c b/minix/fs/mfs/read.c
index af4b76064..e4c0ed9ab 100644
--- a/minix/fs/mfs/read.c
+++ b/minix/fs/mfs/read.c
@@ -260,8 +260,6 @@ int opportunistic;		/* if nonzero, only use cache for metadata */
 	bp = get_block(rip->i_dev, b, iomode); /* get double indirect block */
 	if (bp == NULL)
 		return NO_BLOCK;		/* peeking failed */
-	ASSERT(lmfs_dev(bp) != NO_DEV);
-	ASSERT(lmfs_dev(bp) == rip->i_dev);
 	z = rd_indir(bp, index);		/* z= zone for single*/
 	put_block(bp);				/* release double ind block */
 	excess = excess % nr_indirects;		/* index into single ind blk */
@@ -310,7 +308,7 @@ int index;			/* index into *bp */
   if(bp == NULL)
 	panic("rd_indir() on NULL");
 
-  sp = get_super(lmfs_dev(bp));	/* need super block to find file sys type */
+  sp = &superblock;
 
   /* read a zone from an indirect block */
   assert(sp->s_version == V3);
@@ -343,28 +341,15 @@ unsigned bytes_ahead;		/* bytes beyond position for immediate use */
  * flag on all reads to allow this.
  */
 /* Minimum number of blocks to prefetch. */
-  int nr_bufs = lmfs_nr_bufs();
-# define BLOCKS_MINIMUM		(nr_bufs < 50 ? 18 : 32)
+# define BLOCKS_MINIMUM		32
   int r, scale, read_q_size;
   unsigned int blocks_ahead, fragment, block_size;
   block_t block, blocks_left;
   off_t ind1_pos;
   dev_t dev;
   struct buf *bp;
-  static unsigned int readqsize = 0;
-  static struct buf **read_q;
+  static block64_t read_q[LMFS_MAX_PREFETCH];
   u64_t position_running;
-  int inuse_before = lmfs_bufs_in_use();
-
-  if(readqsize != nr_bufs) {
-	if(readqsize > 0) {
-		assert(read_q != NULL);
-		free(read_q);
-	}
-	if(!(read_q = malloc(sizeof(read_q[0])*nr_bufs)))
-		panic("couldn't allocate read_q");
-	readqsize = nr_bufs;
-  }
 
   dev = rip->i_dev;
   assert(dev != NO_DEV);
@@ -379,12 +364,11 @@ unsigned bytes_ahead;		/* bytes beyond position for immediate use */
   bytes_ahead += fragment;
   blocks_ahead = (bytes_ahead + block_size - 1) / block_size;
 
-  r = lmfs_get_block_ino(&bp, dev, block, PREFETCH, rip->i_num, position);
-  if (r != OK)
+  r = lmfs_get_block_ino(&bp, dev, block, PEEK, rip->i_num, position);
+  if (r == OK)
+	return(bp);
+  if (r != ENOENT)
 	panic("MFS: error getting block (%llu,%u): %d", dev, block, r);
-  assert(bp != NULL);
-  assert(bp->lmfs_count > 0);
-  if (lmfs_dev(bp) != NO_DEV) return(bp);
 
   /* The best guess for the number of blocks to prefetch:  A lot.
    * It is impossible to tell what the device looks like, so we don't even
@@ -417,9 +401,6 @@ unsigned bytes_ahead;		/* bytes beyond position for immediate use */
 	blocks_left++;
   }
 
-  /* No more than the maximum request. */
-  if (blocks_ahead > NR_IOREQS) blocks_ahead = NR_IOREQS;
-
   /* Read at least the minimum number of blocks, but not after a seek. */
   if (blocks_ahead < BLOCKS_MINIMUM && rip->i_seek == NO_SEEK)
 	blocks_ahead = BLOCKS_MINIMUM;
@@ -427,43 +408,38 @@ unsigned bytes_ahead;		/* bytes beyond position for immediate use */
   /* Can't go past end of file. */
   if (blocks_ahead > blocks_left) blocks_ahead = blocks_left;
 
+  /* No more than the maximum request. */
+  if (blocks_ahead > LMFS_MAX_PREFETCH) blocks_ahead = LMFS_MAX_PREFETCH;
+
   read_q_size = 0;
 
   /* Acquire block buffers. */
   for (;;) {
   	block_t thisblock;
-	assert(bp->lmfs_count > 0);
-	read_q[read_q_size++] = bp;
+	read_q[read_q_size++] = block;
 
 	if (--blocks_ahead == 0) break;
 
-	/* Don't trash the cache, leave 4 free. */
-	if (lmfs_bufs_in_use() >= nr_bufs - 4) break;
-
 	block++;
 	position_running += block_size;
 
 	thisblock = read_map(rip, (off_t) ex64lo(position_running), 1);
 	if (thisblock != NO_BLOCK) {
-		r = lmfs_get_block_ino(&bp, dev, thisblock, PREFETCH,
-		    rip->i_num, position_running);
-		if (r != OK)
-			panic("MFS: error getting block (%llu,%u): %d",
-			    dev, thisblock, r);
-	} else {
-		bp = get_block(dev, block, PREFETCH);
-	}
-	assert(bp);
-	assert(bp->lmfs_count > 0);
-	if (lmfs_dev(bp) != NO_DEV) {
+		r = lmfs_get_block_ino(&bp, dev, thisblock, PEEK, rip->i_num,
+		    position_running);
+		block = thisblock;
+	} else
+		r = lmfs_get_block(&bp, dev, block, PEEK);
+
+	if (r == OK) {
 		/* Oops, block already in the cache, get out. */
 		put_block(bp);
 		break;
 	}
+	if (r != ENOENT)
+		panic("MFS: error getting block (%llu,%u): %d", dev, block, r);
   }
-  lmfs_rw_scattered(dev, read_q, read_q_size, READING);
-
-  assert(inuse_before == lmfs_bufs_in_use());
+  lmfs_prefetch(dev, read_q, read_q_size);
 
   r = lmfs_get_block_ino(&bp, dev, baseblock, NORMAL, rip->i_num, position);
   if (r != OK)
diff --git a/minix/fs/mfs/stadir.c b/minix/fs/mfs/stadir.c
index 2e53ee959..8a429fb2d 100644
--- a/minix/fs/mfs/stadir.c
+++ b/minix/fs/mfs/stadir.c
@@ -85,7 +85,7 @@ int fs_statvfs(struct statvfs *st)
   struct super_block *sp;
   int scale;
 
-  sp = get_super(fs_dev);
+  sp = &superblock;
 
   scale = sp->s_log_zone_size;
 
diff --git a/minix/fs/mfs/super.c b/minix/fs/mfs/super.c
index f5e736a4b..32d30fdad 100644
--- a/minix/fs/mfs/super.c
+++ b/minix/fs/mfs/super.c
@@ -6,7 +6,6 @@
  * The entry points into this file are
  *   alloc_bit:       somebody wants to allocate a zone or inode; find one
  *   free_bit:        indicate that a zone or inode is available for allocation
- *   get_super:       search the 'superblock' table for a device
  *   mounted:         tells if file inode is on mounted (or ROOT) file system
  *   read_super:      read a superblock
  */
@@ -156,23 +155,6 @@ bit_t bit_returned;		/* number of bit to insert into the map */
   }
 }
 
-/*===========================================================================*
- *				get_super				     *
- *===========================================================================*/
-struct super_block *get_super(
-  dev_t dev			/* device number whose super_block is sought */
-)
-{
-  if (dev == NO_DEV)
-  	panic("request for super_block of NO_DEV");
-
-  if(superblock.s_dev != dev)
-  	panic("wrong superblock: 0x%x", (int) dev);
-
-  return(&superblock);
-}
-
-
 /*===========================================================================*
  *				get_block_size				     *
  *===========================================================================*/
diff --git a/minix/fs/mfs/write.c b/minix/fs/mfs/write.c
index 51a4a1db3..dce03d0aa 100644
--- a/minix/fs/mfs/write.c
+++ b/minix/fs/mfs/write.c
@@ -200,7 +200,7 @@ zone_t zone;			/* zone to write */
   if(bp == NULL)
 	panic("wr_indir() on NULL");
 
-  sp = get_super(lmfs_dev(bp));	/* need super block to find file sys type */
+  sp = &superblock;
 
   /* write a zone into an indirect block */
   assert(sp->s_version == V3);
diff --git a/minix/include/minix/libminixfs.h b/minix/include/minix/libminixfs.h
index 05d1aee45..f63041cc5 100644
--- a/minix/include/minix/libminixfs.h
+++ b/minix/include/minix/libminixfs.h
@@ -5,6 +5,9 @@
 
 #include <minix/fsdriver.h>
 
+/* Maximum number of blocks that will be considered by lmfs_prefetch() */
+#define LMFS_MAX_PREFETCH	NR_IOREQS
+
 struct buf {
   /* Data portion of the buffer. */
   void *data;
@@ -30,9 +33,6 @@ struct buf {
 void lmfs_markdirty(struct buf *bp);
 void lmfs_markclean(struct buf *bp);
 int lmfs_isclean(struct buf *bp);
-dev_t lmfs_dev(struct buf *bp);
-int lmfs_bufs_in_use(void);
-int lmfs_nr_bufs(void);
 void lmfs_flushall(void);
 void lmfs_flushdev(dev_t dev);
 size_t lmfs_fs_block_size(void);
@@ -46,7 +46,7 @@ void lmfs_put_block(struct buf *bp);
 void lmfs_free_block(dev_t dev, block64_t block);
 void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t off);
 void lmfs_invalidate(dev_t device);
-void lmfs_rw_scattered(dev_t, struct buf **, int, int);
+void lmfs_prefetch(dev_t dev, const block64_t *blockset, unsigned int nblocks);
 void lmfs_setquiet(int q);
 void lmfs_set_blockusage(fsblkcnt_t btotal, fsblkcnt_t bused);
 void lmfs_change_blockusage(int delta);
@@ -54,8 +54,7 @@ void lmfs_change_blockusage(int delta);
 /* get_block arguments */
 #define NORMAL             0    /* forces get_block to do disk read */
 #define NO_READ            1    /* prevents get_block from doing disk read */
-#define PREFETCH           2    /* tells get_block not to read or mark dev */
-#define PEEK               3    /* returns ENOENT if not in cache */
+#define PEEK               2    /* returns ENOENT if not in cache */
 
 /* Block I/O helper functions. */
 void lmfs_driver(dev_t dev, char *label);
diff --git a/minix/lib/libminixfs/bio.c b/minix/lib/libminixfs/bio.c
index 0cbaf10ef..0e2d3d59a 100644
--- a/minix/lib/libminixfs/bio.c
+++ b/minix/lib/libminixfs/bio.c
@@ -10,7 +10,8 @@
  *   o  it must initialize this library in order to set up a buffer pool for
  *      use by these functions, using the lmfs_buf_pool function; the
  *      recommended number of blocks for *non*-disk-backed file systems is
- *      NR_IOREQS buffers (disk-backed file systems typically use many more);
+ *      LMFS_MAX_PREFETCH buffers (disk-backed file systems typically use many
+ *      more);
  *   o  it must enable VM caching in order to support memory mapping of block
  *      devices, using the lmfs_may_use_vmcache function;
  *   o  it must either use lmfs_flushall as implementation for the fdr_sync
@@ -64,12 +65,15 @@ static void
 block_prefetch(dev_t dev, block64_t block, unsigned int nblocks,
 	size_t block_size, size_t last_size)
 {
-	struct buf *bp, *bufs[NR_IOREQS];
-	unsigned int count;
+	struct buf *bp;
+	unsigned int count, limit;
 	int r;
 
-	if (nblocks > NR_IOREQS) {
-		nblocks = NR_IOREQS;
+	limit = lmfs_readahead_limit();
+	assert(limit >= 1 && limit <= LMFS_MAX_PREFETCH);
+
+	if (nblocks > limit) {
+		nblocks = limit;
 
 		last_size = block_size;
 	}
@@ -77,24 +81,21 @@ block_prefetch(dev_t dev, block64_t block, unsigned int nblocks,
 	for (count = 0; count < nblocks; count++) {
 		if (count == nblocks - 1 && last_size < block_size)
 			r = lmfs_get_partial_block(&bp, dev, block + count,
-			    PREFETCH, last_size);
+			    PEEK, last_size);
 		else
-			r = lmfs_get_block(&bp, dev, block + count, PREFETCH);
-
-		if (r != OK)
-			panic("libminixfs: get_block PREFETCH error: %d\n", r);
+			r = lmfs_get_block(&bp, dev, block + count, PEEK);
 
-		if (lmfs_dev(bp) != NO_DEV) {
+		if (r == OK) {
 			lmfs_put_block(bp);
 
+			last_size = block_size;
+
 			break;
 		}
-
-		bufs[count] = bp;
 	}
 
 	if (count > 0)
-		lmfs_rw_scattered(dev, bufs, count, READING);
+		lmfs_readahead(dev, block, count, last_size);
 }
 
 /*
@@ -206,8 +207,6 @@ lmfs_bio(dev_t dev, struct fsdriver_data * data, size_t bytes, off_t pos,
 
 		/* Perform the actual copy. */
 		if (r == OK && data != NULL) {
-			assert(lmfs_dev(bp) != NO_DEV);
-
 			if (write) {
 				r = fsdriver_copyin(data, off,
 				    (char *)bp->data + block_off, chunk);
diff --git a/minix/lib/libminixfs/cache.c b/minix/lib/libminixfs/cache.c
index 2dc377c08..ffb7aadc5 100644
--- a/minix/lib/libminixfs/cache.c
+++ b/minix/lib/libminixfs/cache.c
@@ -18,6 +18,7 @@
 #include <minix/sysutil.h>
 #include <minix/u64.h>
 #include <minix/bdev.h>
+#include <minix/bitmap.h>
 
 #include "inc.h"
 
@@ -173,11 +174,6 @@ int lmfs_isclean(struct buf *bp)
 	return !(bp->lmfs_flags & VMMC_DIRTY);
 }
 
-dev_t lmfs_dev(struct buf *bp)
-{
-	return bp->lmfs_dev;
-}
-
 static void free_unused_blocks(void)
 {
 	struct buf *bp;
@@ -319,10 +315,8 @@ static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how,
  * disk (if 'how' is NORMAL).  If 'how' is NO_READ, the caller intends to
  * overwrite the requested block in its entirety, so it is only necessary to
  * see if it is in the cache; if it is not, any free buffer will do.  If 'how'
- * is PREFETCH, the block need not be read from the disk, and the device is not
- * to be marked on the block (i.e., set to NO_DEV), so callers can tell if the
- * block returned is valid.  If 'how' is PEEK, the function returns the block
- * if it is in the cache or the VM cache, and an ENOENT error code otherwise.
+ * is PEEK, the function returns the block if it is in the cache or the VM
+ * cache, and an ENOENT error code otherwise.
  * In addition to the LRU chain, there is also a hash chain to link together
  * blocks whose block numbers end with the same bit strings, for fast lookup.
  */
@@ -441,12 +435,16 @@ static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how,
 
   assert(dev != NO_DEV);
 
-  /* Block is not found in our cache, but we do want it
-   * if it's in the vm cache.
+  /* The block is not found in our cache, but we do want it if it's in the VM
+   * cache. The exception is NO_READ, purely for context switching performance
+   * reasons. NO_READ is used for 1) newly allocated blocks, 2) blocks being
+   * prefetched, and 3) blocks about to be fully overwritten. In the first two
+   * cases, VM will not have the block in its cache anyway, and for the third
+   * we save on one VM call only if the block is in the VM cache.
    */
   assert(!bp->data);
   assert(!bp->lmfs_bytes);
-  if(vmcache) {
+  if (how != NO_READ && vmcache) {
 	if((bp->data = vm_map_cacheblock(dev, dev_off, ino, ino_off,
 	    &bp->lmfs_flags, roundup(block_size, PAGE_SIZE))) != MAP_FAILED) {
 		bp->lmfs_bytes = block_size;
@@ -476,10 +474,7 @@ static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how,
 
   assert(bp->data);
 
-  if(how == PREFETCH) {
-	/* PREFETCH: don't do i/o. */
-	bp->lmfs_dev = NO_DEV;
-  } else if (how == NORMAL) {
+  if (how == NORMAL) {
 	/* Try to read the block. Return an error code on failure. */
 	if ((r = read_block(bp, block_size)) != OK) {
 		put_block(bp, 0);
@@ -812,68 +807,59 @@ void lmfs_invalidate(
 }
 
 /*===========================================================================*
- *				lmfs_flushdev				     *
+ *				sort_blocks				     *
  *===========================================================================*/
-void lmfs_flushdev(dev_t dev)
+static void sort_blocks(struct buf **bufq, unsigned int bufqsize)
 {
-/* Flush all dirty blocks for one device. */
-
-  register struct buf *bp;
-  static struct buf **dirty;
-  static unsigned int dirtylistsize = 0;
-  int ndirty;
+  struct buf *bp;
+  int i, j, gap;
 
-  if(dirtylistsize != nr_bufs) {
-	if(dirtylistsize > 0) {
-		assert(dirty != NULL);
-		free(dirty);
-	}
-	if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs)))
-		panic("couldn't allocate dirty buf list");
-	dirtylistsize = nr_bufs;
-  }
+  gap = 1;
+  do
+	gap = 3 * gap + 1;
+  while ((unsigned int)gap <= bufqsize);
 
-  for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) {
-	/* Do not flush dirty blocks that are in use (lmfs_count>0): the file
-	 * system may mark the block as dirty before changing its contents, in
-	 * which case the new contents could end up being lost.
-	 */
-	if (!lmfs_isclean(bp) && bp->lmfs_dev == dev && bp->lmfs_count == 0) {
-		dirty[ndirty++] = bp;
+  while (gap != 1) {
+	gap /= 3;
+	for (j = gap; (unsigned int)j < bufqsize; j++) {
+		for (i = j - gap; i >= 0 &&
+		    bufq[i]->lmfs_blocknr > bufq[i + gap]->lmfs_blocknr;
+		    i -= gap) {
+			bp = bufq[i];
+			bufq[i] = bufq[i + gap];
+			bufq[i + gap] = bp;
+		}
 	}
   }
-
-  lmfs_rw_scattered(dev, dirty, ndirty, WRITING);
 }
 
 /*===========================================================================*
- *				lmfs_rw_scattered			     *
+ *				rw_scattered				     *
  *===========================================================================*/
-void lmfs_rw_scattered(
+static void rw_scattered(
   dev_t dev,			/* major-minor device number */
   struct buf **bufq,		/* pointer to array of buffers */
-  int bufqsize,			/* number of buffers */
+  unsigned int bufqsize,	/* number of buffers */
   int rw_flag			/* READING or WRITING */
 )
 {
 /* Read or write scattered data from a device. */
 
   register struct buf *bp;
-  int gap;
-  register int i;
   register iovec_t *iop;
   static iovec_t iovec[NR_IOREQS];
   off_t pos;
-  int iov_per_block;
+  unsigned int i, iov_per_block;
   unsigned int start_in_use = bufs_in_use, start_bufqsize = bufqsize;
 
-  assert(bufqsize >= 0);
   if(bufqsize == 0) return;
 
   /* for READING, check all buffers on the list are obtained and held
    * (count > 0)
    */
   if (rw_flag == READING) {
+	assert(bufqsize <= LMFS_MAX_PREFETCH);
+
 	for(i = 0; i < bufqsize; i++) {
 		assert(bufq[i] != NULL);
 		assert(bufq[i]->lmfs_count > 0);
@@ -887,40 +873,26 @@ void lmfs_rw_scattered(
   assert(fs_block_size > 0);
   assert(howmany(fs_block_size, PAGE_SIZE) <= NR_IOREQS);
   
-  /* (Shell) sort buffers on lmfs_blocknr. */
-  gap = 1;
-  do
-	gap = 3 * gap + 1;
-  while (gap <= bufqsize);
-  while (gap != 1) {
-  	int j;
-	gap /= 3;
-	for (j = gap; j < bufqsize; j++) {
-		for (i = j - gap;
-		     i >= 0 && bufq[i]->lmfs_blocknr > bufq[i + gap]->lmfs_blocknr;
-		     i -= gap) {
-			bp = bufq[i];
-			bufq[i] = bufq[i + gap];
-			bufq[i + gap] = bp;
-		}
-	}
-  }
+  /* For WRITING, (Shell) sort buffers on lmfs_blocknr.
+   * For READING, the buffers are already sorted.
+   */
+  if (rw_flag == WRITING)
+	sort_blocks(bufq, bufqsize);
 
   /* Set up I/O vector and do I/O.  The result of bdev I/O is OK if everything
    * went fine, otherwise the error code for the first failed transfer.
    */
   while (bufqsize > 0) {
-  	int nblocks = 0, niovecs = 0;
+	unsigned int p, nblocks = 0, niovecs = 0;
 	int r;
 	for (iop = iovec; nblocks < bufqsize; nblocks++) {
-		int p;
 		vir_bytes vdata, blockrem;
 		bp = bufq[nblocks];
 		if (bp->lmfs_blocknr != bufq[0]->lmfs_blocknr + nblocks)
 			break;
 		blockrem = bp->lmfs_bytes;
 		iov_per_block = howmany(blockrem, PAGE_SIZE);
-		if(niovecs >= NR_IOREQS-iov_per_block) break;
+		if (niovecs > NR_IOREQS - iov_per_block) break;
 		vdata = (vir_bytes) bp->data;
 		for(p = 0; p < iov_per_block; p++) {
 			vir_bytes chunk =
@@ -937,7 +909,7 @@ void lmfs_rw_scattered(
 	}
 
 	assert(nblocks > 0);
-	assert(niovecs > 0);
+	assert(niovecs > 0 && niovecs <= NR_IOREQS);
 
 	pos = (off_t)bufq[0]->lmfs_blocknr * fs_block_size;
 	if (rw_flag == READING)
@@ -963,7 +935,6 @@ void lmfs_rw_scattered(
 			break;
 		}
 		if (rw_flag == READING) {
-			bp->lmfs_dev = dev;	/* validate block */
 			lmfs_put_block(bp);
 		} else {
 			MARKCLEAN(bp);
@@ -979,7 +950,9 @@ void lmfs_rw_scattered(
 		 * give at this time.  Don't forget to release those extras.
 		 */
 		while (bufqsize > 0) {
-			lmfs_put_block(*bufq++);
+			bp = *bufq++;
+			bp->lmfs_dev = NO_DEV;	/* invalidate block */
+			lmfs_put_block(bp);
 			bufqsize--;
 		}
 	}
@@ -1001,6 +974,190 @@ void lmfs_rw_scattered(
   }
 }
 
+/*===========================================================================*
+ *				lmfs_readahead				     *
+ *===========================================================================*/
+void lmfs_readahead(dev_t dev, block64_t base_block, unsigned int nblocks,
+	size_t last_size)
+{
+/* Read ahead 'nblocks' blocks starting from the block 'base_block' on device
+ * 'dev'. The number of blocks must be between 1 and LMFS_MAX_PREFETCH,
+ * inclusive. All blocks have the file system's block size, possibly except the
+ * last block in the range, which is of size 'last_size'. The caller must
+ * ensure that none of the blocks in the range are already in the cache.
+ * However, the caller must also not rely on all or even any of the blocks to
+ * be present in the cache afterwards--failures are (deliberately!) ignored.
+ */
+  static struct buf *bufq[LMFS_MAX_PREFETCH]; /* static because of size only */
+  struct buf *bp;
+  unsigned int count;
+  int r;
+
+  assert(nblocks >= 1 && nblocks <= LMFS_MAX_PREFETCH);
+
+  for (count = 0; count < nblocks; count++) {
+	if (count == nblocks - 1)
+		r = lmfs_get_partial_block(&bp, dev, base_block + count,
+		    NO_READ, last_size);
+	else
+		r = lmfs_get_block(&bp, dev, base_block + count, NO_READ);
+
+	if (r != OK)
+		break;
+
+	/* We could add a flag that makes the get_block() calls fail if the
+	 * block is already in the cache, but it is not a major concern if it
+	 * is: we just perform a useless read in that case. However, if the
+	 * block is cached *and* dirty, we are about to lose its new contents.
+	 */
+	assert(lmfs_isclean(bp));
+
+	bufq[count] = bp;
+  }
+
+  rw_scattered(dev, bufq, count, READING);
+}
+
+/*===========================================================================*
+ *				lmfs_prefetch				     *
+ *===========================================================================*/
+unsigned int lmfs_readahead_limit(void)
+{
+/* Return the maximum number of blocks that should be read ahead at once. The
+ * return value is guaranteed to be between 1 and LMFS_MAX_PREFETCH, inclusive.
+ */
+  unsigned int max_transfer, max_bufs;
+
+  /* The returned value is the minimum of two factors: the maximum number of
+   * blocks that can be transferred in a single I/O gather request (see how
+   * rw_scattered() generates I/O requests), and a policy limit on the number
+   * of buffers that any read-ahead operation may use (that is, thrash).
+   */
+  max_transfer = NR_IOREQS / MAX(fs_block_size / PAGE_SIZE, 1);
+
+  /* The constants have been imported from MFS as is, and may need tuning. */
+  if (nr_bufs < 50)
+	max_bufs = 18;
+  else
+	max_bufs = nr_bufs - 4;
+
+  return MIN(max_transfer, max_bufs);
+}
+
+/*===========================================================================*
+ *				lmfs_prefetch				     *
+ *===========================================================================*/
+void lmfs_prefetch(dev_t dev, const block64_t *blockset, unsigned int nblocks)
+{
+/* The given set of blocks is expected to be needed soon, so prefetch a
+ * convenient subset. The blocks are expected to be sorted by likelihood of
+ * being accessed soon, making the first block of the set the most important
+ * block to prefetch right now. The caller must have made sure that the blocks
+ * are not in the cache already. The array may have duplicate block numbers.
+ */
+  bitchunk_t blocks_before[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)];
+  bitchunk_t blocks_after[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)];
+  block64_t block, base_block;
+  unsigned int i, bit, nr_before, nr_after, span, limit, nr_blocks;
+
+  if (nblocks == 0)
+	return;
+
+  /* Here is the deal. We are going to prefetch one range only, because seeking
+   * is too expensive for just prefetching. The range we select should at least
+   * include the first ("base") block of the given set, since that is the block
+   * the caller is primarily interested in. Thus, the rest of the range is
+   * going to have to be directly around this base block. We first check which
+   * blocks from the set fall just before and after the base block, which then
+   * allows us to construct a contiguous range of desired blocks directly
+   * around the base block, in O(n) time. As a natural part of this, we ignore
+   * duplicate blocks in the given set. We then read from the beginning of this
+   * range, in order to maximize the chance that a next prefetch request will
+   * continue from the last disk position without requiring a seek. However, we
+   * do correct for the maximum number of blocks we can (or should) read in at
+   * once, such that we will still end up reading the base block.
+   */
+  base_block = blockset[0];
+
+  memset(blocks_before, 0, sizeof(blocks_before));
+  memset(blocks_after, 0, sizeof(blocks_after));
+
+  for (i = 1; i < nblocks; i++) {
+	block = blockset[i];
+
+	if (block < base_block && block + LMFS_MAX_PREFETCH >= base_block) {
+		bit = base_block - block - 1;
+		assert(bit < LMFS_MAX_PREFETCH);
+		SET_BIT(blocks_before, bit);
+	} else if (block > base_block &&
+	    block - LMFS_MAX_PREFETCH <= base_block) {
+		bit = block - base_block - 1;
+		assert(bit < LMFS_MAX_PREFETCH);
+		SET_BIT(blocks_after, bit);
+	}
+  }
+
+  for (nr_before = 0; nr_before < LMFS_MAX_PREFETCH; nr_before++)
+	if (!GET_BIT(blocks_before, nr_before))
+		break;
+
+  for (nr_after = 0; nr_after < LMFS_MAX_PREFETCH; nr_after++)
+	if (!GET_BIT(blocks_after, nr_after))
+		break;
+
+  /* The number of blocks to prefetch is the minimum of two factors: the number
+   * of blocks in the range around the base block, and the maximum number of
+   * blocks that should be read ahead at once at all.
+   */
+  span = nr_before + 1 + nr_after;
+  limit = lmfs_readahead_limit();
+
+  nr_blocks = MIN(span, limit);
+  assert(nr_blocks >= 1 && nr_blocks <= LMFS_MAX_PREFETCH);
+
+  /* Start prefetching from the lowest block within the contiguous range, but
+   * make sure that we read at least the original base block itself, too.
+   */
+  base_block -= MIN(nr_before, nr_blocks - 1);
+
+  lmfs_readahead(dev, base_block, nr_blocks, fs_block_size);
+}
+
+/*===========================================================================*
+ *				lmfs_flushdev				     *
+ *===========================================================================*/
+void lmfs_flushdev(dev_t dev)
+{
+/* Flush all dirty blocks for one device. */
+
+  register struct buf *bp;
+  static struct buf **dirty;
+  static unsigned int dirtylistsize = 0;
+  unsigned int ndirty;
+
+  if(dirtylistsize != nr_bufs) {
+	if(dirtylistsize > 0) {
+		assert(dirty != NULL);
+		free(dirty);
+	}
+	if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs)))
+		panic("couldn't allocate dirty buf list");
+	dirtylistsize = nr_bufs;
+  }
+
+  for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) {
+	/* Do not flush dirty blocks that are in use (lmfs_count>0): the file
+	 * system may mark the block as dirty before changing its contents, in
+	 * which case the new contents could end up being lost.
+	 */
+	if (!lmfs_isclean(bp) && bp->lmfs_dev == dev && bp->lmfs_count == 0) {
+		dirty[ndirty++] = bp;
+	}
+  }
+
+  rw_scattered(dev, dirty, ndirty, WRITING);
+}
+
 /*===========================================================================*
  *				rm_lru					     *
  *===========================================================================*/
@@ -1128,16 +1285,6 @@ void lmfs_buf_pool(int new_nr_bufs)
   buf_hash[0] = front;
 }
 
-int lmfs_bufs_in_use(void)
-{
-	return bufs_in_use;
-}
-
-int lmfs_nr_bufs(void)
-{
-	return nr_bufs;
-}
-
 void lmfs_flushall(void)
 {
 	struct buf *bp;
diff --git a/minix/lib/libminixfs/inc.h b/minix/lib/libminixfs/inc.h
index 0f73e26e7..ea01e99d0 100644
--- a/minix/lib/libminixfs/inc.h
+++ b/minix/lib/libminixfs/inc.h
@@ -3,5 +3,8 @@
 
 int lmfs_get_partial_block(struct buf **bpp, dev_t dev, block64_t block,
 	int how, size_t block_size);
+void lmfs_readahead(dev_t dev, block64_t base_block, unsigned int nblocks,
+	size_t last_size);
+unsigned int lmfs_readahead_limit(void);
 
 #endif /* !_LIBMINIXFS_INC_H */