From: David van Moolenbroek Date: Sat, 4 Apr 2015 15:55:48 +0000 (+0000) Subject: libminixfs: rework prefetch API X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zpipe.c?a=commitdiff_plain;h=4472b590c74fed661146a925d8dff2ed1b04256e;p=minix.git libminixfs: rework prefetch API This patch changes the prefetch API so that file systems must now provide a set of block numbers, rather than a set of buffers. The result is a leaner and more well-defined API; linear computation of the range of blocks to prefetch; duplicates no longer interfering with the prefetch process; guaranteed inclusion of the block needed next into the prefetch range; and, limits and policy decisions better established by libminixfs now actually being moved into libminixfs. Change-Id: I7e44daf2d2d164bc5e2f1473ad717f3ff0f0a77f --- diff --git a/minix/fs/ext2/misc.c b/minix/fs/ext2/misc.c index 585db14fa..4f79cf931 100644 --- a/minix/fs/ext2/misc.c +++ b/minix/fs/ext2/misc.c @@ -19,8 +19,6 @@ void fs_sync(void) */ struct inode *rip; - assert(lmfs_nr_bufs() > 0); - if (superblock->s_rd_only) return; /* nothing to sync */ diff --git a/minix/fs/ext2/path.c b/minix/fs/ext2/path.c index 0fb0c3711..c8e4c2635 100644 --- a/minix/fs/ext2/path.c +++ b/minix/fs/ext2/path.c @@ -215,7 +215,6 @@ int ftype; /* used when ENTER and INCOMPAT_FILETYPE */ /* 'flag' is LOOK_UP */ *numb = (ino_t) conv4(le_CPU, dp->d_ino); } - assert(lmfs_dev(bp) != NO_DEV); put_block(bp); return(r); } @@ -250,7 +249,6 @@ int ftype; /* used when ENTER and INCOMPAT_FILETYPE */ } /* The whole block has been searched or ENTER has a free slot. */ - assert(lmfs_dev(bp) != NO_DEV); if (e_hit) break; /* e_hit set if ENTER can be performed now */ put_block(bp); /* otherwise, continue searching dir */ } diff --git a/minix/fs/ext2/read.c b/minix/fs/ext2/read.c index 4a38ff2fe..26595c510 100644 --- a/minix/fs/ext2/read.c +++ b/minix/fs/ext2/read.c @@ -252,8 +252,6 @@ int opportunistic; b = rip->i_block[EXT2_TIND_BLOCK]; if (b == NO_BLOCK) return(NO_BLOCK); bp = get_block(rip->i_dev, b, NORMAL); /* get triple ind block */ - ASSERT(lmfs_dev(bp) != NO_DEV); - ASSERT(lmfs_dev(bp) == rip->i_dev); excess = block_pos - triple_ind_s; mindex = excess / addr_in_block2; b = rd_indir(bp, mindex); /* num of double ind block */ @@ -264,8 +262,6 @@ int opportunistic; bp = get_block(rip->i_dev, b, iomode); /* get double indirect block */ if (bp == NULL) return NO_BLOCK; /* peeking failed */ - ASSERT(lmfs_dev(bp) != NO_DEV); - ASSERT(lmfs_dev(bp) == rip->i_dev); mindex = excess / addr_in_block; b = rd_indir(bp, mindex); /* num of single ind block */ put_block(bp); /* release double ind block */ @@ -276,8 +272,6 @@ int opportunistic; if (bp == NULL) return NO_BLOCK; /* peeking failed */ - ASSERT(lmfs_dev(bp) != NO_DEV); - ASSERT(lmfs_dev(bp) == rip->i_dev); b = rd_indir(bp, mindex); put_block(bp); /* release single ind block */ @@ -332,34 +326,16 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ * flag on all reads to allow this. */ /* Minimum number of blocks to prefetch. */ -# define BLOCKS_MINIMUM (nr_bufs < 50 ? 18 : 32) - int nr_bufs = lmfs_nr_bufs(); +# define BLOCKS_MINIMUM 32 int r, read_q_size; unsigned int blocks_ahead, fragment, block_size; block_t block, blocks_left; off_t ind1_pos; dev_t dev; struct buf *bp = NULL; - static unsigned int readqsize = 0; - static struct buf **read_q = NULL; + static block64_t read_q[LMFS_MAX_PREFETCH]; u64_t position_running; - if(readqsize != nr_bufs) { - if(readqsize > 0) { - assert(read_q != NULL); - free(read_q); - read_q = NULL; - readqsize = 0; - } - - assert(readqsize == 0); - assert(read_q == NULL); - - if(!(read_q = malloc(sizeof(read_q[0])*nr_bufs))) - panic("couldn't allocate read_q"); - readqsize = nr_bufs; - } - dev = rip->i_dev; assert(dev != NO_DEV); block_size = get_block_size(dev); @@ -372,11 +348,11 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ bytes_ahead += fragment; blocks_ahead = (bytes_ahead + block_size - 1) / block_size; - r = lmfs_get_block_ino(&bp, dev, block, PREFETCH, rip->i_num, position); - if (r != OK) + r = lmfs_get_block_ino(&bp, dev, block, PEEK, rip->i_num, position); + if (r == OK) + return(bp); + if (r != ENOENT) panic("ext2: error getting block (%llu,%u): %d", dev, block, r); - assert(bp != NULL); - if (lmfs_dev(bp) != NO_DEV) return(bp); /* The best guess for the number of blocks to prefetch: A lot. * It is impossible to tell what the device looks like, so we don't even @@ -408,9 +384,6 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ blocks_left++; } - /* No more than the maximum request. */ - if (blocks_ahead > NR_IOREQS) blocks_ahead = NR_IOREQS; - /* Read at least the minimum number of blocks, but not after a seek. */ if (blocks_ahead < BLOCKS_MINIMUM && rip->i_seek == NO_SEEK) blocks_ahead = BLOCKS_MINIMUM; @@ -418,38 +391,39 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ /* Can't go past end of file. */ if (blocks_ahead > blocks_left) blocks_ahead = blocks_left; + /* No more than the maximum request. */ + if (blocks_ahead > LMFS_MAX_PREFETCH) blocks_ahead = LMFS_MAX_PREFETCH; + read_q_size = 0; /* Acquire block buffers. */ for (;;) { block_t thisblock; - read_q[read_q_size++] = bp; + read_q[read_q_size++] = block; if (--blocks_ahead == 0) break; - /* Don't trash the cache, leave 4 free. */ - if (lmfs_bufs_in_use() >= nr_bufs - 4) break; - block++; position_running += block_size; thisblock = read_map(rip, (off_t) ex64lo(position_running), 1); if (thisblock != NO_BLOCK) { - r = lmfs_get_block_ino(&bp, dev, thisblock, PREFETCH, - rip->i_num, position_running); - if (r != OK) - panic("ext2: error getting block (%llu,%u): %d", - dev, thisblock, r); - } else { - bp = get_block(dev, block, PREFETCH); - } - if (lmfs_dev(bp) != NO_DEV) { + r = lmfs_get_block_ino(&bp, dev, thisblock, PEEK, rip->i_num, + position_running); + block = thisblock; + } else + r = lmfs_get_block(&bp, dev, block, PEEK); + + if (r == OK) { /* Oops, block already in the cache, get out. */ put_block(bp); break; } + if (r != ENOENT) + panic("ext2: error getting block (%llu,%u): %d", dev, block, + r); } - lmfs_rw_scattered(dev, read_q, read_q_size, READING); + lmfs_prefetch(dev, read_q, read_q_size); r = lmfs_get_block_ino(&bp, dev, baseblock, NORMAL, rip->i_num, position); if (r != OK) diff --git a/minix/fs/mfs/cache.c b/minix/fs/mfs/cache.c index bb3a4a4ac..4b3fcc223 100644 --- a/minix/fs/mfs/cache.c +++ b/minix/fs/mfs/cache.c @@ -57,7 +57,7 @@ zone_t alloc_zone( * z = b + sp->s_firstdatazone - 1 * Alloc_bit() never returns 0, since this is used for NO_BIT (failure). */ - sp = get_super(dev); + sp = &superblock; /* If z is 0, skip initial part of the map known to be fully in use. */ if (z == sp->s_firstdatazone) { @@ -93,7 +93,7 @@ void free_zone( bit_t bit; /* Locate the appropriate super_block and return bit. */ - sp = get_super(dev); + sp = &superblock; if (numb < sp->s_firstdatazone || numb >= sp->s_zones) return; bit = (bit_t) (numb - (zone_t) (sp->s_firstdatazone - 1)); free_bit(sp, ZMAP, bit); diff --git a/minix/fs/mfs/clean.h b/minix/fs/mfs/clean.h index 427696b2a..5d367116d 100644 --- a/minix/fs/mfs/clean.h +++ b/minix/fs/mfs/clean.h @@ -2,10 +2,13 @@ #ifndef _MFS_CLEAN_H #define _MFS_CLEAN_H 1 -#define MARKDIRTY(b) do { if(superblock.s_dev == lmfs_dev(b) && superblock.s_rd_only) { printf("%s:%d: dirty block on rofs! ", __FILE__, __LINE__); util_stacktrace(); } else { lmfs_markdirty(b); } } while(0) -#define MARKCLEAN(b) lmfs_markclean(b) - -#define ISDIRTY(b) (!lmfs_isclean(b)) -#define ISCLEAN(b) (lmfs_isclean(b)) +#define MARKDIRTY(b) do { \ + if (superblock.s_rd_only) { \ + printf("%s:%d: dirty block on rofs! ", __FILE__, __LINE__); \ + util_stacktrace(); \ + } else { \ + lmfs_markdirty(b); \ + } \ +} while(0) #endif diff --git a/minix/fs/mfs/inode.c b/minix/fs/mfs/inode.c index 65c5e3d65..2827c047e 100644 --- a/minix/fs/mfs/inode.c +++ b/minix/fs/mfs/inode.c @@ -258,7 +258,7 @@ struct inode *alloc_inode(dev_t dev, mode_t bits, uid_t uid, gid_t gid) int major, minor, inumb; bit_t b; - sp = get_super(dev); /* get pointer to super_block */ + sp = &superblock; if (sp->s_rd_only) { /* can't allocate an inode on a read only device. */ err_code = EROFS; return(NULL); @@ -335,8 +335,7 @@ static void free_inode( register struct super_block *sp; bit_t b; - /* Locate the appropriate super_block. */ - sp = get_super(dev); + sp = &superblock; if (inumb == NO_ENTRY || inumb > sp->s_ninodes) return; b = (bit_t) inumb; free_bit(sp, IMAP, b); @@ -385,7 +384,7 @@ int rw_flag; /* READING or WRITING */ block_t b, offset; /* Get the block where the inode resides. */ - sp = get_super(rip->i_dev); /* get pointer to super block */ + sp = &superblock; rip->i_sp = sp; /* inode must contain super block pointer */ offset = START_BLOCK + sp->s_imap_blocks + sp->s_zmap_blocks; b = (block_t) (rip->i_num - 1)/sp->s_inodes_per_block + offset; diff --git a/minix/fs/mfs/misc.c b/minix/fs/mfs/misc.c index 48d85ec49..99f54ff26 100644 --- a/minix/fs/mfs/misc.c +++ b/minix/fs/mfs/misc.c @@ -15,8 +15,6 @@ void fs_sync(void) */ struct inode *rip; - assert(lmfs_nr_bufs() > 0); - /* Write all the dirty inodes to the disk. */ for(rip = &inode[0]; rip < &inode[NR_INODES]; rip++) if(rip->i_count > 0 && IN_ISDIRTY(rip)) rw_inode(rip, WRITING); diff --git a/minix/fs/mfs/path.c b/minix/fs/mfs/path.c index 2eae77e58..034fdcfd9 100644 --- a/minix/fs/mfs/path.c +++ b/minix/fs/mfs/path.c @@ -140,7 +140,6 @@ int flag; /* LOOK_UP, ENTER, DELETE or IS_EMPTY */ assert(ldir_ptr->i_dev != NO_DEV); assert(bp != NULL); - assert(lmfs_dev(bp) != NO_DEV); /* Search a directory block. */ for (dp = &b_dir(bp)[0]; @@ -185,7 +184,6 @@ int flag; /* LOOK_UP, ENTER, DELETE or IS_EMPTY */ *numb = (ino_t) conv4(sp->s_native, (int) dp->mfs_d_ino); } - assert(lmfs_dev(bp) != NO_DEV); put_block(bp); return(r); } @@ -199,7 +197,6 @@ int flag; /* LOOK_UP, ENTER, DELETE or IS_EMPTY */ /* The whole block has been searched or ENTER has a free slot. */ if (e_hit) break; /* e_hit set if ENTER can be performed now */ - assert(lmfs_dev(bp) != NO_DEV); put_block(bp); /* otherwise, continue searching dir */ } diff --git a/minix/fs/mfs/proto.h b/minix/fs/mfs/proto.h index a519cf203..d0598e897 100644 --- a/minix/fs/mfs/proto.h +++ b/minix/fs/mfs/proto.h @@ -84,7 +84,6 @@ int fs_statvfs(struct statvfs *st); bit_t alloc_bit(struct super_block *sp, int map, bit_t origin); void free_bit(struct super_block *sp, int map, bit_t bit_returned); unsigned int get_block_size(dev_t dev); -struct super_block *get_super(dev_t dev); int read_super(struct super_block *sp); int write_super(struct super_block *sp); diff --git a/minix/fs/mfs/read.c b/minix/fs/mfs/read.c index af4b76064..e4c0ed9ab 100644 --- a/minix/fs/mfs/read.c +++ b/minix/fs/mfs/read.c @@ -260,8 +260,6 @@ int opportunistic; /* if nonzero, only use cache for metadata */ bp = get_block(rip->i_dev, b, iomode); /* get double indirect block */ if (bp == NULL) return NO_BLOCK; /* peeking failed */ - ASSERT(lmfs_dev(bp) != NO_DEV); - ASSERT(lmfs_dev(bp) == rip->i_dev); z = rd_indir(bp, index); /* z= zone for single*/ put_block(bp); /* release double ind block */ excess = excess % nr_indirects; /* index into single ind blk */ @@ -310,7 +308,7 @@ int index; /* index into *bp */ if(bp == NULL) panic("rd_indir() on NULL"); - sp = get_super(lmfs_dev(bp)); /* need super block to find file sys type */ + sp = &superblock; /* read a zone from an indirect block */ assert(sp->s_version == V3); @@ -343,28 +341,15 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ * flag on all reads to allow this. */ /* Minimum number of blocks to prefetch. */ - int nr_bufs = lmfs_nr_bufs(); -# define BLOCKS_MINIMUM (nr_bufs < 50 ? 18 : 32) +# define BLOCKS_MINIMUM 32 int r, scale, read_q_size; unsigned int blocks_ahead, fragment, block_size; block_t block, blocks_left; off_t ind1_pos; dev_t dev; struct buf *bp; - static unsigned int readqsize = 0; - static struct buf **read_q; + static block64_t read_q[LMFS_MAX_PREFETCH]; u64_t position_running; - int inuse_before = lmfs_bufs_in_use(); - - if(readqsize != nr_bufs) { - if(readqsize > 0) { - assert(read_q != NULL); - free(read_q); - } - if(!(read_q = malloc(sizeof(read_q[0])*nr_bufs))) - panic("couldn't allocate read_q"); - readqsize = nr_bufs; - } dev = rip->i_dev; assert(dev != NO_DEV); @@ -379,12 +364,11 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ bytes_ahead += fragment; blocks_ahead = (bytes_ahead + block_size - 1) / block_size; - r = lmfs_get_block_ino(&bp, dev, block, PREFETCH, rip->i_num, position); - if (r != OK) + r = lmfs_get_block_ino(&bp, dev, block, PEEK, rip->i_num, position); + if (r == OK) + return(bp); + if (r != ENOENT) panic("MFS: error getting block (%llu,%u): %d", dev, block, r); - assert(bp != NULL); - assert(bp->lmfs_count > 0); - if (lmfs_dev(bp) != NO_DEV) return(bp); /* The best guess for the number of blocks to prefetch: A lot. * It is impossible to tell what the device looks like, so we don't even @@ -417,9 +401,6 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ blocks_left++; } - /* No more than the maximum request. */ - if (blocks_ahead > NR_IOREQS) blocks_ahead = NR_IOREQS; - /* Read at least the minimum number of blocks, but not after a seek. */ if (blocks_ahead < BLOCKS_MINIMUM && rip->i_seek == NO_SEEK) blocks_ahead = BLOCKS_MINIMUM; @@ -427,43 +408,38 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ /* Can't go past end of file. */ if (blocks_ahead > blocks_left) blocks_ahead = blocks_left; + /* No more than the maximum request. */ + if (blocks_ahead > LMFS_MAX_PREFETCH) blocks_ahead = LMFS_MAX_PREFETCH; + read_q_size = 0; /* Acquire block buffers. */ for (;;) { block_t thisblock; - assert(bp->lmfs_count > 0); - read_q[read_q_size++] = bp; + read_q[read_q_size++] = block; if (--blocks_ahead == 0) break; - /* Don't trash the cache, leave 4 free. */ - if (lmfs_bufs_in_use() >= nr_bufs - 4) break; - block++; position_running += block_size; thisblock = read_map(rip, (off_t) ex64lo(position_running), 1); if (thisblock != NO_BLOCK) { - r = lmfs_get_block_ino(&bp, dev, thisblock, PREFETCH, - rip->i_num, position_running); - if (r != OK) - panic("MFS: error getting block (%llu,%u): %d", - dev, thisblock, r); - } else { - bp = get_block(dev, block, PREFETCH); - } - assert(bp); - assert(bp->lmfs_count > 0); - if (lmfs_dev(bp) != NO_DEV) { + r = lmfs_get_block_ino(&bp, dev, thisblock, PEEK, rip->i_num, + position_running); + block = thisblock; + } else + r = lmfs_get_block(&bp, dev, block, PEEK); + + if (r == OK) { /* Oops, block already in the cache, get out. */ put_block(bp); break; } + if (r != ENOENT) + panic("MFS: error getting block (%llu,%u): %d", dev, block, r); } - lmfs_rw_scattered(dev, read_q, read_q_size, READING); - - assert(inuse_before == lmfs_bufs_in_use()); + lmfs_prefetch(dev, read_q, read_q_size); r = lmfs_get_block_ino(&bp, dev, baseblock, NORMAL, rip->i_num, position); if (r != OK) diff --git a/minix/fs/mfs/stadir.c b/minix/fs/mfs/stadir.c index 2e53ee959..8a429fb2d 100644 --- a/minix/fs/mfs/stadir.c +++ b/minix/fs/mfs/stadir.c @@ -85,7 +85,7 @@ int fs_statvfs(struct statvfs *st) struct super_block *sp; int scale; - sp = get_super(fs_dev); + sp = &superblock; scale = sp->s_log_zone_size; diff --git a/minix/fs/mfs/super.c b/minix/fs/mfs/super.c index f5e736a4b..32d30fdad 100644 --- a/minix/fs/mfs/super.c +++ b/minix/fs/mfs/super.c @@ -6,7 +6,6 @@ * The entry points into this file are * alloc_bit: somebody wants to allocate a zone or inode; find one * free_bit: indicate that a zone or inode is available for allocation - * get_super: search the 'superblock' table for a device * mounted: tells if file inode is on mounted (or ROOT) file system * read_super: read a superblock */ @@ -156,23 +155,6 @@ bit_t bit_returned; /* number of bit to insert into the map */ } } -/*===========================================================================* - * get_super * - *===========================================================================*/ -struct super_block *get_super( - dev_t dev /* device number whose super_block is sought */ -) -{ - if (dev == NO_DEV) - panic("request for super_block of NO_DEV"); - - if(superblock.s_dev != dev) - panic("wrong superblock: 0x%x", (int) dev); - - return(&superblock); -} - - /*===========================================================================* * get_block_size * *===========================================================================*/ diff --git a/minix/fs/mfs/write.c b/minix/fs/mfs/write.c index 51a4a1db3..dce03d0aa 100644 --- a/minix/fs/mfs/write.c +++ b/minix/fs/mfs/write.c @@ -200,7 +200,7 @@ zone_t zone; /* zone to write */ if(bp == NULL) panic("wr_indir() on NULL"); - sp = get_super(lmfs_dev(bp)); /* need super block to find file sys type */ + sp = &superblock; /* write a zone into an indirect block */ assert(sp->s_version == V3); diff --git a/minix/include/minix/libminixfs.h b/minix/include/minix/libminixfs.h index 05d1aee45..f63041cc5 100644 --- a/minix/include/minix/libminixfs.h +++ b/minix/include/minix/libminixfs.h @@ -5,6 +5,9 @@ #include +/* Maximum number of blocks that will be considered by lmfs_prefetch() */ +#define LMFS_MAX_PREFETCH NR_IOREQS + struct buf { /* Data portion of the buffer. */ void *data; @@ -30,9 +33,6 @@ struct buf { void lmfs_markdirty(struct buf *bp); void lmfs_markclean(struct buf *bp); int lmfs_isclean(struct buf *bp); -dev_t lmfs_dev(struct buf *bp); -int lmfs_bufs_in_use(void); -int lmfs_nr_bufs(void); void lmfs_flushall(void); void lmfs_flushdev(dev_t dev); size_t lmfs_fs_block_size(void); @@ -46,7 +46,7 @@ void lmfs_put_block(struct buf *bp); void lmfs_free_block(dev_t dev, block64_t block); void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t off); void lmfs_invalidate(dev_t device); -void lmfs_rw_scattered(dev_t, struct buf **, int, int); +void lmfs_prefetch(dev_t dev, const block64_t *blockset, unsigned int nblocks); void lmfs_setquiet(int q); void lmfs_set_blockusage(fsblkcnt_t btotal, fsblkcnt_t bused); void lmfs_change_blockusage(int delta); @@ -54,8 +54,7 @@ void lmfs_change_blockusage(int delta); /* get_block arguments */ #define NORMAL 0 /* forces get_block to do disk read */ #define NO_READ 1 /* prevents get_block from doing disk read */ -#define PREFETCH 2 /* tells get_block not to read or mark dev */ -#define PEEK 3 /* returns ENOENT if not in cache */ +#define PEEK 2 /* returns ENOENT if not in cache */ /* Block I/O helper functions. */ void lmfs_driver(dev_t dev, char *label); diff --git a/minix/lib/libminixfs/bio.c b/minix/lib/libminixfs/bio.c index 0cbaf10ef..0e2d3d59a 100644 --- a/minix/lib/libminixfs/bio.c +++ b/minix/lib/libminixfs/bio.c @@ -10,7 +10,8 @@ * o it must initialize this library in order to set up a buffer pool for * use by these functions, using the lmfs_buf_pool function; the * recommended number of blocks for *non*-disk-backed file systems is - * NR_IOREQS buffers (disk-backed file systems typically use many more); + * LMFS_MAX_PREFETCH buffers (disk-backed file systems typically use many + * more); * o it must enable VM caching in order to support memory mapping of block * devices, using the lmfs_may_use_vmcache function; * o it must either use lmfs_flushall as implementation for the fdr_sync @@ -64,12 +65,15 @@ static void block_prefetch(dev_t dev, block64_t block, unsigned int nblocks, size_t block_size, size_t last_size) { - struct buf *bp, *bufs[NR_IOREQS]; - unsigned int count; + struct buf *bp; + unsigned int count, limit; int r; - if (nblocks > NR_IOREQS) { - nblocks = NR_IOREQS; + limit = lmfs_readahead_limit(); + assert(limit >= 1 && limit <= LMFS_MAX_PREFETCH); + + if (nblocks > limit) { + nblocks = limit; last_size = block_size; } @@ -77,24 +81,21 @@ block_prefetch(dev_t dev, block64_t block, unsigned int nblocks, for (count = 0; count < nblocks; count++) { if (count == nblocks - 1 && last_size < block_size) r = lmfs_get_partial_block(&bp, dev, block + count, - PREFETCH, last_size); + PEEK, last_size); else - r = lmfs_get_block(&bp, dev, block + count, PREFETCH); - - if (r != OK) - panic("libminixfs: get_block PREFETCH error: %d\n", r); + r = lmfs_get_block(&bp, dev, block + count, PEEK); - if (lmfs_dev(bp) != NO_DEV) { + if (r == OK) { lmfs_put_block(bp); + last_size = block_size; + break; } - - bufs[count] = bp; } if (count > 0) - lmfs_rw_scattered(dev, bufs, count, READING); + lmfs_readahead(dev, block, count, last_size); } /* @@ -206,8 +207,6 @@ lmfs_bio(dev_t dev, struct fsdriver_data * data, size_t bytes, off_t pos, /* Perform the actual copy. */ if (r == OK && data != NULL) { - assert(lmfs_dev(bp) != NO_DEV); - if (write) { r = fsdriver_copyin(data, off, (char *)bp->data + block_off, chunk); diff --git a/minix/lib/libminixfs/cache.c b/minix/lib/libminixfs/cache.c index 2dc377c08..ffb7aadc5 100644 --- a/minix/lib/libminixfs/cache.c +++ b/minix/lib/libminixfs/cache.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "inc.h" @@ -173,11 +174,6 @@ int lmfs_isclean(struct buf *bp) return !(bp->lmfs_flags & VMMC_DIRTY); } -dev_t lmfs_dev(struct buf *bp) -{ - return bp->lmfs_dev; -} - static void free_unused_blocks(void) { struct buf *bp; @@ -319,10 +315,8 @@ static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, * disk (if 'how' is NORMAL). If 'how' is NO_READ, the caller intends to * overwrite the requested block in its entirety, so it is only necessary to * see if it is in the cache; if it is not, any free buffer will do. If 'how' - * is PREFETCH, the block need not be read from the disk, and the device is not - * to be marked on the block (i.e., set to NO_DEV), so callers can tell if the - * block returned is valid. If 'how' is PEEK, the function returns the block - * if it is in the cache or the VM cache, and an ENOENT error code otherwise. + * is PEEK, the function returns the block if it is in the cache or the VM + * cache, and an ENOENT error code otherwise. * In addition to the LRU chain, there is also a hash chain to link together * blocks whose block numbers end with the same bit strings, for fast lookup. */ @@ -441,12 +435,16 @@ static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, assert(dev != NO_DEV); - /* Block is not found in our cache, but we do want it - * if it's in the vm cache. + /* The block is not found in our cache, but we do want it if it's in the VM + * cache. The exception is NO_READ, purely for context switching performance + * reasons. NO_READ is used for 1) newly allocated blocks, 2) blocks being + * prefetched, and 3) blocks about to be fully overwritten. In the first two + * cases, VM will not have the block in its cache anyway, and for the third + * we save on one VM call only if the block is in the VM cache. */ assert(!bp->data); assert(!bp->lmfs_bytes); - if(vmcache) { + if (how != NO_READ && vmcache) { if((bp->data = vm_map_cacheblock(dev, dev_off, ino, ino_off, &bp->lmfs_flags, roundup(block_size, PAGE_SIZE))) != MAP_FAILED) { bp->lmfs_bytes = block_size; @@ -476,10 +474,7 @@ static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, assert(bp->data); - if(how == PREFETCH) { - /* PREFETCH: don't do i/o. */ - bp->lmfs_dev = NO_DEV; - } else if (how == NORMAL) { + if (how == NORMAL) { /* Try to read the block. Return an error code on failure. */ if ((r = read_block(bp, block_size)) != OK) { put_block(bp, 0); @@ -812,68 +807,59 @@ void lmfs_invalidate( } /*===========================================================================* - * lmfs_flushdev * + * sort_blocks * *===========================================================================*/ -void lmfs_flushdev(dev_t dev) +static void sort_blocks(struct buf **bufq, unsigned int bufqsize) { -/* Flush all dirty blocks for one device. */ - - register struct buf *bp; - static struct buf **dirty; - static unsigned int dirtylistsize = 0; - int ndirty; + struct buf *bp; + int i, j, gap; - if(dirtylistsize != nr_bufs) { - if(dirtylistsize > 0) { - assert(dirty != NULL); - free(dirty); - } - if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs))) - panic("couldn't allocate dirty buf list"); - dirtylistsize = nr_bufs; - } + gap = 1; + do + gap = 3 * gap + 1; + while ((unsigned int)gap <= bufqsize); - for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) { - /* Do not flush dirty blocks that are in use (lmfs_count>0): the file - * system may mark the block as dirty before changing its contents, in - * which case the new contents could end up being lost. - */ - if (!lmfs_isclean(bp) && bp->lmfs_dev == dev && bp->lmfs_count == 0) { - dirty[ndirty++] = bp; + while (gap != 1) { + gap /= 3; + for (j = gap; (unsigned int)j < bufqsize; j++) { + for (i = j - gap; i >= 0 && + bufq[i]->lmfs_blocknr > bufq[i + gap]->lmfs_blocknr; + i -= gap) { + bp = bufq[i]; + bufq[i] = bufq[i + gap]; + bufq[i + gap] = bp; + } } } - - lmfs_rw_scattered(dev, dirty, ndirty, WRITING); } /*===========================================================================* - * lmfs_rw_scattered * + * rw_scattered * *===========================================================================*/ -void lmfs_rw_scattered( +static void rw_scattered( dev_t dev, /* major-minor device number */ struct buf **bufq, /* pointer to array of buffers */ - int bufqsize, /* number of buffers */ + unsigned int bufqsize, /* number of buffers */ int rw_flag /* READING or WRITING */ ) { /* Read or write scattered data from a device. */ register struct buf *bp; - int gap; - register int i; register iovec_t *iop; static iovec_t iovec[NR_IOREQS]; off_t pos; - int iov_per_block; + unsigned int i, iov_per_block; unsigned int start_in_use = bufs_in_use, start_bufqsize = bufqsize; - assert(bufqsize >= 0); if(bufqsize == 0) return; /* for READING, check all buffers on the list are obtained and held * (count > 0) */ if (rw_flag == READING) { + assert(bufqsize <= LMFS_MAX_PREFETCH); + for(i = 0; i < bufqsize; i++) { assert(bufq[i] != NULL); assert(bufq[i]->lmfs_count > 0); @@ -887,40 +873,26 @@ void lmfs_rw_scattered( assert(fs_block_size > 0); assert(howmany(fs_block_size, PAGE_SIZE) <= NR_IOREQS); - /* (Shell) sort buffers on lmfs_blocknr. */ - gap = 1; - do - gap = 3 * gap + 1; - while (gap <= bufqsize); - while (gap != 1) { - int j; - gap /= 3; - for (j = gap; j < bufqsize; j++) { - for (i = j - gap; - i >= 0 && bufq[i]->lmfs_blocknr > bufq[i + gap]->lmfs_blocknr; - i -= gap) { - bp = bufq[i]; - bufq[i] = bufq[i + gap]; - bufq[i + gap] = bp; - } - } - } + /* For WRITING, (Shell) sort buffers on lmfs_blocknr. + * For READING, the buffers are already sorted. + */ + if (rw_flag == WRITING) + sort_blocks(bufq, bufqsize); /* Set up I/O vector and do I/O. The result of bdev I/O is OK if everything * went fine, otherwise the error code for the first failed transfer. */ while (bufqsize > 0) { - int nblocks = 0, niovecs = 0; + unsigned int p, nblocks = 0, niovecs = 0; int r; for (iop = iovec; nblocks < bufqsize; nblocks++) { - int p; vir_bytes vdata, blockrem; bp = bufq[nblocks]; if (bp->lmfs_blocknr != bufq[0]->lmfs_blocknr + nblocks) break; blockrem = bp->lmfs_bytes; iov_per_block = howmany(blockrem, PAGE_SIZE); - if(niovecs >= NR_IOREQS-iov_per_block) break; + if (niovecs > NR_IOREQS - iov_per_block) break; vdata = (vir_bytes) bp->data; for(p = 0; p < iov_per_block; p++) { vir_bytes chunk = @@ -937,7 +909,7 @@ void lmfs_rw_scattered( } assert(nblocks > 0); - assert(niovecs > 0); + assert(niovecs > 0 && niovecs <= NR_IOREQS); pos = (off_t)bufq[0]->lmfs_blocknr * fs_block_size; if (rw_flag == READING) @@ -963,7 +935,6 @@ void lmfs_rw_scattered( break; } if (rw_flag == READING) { - bp->lmfs_dev = dev; /* validate block */ lmfs_put_block(bp); } else { MARKCLEAN(bp); @@ -979,7 +950,9 @@ void lmfs_rw_scattered( * give at this time. Don't forget to release those extras. */ while (bufqsize > 0) { - lmfs_put_block(*bufq++); + bp = *bufq++; + bp->lmfs_dev = NO_DEV; /* invalidate block */ + lmfs_put_block(bp); bufqsize--; } } @@ -1001,6 +974,190 @@ void lmfs_rw_scattered( } } +/*===========================================================================* + * lmfs_readahead * + *===========================================================================*/ +void lmfs_readahead(dev_t dev, block64_t base_block, unsigned int nblocks, + size_t last_size) +{ +/* Read ahead 'nblocks' blocks starting from the block 'base_block' on device + * 'dev'. The number of blocks must be between 1 and LMFS_MAX_PREFETCH, + * inclusive. All blocks have the file system's block size, possibly except the + * last block in the range, which is of size 'last_size'. The caller must + * ensure that none of the blocks in the range are already in the cache. + * However, the caller must also not rely on all or even any of the blocks to + * be present in the cache afterwards--failures are (deliberately!) ignored. + */ + static struct buf *bufq[LMFS_MAX_PREFETCH]; /* static because of size only */ + struct buf *bp; + unsigned int count; + int r; + + assert(nblocks >= 1 && nblocks <= LMFS_MAX_PREFETCH); + + for (count = 0; count < nblocks; count++) { + if (count == nblocks - 1) + r = lmfs_get_partial_block(&bp, dev, base_block + count, + NO_READ, last_size); + else + r = lmfs_get_block(&bp, dev, base_block + count, NO_READ); + + if (r != OK) + break; + + /* We could add a flag that makes the get_block() calls fail if the + * block is already in the cache, but it is not a major concern if it + * is: we just perform a useless read in that case. However, if the + * block is cached *and* dirty, we are about to lose its new contents. + */ + assert(lmfs_isclean(bp)); + + bufq[count] = bp; + } + + rw_scattered(dev, bufq, count, READING); +} + +/*===========================================================================* + * lmfs_prefetch * + *===========================================================================*/ +unsigned int lmfs_readahead_limit(void) +{ +/* Return the maximum number of blocks that should be read ahead at once. The + * return value is guaranteed to be between 1 and LMFS_MAX_PREFETCH, inclusive. + */ + unsigned int max_transfer, max_bufs; + + /* The returned value is the minimum of two factors: the maximum number of + * blocks that can be transferred in a single I/O gather request (see how + * rw_scattered() generates I/O requests), and a policy limit on the number + * of buffers that any read-ahead operation may use (that is, thrash). + */ + max_transfer = NR_IOREQS / MAX(fs_block_size / PAGE_SIZE, 1); + + /* The constants have been imported from MFS as is, and may need tuning. */ + if (nr_bufs < 50) + max_bufs = 18; + else + max_bufs = nr_bufs - 4; + + return MIN(max_transfer, max_bufs); +} + +/*===========================================================================* + * lmfs_prefetch * + *===========================================================================*/ +void lmfs_prefetch(dev_t dev, const block64_t *blockset, unsigned int nblocks) +{ +/* The given set of blocks is expected to be needed soon, so prefetch a + * convenient subset. The blocks are expected to be sorted by likelihood of + * being accessed soon, making the first block of the set the most important + * block to prefetch right now. The caller must have made sure that the blocks + * are not in the cache already. The array may have duplicate block numbers. + */ + bitchunk_t blocks_before[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)]; + bitchunk_t blocks_after[BITMAP_CHUNKS(LMFS_MAX_PREFETCH)]; + block64_t block, base_block; + unsigned int i, bit, nr_before, nr_after, span, limit, nr_blocks; + + if (nblocks == 0) + return; + + /* Here is the deal. We are going to prefetch one range only, because seeking + * is too expensive for just prefetching. The range we select should at least + * include the first ("base") block of the given set, since that is the block + * the caller is primarily interested in. Thus, the rest of the range is + * going to have to be directly around this base block. We first check which + * blocks from the set fall just before and after the base block, which then + * allows us to construct a contiguous range of desired blocks directly + * around the base block, in O(n) time. As a natural part of this, we ignore + * duplicate blocks in the given set. We then read from the beginning of this + * range, in order to maximize the chance that a next prefetch request will + * continue from the last disk position without requiring a seek. However, we + * do correct for the maximum number of blocks we can (or should) read in at + * once, such that we will still end up reading the base block. + */ + base_block = blockset[0]; + + memset(blocks_before, 0, sizeof(blocks_before)); + memset(blocks_after, 0, sizeof(blocks_after)); + + for (i = 1; i < nblocks; i++) { + block = blockset[i]; + + if (block < base_block && block + LMFS_MAX_PREFETCH >= base_block) { + bit = base_block - block - 1; + assert(bit < LMFS_MAX_PREFETCH); + SET_BIT(blocks_before, bit); + } else if (block > base_block && + block - LMFS_MAX_PREFETCH <= base_block) { + bit = block - base_block - 1; + assert(bit < LMFS_MAX_PREFETCH); + SET_BIT(blocks_after, bit); + } + } + + for (nr_before = 0; nr_before < LMFS_MAX_PREFETCH; nr_before++) + if (!GET_BIT(blocks_before, nr_before)) + break; + + for (nr_after = 0; nr_after < LMFS_MAX_PREFETCH; nr_after++) + if (!GET_BIT(blocks_after, nr_after)) + break; + + /* The number of blocks to prefetch is the minimum of two factors: the number + * of blocks in the range around the base block, and the maximum number of + * blocks that should be read ahead at once at all. + */ + span = nr_before + 1 + nr_after; + limit = lmfs_readahead_limit(); + + nr_blocks = MIN(span, limit); + assert(nr_blocks >= 1 && nr_blocks <= LMFS_MAX_PREFETCH); + + /* Start prefetching from the lowest block within the contiguous range, but + * make sure that we read at least the original base block itself, too. + */ + base_block -= MIN(nr_before, nr_blocks - 1); + + lmfs_readahead(dev, base_block, nr_blocks, fs_block_size); +} + +/*===========================================================================* + * lmfs_flushdev * + *===========================================================================*/ +void lmfs_flushdev(dev_t dev) +{ +/* Flush all dirty blocks for one device. */ + + register struct buf *bp; + static struct buf **dirty; + static unsigned int dirtylistsize = 0; + unsigned int ndirty; + + if(dirtylistsize != nr_bufs) { + if(dirtylistsize > 0) { + assert(dirty != NULL); + free(dirty); + } + if(!(dirty = malloc(sizeof(dirty[0])*nr_bufs))) + panic("couldn't allocate dirty buf list"); + dirtylistsize = nr_bufs; + } + + for (bp = &buf[0], ndirty = 0; bp < &buf[nr_bufs]; bp++) { + /* Do not flush dirty blocks that are in use (lmfs_count>0): the file + * system may mark the block as dirty before changing its contents, in + * which case the new contents could end up being lost. + */ + if (!lmfs_isclean(bp) && bp->lmfs_dev == dev && bp->lmfs_count == 0) { + dirty[ndirty++] = bp; + } + } + + rw_scattered(dev, dirty, ndirty, WRITING); +} + /*===========================================================================* * rm_lru * *===========================================================================*/ @@ -1128,16 +1285,6 @@ void lmfs_buf_pool(int new_nr_bufs) buf_hash[0] = front; } -int lmfs_bufs_in_use(void) -{ - return bufs_in_use; -} - -int lmfs_nr_bufs(void) -{ - return nr_bufs; -} - void lmfs_flushall(void) { struct buf *bp; diff --git a/minix/lib/libminixfs/inc.h b/minix/lib/libminixfs/inc.h index 0f73e26e7..ea01e99d0 100644 --- a/minix/lib/libminixfs/inc.h +++ b/minix/lib/libminixfs/inc.h @@ -3,5 +3,8 @@ int lmfs_get_partial_block(struct buf **bpp, dev_t dev, block64_t block, int how, size_t block_size); +void lmfs_readahead(dev_t dev, block64_t base_block, unsigned int nblocks, + size_t last_size); +unsigned int lmfs_readahead_limit(void); #endif /* !_LIBMINIXFS_INC_H */