From: David van Moolenbroek Date: Sun, 29 Mar 2015 16:57:53 +0000 (+0000) Subject: libminixfs: better support for read errors and EOF X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zpipe.c?a=commitdiff_plain;h=6c46a77d9509a6aefda1522eee2103e194ced4b3;p=minix.git libminixfs: better support for read errors and EOF - The lmfs_get_block*(3) API calls may now return an error. The idea is to encourage a next generation of file system services to do a better job at dealing with block read errors than the MFS-derived implementations do. These existing file systems have been changed to panic immediately upon getting a block read error, in order to let unchecked errors cause corruption. Note that libbdev already retries failing I/O operations a few times first. - The libminixfs block device I/O module (bio.c) now deals properly with end-of-file conditions on block devices. Since a device or partition size may not be a multiple of the root file system's block size, support for partial block retrival has been added, with a new internal lmfs_get_partial_block(3) call. A new test program, test85, tests the new handling of EOF conditions when reading, writing, and memory-mapping a block device. Change-Id: I05e35b6b8851488328a2679da635ebba0c6d08ce --- diff --git a/distrib/sets/lists/minix/mi b/distrib/sets/lists/minix/mi index 3a47f67f1..ff0024099 100644 --- a/distrib/sets/lists/minix/mi +++ b/distrib/sets/lists/minix/mi @@ -6306,6 +6306,7 @@ ./usr/tests/minix-posix/test82 minix-sys ./usr/tests/minix-posix/test83 minix-sys ./usr/tests/minix-posix/test84 minix-sys +./usr/tests/minix-posix/test85 minix-sys ./usr/tests/minix-posix/test9 minix-sys ./usr/tests/minix-posix/testinterp minix-sys ./usr/tests/minix-posix/testisofs minix-sys diff --git a/minix/fs/ext2/fs.h b/minix/fs/ext2/fs.h index a305c798a..a97b19285 100644 --- a/minix/fs/ext2/fs.h +++ b/minix/fs/ext2/fs.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/minix/fs/ext2/proto.h b/minix/fs/ext2/proto.h index 9573c49c8..042d3da55 100644 --- a/minix/fs/ext2/proto.h +++ b/minix/fs/ext2/proto.h @@ -1,7 +1,6 @@ #ifndef EXT2_PROTO_H #define EXT2_PROTO_H -#define get_block(d, n, t) lmfs_get_block(d, n, t) #define put_block(n) lmfs_put_block(n) /* Function prototypes. */ @@ -95,6 +94,7 @@ struct group_desc* get_group_desc(unsigned int bnum); int fs_utime(ino_t ino, struct timespec *atime, struct timespec *mtime); /* utility.c */ +struct buf *get_block(dev_t dev, block_t block, int how); unsigned conv2(int norm, int w); long conv4(int norm, long x); int ansi_strcmp(register const char* ansi_s, register const char *s2, diff --git a/minix/fs/ext2/read.c b/minix/fs/ext2/read.c index 28d1f73c2..4a38ff2fe 100644 --- a/minix/fs/ext2/read.c +++ b/minix/fs/ext2/read.c @@ -48,8 +48,6 @@ ssize_t fs_readwrite(ino_t ino_nr, struct fsdriver_data *data, size_t nrbytes, f_size = rip->i_size; if (f_size < 0) f_size = MAX_FILE_POS; - lmfs_reset_rdwt_err(); - if (call == FSC_WRITE) { /* Check in advance to see if file will grow too big. */ if (position > (off_t) (rip->i_sp->s_max_size - nrbytes)) @@ -74,8 +72,7 @@ ssize_t fs_readwrite(ino_t ino_nr, struct fsdriver_data *data, size_t nrbytes, r = rw_chunk(rip, ((u64_t)((unsigned long)position)), off, chunk, nrbytes, call, data, cum_io, block_size, &completed); - if (r != OK) break; /* EOF reached */ - if (lmfs_rdwt_err() < 0) break; + if (r != OK) break; /* Update counters and pointers. */ nrbytes -= chunk; /* bytes yet to be read */ @@ -92,9 +89,6 @@ ssize_t fs_readwrite(ino_t ino_nr, struct fsdriver_data *data, size_t nrbytes, rip->i_seek = NO_SEEK; - if (lmfs_rdwt_err() != OK) r = lmfs_rdwt_err(); /* check for disk error */ - if (lmfs_rdwt_err() == END_OF_FILE) r = OK; - if (r != OK) return r; @@ -124,7 +118,7 @@ int *completed; /* number of bytes copied */ { /* Read or write (part of) a block. */ - register struct buf *bp = NULL; + struct buf *bp = NULL; register int r = OK; int n; block_t b; @@ -173,7 +167,8 @@ int *completed; /* number of bytes copied */ n = NO_READ; assert(ino != VMC_NO_INODE); assert(!(ino_off % block_size)); - bp = lmfs_get_block_ino(dev, b, n, ino, ino_off); + if ((r = lmfs_get_block_ino(&bp, dev, b, n, ino, ino_off)) != OK) + panic("ext2: error getting block (%llu,%u): %d", dev, b, r); } /* In all cases, bp now points to a valid buffer. */ @@ -291,13 +286,19 @@ int opportunistic; struct buf *get_block_map(register struct inode *rip, u64_t position) { + struct buf *bp; + int r, block_size; block_t b = read_map(rip, position, 0); /* get block number */ - int block_size = get_block_size(rip->i_dev); if(b == NO_BLOCK) return NULL; + block_size = get_block_size(rip->i_dev); position = rounddown(position, block_size); assert(rip->i_num != VMC_NO_INODE); - return lmfs_get_block_ino(rip->i_dev, b, NORMAL, rip->i_num, position); + if ((r = lmfs_get_block_ino(&bp, rip->i_dev, b, NORMAL, rip->i_num, + position)) != OK) + panic("ext2: error getting block (%llu,%u): %d", + rip->i_dev, b, r); + return bp; } /*===========================================================================* @@ -333,7 +334,7 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ /* Minimum number of blocks to prefetch. */ # define BLOCKS_MINIMUM (nr_bufs < 50 ? 18 : 32) int nr_bufs = lmfs_nr_bufs(); - int read_q_size; + int r, read_q_size; unsigned int blocks_ahead, fragment, block_size; block_t block, blocks_left; off_t ind1_pos; @@ -371,7 +372,9 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ bytes_ahead += fragment; blocks_ahead = (bytes_ahead + block_size - 1) / block_size; - bp = lmfs_get_block_ino(dev, block, PREFETCH, rip->i_num, position); + r = lmfs_get_block_ino(&bp, dev, block, PREFETCH, rip->i_num, position); + if (r != OK) + panic("ext2: error getting block (%llu,%u): %d", dev, block, r); assert(bp != NULL); if (lmfs_dev(bp) != NO_DEV) return(bp); @@ -432,8 +435,11 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ thisblock = read_map(rip, (off_t) ex64lo(position_running), 1); if (thisblock != NO_BLOCK) { - bp = lmfs_get_block_ino(dev, thisblock, PREFETCH, rip->i_num, - position_running); + r = lmfs_get_block_ino(&bp, dev, thisblock, PREFETCH, + rip->i_num, position_running); + if (r != OK) + panic("ext2: error getting block (%llu,%u): %d", + dev, thisblock, r); } else { bp = get_block(dev, block, PREFETCH); } @@ -445,7 +451,10 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ } lmfs_rw_scattered(dev, read_q, read_q_size, READING); - return(lmfs_get_block_ino(dev, baseblock, NORMAL, rip->i_num, position)); + r = lmfs_get_block_ino(&bp, dev, baseblock, NORMAL, rip->i_num, position); + if (r != OK) + panic("ext2: error getting block (%llu,%u): %d", dev, baseblock, r); + return bp; } diff --git a/minix/fs/ext2/utility.c b/minix/fs/ext2/utility.c index f2c37e72c..7fc444721 100644 --- a/minix/fs/ext2/utility.c +++ b/minix/fs/ext2/utility.c @@ -8,6 +8,28 @@ #include "super.h" +/*===========================================================================* + * get_block * + *===========================================================================*/ +struct buf *get_block(dev_t dev, block_t block, int how) +{ +/* Wrapper routine for lmfs_get_block(). This ext2 implementation does not deal + * well with block read errors pretty much anywhere. To prevent corruption due + * to unchecked error conditions, we panic upon an I/O failure here. + */ + struct buf *bp; + int r; + + if ((r = lmfs_get_block(&bp, dev, block, how)) != OK && r != ENOENT) + panic("ext2: error getting block (%llu,%u): %d", dev, block, r); + + assert(r == OK || how == PEEK); + + return (r == OK) ? bp : NULL; +} + + + /*===========================================================================* * conv2 * *===========================================================================*/ diff --git a/minix/fs/ext2/write.c b/minix/fs/ext2/write.c index b0d007105..edcbf2cd0 100644 --- a/minix/fs/ext2/write.c +++ b/minix/fs/ext2/write.c @@ -307,7 +307,7 @@ register struct inode *rip; /* pointer to inode */ off_t position; /* file pointer */ { /* Acquire a new block and return a pointer to it. */ - register struct buf *bp; + struct buf *bp; int r; block_t b; @@ -355,8 +355,10 @@ off_t position; /* file pointer */ } } - bp = lmfs_get_block_ino(rip->i_dev, b, NO_READ, rip->i_num, - rounddown(position, rip->i_sp->s_block_size)); + r = lmfs_get_block_ino(&bp, rip->i_dev, b, NO_READ, rip->i_num, + rounddown(position, rip->i_sp->s_block_size)); + if (r != OK) + panic("ext2: error getting block (%llu,%u): %d", rip->i_dev, b, r); zero_block(bp); return(bp); } diff --git a/minix/fs/isofs/read.c b/minix/fs/isofs/read.c index 94ac59d00..e5f6dd45d 100644 --- a/minix/fs/isofs/read.c +++ b/minix/fs/isofs/read.c @@ -26,7 +26,6 @@ ssize_t fs_read(ino_t ino_nr, struct fsdriver_data *data, size_t bytes, block_size = v_pri.logical_block_size_l; cum_io = 0; - lmfs_reset_rdwt_err(); r = OK; /* Split the transfer into chunks that don't span two blocks. */ @@ -47,8 +46,6 @@ ssize_t fs_read(ino_t ino_nr, struct fsdriver_data *data, size_t bytes, lmfs_put_block(bp); if (r != OK) - break; /* EOF reached. */ - if (lmfs_rdwt_err() < 0) break; /* Update counters and pointers. */ @@ -57,11 +54,6 @@ ssize_t fs_read(ino_t ino_nr, struct fsdriver_data *data, size_t bytes, pos += chunk; /* Position within the file. */ } - if (lmfs_rdwt_err() != OK) - r = lmfs_rdwt_err(); /* Check for disk error. */ - if (lmfs_rdwt_err() == END_OF_FILE) - r = OK; - return (r == OK) ? cum_io : r; } diff --git a/minix/fs/isofs/susp.c b/minix/fs/isofs/susp.c index a5fde0e27..80436e991 100644 --- a/minix/fs/isofs/susp.c +++ b/minix/fs/isofs/susp.c @@ -12,11 +12,11 @@ int parse_susp(struct rrii_dir_record *dir, char *buffer) char susp_signature[2]; u8_t susp_length; u8_t susp_version; - u32_t ca_block_nr; u32_t ca_offset; u32_t ca_length; struct buf *ca_bp; + int r; susp_signature[0] = buffer[0]; susp_signature[1] = buffer[1]; @@ -44,10 +44,9 @@ int parse_susp(struct rrii_dir_record *dir, char *buffer) ca_length = v_pri.logical_block_size_l - ca_offset; } - ca_bp = lmfs_get_block(fs_dev, ca_block_nr, NORMAL); - if (ca_bp == NULL) { - return EINVAL; - } + r = lmfs_get_block(&ca_bp, fs_dev, ca_block_nr, NORMAL); + if (r != OK) + return r; parse_susp_buffer(dir, b_data(ca_bp) + ca_offset, ca_length); lmfs_put_block(ca_bp); diff --git a/minix/fs/isofs/utility.c b/minix/fs/isofs/utility.c index e1729170c..bea356d10 100644 --- a/minix/fs/isofs/utility.c +++ b/minix/fs/isofs/utility.c @@ -36,12 +36,21 @@ void free_extent(struct dir_extent *e) struct buf* read_extent_block(struct dir_extent *e, size_t block) { - size_t block_id = get_extent_absolute_block_id(e, block); + struct buf *bp; + size_t block_id; + int r; + + block_id = get_extent_absolute_block_id(e, block); if (block_id == 0 || block_id >= v_pri.volume_space_size_l) return NULL; - return lmfs_get_block(fs_dev, block_id, NORMAL); + /* Not all callers deal well with failure, so panic on I/O error. */ + if ((r = lmfs_get_block(&bp, fs_dev, block_id, NORMAL)) != OK) + panic("ISOFS: error getting block (%llu,%zu): %d", + fs_dev, block_id, r); + + return bp; } size_t get_extent_absolute_block_id(struct dir_extent *e, size_t block) diff --git a/minix/fs/mfs/cache.c b/minix/fs/mfs/cache.c index ed8300851..bb3a4a4ac 100644 --- a/minix/fs/mfs/cache.c +++ b/minix/fs/mfs/cache.c @@ -1,17 +1,7 @@ /* The file system maintains a buffer cache to reduce the number of disk * accesses needed. Whenever a read or write to the disk is done, a check is - * first made to see if the block is in the cache. This file manages the - * cache. - * - * The entry points into this file are: - * get_block: request to fetch a block for reading or writing from cache - * put_block: return a block previously requested with get_block - * alloc_zone: allocate a new zone (to increase the length of a file) - * free_zone: release a zone (when a file is removed) - * invalidate: remove all the cache blocks on some device - * - * Private functions: - * read_block: read or write a block from the disk itself + * first made to see if the block is in the cache. This file contains some + * related routines, but the cache is now in libminixfs. */ #include "fs.h" @@ -26,6 +16,26 @@ #include "super.h" #include "inode.h" +/*===========================================================================* + * get_block * + *===========================================================================*/ +struct buf *get_block(dev_t dev, block_t block, int how) +{ +/* Wrapper routine for lmfs_get_block(). This MFS implementation does not deal + * well with block read errors pretty much anywhere. To prevent corruption due + * to unchecked error conditions, we panic upon an I/O failure here. + */ + struct buf *bp; + int r; + + if ((r = lmfs_get_block(&bp, dev, block, how)) != OK && r != ENOENT) + panic("MFS: error getting block (%llu,%u): %d", dev, block, r); + + assert(r == OK || how == PEEK); + + return (r == OK) ? bp : NULL; +} + /*===========================================================================* * alloc_zone * *===========================================================================*/ diff --git a/minix/fs/mfs/proto.h b/minix/fs/mfs/proto.h index 92a4ad638..a519cf203 100644 --- a/minix/fs/mfs/proto.h +++ b/minix/fs/mfs/proto.h @@ -2,7 +2,6 @@ #define __MFS_PROTO_H__ /* Some shortcuts to functions in -lminixfs */ -#define get_block(d, b, t) lmfs_get_block(d, b, t) #define put_block(b) lmfs_put_block(b) /* Function prototypes. */ @@ -16,6 +15,7 @@ struct super_block; /* cache.c */ zone_t alloc_zone(dev_t dev, zone_t z); void free_zone(dev_t dev, zone_t numb); +struct buf *get_block(dev_t dev, block_t block, int how); /* inode.c */ struct inode *alloc_inode(dev_t dev, mode_t bits, uid_t uid, gid_t gid); diff --git a/minix/fs/mfs/read.c b/minix/fs/mfs/read.c index f5bdf149d..af4b76064 100644 --- a/minix/fs/mfs/read.c +++ b/minix/fs/mfs/read.c @@ -44,8 +44,6 @@ ssize_t fs_readwrite(ino_t ino_nr, struct fsdriver_data *data, size_t nrbytes, block_size = rip->i_sp->s_block_size; f_size = rip->i_size; - lmfs_reset_rdwt_err(); - /* If this is file i/o, check we can write */ if (call == FSC_WRITE) { if(rip->i_sp->s_rd_only) @@ -80,8 +78,7 @@ ssize_t fs_readwrite(ino_t ino_nr, struct fsdriver_data *data, size_t nrbytes, r = rw_chunk(rip, ((u64_t)((unsigned long)position)), off, chunk, nrbytes, call, data, cum_io, block_size, &completed); - if (r != OK) break; /* EOF reached */ - if (lmfs_rdwt_err() < 0) break; + if (r != OK) break; /* Update counters and pointers. */ nrbytes -= chunk; /* bytes yet to be read */ @@ -98,9 +95,6 @@ ssize_t fs_readwrite(ino_t ino_nr, struct fsdriver_data *data, size_t nrbytes, rip->i_seek = NO_SEEK; - if (lmfs_rdwt_err() != OK) r = lmfs_rdwt_err(); /* check for disk error */ - if (lmfs_rdwt_err() == END_OF_FILE) r = OK; - if (r != OK) return r; @@ -134,8 +128,7 @@ unsigned int block_size; /* block size of FS operating on */ int *completed; /* number of bytes copied */ { /* Read or write (part of) a block. */ - - register struct buf *bp = NULL; + struct buf *bp = NULL; register int r = OK; int n; block_t b; @@ -184,7 +177,8 @@ int *completed; /* number of bytes copied */ n = NO_READ; assert(ino != VMC_NO_INODE); assert(!(ino_off % block_size)); - bp = lmfs_get_block_ino(dev, b, n, ino, ino_off); + if ((r = lmfs_get_block_ino(&bp, dev, b, n, ino, ino_off)) != OK) + panic("MFS: error getting block (%llu,%u): %d", dev, b, r); } /* In all cases, bp now points to a valid buffer. */ @@ -288,13 +282,19 @@ int opportunistic; /* if nonzero, only use cache for metadata */ struct buf *get_block_map(register struct inode *rip, u64_t position) { + struct buf *bp; + int r, block_size; block_t b = read_map(rip, position, 0); /* get block number */ - int block_size = get_block_size(rip->i_dev); if(b == NO_BLOCK) return NULL; + block_size = get_block_size(rip->i_dev); position = rounddown(position, block_size); assert(rip->i_num != VMC_NO_INODE); - return lmfs_get_block_ino(rip->i_dev, b, NORMAL, rip->i_num, position); + if ((r = lmfs_get_block_ino(&bp, rip->i_dev, b, NORMAL, rip->i_num, + position)) != OK) + panic("MFS: error getting block (%llu,%u): %d", + rip->i_dev, b, r); + return bp; } /*===========================================================================* @@ -345,7 +345,7 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ /* Minimum number of blocks to prefetch. */ int nr_bufs = lmfs_nr_bufs(); # define BLOCKS_MINIMUM (nr_bufs < 50 ? 18 : 32) - int scale, read_q_size; + int r, scale, read_q_size; unsigned int blocks_ahead, fragment, block_size; block_t block, blocks_left; off_t ind1_pos; @@ -379,7 +379,9 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ bytes_ahead += fragment; blocks_ahead = (bytes_ahead + block_size - 1) / block_size; - bp = lmfs_get_block_ino(dev, block, PREFETCH, rip->i_num, position); + r = lmfs_get_block_ino(&bp, dev, block, PREFETCH, rip->i_num, position); + if (r != OK) + panic("MFS: error getting block (%llu,%u): %d", dev, block, r); assert(bp != NULL); assert(bp->lmfs_count > 0); if (lmfs_dev(bp) != NO_DEV) return(bp); @@ -443,8 +445,11 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ thisblock = read_map(rip, (off_t) ex64lo(position_running), 1); if (thisblock != NO_BLOCK) { - bp = lmfs_get_block_ino(dev, thisblock, PREFETCH, rip->i_num, - position_running); + r = lmfs_get_block_ino(&bp, dev, thisblock, PREFETCH, + rip->i_num, position_running); + if (r != OK) + panic("MFS: error getting block (%llu,%u): %d", + dev, thisblock, r); } else { bp = get_block(dev, block, PREFETCH); } @@ -460,7 +465,10 @@ unsigned bytes_ahead; /* bytes beyond position for immediate use */ assert(inuse_before == lmfs_bufs_in_use()); - return(lmfs_get_block_ino(dev, baseblock, NORMAL, rip->i_num, position)); + r = lmfs_get_block_ino(&bp, dev, baseblock, NORMAL, rip->i_num, position); + if (r != OK) + panic("MFS: error getting block (%llu,%u): %d", dev, baseblock, r); + return bp; } diff --git a/minix/fs/mfs/write.c b/minix/fs/mfs/write.c index 0d27266e6..51a4a1db3 100644 --- a/minix/fs/mfs/write.c +++ b/minix/fs/mfs/write.c @@ -259,8 +259,7 @@ off_t position; /* file pointer */ * allocating a complete zone, and then returning the initial block. * On the other hand, the current zone may still have some unused blocks. */ - - register struct buf *bp; + struct buf *bp; block_t b, base_block; zone_t z; zone_t zone_size; @@ -297,8 +296,10 @@ off_t position; /* file pointer */ b = base_block + (block_t)((position % zone_size)/rip->i_sp->s_block_size); } - bp = lmfs_get_block_ino(rip->i_dev, b, NO_READ, rip->i_num, + r = lmfs_get_block_ino(&bp, rip->i_dev, b, NO_READ, rip->i_num, rounddown(position, rip->i_sp->s_block_size)); + if (r != OK) + panic("MFS: error getting block (%llu,%u): %d", rip->i_dev, b, r); zero_block(bp); return(bp); } diff --git a/minix/include/minix/libminixfs.h b/minix/include/minix/libminixfs.h index 44aa80e96..05d1aee45 100644 --- a/minix/include/minix/libminixfs.h +++ b/minix/include/minix/libminixfs.h @@ -17,7 +17,7 @@ struct buf { block64_t lmfs_blocknr; /* block number of its (minor) device */ char lmfs_count; /* number of users of this buffer */ char lmfs_needsetcache; /* to be identified to VM */ - unsigned int lmfs_bytes; /* Number of bytes allocated in bp */ + size_t lmfs_bytes; /* size of this block (allocated and used) */ u32_t lmfs_flags; /* Flags shared between VM and FS */ /* If any, which inode & offset does this block correspond to? @@ -38,12 +38,10 @@ void lmfs_flushdev(dev_t dev); size_t lmfs_fs_block_size(void); void lmfs_may_use_vmcache(int); void lmfs_set_blocksize(size_t blocksize); -void lmfs_reset_rdwt_err(void); -int lmfs_rdwt_err(void); void lmfs_buf_pool(int new_nr_bufs); -struct buf *lmfs_get_block(dev_t dev, block64_t block, int how); -struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, - u64_t off); +int lmfs_get_block(struct buf **bpp, dev_t dev, block64_t block, int how); +int lmfs_get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, + ino_t ino, u64_t off); void lmfs_put_block(struct buf *bp); void lmfs_free_block(dev_t dev, block64_t block); void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t off); @@ -57,9 +55,7 @@ void lmfs_change_blockusage(int delta); #define NORMAL 0 /* forces get_block to do disk read */ #define NO_READ 1 /* prevents get_block from doing disk read */ #define PREFETCH 2 /* tells get_block not to read or mark dev */ -#define PEEK 3 /* returns NULL if not in cache or VM cache */ - -#define END_OF_FILE (-104) /* eof detected */ +#define PEEK 3 /* returns ENOENT if not in cache */ /* Block I/O helper functions. */ void lmfs_driver(dev_t dev, char *label); diff --git a/minix/lib/libminixfs/bio.c b/minix/lib/libminixfs/bio.c index b9b892d8f..0cbaf10ef 100644 --- a/minix/lib/libminixfs/bio.c +++ b/minix/lib/libminixfs/bio.c @@ -32,8 +32,12 @@ #include #include #include +#include +#include #include +#include "inc.h" + /* * Set the driver label of the device identified by 'dev' to 'label'. While * 'dev' is a full device number, only its major device number is to be used. @@ -49,6 +53,7 @@ lmfs_driver(dev_t dev, char *label) /* * Prefetch up to "nblocks" blocks on "dev" starting from block number "block". + * The size to be used for the last block in the range is given as "last_size". * Stop early when either the I/O request fills up or when a block is already * found to be in the cache. The latter is likely to happen often, since this * function is called before getting each block for reading. Prefetching is a @@ -56,14 +61,28 @@ lmfs_driver(dev_t dev, char *label) * TODO: limit according to the number of available buffers. */ static void -block_prefetch(dev_t dev, block64_t block, unsigned int nblocks) +block_prefetch(dev_t dev, block64_t block, unsigned int nblocks, + size_t block_size, size_t last_size) { struct buf *bp, *bufs[NR_IOREQS]; unsigned int count; + int r; + + if (nblocks > NR_IOREQS) { + nblocks = NR_IOREQS; + + last_size = block_size; + } for (count = 0; count < nblocks; count++) { - bp = lmfs_get_block(dev, block + count, PREFETCH); - assert(bp != NULL); + if (count == nblocks - 1 && last_size < block_size) + r = lmfs_get_partial_block(&bp, dev, block + count, + PREFETCH, last_size); + else + r = lmfs_get_block(&bp, dev, block + count, PREFETCH); + + if (r != OK) + panic("libminixfs: get_block PREFETCH error: %d\n", r); if (lmfs_dev(bp) != NO_DEV) { lmfs_put_block(bp); @@ -90,21 +109,16 @@ block_prefetch(dev_t dev, block64_t block, unsigned int nblocks) * flushed immediately, and thus, a successful write only indicates that the * data have been taken in by the cache (for immediate I/O, a character device * would have to be used, but MINIX3 no longer supports this), which may be - * follwed later by silent failures, including undetected end-of-file cases. - * In particular, write requests may or may not return 0 (EOF) immediately when - * writing at or beyond the block device's size. i Since block I/O takes place - * at block granularity, block-unaligned writes have to read a block from disk - * before updating it, and that is the only possible source of actual I/O - * errors for write calls. - * TODO: reconsider the buffering-only approach, or see if we can at least - * somehow throw accurate EOF errors without reading in each block first. + * follwed later by silent failures. End-of-file conditions are always + * reported immediately, though. */ ssize_t lmfs_bio(dev_t dev, struct fsdriver_data * data, size_t bytes, off_t pos, int call) { block64_t block; - size_t block_size, off, block_off, chunk; + struct part_geom part; + size_t block_size, off, block_off, last_size, size, chunk; unsigned int blocks_left; struct buf *bp; int r, write, how; @@ -123,37 +137,74 @@ lmfs_bio(dev_t dev, struct fsdriver_data * data, size_t bytes, off_t pos, if (pos < 0 || bytes > SSIZE_MAX || pos > INT64_MAX - bytes + 1) return EINVAL; + /* + * Get the partition size, so that we can handle EOF ourselves. + * Unfortunately, we cannot cache the results between calls, since we + * do not get to see DIOCSETP ioctls--see also repartition(8). + */ + if ((r = bdev_ioctl(dev, DIOCGETP, &part, NONE /*user_endpt*/)) != OK) + return r; + + if ((uint64_t)pos >= part.size) + return 0; /* EOF */ + + if ((uint64_t)pos > part.size - bytes) + bytes = part.size - pos; + off = 0; block = pos / block_size; block_off = (size_t)(pos % block_size); blocks_left = howmany(block_off + bytes, block_size); - lmfs_reset_rdwt_err(); + assert(blocks_left > 0); + + /* + * If the last block we need is also the last block of the device, + * see how many bytes we should actually transfer for that block. + */ + if (block + blocks_left - 1 == part.size / block_size) + last_size = part.size % block_size; + else + last_size = block_size; + r = OK; - for (off = 0; off < bytes; off += chunk) { - chunk = block_size - block_off; + for (off = 0; off < bytes && blocks_left > 0; off += chunk) { + size = (blocks_left == 1) ? last_size : block_size; + + chunk = size - block_off; if (chunk > bytes - off) chunk = bytes - off; + assert(chunk > 0 && chunk <= size); + /* * For read requests, help the block driver form larger I/O * requests. */ if (!write) - block_prefetch(dev, block, blocks_left); + block_prefetch(dev, block, blocks_left, block_size, + last_size); /* * Do not read the block from disk if we will end up * overwriting all of its contents. */ - how = (write && chunk == block_size) ? NO_READ : NORMAL; + how = (write && chunk == size) ? NO_READ : NORMAL; + + if (size < block_size) + r = lmfs_get_partial_block(&bp, dev, block, how, size); + else + r = lmfs_get_block(&bp, dev, block, how); - bp = lmfs_get_block(dev, block, how); - assert(bp); + if (r != OK) { + printf("libminixfs: error getting block <%"PRIx64"," + "%"PRIu64"> for device I/O (%d)\n", dev, block, r); - r = lmfs_rdwt_err(); + break; + } + /* Perform the actual copy. */ if (r == OK && data != NULL) { assert(lmfs_dev(bp) != NO_DEV); @@ -186,12 +237,11 @@ lmfs_bio(dev_t dev, struct fsdriver_data * data, size_t bytes, off_t pos, } /* - * If we were not able to do any I/O, return the error (or EOF, even - * for writes). Otherwise, return how many bytes we did manage to - * transfer. + * If we were not able to do any I/O, return the error. Otherwise, + * return how many bytes we did manage to transfer. */ if (r != OK && off == 0) - return (r == END_OF_FILE) ? 0 : r; + return r; return off; } diff --git a/minix/lib/libminixfs/cache.c b/minix/lib/libminixfs/cache.c index e7a9fa98c..2dc377c08 100644 --- a/minix/lib/libminixfs/cache.c +++ b/minix/lib/libminixfs/cache.c @@ -2,6 +2,7 @@ #define _SYSTEM #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include +#include "inc.h" + /* Buffer (block) cache. To acquire a block, a routine calls lmfs_get_block(), * telling which block it wants. The block is then regarded as "in use" and * has its reference count incremented. All the blocks that are not in use are @@ -44,7 +47,7 @@ static struct buf *rear; /* points to most recently used free block */ static unsigned int bufs_in_use;/* # bufs currently in use (not on free list)*/ static void rm_lru(struct buf *bp); -static void read_block(struct buf *); +static int read_block(struct buf *bp, size_t size); static void freeblock(struct buf *bp); static void cache_heuristic_check(void); static void put_block(struct buf *bp, int put_flags); @@ -60,8 +63,6 @@ static size_t fs_block_size = PAGE_SIZE; /* raw i/o block size */ static fsblkcnt_t fs_btotal = 0, fs_bused = 0; -static int rdwt_err; - static int quiet = 0; void lmfs_setquiet(int q) { quiet = q; } @@ -193,33 +194,33 @@ static void free_unused_blocks(void) printf("libminixfs: freeing; %d blocks, %d bytes\n", freed, bytes); } -static void lmfs_alloc_block(struct buf *bp) +static void lmfs_alloc_block(struct buf *bp, size_t block_size) { int len; ASSERT(!bp->data); ASSERT(bp->lmfs_bytes == 0); - len = roundup(fs_block_size, PAGE_SIZE); + len = roundup(block_size, PAGE_SIZE); - if((bp->data = mmap(0, fs_block_size, - PROT_READ|PROT_WRITE, MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) { + if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE, + MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) { free_unused_blocks(); - if((bp->data = mmap(0, fs_block_size, PROT_READ|PROT_WRITE, + if((bp->data = mmap(0, block_size, PROT_READ|PROT_WRITE, MAP_PREALLOC|MAP_ANON, -1, 0)) == MAP_FAILED) { panic("libminixfs: could not allocate block"); } } assert(bp->data); - bp->lmfs_bytes = fs_block_size; + bp->lmfs_bytes = block_size; bp->lmfs_needsetcache = 1; } /*===========================================================================* * lmfs_get_block * *===========================================================================*/ -struct buf *lmfs_get_block(dev_t dev, block64_t block, int how) +int lmfs_get_block(struct buf **bpp, dev_t dev, block64_t block, int how) { - return lmfs_get_block_ino(dev, block, how, VMC_NO_INODE, 0); + return lmfs_get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0); } static void munmap_t(void *a, int len) @@ -264,7 +265,7 @@ static void freeblock(struct buf *bp) */ if (bp->lmfs_dev != NO_DEV) { if (!lmfs_isclean(bp)) lmfs_flushdev(bp->lmfs_dev); - assert(bp->lmfs_bytes == fs_block_size); + assert(bp->lmfs_bytes > 0); bp->lmfs_dev = NO_DEV; } @@ -300,27 +301,32 @@ static struct buf *find_block(dev_t dev, block64_t block) } /*===========================================================================* - * lmfs_get_block_ino * + * get_block_ino * *===========================================================================*/ -struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, - u64_t ino_off) +static int get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, + ino_t ino, u64_t ino_off, size_t block_size) { -/* Check to see if the requested block is in the block cache. If so, return - * a pointer to it. If not, evict some other block and fetch it (unless - * 'how' is NO_READ). All the blocks in the cache that are not in use are - * linked together in a chain, with 'front' pointing to the least recently used - * block and 'rear' to the most recently used block. If 'how' is NO_READ, the - * block being requested will be overwritten in its entirety, so it is only - * necessary to see if it is in the cache; if it is not, any free buffer will - * do. It is not necessary to actually read the block in from disk. If 'how' +/* Check to see if the requested block is in the block cache. The requested + * block is identified by the block number in 'block' on device 'dev', counted + * in the file system block size. The amount of data requested for this block + * is given in 'block_size', which may be less than the file system block size + * iff the requested block is the last (partial) block on a device. Note that + * the given block size does *not* affect the conversion of 'block' to a byte + * offset! Either way, if the block could be obtained, either from the cache + * or by reading from the device, return OK, with a pointer to the buffer + * structure stored in 'bpp'. If not, return a negative error code (and no + * buffer). If necessary, evict some other block and fetch the contents from + * disk (if 'how' is NORMAL). If 'how' is NO_READ, the caller intends to + * overwrite the requested block in its entirety, so it is only necessary to + * see if it is in the cache; if it is not, any free buffer will do. If 'how' * is PREFETCH, the block need not be read from the disk, and the device is not * to be marked on the block (i.e., set to NO_DEV), so callers can tell if the * block returned is valid. If 'how' is PEEK, the function returns the block - * if it is in the cache or could be obtained from VM, and NULL otherwise. + * if it is in the cache or the VM cache, and an ENOENT error code otherwise. * In addition to the LRU chain, there is also a hash chain to link together * blocks whose block numbers end with the same bit strings, for fast lookup. */ - int b; + int b, r; static struct buf *bp; uint64_t dev_off; struct buf *prev_ptr; @@ -347,6 +353,13 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, /* See if the block is in the cache. If so, we can return it right away. */ bp = find_block(dev, block); if (bp != NULL && !(bp->lmfs_flags & VMMC_EVICTED)) { + ASSERT(bp->lmfs_dev == dev); + ASSERT(bp->lmfs_dev != NO_DEV); + + /* The block must have exactly the requested number of bytes. */ + if (bp->lmfs_bytes != block_size) + return EIO; + /* Block needed has been found. */ if (bp->lmfs_count == 0) { rm_lru(bp); @@ -356,9 +369,6 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, bp->lmfs_flags |= VMMC_BLOCK_LOCKED; } raisecount(bp); - ASSERT(bp->lmfs_bytes == fs_block_size); - ASSERT(bp->lmfs_dev == dev); - ASSERT(bp->lmfs_dev != NO_DEV); ASSERT(bp->lmfs_flags & VMMC_BLOCK_LOCKED); ASSERT(bp->data); @@ -372,7 +382,8 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, } } - return(bp); + *bpp = bp; + return OK; } /* We had the block in the cache but VM evicted it; invalidate it. */ @@ -437,10 +448,11 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, assert(!bp->lmfs_bytes); if(vmcache) { if((bp->data = vm_map_cacheblock(dev, dev_off, ino, ino_off, - &bp->lmfs_flags, fs_block_size)) != MAP_FAILED) { - bp->lmfs_bytes = fs_block_size; + &bp->lmfs_flags, roundup(block_size, PAGE_SIZE))) != MAP_FAILED) { + bp->lmfs_bytes = block_size; ASSERT(!bp->lmfs_needsetcache); - return bp; + *bpp = bp; + return OK; } } bp->data = NULL; @@ -455,12 +467,12 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, put_block(bp, ONE_SHOT); - return NULL; + return ENOENT; } /* Not in the cache; reserve memory for its contents. */ - lmfs_alloc_block(bp); + lmfs_alloc_block(bp, block_size); assert(bp->data); @@ -468,7 +480,12 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, /* PREFETCH: don't do i/o. */ bp->lmfs_dev = NO_DEV; } else if (how == NORMAL) { - read_block(bp); + /* Try to read the block. Return an error code on failure. */ + if ((r = read_block(bp, block_size)) != OK) { + put_block(bp, 0); + + return r; + } } else if(how == NO_READ) { /* This block will be overwritten by new contents. */ } else @@ -476,7 +493,26 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block, int how, ino_t ino, assert(bp->data); - return(bp); /* return the newly acquired block */ + *bpp = bp; /* return the newly acquired block */ + return OK; +} + +/*===========================================================================* + * lmfs_get_block_ino * + *===========================================================================*/ +int lmfs_get_block_ino(struct buf **bpp, dev_t dev, block64_t block, int how, + ino_t ino, u64_t ino_off) +{ + return get_block_ino(bpp, dev, block, how, ino, ino_off, fs_block_size); +} + +/*===========================================================================* + * lmfs_get_partial_block * + *===========================================================================*/ +int lmfs_get_partial_block(struct buf **bpp, dev_t dev, block64_t block, + int how, size_t block_size) +{ + return get_block_ino(bpp, dev, block, how, VMC_NO_INODE, 0, block_size); } /*===========================================================================* @@ -536,12 +572,21 @@ static void put_block(struct buf *bp, int put_flags) assert(bp->data); setflags = (put_flags & ONE_SHOT) ? VMSF_ONCE : 0; + if ((r = vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode, - bp->lmfs_inode_offset, &bp->lmfs_flags, fs_block_size, - setflags)) != OK) { + bp->lmfs_inode_offset, &bp->lmfs_flags, + roundup(bp->lmfs_bytes, PAGE_SIZE), setflags)) != OK) { if(r == ENOSYS) { printf("libminixfs: ENOSYS, disabling VM calls\n"); vmcache = 0; + } else if (r == ENOMEM) { + /* Do not panic in this case. Running out of memory is + * bad, especially since it may lead to applications + * crashing when trying to access memory-mapped pages + * we haven't been able to pass off to the VM cache, + * but the entire file system crashing is always worse. + */ + printf("libminixfs: no memory for cache block!\n"); } else { panic("libminixfs: setblock of %p dev 0x%llx off " "0x%llx failed\n", bp->data, dev, dev_off); @@ -634,6 +679,7 @@ void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off) */ struct buf *bp; static block64_t fake_block = 0; + int r; if (!vmcache) return; @@ -650,7 +696,9 @@ void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off) fake_block = ((uint64_t)INT64_MAX + 1) / fs_block_size; /* Obtain a block. */ - bp = lmfs_get_block_ino(dev, fake_block, NO_READ, ino, ino_off); + if ((r = lmfs_get_block_ino(&bp, dev, fake_block, NO_READ, ino, + ino_off)) != OK) + panic("libminixfs: getting a NO_READ block failed: %d", r); assert(bp != NULL); assert(bp->lmfs_dev != NO_DEV); @@ -684,33 +732,28 @@ void lmfs_set_blockusage(fsblkcnt_t btotal, fsblkcnt_t bused) /*===========================================================================* * read_block * *===========================================================================*/ -static void read_block( - struct buf *bp /* buffer pointer */ -) +static int read_block(struct buf *bp, size_t block_size) { -/* Read or write a disk block. This is the only routine in which actual disk - * I/O is invoked. If an error occurs, a message is printed here, but the error - * is not reported to the caller. If the error occurred while purging a block - * from the cache, it is not clear what the caller could do about it anyway. +/* Read a disk block of 'size' bytes. The given size is always the FS block + * size, except for the last block of a device. If an I/O error occurs, + * invalidate the block and return an error code. */ - int r, op_failed; + ssize_t r; off_t pos; dev_t dev = bp->lmfs_dev; - op_failed = 0; - assert(dev != NO_DEV); - ASSERT(bp->lmfs_bytes == fs_block_size); + ASSERT(bp->lmfs_bytes == block_size); ASSERT(fs_block_size > 0); pos = (off_t)bp->lmfs_blocknr * fs_block_size; - if(fs_block_size > PAGE_SIZE) { + if (block_size > PAGE_SIZE) { #define MAXPAGES 20 vir_bytes blockrem, vaddr = (vir_bytes) bp->data; int p = 0; static iovec_t iovec[MAXPAGES]; - blockrem = fs_block_size; + blockrem = block_size; while(blockrem > 0) { vir_bytes chunk = blockrem >= PAGE_SIZE ? PAGE_SIZE : blockrem; iovec[p].iov_addr = vaddr; @@ -721,25 +764,20 @@ static void read_block( } r = bdev_gather(dev, pos, iovec, p, BDEV_NOFLAGS); } else { - r = bdev_read(dev, pos, bp->data, fs_block_size, - BDEV_NOFLAGS); - } - if (r < 0) { - printf("fs cache: I/O error on device %d/%d, block %"PRIu64"\n", - major(dev), minor(dev), bp->lmfs_blocknr); - op_failed = 1; - } else if (r != (ssize_t) fs_block_size) { - r = END_OF_FILE; - op_failed = 1; + r = bdev_read(dev, pos, bp->data, block_size, BDEV_NOFLAGS); } + if (r != (ssize_t)block_size) { + printf("fs cache: I/O error on device %d/%d, block %"PRIu64" (%zd)\n", + major(dev), minor(dev), bp->lmfs_blocknr, r); + if (r >= 0) + r = EIO; /* TODO: retry retrieving (just) the remaining part */ - if (op_failed) { - bp->lmfs_dev = NO_DEV; /* invalidate block */ + bp->lmfs_dev = NO_DEV; /* invalidate block */ - /* Report read errors to interested parties. */ - rdwt_err = r; + return r; } + return OK; } /*===========================================================================* @@ -842,13 +880,12 @@ void lmfs_rw_scattered( } /* therefore they are all 'in use' and must be at least this many */ - assert(start_in_use >= start_bufqsize); + assert(start_in_use >= start_bufqsize); } assert(dev != NO_DEV); assert(fs_block_size > 0); - iov_per_block = roundup(fs_block_size, PAGE_SIZE) / PAGE_SIZE; - assert(iov_per_block < NR_IOREQS); + assert(howmany(fs_block_size, PAGE_SIZE) <= NR_IOREQS); /* (Shell) sort buffers on lmfs_blocknr. */ gap = 1; @@ -881,11 +918,13 @@ void lmfs_rw_scattered( bp = bufq[nblocks]; if (bp->lmfs_blocknr != bufq[0]->lmfs_blocknr + nblocks) break; + blockrem = bp->lmfs_bytes; + iov_per_block = howmany(blockrem, PAGE_SIZE); if(niovecs >= NR_IOREQS-iov_per_block) break; vdata = (vir_bytes) bp->data; - blockrem = fs_block_size; for(p = 0; p < iov_per_block; p++) { - vir_bytes chunk = blockrem < PAGE_SIZE ? blockrem : PAGE_SIZE; + vir_bytes chunk = + blockrem < PAGE_SIZE ? blockrem : PAGE_SIZE; iop->iov_addr = vdata; iop->iov_size = chunk; vdata += PAGE_SIZE; @@ -916,7 +955,7 @@ void lmfs_rw_scattered( } for (i = 0; i < nblocks; i++) { bp = bufq[i]; - if (r < (ssize_t) fs_block_size) { + if (r < (ssize_t)bp->lmfs_bytes) { /* Transfer failed. */ if (i == 0) { bp->lmfs_dev = NO_DEV; /* Invalidate block */ @@ -929,7 +968,7 @@ void lmfs_rw_scattered( } else { MARKCLEAN(bp); } - r -= fs_block_size; + r -= bp->lmfs_bytes; } bufq += i; @@ -1126,13 +1165,3 @@ void lmfs_may_use_vmcache(int ok) { may_use_vmcache = ok; } - -void lmfs_reset_rdwt_err(void) -{ - rdwt_err = OK; -} - -int lmfs_rdwt_err(void) -{ - return rdwt_err; -} diff --git a/minix/lib/libminixfs/inc.h b/minix/lib/libminixfs/inc.h new file mode 100644 index 000000000..0f73e26e7 --- /dev/null +++ b/minix/lib/libminixfs/inc.h @@ -0,0 +1,7 @@ +#ifndef _LIBMINIXFS_INC_H +#define _LIBMINIXFS_INC_H + +int lmfs_get_partial_block(struct buf **bpp, dev_t dev, block64_t block, + int how, size_t block_size); + +#endif /* !_LIBMINIXFS_INC_H */ diff --git a/minix/servers/vm/mem_cache.c b/minix/servers/vm/mem_cache.c index 1f6942da5..93421a259 100644 --- a/minix/servers/vm/mem_cache.c +++ b/minix/servers/vm/mem_cache.c @@ -122,7 +122,8 @@ do_mapcache(message *msg) assert(offset < vr->length); if(!(hb = find_cached_page_bydev(dev, dev_off + offset, - msg->m_vmmcp.ino, ino_off + offset, 1))) { + msg->m_vmmcp.ino, ino_off + offset, 1)) || + (hb->flags & VMSF_ONCE)) { map_unmap_region(caller, vr, 0, bytes); return ENOENT; } diff --git a/minix/servers/vm/mem_file.c b/minix/servers/vm/mem_file.c index d73c062d4..dba3f514a 100644 --- a/minix/servers/vm/mem_file.c +++ b/minix/servers/vm/mem_file.c @@ -140,8 +140,8 @@ static int mappedfile_pagefault(struct vmproc *vmp, struct vir_region *region, if(!cb) { #if 0 printf("VM: mem_file: no callback, returning EFAULT\n"); -#endif sys_diagctl_stacktrace(vmp->vm_endpoint); +#endif return EFAULT; } diff --git a/minix/tests/Makefile b/minix/tests/Makefile index fde0607cb..db91e52d0 100644 --- a/minix/tests/Makefile +++ b/minix/tests/Makefile @@ -59,7 +59,7 @@ MINIX_TESTS= \ 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 \ 41 42 43 44 45 46 48 49 50 52 53 54 55 56 58 59 60 \ 61 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 \ -81 82 83 84 +81 82 83 84 85 FILES += t84_h_nonexec.sh diff --git a/minix/tests/run b/minix/tests/run index bc0b807de..d442d2591 100755 --- a/minix/tests/run +++ b/minix/tests/run @@ -22,7 +22,7 @@ export USENETWORK # set to "yes" for test48+82 to use the network # Programs that require setuid setuids="test11 test33 test43 test44 test46 test56 test60 test61 test65 \ - test69 test73 test74 test78 test83" + test69 test73 test74 test78 test83 test85" # Scripts that require to be run as root rootscripts="testisofs testvnd testrelpol" @@ -30,7 +30,7 @@ alltests="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 \ 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 \ 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 \ - 81 82 83 84 sh1 sh2 interp mfs isofs vnd" + 81 82 83 84 85 sh1 sh2 interp mfs isofs vnd" tests_no=`expr 0` # If root, make sure the setuid tests have the correct permissions diff --git a/minix/tests/test72.c b/minix/tests/test72.c index 6c709692b..8e6cc1103 100644 --- a/minix/tests/test72.c +++ b/minix/tests/test72.c @@ -41,10 +41,11 @@ int dowriteblock(int b, int blocksize, u32_t seed, char *data) { struct buf *bp; + int r; assert(blocksize == curblocksize); - if(!(bp = lmfs_get_block(MYDEV, b, NORMAL))) { + if ((r = lmfs_get_block(&bp, MYDEV, b, NORMAL)) != 0) { e(30); return 0; } @@ -62,10 +63,11 @@ int readblock(int b, int blocksize, u32_t seed, char *data) { struct buf *bp; + int r; assert(blocksize == curblocksize); - if(!(bp = lmfs_get_block(MYDEV, b, NORMAL))) { + if ((r = lmfs_get_block(&bp, MYDEV, b, NORMAL)) != 0) { e(30); return 0; } diff --git a/minix/tests/test85.c b/minix/tests/test85.c new file mode 100644 index 000000000..b4bb4207d --- /dev/null +++ b/minix/tests/test85.c @@ -0,0 +1,540 @@ +/* Test for end-of-file during block device I/O - by D.C. van Moolenbroek */ +/* This test needs to be run as root; it sets up and uses a VND instance. */ +/* + * The test should work with all root file system block sizes, but only tests + * certain corner cases if the root FS block size is twice the page size. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VNCONFIG "/usr/sbin/vnconfig" + +#define SECTOR_SIZE 512 /* this should be the sector size of VND */ + +#define ITERATIONS 3 + +enum { + BEFORE_EOF, + UPTO_EOF, + ACROSS_EOF, + ONEPAST_EOF, + FROM_EOF, + BEYOND_EOF +}; + +#include "common.h" + +static int need_cleanup = 0; + +static int dev_fd; +static size_t dev_size; +static char *dev_buf; +static char *dev_ref; + +static size_t block_size; +static size_t page_size; +static int test_peek; + +static char *mmap_ptr = NULL; +static size_t mmap_size; + +static int pipe_fd[2]; + +/* + * Fill the given buffer with random contents. + */ +static void +fill_buf(char * buf, size_t size) +{ + + while (size--) + *buf++ = lrand48() & 0xff; +} + +/* + * Place the elements of the source array in the destination array in random + * order. There are probably better ways to do this, but it is morning, and I + * haven't had coffee yet, so go away. + */ +static void +scramble(int * dst, const int * src, int count) +{ + int i, j, k; + + for (i = 0; i < count; i++) + dst[i] = i; + + for (i = count - 1; i >= 0; i--) { + j = lrand48() % (i + 1); + + k = dst[j]; + dst[j] = dst[i]; + dst[i] = src[k]; + } +} + +/* + * Perform I/O using read(2) and check the returned results against the + * expected result and the image reference data. + */ +static void +io_read(size_t pos, size_t len, size_t expected) +{ + ssize_t bytes; + + assert(len > 0 && len <= dev_size); + assert(expected <= len); + + if (lseek(dev_fd, (off_t)pos, SEEK_SET) != pos) e(0); + + memset(dev_buf, 0, len); + + if ((bytes = read(dev_fd, dev_buf, len)) < 0) e(0); + + if (bytes != expected) e(0); + + if (memcmp(&dev_ref[pos], dev_buf, bytes)) e(0); +} + +/* + * Perform I/O using write(2) and check the returned result against the + * expected result. Update the image reference data as appropriate. + */ +static void +io_write(size_t pos, size_t len, size_t expected) +{ + ssize_t bytes; + + assert(len > 0 && len <= dev_size); + assert(expected <= len); + + if (lseek(dev_fd, (off_t)pos, SEEK_SET) != pos) e(0); + + fill_buf(dev_buf, len); + + if ((bytes = write(dev_fd, dev_buf, len)) < 0) e(0); + + if (bytes != expected) e(0); + + if (bytes > 0) { + assert(pos + bytes <= dev_size); + + memcpy(&dev_ref[pos], dev_buf, bytes); + } +} + +/* + * Test if reading from the given pointer succeeds or not, and return the + * result. + */ +static int +is_readable(char * ptr) +{ + ssize_t r; + char byte; + + /* + * If we access the pointer directly, we will get a fatal signal. + * Thus, for that to work we would need a child process, making the + * whole test slow and noisy. Let a service try the operation instead. + */ + r = write(pipe_fd[1], ptr, 1); + + if (r == 1) { + /* Don't fill up the pipe. */ + if (read(pipe_fd[0], &byte, 1) != 1) e(0); + + return 1; + } else if (r != -1 || errno != EFAULT) + e(0); + + return 0; +} + +/* + * Perform I/O using mmap(2) and check the returned results against the + * expected result and the image reference data. Ensure that bytes beyond the + * device end are either zero (on the remainder of the last page) or + * inaccessible on pages entirely beyond the device end. + */ +static void +io_peek(size_t pos, size_t len, size_t expected) +{ + size_t n, delta, mapped_size; + char *ptr; + + assert(test_peek); + + delta = pos % page_size; + + pos -= delta; + len += delta; + + len = roundup(len, page_size); + + /* Don't bother with the given expected value. Recompute it. */ + if (pos < dev_size) + expected = MIN(dev_size - pos, len); + else + expected = 0; + + mapped_size = roundup(dev_size, page_size); + + assert(!(len % page_size)); + + ptr = mmap(NULL, len, PROT_READ, MAP_PRIVATE | MAP_FILE, dev_fd, + (off_t)pos); + + /* + * As of writing, VM allows memory mapping at any offset and for any + * length. At least for block devices, VM should probably be changed + * to throw ENXIO for any pages beyond the file end, which in turn + * renders all the SIGBUS tests below obsolete. + */ + if (ptr == MAP_FAILED) { + if (pos + len <= mapped_size) e(0); + if (errno != ENXIO) e(0); + + return; + } + + mmap_ptr = ptr; + mmap_size = len; + + /* + * Any page that contains any valid part of the mapped device should be + * readable and have correct contents for that part. If the last valid + * page extends beyond the mapped device, its remainder should be zero. + */ + if (pos < dev_size) { + /* The valid part should have the expected device contents. */ + if (memcmp(&dev_ref[pos], ptr, expected)) e(0); + + /* The remainder, if any, should be zero. */ + for (n = expected; n % page_size; n++) + if (ptr[n] != 0) e(0); + } + + /* + * Any page entirely beyond EOF should not be mapped in. In order to + * ensure that is_readable() works, also test pages that are mapped in. + */ + for (n = pos; n < pos + len; n += page_size) + if (is_readable(&ptr[n - pos]) != (n < mapped_size)) e(0); + + munmap(ptr, len); + + mmap_ptr = NULL; +} + +/* + * Perform one of the supported end-of-file access attempts using one I/O + * operation. + */ +static void +do_one_io(int where, void (* io_proc)(size_t, size_t, size_t)) +{ + size_t start, bytes; + + switch (where) { + case BEFORE_EOF: + bytes = lrand48() % (dev_size - 1) + 1; + + io_proc(dev_size - bytes - 1, bytes, bytes); + + break; + + case UPTO_EOF: + bytes = lrand48() % dev_size + 1; + + io_proc(dev_size - bytes, bytes, bytes); + + break; + + case ACROSS_EOF: + start = lrand48() % (dev_size - 1) + 1; + bytes = dev_size - start + 1; + assert(start < dev_size && start + bytes > dev_size); + bytes += lrand48() % (dev_size - bytes + 1); + + io_proc(start, bytes, dev_size - start); + + break; + + case ONEPAST_EOF: + bytes = lrand48() % (dev_size - 1) + 1; + + io_proc(dev_size - bytes + 1, bytes, bytes - 1); + + break; + + case FROM_EOF: + bytes = lrand48() % dev_size + 1; + + io_proc(dev_size, bytes, 0); + + break; + + case BEYOND_EOF: + start = dev_size + lrand48() % dev_size + 1; + bytes = lrand48() % dev_size + 1; + + io_proc(start, bytes, 0); + + break; + + default: + assert(0); + } +} + +/* + * Perform I/O operations, testing all the supported end-of-file access + * attempts in a random order so as to detect possible problems with caching. + */ +static void +do_io(void (* io_proc)(size_t, size_t, size_t)) +{ + static const int list[] = { BEFORE_EOF, UPTO_EOF, ACROSS_EOF, + ONEPAST_EOF, FROM_EOF, BEYOND_EOF }; + static const int count = sizeof(list) / sizeof(list[0]); + int i, where[count]; + + scramble(where, list, count); + + for (i = 0; i < count; i++) + do_one_io(where[i], io_proc); +} + +/* + * Set up an image file of the given size, assign it to a VND, and open the + * resulting block device. The size is size_t because we keep a reference copy + * of its entire contents in memory. + */ +static void +setup_image(size_t size) +{ + struct part_geom part; + size_t off; + ssize_t bytes; + int fd, status; + + dev_size = size; + if ((dev_buf = malloc(dev_size)) == NULL) e(0); + if ((dev_ref = malloc(dev_size)) == NULL) e(0); + + if ((fd = open("image", O_CREAT | O_TRUNC | O_RDWR, 0644)) < 0) e(0); + + fill_buf(dev_ref, dev_size); + + for (off = 0; off < dev_size; off += bytes) { + bytes = write(fd, &dev_ref[off], dev_size - off); + + if (bytes <= 0) e(0); + } + + close(fd); + + status = system(VNCONFIG " vnd0 image 2>/dev/null"); + if (!WIFEXITED(status)) e(0); + if (WEXITSTATUS(status) != 0) { + printf("skipped\n"); /* most likely cause: vnd0 is in use */ + cleanup(); + exit(0); + } + + need_cleanup = 1; + + if ((dev_fd = open("/dev/vnd0", O_RDWR)) < 0) e(0); + + if (ioctl(dev_fd, DIOCGETP, &part) < 0) e(0); + + if (part.size != dev_size) e(0); +} + +/* + * Clean up the VND we set up previously. This function is also called in case + * of an unexpected exit. + */ +static void +cleanup_device(void) +{ + int status; + + if (!need_cleanup) + return; + + if (mmap_ptr != NULL) { + munmap(mmap_ptr, mmap_size); + + mmap_ptr = NULL; + } + + if (dev_fd >= 0) + close(dev_fd); + + status = system(VNCONFIG " -u vnd0 2>/dev/null"); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) e(0); + + need_cleanup = 0; +} + +/* + * Signal handler for exceptions. + */ +static void +got_signal(int __unused sig) +{ + + cleanup_device(); + + exit(1); +} + +/* + * Clean up the VND and image file we set up previously. + */ +static void +cleanup_image(void) +{ + size_t off; + ssize_t bytes; + int fd; + + cleanup_device(); + + if ((fd = open("image", O_RDONLY, 0644)) < 0) e(0); + + for (off = 0; off < dev_size; off += bytes) { + bytes = read(fd, &dev_buf[off], dev_size - off); + + if (bytes <= 0) e(0); + } + + close(fd); + + /* Have all changes written back to the device? */ + if (memcmp(dev_buf, dev_ref, dev_size)) e(0); + + unlink("image"); + + free(dev_buf); + free(dev_ref); +} + +/* + * Run the full test for a block device with the given size. + */ +static void +do_test(size_t size) +{ + int i; + + /* + * Using the three I/O primitives (read, write, peek), we run four + * sequences, mainly to test the effects of blocks being cached or not. + * We set up a new image for each sequence, because -if everything goes + * right- closing the device file also clears all cached blocks for it, + * in both the root file system's cache and the VM cache. Note that we + * currently do not even attempt to push the blocks out of the root FS' + * cache in order to test retrieval from the VM cache, since this would + * involve doing a LOT of extra I/O. + */ + for (i = 0; i < 4; i++) { + setup_image(size); + + switch (i) { + case 0: + do_io(io_read); + + /* FALLTHROUGH */ + case 1: + do_io(io_write); + + do_io(io_read); + + break; + + case 2: + do_io(io_peek); + + /* FALLTHROUGH */ + + case 3: + do_io(io_write); + + do_io(io_peek); + + break; + } + + cleanup_image(); + } +} + +/* + * Test program for end-of-file conditions during block device I/O. + */ +int +main(void) +{ + static const unsigned int blocks[] = { 1, 4, 3, 5, 2 }; + struct statvfs buf; + int i, j; + + start(85); + + signal(SIGINT, got_signal); + signal(SIGABRT, got_signal); + signal(SIGSEGV, got_signal); + signal(SIGBUS, got_signal); + atexit(cleanup_device); + + srand48(time(NULL)); + + if (pipe(pipe_fd) != 0) e(0); + + /* + * Get the system page size, and align all memory mapping offsets and + * sizes accordingly. + */ + page_size = sysconf(_SC_PAGESIZE); + + /* + * Get the root file system block size. In the current MINIX3 system + * architecture, the root file system's block size determines the + * transfer granularity for I/O on unmounted block devices. If this + * block size is not a multiple of the page size, we are (currently!) + * not expecting memory-mapped block devices to work. + */ + if (statvfs("/", &buf) < 0) e(0); + + block_size = buf.f_bsize; + + test_peek = !(block_size % page_size); + + for (i = 0; i < ITERATIONS; i++) { + /* + * The 'blocks' array is scrambled so as to detect any blocks + * left in the VM cache (or not) across runs, just in case. + */ + for (j = 0; j < sizeof(blocks) / sizeof(blocks[0]); j++) { + do_test(blocks[j] * block_size + SECTOR_SIZE); + + do_test(blocks[j] * block_size); + + do_test(blocks[j] * block_size - SECTOR_SIZE); + } + } + + quit(); +}