From: David van Moolenbroek Date: Wed, 30 Dec 2015 13:17:42 +0000 (+0000) Subject: Fix soft faults in FSes resulting in partial I/O X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zlib_tech.html?a=commitdiff_plain;h=10b7016b5abe43d40ea8e1426bbd228596993e98;p=minix.git Fix soft faults in FSes resulting in partial I/O In order to resolve page faults on file-mapped pages, VM may need to communicate (through VFS) with a file system. The file system must therefore not be the one to cause, and thus end up being blocked on, such page faults. To resolve this potential deadlock, the safecopy system was previously extended with the CPF_TRY flag, which causes the kernel to return EFAULT to the caller of a safecopy function upon getting a pagefault, bypassing VM and thus avoiding the loop. VFS was extended to repeat relevant file system calls that returned EFAULT, after resolving the page fault, to keep these soft faults from being exposed to applications. However, general UNIX I/O semantics dictate that if an I/O transfer partially succeeded before running into a failure, the partial result is to be returned. Proper file system implementations may therefore end up returning partial success rather than the EFAULT code resulting from a soft fault. Since VFS does not get the EFAULT code in this case, it does not know that a soft fault occurred, and thus does not repeat the call either. The end result is that an application may get partial I/O results (e.g., a short read(2)) even on regular files. Applications cannot reasonably be expected to deal with this. Due to the fact that most of the current file system implementations do not implement proper partial-failure semantics, this problem is not yet widespread. In fact, it has only occurred on direct block device I/O so far. However, the next generation of file system services will be implementing proper I/O semantics, thus exacerbating the problem. To remedy this situation, this patch changes the CPF_TRY semantics: whenever the kernel experiences a soft fault during a safecopy call, in addition to returning FAULT, the kernel also stores a mark in the grant created with CPF_TRY. Instead of testing on EFAULT, VFS checks whether the grant was marked, as part of revoking the grant. If the grant was indeed marked by the kernel, VFS repeats the file system operation, regardless of its initial return value. Thus, the EFAULT code now only serves to make the file system fail the call faster. The approach is currently supported for both direct and magic grants, but is used only with magic grants - arguably the only case where it makes sense. Indirect grants should not have CPF_TRY set; in a chain of indirect grants, the original grant is marked, as it should be. In order to avoid potential SMP issues, the mark stored in the grant is its grant identifier, so as to discard outdated kernel writes. Whether this is necessary or effective remains to be evaluated. This patch also cleans up the grant structure a bit, removing reserved space and thus making the structure slightly smaller. The structure is used internally between system services only, so there is no need for binary compatibility. Change-Id: I6bb3990dce67a80146d954546075ceda4d6567f8 --- diff --git a/minix/include/minix/safecopies.h b/minix/include/minix/safecopies.h index 53580ff3d..c1214a285 100644 --- a/minix/include/minix/safecopies.h +++ b/minix/include/minix/safecopies.h @@ -7,21 +7,20 @@ #include typedef struct { - int cp_flags; /* CPF_* below */ - union ixfer_cp_u{ + int cp_flags; /* CPF_* below */ + int cp_seq; /* sequence number */ + union ixfer_cp_u { struct { /* CPF_DIRECT */ endpoint_t cp_who_to; /* grantee */ vir_bytes cp_start; /* memory */ size_t cp_len; /* size in bytes */ - char cp_reserved[8]; /* future use */ } cp_direct; struct { /* CPF_INDIRECT */ endpoint_t cp_who_to; /* grantee */ endpoint_t cp_who_from; /* previous granter */ cp_grant_id_t cp_grant; /* previous grant */ - char cp_reserved[8];/* future use */ } cp_indirect; struct { /* CPF_MAGIC */ @@ -29,15 +28,13 @@ typedef struct { endpoint_t cp_who_to; /* grantee */ vir_bytes cp_start; /* memory */ size_t cp_len; /* size in bytes */ - char cp_reserved[8]; /* future use */ } cp_magic; struct { /* (free slot) */ int cp_next; /* next free or -1 */ } cp_free; } cp_u; - int cp_seq; /* sequence number */ - char cp_reserved[4]; /* future use */ + cp_grant_id_t cp_faulted; /* soft fault marker (CPF_TRY only) */ } cp_grant_t; /* Vectored safecopy. */ @@ -45,7 +42,6 @@ struct vscp_vec { /* Exactly one of the following must be SELF. */ endpoint_t v_from; /* source */ endpoint_t v_to; /* destination */ - cp_grant_id_t v_gid; /* grant id of other process */ size_t v_offset; /* offset in other grant */ vir_bytes v_addr; /* address in copier's space */ @@ -78,6 +74,9 @@ struct vscp_vec { #define CPF_MAGIC 0x000800 /* Grant from any to any. */ #define CPF_VALID 0x001000 /* Grant slot contains valid grant. */ +/* Special cpf_revoke() return values. */ +#define GRANT_FAULTED 1 /* CPF_TRY: a soft fault occurred */ + /* Prototypes for functions in libsys. */ cp_grant_id_t cpf_grant_direct(endpoint_t, vir_bytes, size_t, int); cp_grant_id_t cpf_grant_indirect(endpoint_t, endpoint_t, cp_grant_id_t); diff --git a/minix/kernel/proto.h b/minix/kernel/proto.h index 209fae7f9..44c99e8b4 100644 --- a/minix/kernel/proto.h +++ b/minix/kernel/proto.h @@ -156,8 +156,9 @@ void hook_ipc_clear(struct proc *proc); #endif /* system/do_safecopy.c */ +struct cp_sfinfo; /* external callers may only provide NULL */ int verify_grant(endpoint_t, endpoint_t, cp_grant_id_t, vir_bytes, int, - vir_bytes, vir_bytes *, endpoint_t *, u32_t *); + vir_bytes, vir_bytes *, endpoint_t *, struct cp_sfinfo *); /* system/do_diagctl.c */ int do_diagctl(struct proc * caller, message *m); diff --git a/minix/kernel/system/do_safecopy.c b/minix/kernel/system/do_safecopy.c index 5284741a5..6002a9011 100644 --- a/minix/kernel/system/do_safecopy.c +++ b/minix/kernel/system/do_safecopy.c @@ -29,6 +29,13 @@ static int safecopy(struct proc *, endpoint_t, endpoint_t, #define HASGRANTTABLE(gr) \ (priv(gr) && priv(gr)->s_grant_table) +struct cp_sfinfo { /* information for handling soft faults */ + int try; /* if nonzero, try copy only, stop on fault */ + endpoint_t endpt; /* endpoint owning grant with CPF_TRY flag */ + vir_bytes addr; /* address to write mark upon soft fault */ + cp_grant_id_t value; /* grant ID to use as mark value to write */ +}; + /*===========================================================================* * verify_grant * *===========================================================================*/ @@ -41,7 +48,7 @@ int verify_grant( vir_bytes offset_in, /* copy offset within grant */ vir_bytes *offset_result, /* copy offset within virtual address space */ endpoint_t *e_granter, /* new granter (magic grants) */ - u32_t *flags /* CPF_* */ + struct cp_sfinfo *sfinfo /* storage for soft fault information */ ) { cp_grant_t g; @@ -120,8 +127,6 @@ int verify_grant( return EPERM; } - if(flags) *flags = g.cp_flags; - /* Check validity: flags and sequence number. */ if((g.cp_flags & (CPF_USED | CPF_VALID)) != (CPF_USED | CPF_VALID)) { @@ -250,6 +255,14 @@ int verify_grant( return EPERM; } + /* If requested, store information regarding soft faults. */ + if (sfinfo != NULL && (sfinfo->try = !!(g.cp_flags & CPF_TRY))) { + sfinfo->endpt = granter; + sfinfo->addr = priv(granter_proc)->s_grant_table + + sizeof(g) * grant_idx + offsetof(cp_grant_t, cp_faulted); + sfinfo->value = grant; + } + return OK; } @@ -274,7 +287,7 @@ static int safecopy( endpoint_t new_granter, *src, *dst; struct proc *granter_p; int r; - u32_t flags; + struct cp_sfinfo sfinfo; #if PERF_USE_COW_SAFECOPY vir_bytes size; #endif @@ -295,7 +308,7 @@ static int safecopy( /* Verify permission exists. */ if((r=verify_grant(granter, grantee, grantid, bytes, access, - g_offset, &v_offset, &new_granter, &flags)) != OK) { + g_offset, &v_offset, &new_granter, &sfinfo)) != OK) { if(r == ENOTREADY) return r; printf( "grant %d verify to copy %d->%d by %d failed: err %d\n", @@ -325,11 +338,39 @@ static int safecopy( } /* Do the regular copy. */ - if(flags & CPF_TRY) { - int r; - /* Try copy without transparently faulting in pages. */ + if (sfinfo.try) { + /* + * Try copying without transparently faulting in pages. + * TODO: while CPF_TRY is meant to protect against deadlocks on + * memory-mapped files in file systems, it seems that this case + * triggers faults a whole lot more often, resulting in extra + * overhead due to retried file system operations. It might be + * a good idea to go through VM even in this case, and have VM + * fail (only) if the affected page belongs to a file mapping. + */ r = virtual_copy(&v_src, &v_dst, bytes); - if(r == EFAULT_SRC || r == EFAULT_DST) return EFAULT; + if (r == EFAULT_SRC || r == EFAULT_DST) { + /* + * Mark the magic grant as having experienced a soft + * fault during its lifetime. The exact value does not + * matter, but we use the grant ID (including its + * sequence number) as a form of protection in the + * light of CPU concurrency. + */ + r = data_copy(KERNEL, (vir_bytes)&sfinfo.value, + sfinfo.endpt, sfinfo.addr, sizeof(sfinfo.value)); + /* + * Failure means the creator of the magic grant messed + * up, which can only be unintentional, so report.. + */ + if (r != OK) + printf("Kernel: writing soft fault marker %d " + "into %d at 0x%lx failed (%d)\n", + sfinfo.value, sfinfo.endpt, sfinfo.addr, + r); + + return EFAULT; + } return r; } return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes); diff --git a/minix/lib/libsys/safecopies.c b/minix/lib/libsys/safecopies.c index cec339f6a..e22620ff1 100644 --- a/minix/lib/libsys/safecopies.c +++ b/minix/lib/libsys/safecopies.c @@ -152,6 +152,7 @@ cpf_grant_direct(endpoint_t who_to, vir_bytes addr, size_t bytes, int access) grants[g].cp_u.cp_direct.cp_who_to = who_to; grants[g].cp_u.cp_direct.cp_start = addr; grants[g].cp_u.cp_direct.cp_len = bytes; + grants[g].cp_faulted = GRANT_INVALID; __insn_barrier(); grants[g].cp_flags = access | CPF_DIRECT | CPF_USED | CPF_VALID; @@ -174,6 +175,7 @@ cpf_grant_indirect(endpoint_t who_to, endpoint_t who_from, cp_grant_id_t gr) grants[g].cp_u.cp_indirect.cp_who_to = who_to; grants[g].cp_u.cp_indirect.cp_who_from = who_from; grants[g].cp_u.cp_indirect.cp_grant = gr; + grants[g].cp_faulted = GRANT_INVALID; __insn_barrier(); grants[g].cp_flags = CPF_USED | CPF_INDIRECT | CPF_VALID; @@ -198,23 +200,39 @@ cpf_grant_magic(endpoint_t who_to, endpoint_t who_from, grants[g].cp_u.cp_magic.cp_who_from = who_from; grants[g].cp_u.cp_magic.cp_start = addr; grants[g].cp_u.cp_magic.cp_len = bytes; + grants[g].cp_faulted = GRANT_INVALID; __insn_barrier(); grants[g].cp_flags = CPF_USED | CPF_MAGIC | CPF_VALID | access; return GRANT_ID(g, grants[g].cp_seq); } +/* + * Revoke previously granted access, identified by grant ID. Return -1 on + * error, with errno set as appropriate. Return 0 on success, with one + * exception: return GRANT_FAULTED (1) if a grant was created with CPF_TRY and + * during its lifetime, a copy from or to the grant experienced a soft fault. + */ int cpf_revoke(cp_grant_id_t grant) { -/* Revoke previously granted access, identified by grant id. */ - int g; + int r, g; GID_CHECK_USED(grant); g = GRANT_IDX(grant); - /* Make grant invalid by setting flags to 0, clearing CPF_USED. + /* + * If a safecopy action on a (direct or magic) grant with the CPF_TRY + * flag failed on a soft fault, the kernel will have set the cp_faulted + * field to the grant identifier. Here, we test this and return + * GRANT_FAULTED (1) on a match. + */ + r = ((grants[g].cp_flags & CPF_TRY) && + grants[g].cp_faulted == grant) ? GRANT_FAULTED : 0; + + /* + * Make grant invalid by setting flags to 0, clearing CPF_USED. * This invalidates the grant. */ grants[g].cp_flags = 0; @@ -239,7 +257,7 @@ cpf_revoke(cp_grant_id_t grant) grants[g].cp_u.cp_free.cp_next = freelist; freelist = g; - return 0; + return r; } /* diff --git a/minix/servers/vfs/comm.c b/minix/servers/vfs/comm.c index 9381cc323..e21e62159 100644 --- a/minix/servers/vfs/comm.c +++ b/minix/servers/vfs/comm.c @@ -161,7 +161,10 @@ int fs_sendrec(endpoint_t fs_e, message *reqmp) assert(self->w_sendrec == NULL); - return(reqmp->m_type); + r = reqmp->m_type; + if (r == ERESTART) /* ERESTART is used internally, so make sure it is.. */ + r = EIO; /* ..not delivered as a result from a file system. */ + return(r); } /*===========================================================================* diff --git a/minix/servers/vfs/pipe.c b/minix/servers/vfs/pipe.c index 3cba587e2..2aee4ac5c 100644 --- a/minix/servers/vfs/pipe.c +++ b/minix/servers/vfs/pipe.c @@ -483,7 +483,7 @@ void revive(endpoint_t proc_e, int returned) * it again now that I/O is done. */ if (GRANT_VALID(rfp->fp_grant)) { - if(cpf_revoke(rfp->fp_grant)) { + if(cpf_revoke(rfp->fp_grant) == -1) { panic("VFS: revoke failed for grant: %d", rfp->fp_grant); } diff --git a/minix/servers/vfs/request.c b/minix/servers/vfs/request.c index ec019fbcb..e34874d35 100644 --- a/minix/servers/vfs/request.c +++ b/minix/servers/vfs/request.c @@ -49,7 +49,9 @@ static int req_breadwrite_actual(endpoint_t fs_e, endpoint_t user_e, dev_t dev, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); + if (r != OK) return(r); /* Fill in response structure */ @@ -68,7 +70,7 @@ int req_breadwrite(endpoint_t fs_e, endpoint_t user_e, dev_t dev, off_t pos, r = req_breadwrite_actual(fs_e, user_e, dev, pos, num_of_bytes, user_addr, rw_flag, new_pos, cum_iop, CPF_TRY); - if(r == EFAULT) { + if (r == ERESTART) { if((r=vm_vfs_procctl_handlemem(user_e, user_addr, num_of_bytes, rw_flag == READING)) != OK) { return r; @@ -324,7 +326,8 @@ static int req_getdents_actual( } r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); if (r == OK) { *new_pos = m.m_fs_vfs_getdents.seek_pos; @@ -351,7 +354,9 @@ int req_getdents( r = req_getdents_actual(fs_e, inode_nr, pos, buf, size, new_pos, direct, CPF_TRY); - if(r == EFAULT && !direct) { + if (r == ERESTART) { + assert(!direct); + if((r=vm_vfs_procctl_handlemem(who_e, buf, size, 1)) != OK) { return r; } @@ -736,7 +741,8 @@ static int req_rdlink_actual(endpoint_t fs_e, ino_t inode_nr, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); if (r == OK) r = m.m_fs_vfs_rdlink.nbytes; @@ -756,7 +762,9 @@ int req_rdlink(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e, r = req_rdlink_actual(fs_e, inode_nr, proc_e, buf, len, direct, CPF_TRY); - if(r == EFAULT && !direct) { + if (r == ERESTART) { + assert(!direct); + if((r=vm_vfs_procctl_handlemem(proc_e, buf, len, 1)) != OK) { return r; } @@ -854,7 +862,8 @@ static int req_readwrite_actual(endpoint_t fs_e, ino_t inode_nr, off_t pos, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); if (r == OK) { /* Fill in response structure */ @@ -877,9 +886,9 @@ int req_readwrite(endpoint_t fs_e, ino_t inode_nr, off_t pos, r = req_readwrite_actual(fs_e, inode_nr, pos, rw_flag, user_e, user_addr, num_of_bytes, new_posp, cum_iop, CPF_TRY); - if(r == EFAULT) { - if((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr, num_of_bytes, - rw_flag == READING)) != OK) { + if (r == ERESTART) { + if ((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr, + num_of_bytes, rw_flag == READING)) != OK) { return r; } @@ -1034,8 +1043,9 @@ static int req_slink_actual( /* Send/rec request */ r = fs_sendrec(fs_e, &m); + cpf_revoke(gid_name); - cpf_revoke(gid_buf); + if (cpf_revoke(gid_buf) == GRANT_FAULTED) return(ERESTART); return(r); } @@ -1059,7 +1069,7 @@ int req_slink( r = req_slink_actual(fs_e, inode_nr, lastc, proc_e, path_addr, path_length, uid, gid, CPF_TRY); - if(r == EFAULT) { + if (r == ERESTART) { if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) path_addr, path_length, 0)) != OK) { return r; @@ -1096,7 +1106,8 @@ int req_stat_actual(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); return(r); } @@ -1112,7 +1123,7 @@ int req_stat(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e, r = req_stat_actual(fs_e, inode_nr, proc_e, buf, CPF_TRY); - if(r == EFAULT) { + if (r == ERESTART) { if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) buf, sizeof(struct stat), 1)) != OK) { return r; diff --git a/minix/tests/blocktest/blocktest.c b/minix/tests/blocktest/blocktest.c index 136bbefa6..e7e235ade 100644 --- a/minix/tests/blocktest/blocktest.c +++ b/minix/tests/blocktest/blocktest.c @@ -318,7 +318,7 @@ static void raw_xfer(dev_t minor, u64_t pos, iovec_s_t *iovec, int nr_req, r = sendrec_driver(&m, exp, res); - if (cpf_revoke(grant) != OK) + if (cpf_revoke(grant) == -1) panic("unable to revoke grant"); if (r != RESULT_OK) @@ -359,7 +359,7 @@ static void vir_xfer(dev_t minor, u64_t pos, iovec_t *iovec, int nr_req, for (i = 0; i < nr_req; i++) { iovec[i].iov_size = iov_s[i].iov_size; - if (cpf_revoke(iov_s[i].iov_grant) != OK) + if (cpf_revoke(iov_s[i].iov_grant) == -1) panic("unable to revoke grant"); } } @@ -1181,7 +1181,7 @@ static int vir_ioctl(dev_t minor, unsigned long req, void *ptr, ssize_t exp, r = sendrec_driver(&m, exp, res); - if (cpf_revoke(grant) != OK) + if (cpf_revoke(grant) == -1) panic("unable to revoke grant"); return r; diff --git a/minix/tests/test74.c b/minix/tests/test74.c index 5c5119e81..8478e037b 100644 --- a/minix/tests/test74.c +++ b/minix/tests/test74.c @@ -791,6 +791,90 @@ hole_regression(void) close(fd); } +/* + * Test that soft faults during file system I/O do not cause functions to + * return partial I/O results. + * + * We refer to the faults that are caused internally within the operating + * system as a result of the deadlock mitigation described at the top of this + * file, as a particular class of "soft faults". Such soft faults may occur in + * the middle of an I/O operation, and general I/O semantics dictate that upon + * partial success, the partial success is returned (and *not* an error). As a + * result, these soft faults, if not handled as special cases, may cause even + * common file system operations such as read(2) on a regular file to return + * fewer bytes than requested. Such unexpected short reads are typically not + * handled well by userland, and the OS must prevent them from occurring if it + * can. Note that read(2) is the most problematic, but certainly not the only, + * case where this problem can occur. + * + * Unfortunately, several file system services are not following the proper + * general I/O semantics - and this includes MFS. Therefore, for now, we have + * to test this case using block device I/O, which does do the right thing. + * In this test we hope that the root file system is mounted on a block device + * usable for (read-only!) testing purposes. + */ +static void +softfault_partial(void) +{ + struct statvfs stf; + struct stat st; + char *buf, *buf2; + ssize_t size; + int fd; + + if (statvfs("/", &stf) != 0) e(0); + + /* + * If the root file system is not mounted off a block device, or if we + * cannot open that device ourselves, simply skip this subtest. + */ + if (stat(stf.f_mntfromname, &st) != 0 || !S_ISBLK(st.st_mode)) + return; /* skip subtest */ + + if ((fd = open(stf.f_mntfromname, O_RDONLY)) == -1) + return; /* skip subtest */ + + /* + * See if we can read in the first two full blocks, or two pages worth + * of data, whichever is larger. If that fails, there is no point in + * continuing the test. + */ + size = MAX(stf.f_bsize, PAGE_SIZE) * 2; + + if ((buf = mmap(NULL, size, PROT_READ | PROT_READ, + MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) e(0); + + if (read(fd, buf, size) != size) { + munmap(buf, size); + close(fd); + return; /* skip subtest */ + } + + lseek(fd, 0, SEEK_SET); + + /* + * Now attempt a read to a partially faulted-in buffer. The first time + * around, the I/O transfer will generate a fault and return partial + * success. In that case, the entire I/O transfer should be retried + * after faulting in the missing page(s), thus resulting in the read + * succeeding in full. + */ + if ((buf2 = mmap(NULL, size, PROT_READ | PROT_READ, + MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) e(0); + buf2[0] = '\0'; /* fault in the first page */ + + if (read(fd, buf2, size) != size) e(0); + + /* The result should be correct, too. */ + if (memcmp(buf, buf2, size)) e(0); + + /* Clean up. */ + munmap(buf2, size); + munmap(buf, size); + + close(fd); +} + int main(int argc, char *argv[]) { @@ -814,6 +898,8 @@ main(int argc, char *argv[]) test_memory_types_vs_operations(); + softfault_partial(); + makefiles(MAXFILES); cachequiet(!bigflag);