#include <stdint.h>
typedef struct {
- int cp_flags; /* CPF_* below */
- union ixfer_cp_u{
+ int cp_flags; /* CPF_* below */
+ int cp_seq; /* sequence number */
+ union ixfer_cp_u {
struct {
/* CPF_DIRECT */
endpoint_t cp_who_to; /* grantee */
vir_bytes cp_start; /* memory */
size_t cp_len; /* size in bytes */
- char cp_reserved[8]; /* future use */
} cp_direct;
struct {
/* CPF_INDIRECT */
endpoint_t cp_who_to; /* grantee */
endpoint_t cp_who_from; /* previous granter */
cp_grant_id_t cp_grant; /* previous grant */
- char cp_reserved[8];/* future use */
} cp_indirect;
struct {
/* CPF_MAGIC */
endpoint_t cp_who_to; /* grantee */
vir_bytes cp_start; /* memory */
size_t cp_len; /* size in bytes */
- char cp_reserved[8]; /* future use */
} cp_magic;
struct {
/* (free slot) */
int cp_next; /* next free or -1 */
} cp_free;
} cp_u;
- int cp_seq; /* sequence number */
- char cp_reserved[4]; /* future use */
+ cp_grant_id_t cp_faulted; /* soft fault marker (CPF_TRY only) */
} cp_grant_t;
/* Vectored safecopy. */
/* Exactly one of the following must be SELF. */
endpoint_t v_from; /* source */
endpoint_t v_to; /* destination */
-
cp_grant_id_t v_gid; /* grant id of other process */
size_t v_offset; /* offset in other grant */
vir_bytes v_addr; /* address in copier's space */
#define CPF_MAGIC 0x000800 /* Grant from any to any. */
#define CPF_VALID 0x001000 /* Grant slot contains valid grant. */
+/* Special cpf_revoke() return values. */
+#define GRANT_FAULTED 1 /* CPF_TRY: a soft fault occurred */
+
/* Prototypes for functions in libsys. */
cp_grant_id_t cpf_grant_direct(endpoint_t, vir_bytes, size_t, int);
cp_grant_id_t cpf_grant_indirect(endpoint_t, endpoint_t, cp_grant_id_t);
#endif
/* system/do_safecopy.c */
+struct cp_sfinfo; /* external callers may only provide NULL */
int verify_grant(endpoint_t, endpoint_t, cp_grant_id_t, vir_bytes, int,
- vir_bytes, vir_bytes *, endpoint_t *, u32_t *);
+ vir_bytes, vir_bytes *, endpoint_t *, struct cp_sfinfo *);
/* system/do_diagctl.c */
int do_diagctl(struct proc * caller, message *m);
#define HASGRANTTABLE(gr) \
(priv(gr) && priv(gr)->s_grant_table)
+struct cp_sfinfo { /* information for handling soft faults */
+ int try; /* if nonzero, try copy only, stop on fault */
+ endpoint_t endpt; /* endpoint owning grant with CPF_TRY flag */
+ vir_bytes addr; /* address to write mark upon soft fault */
+ cp_grant_id_t value; /* grant ID to use as mark value to write */
+};
+
/*===========================================================================*
* verify_grant *
*===========================================================================*/
vir_bytes offset_in, /* copy offset within grant */
vir_bytes *offset_result, /* copy offset within virtual address space */
endpoint_t *e_granter, /* new granter (magic grants) */
- u32_t *flags /* CPF_* */
+ struct cp_sfinfo *sfinfo /* storage for soft fault information */
)
{
cp_grant_t g;
return EPERM;
}
- if(flags) *flags = g.cp_flags;
-
/* Check validity: flags and sequence number. */
if((g.cp_flags & (CPF_USED | CPF_VALID)) !=
(CPF_USED | CPF_VALID)) {
return EPERM;
}
+ /* If requested, store information regarding soft faults. */
+ if (sfinfo != NULL && (sfinfo->try = !!(g.cp_flags & CPF_TRY))) {
+ sfinfo->endpt = granter;
+ sfinfo->addr = priv(granter_proc)->s_grant_table +
+ sizeof(g) * grant_idx + offsetof(cp_grant_t, cp_faulted);
+ sfinfo->value = grant;
+ }
+
return OK;
}
endpoint_t new_granter, *src, *dst;
struct proc *granter_p;
int r;
- u32_t flags;
+ struct cp_sfinfo sfinfo;
#if PERF_USE_COW_SAFECOPY
vir_bytes size;
#endif
/* Verify permission exists. */
if((r=verify_grant(granter, grantee, grantid, bytes, access,
- g_offset, &v_offset, &new_granter, &flags)) != OK) {
+ g_offset, &v_offset, &new_granter, &sfinfo)) != OK) {
if(r == ENOTREADY) return r;
printf(
"grant %d verify to copy %d->%d by %d failed: err %d\n",
}
/* Do the regular copy. */
- if(flags & CPF_TRY) {
- int r;
- /* Try copy without transparently faulting in pages. */
+ if (sfinfo.try) {
+ /*
+ * Try copying without transparently faulting in pages.
+ * TODO: while CPF_TRY is meant to protect against deadlocks on
+ * memory-mapped files in file systems, it seems that this case
+ * triggers faults a whole lot more often, resulting in extra
+ * overhead due to retried file system operations. It might be
+ * a good idea to go through VM even in this case, and have VM
+ * fail (only) if the affected page belongs to a file mapping.
+ */
r = virtual_copy(&v_src, &v_dst, bytes);
- if(r == EFAULT_SRC || r == EFAULT_DST) return EFAULT;
+ if (r == EFAULT_SRC || r == EFAULT_DST) {
+ /*
+ * Mark the magic grant as having experienced a soft
+ * fault during its lifetime. The exact value does not
+ * matter, but we use the grant ID (including its
+ * sequence number) as a form of protection in the
+ * light of CPU concurrency.
+ */
+ r = data_copy(KERNEL, (vir_bytes)&sfinfo.value,
+ sfinfo.endpt, sfinfo.addr, sizeof(sfinfo.value));
+ /*
+ * Failure means the creator of the magic grant messed
+ * up, which can only be unintentional, so report..
+ */
+ if (r != OK)
+ printf("Kernel: writing soft fault marker %d "
+ "into %d at 0x%lx failed (%d)\n",
+ sfinfo.value, sfinfo.endpt, sfinfo.addr,
+ r);
+
+ return EFAULT;
+ }
return r;
}
return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
grants[g].cp_u.cp_direct.cp_who_to = who_to;
grants[g].cp_u.cp_direct.cp_start = addr;
grants[g].cp_u.cp_direct.cp_len = bytes;
+ grants[g].cp_faulted = GRANT_INVALID;
__insn_barrier();
grants[g].cp_flags = access | CPF_DIRECT | CPF_USED | CPF_VALID;
grants[g].cp_u.cp_indirect.cp_who_to = who_to;
grants[g].cp_u.cp_indirect.cp_who_from = who_from;
grants[g].cp_u.cp_indirect.cp_grant = gr;
+ grants[g].cp_faulted = GRANT_INVALID;
__insn_barrier();
grants[g].cp_flags = CPF_USED | CPF_INDIRECT | CPF_VALID;
grants[g].cp_u.cp_magic.cp_who_from = who_from;
grants[g].cp_u.cp_magic.cp_start = addr;
grants[g].cp_u.cp_magic.cp_len = bytes;
+ grants[g].cp_faulted = GRANT_INVALID;
__insn_barrier();
grants[g].cp_flags = CPF_USED | CPF_MAGIC | CPF_VALID | access;
return GRANT_ID(g, grants[g].cp_seq);
}
+/*
+ * Revoke previously granted access, identified by grant ID. Return -1 on
+ * error, with errno set as appropriate. Return 0 on success, with one
+ * exception: return GRANT_FAULTED (1) if a grant was created with CPF_TRY and
+ * during its lifetime, a copy from or to the grant experienced a soft fault.
+ */
int
cpf_revoke(cp_grant_id_t grant)
{
-/* Revoke previously granted access, identified by grant id. */
- int g;
+ int r, g;
GID_CHECK_USED(grant);
g = GRANT_IDX(grant);
- /* Make grant invalid by setting flags to 0, clearing CPF_USED.
+ /*
+ * If a safecopy action on a (direct or magic) grant with the CPF_TRY
+ * flag failed on a soft fault, the kernel will have set the cp_faulted
+ * field to the grant identifier. Here, we test this and return
+ * GRANT_FAULTED (1) on a match.
+ */
+ r = ((grants[g].cp_flags & CPF_TRY) &&
+ grants[g].cp_faulted == grant) ? GRANT_FAULTED : 0;
+
+ /*
+ * Make grant invalid by setting flags to 0, clearing CPF_USED.
* This invalidates the grant.
*/
grants[g].cp_flags = 0;
grants[g].cp_u.cp_free.cp_next = freelist;
freelist = g;
- return 0;
+ return r;
}
/*
assert(self->w_sendrec == NULL);
- return(reqmp->m_type);
+ r = reqmp->m_type;
+ if (r == ERESTART) /* ERESTART is used internally, so make sure it is.. */
+ r = EIO; /* ..not delivered as a result from a file system. */
+ return(r);
}
/*===========================================================================*
* it again now that I/O is done.
*/
if (GRANT_VALID(rfp->fp_grant)) {
- if(cpf_revoke(rfp->fp_grant)) {
+ if(cpf_revoke(rfp->fp_grant) == -1) {
panic("VFS: revoke failed for grant: %d",
rfp->fp_grant);
}
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
- cpf_revoke(grant_id);
+
+ if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
+
if (r != OK) return(r);
/* Fill in response structure */
r = req_breadwrite_actual(fs_e, user_e, dev, pos, num_of_bytes,
user_addr, rw_flag, new_pos, cum_iop, CPF_TRY);
- if(r == EFAULT) {
+ if (r == ERESTART) {
if((r=vm_vfs_procctl_handlemem(user_e, user_addr, num_of_bytes,
rw_flag == READING)) != OK) {
return r;
}
r = fs_sendrec(fs_e, &m);
- cpf_revoke(grant_id);
+
+ if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
if (r == OK) {
*new_pos = m.m_fs_vfs_getdents.seek_pos;
r = req_getdents_actual(fs_e, inode_nr, pos, buf, size, new_pos,
direct, CPF_TRY);
- if(r == EFAULT && !direct) {
+ if (r == ERESTART) {
+ assert(!direct);
+
if((r=vm_vfs_procctl_handlemem(who_e, buf, size, 1)) != OK) {
return r;
}
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
- cpf_revoke(grant_id);
+
+ if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
if (r == OK) r = m.m_fs_vfs_rdlink.nbytes;
r = req_rdlink_actual(fs_e, inode_nr, proc_e, buf, len, direct,
CPF_TRY);
- if(r == EFAULT && !direct) {
+ if (r == ERESTART) {
+ assert(!direct);
+
if((r=vm_vfs_procctl_handlemem(proc_e, buf, len, 1)) != OK) {
return r;
}
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
- cpf_revoke(grant_id);
+
+ if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
if (r == OK) {
/* Fill in response structure */
r = req_readwrite_actual(fs_e, inode_nr, pos, rw_flag, user_e,
user_addr, num_of_bytes, new_posp, cum_iop, CPF_TRY);
- if(r == EFAULT) {
- if((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr, num_of_bytes,
- rw_flag == READING)) != OK) {
+ if (r == ERESTART) {
+ if ((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr,
+ num_of_bytes, rw_flag == READING)) != OK) {
return r;
}
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
+
cpf_revoke(gid_name);
- cpf_revoke(gid_buf);
+ if (cpf_revoke(gid_buf) == GRANT_FAULTED) return(ERESTART);
return(r);
}
r = req_slink_actual(fs_e, inode_nr, lastc, proc_e, path_addr,
path_length, uid, gid, CPF_TRY);
- if(r == EFAULT) {
+ if (r == ERESTART) {
if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) path_addr,
path_length, 0)) != OK) {
return r;
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
- cpf_revoke(grant_id);
+
+ if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
return(r);
}
r = req_stat_actual(fs_e, inode_nr, proc_e, buf, CPF_TRY);
- if(r == EFAULT) {
+ if (r == ERESTART) {
if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) buf,
sizeof(struct stat), 1)) != OK) {
return r;
r = sendrec_driver(&m, exp, res);
- if (cpf_revoke(grant) != OK)
+ if (cpf_revoke(grant) == -1)
panic("unable to revoke grant");
if (r != RESULT_OK)
for (i = 0; i < nr_req; i++) {
iovec[i].iov_size = iov_s[i].iov_size;
- if (cpf_revoke(iov_s[i].iov_grant) != OK)
+ if (cpf_revoke(iov_s[i].iov_grant) == -1)
panic("unable to revoke grant");
}
}
r = sendrec_driver(&m, exp, res);
- if (cpf_revoke(grant) != OK)
+ if (cpf_revoke(grant) == -1)
panic("unable to revoke grant");
return r;
close(fd);
}
+/*
+ * Test that soft faults during file system I/O do not cause functions to
+ * return partial I/O results.
+ *
+ * We refer to the faults that are caused internally within the operating
+ * system as a result of the deadlock mitigation described at the top of this
+ * file, as a particular class of "soft faults". Such soft faults may occur in
+ * the middle of an I/O operation, and general I/O semantics dictate that upon
+ * partial success, the partial success is returned (and *not* an error). As a
+ * result, these soft faults, if not handled as special cases, may cause even
+ * common file system operations such as read(2) on a regular file to return
+ * fewer bytes than requested. Such unexpected short reads are typically not
+ * handled well by userland, and the OS must prevent them from occurring if it
+ * can. Note that read(2) is the most problematic, but certainly not the only,
+ * case where this problem can occur.
+ *
+ * Unfortunately, several file system services are not following the proper
+ * general I/O semantics - and this includes MFS. Therefore, for now, we have
+ * to test this case using block device I/O, which does do the right thing.
+ * In this test we hope that the root file system is mounted on a block device
+ * usable for (read-only!) testing purposes.
+ */
+static void
+softfault_partial(void)
+{
+ struct statvfs stf;
+ struct stat st;
+ char *buf, *buf2;
+ ssize_t size;
+ int fd;
+
+ if (statvfs("/", &stf) != 0) e(0);
+
+ /*
+ * If the root file system is not mounted off a block device, or if we
+ * cannot open that device ourselves, simply skip this subtest.
+ */
+ if (stat(stf.f_mntfromname, &st) != 0 || !S_ISBLK(st.st_mode))
+ return; /* skip subtest */
+
+ if ((fd = open(stf.f_mntfromname, O_RDONLY)) == -1)
+ return; /* skip subtest */
+
+ /*
+ * See if we can read in the first two full blocks, or two pages worth
+ * of data, whichever is larger. If that fails, there is no point in
+ * continuing the test.
+ */
+ size = MAX(stf.f_bsize, PAGE_SIZE) * 2;
+
+ if ((buf = mmap(NULL, size, PROT_READ | PROT_READ,
+ MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) e(0);
+
+ if (read(fd, buf, size) != size) {
+ munmap(buf, size);
+ close(fd);
+ return; /* skip subtest */
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ /*
+ * Now attempt a read to a partially faulted-in buffer. The first time
+ * around, the I/O transfer will generate a fault and return partial
+ * success. In that case, the entire I/O transfer should be retried
+ * after faulting in the missing page(s), thus resulting in the read
+ * succeeding in full.
+ */
+ if ((buf2 = mmap(NULL, size, PROT_READ | PROT_READ,
+ MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) e(0);
+ buf2[0] = '\0'; /* fault in the first page */
+
+ if (read(fd, buf2, size) != size) e(0);
+
+ /* The result should be correct, too. */
+ if (memcmp(buf, buf2, size)) e(0);
+
+ /* Clean up. */
+ munmap(buf2, size);
+ munmap(buf, size);
+
+ close(fd);
+}
+
int
main(int argc, char *argv[])
{
test_memory_types_vs_operations();
+ softfault_partial();
+
makefiles(MAXFILES);
cachequiet(!bigflag);