-#ifndef __MINIX_BDEV_H
-#define __MINIX_BDEV_H
+#ifndef _MINIX_BDEV_H
+#define _MINIX_BDEV_H
+/* Common API. */
extern void bdev_driver(dev_t dev, char *label);
+/* Synchronous API. */
extern int bdev_open(dev_t dev, int access);
extern int bdev_close(dev_t dev);
int flags);
extern int bdev_ioctl(dev_t dev, int request, void *buf);
-#endif /* __MINIX_BDEV_H */
+/* Asynchronous API. */
+typedef int bdev_id_t;
+typedef void *bdev_param_t;
+
+typedef void (*bdev_callback_t)(dev_t dev, bdev_id_t id, bdev_param_t param,
+ int result);
+
+extern void bdev_flush_asyn(dev_t dev);
+
+extern bdev_id_t bdev_read_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
+ int flags, bdev_callback_t callback, bdev_param_t param);
+extern bdev_id_t bdev_write_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
+ int flags, bdev_callback_t callback, bdev_param_t param);
+extern bdev_id_t bdev_gather_asyn(dev_t dev, u64_t pos, iovec_t *vec,
+ int count, int flags, bdev_callback_t callback, bdev_param_t param);
+extern bdev_id_t bdev_scatter_asyn(dev_t dev, u64_t pos, iovec_t *vec,
+ int count, int flags, bdev_callback_t callback, bdev_param_t param);
+extern bdev_id_t bdev_ioctl_asyn(dev_t dev, int request, void *buf,
+ bdev_callback_t callback, bdev_param_t param);
+
+extern int bdev_wait_asyn(bdev_id_t id);
+
+extern void bdev_reply_asyn(message *m);
+
+#endif /* _MINIX_BDEV_H */
LIB= bdev
-SRCS= bdev.c ipc.c driver.c
+SRCS= bdev.c driver.c call.c ipc.c minor.c
.include <bsd.lib.mk>
--- /dev/null
+Development notes regarding libbdev, by David van Moolenbroek.
+
+
+GENERAL MODEL
+
+This library is designed mainly for use by file servers. It essentially covers
+two use cases: 1) use of the block device that contains the file system itself,
+and 2) use of any block device for raw block I/O (on unmounted file systems)
+performed by the root file server. In the first case, the file server is
+responsible for opening and closing the block device, and recovery from a
+driver restart involves reopening those minor devices. Regular file systems
+should have one or at most two (for a separate journal) block devices open at
+the same time, which is why NR_OPEN_DEVS is set to a value that is quite low.
+In the second case, VFS is responsible for opening and closing the block device
+(and performing IOCTLs), as well as reopening the block device on a driver
+restart -- the root file server only gets raw I/O (and flush) requests.
+
+At this time, libbdev considers only clean crashes (a crash-only model), and
+does not support recovery from behavioral errors. Protocol errors are passed to
+the user process, and generally do not have an effect on the overall state of
+the library.
+
+
+RETRY MODEL
+
+The philosophy for recovering from driver restarts in libbdev can be formulated
+as follows: we want to tolerate an unlimited number of driver restarts over a
+long time, but we do not want to keep retrying individual requests across
+driver restarts. As such, we do not keep track of driver restarts on a per-
+driver basis, because that would mean we put a hard limit on the number of
+restarts for that driver in total. Instead, there are two limits: a driver
+restart limit that is kept on a per-request basis, failing only that request
+when the limit is reached, and a driver restart limit that is kept during
+recovery, limiting the number of restarts and eventually giving up on the
+entire driver when even the recovery keeps failing (as no progress is made in
+that case).
+
+Each transfer request also has a transfer retry count. The assumption here is
+that when a transfer request returns EIO, it can be retried and possibly
+succeed upon repetition. The driver restart and transfer retry counts are
+tracked independently and thus the first to hit the limit will fail the
+request. The behavior should be the same for synchronous and asynchronous
+requests in this respect.
+
+It could happen that a new driver gets loaded after we have decided that the
+current driver is unusable. This could be due to a race condition (VFS sends a
+new-driver request after we've given up) or due to user interaction (the user
+loads a replacement driver). The latter case may occur legitimately with raw
+I/O on the root file server, so we must not mark the driver as unusable
+forever. On the other hand, in the former case, we must not continue to send
+I/O without first reopening the minor devices. For this reason, we do not clean
+up the record of the minor devices when we mark a driver as unusable.
/* libbdev - block device interfacing library, by D.C. van Moolenbroek */
-/* This is a preliminary, bare-essentials-only version of this library. */
-
#include <minix/drivers.h>
#include <minix/bdev.h>
#include <minix/ioctl.h>
#include <assert.h>
+#include "const.h"
+#include "type.h"
#include "proto.h"
void bdev_driver(dev_t dev, char *label)
bdev_update(dev, label);
}
+static int bdev_retry(int *driver_tries, int *transfer_tries, int *result)
+{
+/* Return TRUE iff the call result implies that we should retry the operation.
+ */
+
+ switch (*result) {
+ case ERESTART:
+ /* We get this error internally if the driver has restarted and the
+ * current operation may now go through. Check the retry count for
+ * driver restarts first, as we don't want to keep trying forever.
+ */
+ if (++*driver_tries < DRIVER_TRIES)
+ return TRUE;
+
+ *result = EDEADSRCDST;
+
+ break;
+
+ case EIO:
+ /* The 'transfer_tries' pointer is non-NULL if this was a transfer
+ * request. If we get back an I/O failure, keep retrying the request
+ * until we hit the transfer retry limit.
+ */
+ if (transfer_tries != NULL && ++*transfer_tries < TRANSFER_TRIES)
+ return TRUE;
+
+ break;
+ }
+
+ return FALSE;
+}
+
static int bdev_opcl(int req, dev_t dev, int access)
{
/* Open or close the given minor device.
*/
message m;
+ int r, driver_tries = 0;
- memset(&m, 0, sizeof(m));
- m.m_type = req;
- m.BDEV_MINOR = minor(dev);
- m.BDEV_ACCESS = access;
+ do {
+ memset(&m, 0, sizeof(m));
+ m.m_type = req;
+ m.BDEV_MINOR = minor(dev);
+ m.BDEV_ACCESS = access;
- return bdev_sendrec(dev, &m);
+ r = bdev_sendrec(dev, &m);
+ } while (bdev_retry(&driver_tries, NULL, &r));
+
+ return r;
}
int bdev_open(dev_t dev, int access)
/* Open the given minor device.
* File system usage note: typically called from mount, after bdev_driver.
*/
+ int r;
+
+ r = bdev_opcl(BDEV_OPEN, dev, access);
- return bdev_opcl(BDEV_OPEN, dev, access);
+ if (r == OK)
+ bdev_minor_add(dev, access);
+
+ return r;
}
int bdev_close(dev_t dev)
/* Close the given minor device.
* File system usage note: typically called from unmount.
*/
+ int r;
+
+ bdev_flush_asyn(dev);
+
+ r = bdev_opcl(BDEV_CLOSE, dev, 0);
- return bdev_opcl(BDEV_CLOSE, dev, 0);
+ if (r == OK)
+ bdev_minor_del(dev);
+
+ return r;
}
static int bdev_rdwt_setup(int req, dev_t dev, u64_t pos, char *buf,
return OK;
}
-static void bdev_rdwt_cleanup(message *m)
+static void bdev_rdwt_cleanup(const message *m)
{
/* Clean up a single-buffer read/write request.
*/
static ssize_t bdev_rdwt(int req, dev_t dev, u64_t pos, char *buf,
size_t count, int flags)
{
-/* Perform a read or write call using a single buffer.
+/* Perform a synchronous read or write call using a single buffer.
*/
message m;
- int r;
+ int r, driver_tries = 0, transfer_tries = 0;
- if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &m)) != OK)
- return r;
+ do {
+ if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &m)) != OK)
+ break;
- r = bdev_sendrec(dev, &m);
+ r = bdev_sendrec(dev, &m);
- bdev_rdwt_cleanup(&m);
+ bdev_rdwt_cleanup(&m);
+ } while (bdev_retry(&driver_tries, &transfer_tries, &r));
return r;
}
return OK;
}
-static void bdev_vrdwt_cleanup(message *m, iovec_s_t *gvec)
+static void bdev_vrdwt_cleanup(const message *m, iovec_s_t *gvec)
{
/* Clean up a vectored read/write request.
*/
static ssize_t bdev_vrdwt(int req, dev_t dev, u64_t pos, iovec_t *vec,
int count, int flags)
{
-/* Perform a read or write call using a vector of buffers.
+/* Perform a synchronous read or write call using a vector of buffers.
*/
iovec_s_t gvec[NR_IOREQS];
message m;
- int r;
+ int r, driver_tries = 0, transfer_tries = 0;
- if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &m, gvec)) != OK)
- return r;
+ do {
+ if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &m,
+ gvec)) != OK)
+ break;
- r = bdev_sendrec(dev, &m);
+ r = bdev_sendrec(dev, &m);
- bdev_vrdwt_cleanup(&m, gvec);
+ bdev_vrdwt_cleanup(&m, gvec);
+ } while (bdev_retry(&driver_tries, &transfer_tries, &r));
return r;
}
ssize_t bdev_read(dev_t dev, u64_t pos, char *buf, size_t count, int flags)
{
-/* Perform a read call into a single buffer.
+/* Perform a synchronous read call into a single buffer.
*/
return bdev_rdwt(BDEV_READ, dev, pos, buf, count, flags);
ssize_t bdev_write(dev_t dev, u64_t pos, char *buf, size_t count, int flags)
{
-/* Perform a write call from a single buffer.
+/* Perform a synchronous write call from a single buffer.
*/
return bdev_rdwt(BDEV_WRITE, dev, pos, buf, count, flags);
ssize_t bdev_gather(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags)
{
-/* Perform a read call into a vector of buffers.
+/* Perform a synchronous read call into a vector of buffers.
*/
return bdev_vrdwt(BDEV_GATHER, dev, pos, vec, count, flags);
ssize_t bdev_scatter(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags)
{
-/* Perform a write call from a vector of buffers.
+/* Perform a synchronous write call from a vector of buffers.
*/
return bdev_vrdwt(BDEV_SCATTER, dev, pos, vec, count, flags);
return OK;
}
-static void bdev_ioctl_cleanup(message *m)
+static void bdev_ioctl_cleanup(const message *m)
{
/* Clean up an I/O control request.
*/
int bdev_ioctl(dev_t dev, int request, void *buf)
{
-/* Perform an I/O control request.
+/* Perform a synchronous I/O control request.
*/
message m;
+ int r, driver_tries = 0;
+
+ do {
+ if ((r = bdev_ioctl_setup(dev, request, buf, &m)) != OK)
+ break;
+
+ r = bdev_sendrec(dev, &m);
+
+ bdev_ioctl_cleanup(&m);
+ } while (bdev_retry(&driver_tries, NULL, &r));
+
+ return r;
+}
+
+void bdev_flush_asyn(dev_t dev)
+{
+/* Flush all ongoing asynchronous requests to the given minor device. This
+ * involves blocking until all I/O for it has completed.
+ * File system usage note: typically called from flush.
+ */
+ bdev_call_t *call;
+
+ while ((call = bdev_call_find(dev)) != NULL)
+ (void) bdev_wait_asyn(call->id);
+}
+
+static bdev_id_t bdev_rdwt_asyn(int req, dev_t dev, u64_t pos, char *buf,
+ size_t count, int flags, bdev_callback_t callback, bdev_param_t param)
+{
+/* Perform an asynchronous read or write call using a single buffer.
+ */
+ bdev_call_t *call;
int r;
- if ((r = bdev_ioctl_setup(dev, request, buf, &m)) != OK)
+ if ((call = bdev_call_alloc(1)) == NULL)
+ return ENOMEM;
+
+ if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &call->msg)) !=
+ OK) {
+ bdev_call_free(call);
+
return r;
+ }
- r = bdev_sendrec(dev, &m);
+ if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) {
+ bdev_rdwt_cleanup(&call->msg);
- bdev_ioctl_cleanup(&m);
+ bdev_call_free(call);
- return r;
+ return r;
+ }
+
+ call->dev = dev;
+ call->callback = callback;
+ call->param = param;
+ call->driver_tries = 0;
+ call->transfer_tries = 0;
+ call->vec[0].iov_addr = (vir_bytes) buf;
+ call->vec[0].iov_size = count;
+
+ return call->id;
+}
+
+static bdev_id_t bdev_vrdwt_asyn(int req, dev_t dev, u64_t pos, iovec_t *vec,
+ int count, int flags, bdev_callback_t callback, bdev_param_t param)
+{
+/* Perform an asynchronous read or write call using a vector of buffers.
+ */
+ bdev_call_t *call;
+ int r;
+
+ if ((call = bdev_call_alloc(count)) == NULL)
+ return ENOMEM;
+
+ if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &call->msg,
+ call->gvec)) != OK) {
+ bdev_call_free(call);
+
+ return r;
+ }
+
+ if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) {
+ bdev_vrdwt_cleanup(&call->msg, call->gvec);
+
+ bdev_call_free(call);
+
+ return r;
+ }
+
+ call->dev = dev;
+ call->callback = callback;
+ call->param = param;
+ call->driver_tries = 0;
+ call->transfer_tries = 0;
+ memcpy(call->vec, vec, sizeof(vec[0]) * count);
+
+ return call->id;
+}
+
+bdev_id_t bdev_read_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
+ int flags, bdev_callback_t callback, bdev_param_t param)
+{
+/* Perform an asynchronous read call into a single buffer.
+ */
+
+ return bdev_rdwt_asyn(BDEV_READ, dev, pos, buf, count, flags, callback,
+ param);
+}
+
+bdev_id_t bdev_write_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
+ int flags, bdev_callback_t callback, bdev_param_t param)
+{
+/* Perform an asynchronous write call from a single buffer.
+ */
+
+ return bdev_rdwt_asyn(BDEV_WRITE, dev, pos, buf, count, flags, callback,
+ param);
+}
+
+bdev_id_t bdev_gather_asyn(dev_t dev, u64_t pos, iovec_t *vec, int count,
+ int flags, bdev_callback_t callback, bdev_param_t param)
+{
+/* Perform an asynchronous read call into a vector of buffers.
+ */
+
+ return bdev_vrdwt_asyn(BDEV_GATHER, dev, pos, vec, count, flags, callback,
+ param);
+}
+
+bdev_id_t bdev_scatter_asyn(dev_t dev, u64_t pos, iovec_t *vec, int count,
+ int flags, bdev_callback_t callback, bdev_param_t param)
+{
+/* Perform an asynchronous write call into a vector of buffers.
+ */
+
+ return bdev_vrdwt_asyn(BDEV_SCATTER, dev, pos, vec, count, flags, callback,
+ param);
+}
+
+bdev_id_t bdev_ioctl_asyn(dev_t dev, int request, void *buf,
+ bdev_callback_t callback, bdev_param_t param)
+{
+/* Perform an asynchronous I/O control request.
+ */
+ bdev_call_t *call;
+ int r;
+
+ if ((call = bdev_call_alloc(1)) == NULL)
+ return ENOMEM;
+
+ if ((r = bdev_ioctl_setup(dev, request, buf, &call->msg)) != OK) {
+ bdev_call_free(call);
+
+ return r;
+ }
+
+ if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) {
+ bdev_ioctl_cleanup(&call->msg);
+
+ bdev_call_free(call);
+
+ return r;
+ }
+
+ call->dev = dev;
+ call->callback = callback;
+ call->param = param;
+ call->driver_tries = 0;
+ call->vec[0].iov_addr = (vir_bytes) buf;
+
+ return call->id;
+}
+
+void bdev_callback_asyn(bdev_call_t *call, int result)
+{
+/* Perform the callback for an asynchronous request, with the given result.
+ * Clean up the call structure afterwards.
+ */
+
+ /* If this was a transfer request and the result is EIO, we may want to retry
+ * the request first.
+ */
+ switch (call->msg.m_type) {
+ case BDEV_READ:
+ case BDEV_WRITE:
+ case BDEV_GATHER:
+ case BDEV_SCATTER:
+ if (result == EIO && ++call->transfer_tries < TRANSFER_TRIES) {
+ result = bdev_senda(call->dev, &call->msg, call->id);
+
+ if (result == OK)
+ return;
+ }
+ }
+
+ /* Clean up. */
+ switch (call->msg.m_type) {
+ case BDEV_READ:
+ case BDEV_WRITE:
+ bdev_rdwt_cleanup(&call->msg);
+
+ break;
+
+ case BDEV_GATHER:
+ case BDEV_SCATTER:
+ bdev_vrdwt_cleanup(&call->msg, call->gvec);
+
+ break;
+
+ case BDEV_IOCTL:
+ bdev_ioctl_cleanup(&call->msg);
+
+ break;
+
+ default:
+ assert(0);
+ }
+
+ /* Call the callback function. */
+ /* FIXME: we assume all reasonable ssize_t values can be stored in an int. */
+ call->callback(call->dev, call->id, call->param, result);
+
+ /* Free up the call structure. */
+ bdev_call_free(call);
+}
+
+int bdev_restart_asyn(bdev_call_t *call)
+{
+/* The driver for the given call has restarted, and may now have a new
+ * endpoint. Recreate and resend the request for the given call.
+ */
+ int type, r = OK;
+
+ /* Update and check the retry limit for driver restarts first. */
+ if (++call->driver_tries >= DRIVER_TRIES)
+ return EDEADSRCDST;
+
+ /* Recreate all grants for the new endpoint. */
+ type = call->msg.m_type;
+
+ switch (type) {
+ case BDEV_READ:
+ case BDEV_WRITE:
+ bdev_rdwt_cleanup(&call->msg);
+
+ r = bdev_rdwt_setup(type, call->dev,
+ make64(call->msg.BDEV_POS_LO, call->msg.BDEV_POS_HI),
+ (char *) call->vec[0].iov_addr, call->msg.BDEV_COUNT,
+ call->msg.BDEV_FLAGS, &call->msg);
+
+ break;
+
+ case BDEV_GATHER:
+ case BDEV_SCATTER:
+ bdev_vrdwt_cleanup(&call->msg, call->gvec);
+
+ r = bdev_vrdwt_setup(type, call->dev,
+ make64(call->msg.BDEV_POS_LO, call->msg.BDEV_POS_HI),
+ call->vec, call->msg.BDEV_COUNT, call->msg.BDEV_FLAGS,
+ &call->msg, call->gvec);
+
+ break;
+
+ case BDEV_IOCTL:
+ bdev_ioctl_cleanup(&call->msg);
+
+ r = bdev_ioctl_setup(call->dev, call->msg.BDEV_REQUEST,
+ (char *) call->vec[0].iov_addr, &call->msg);
+
+ break;
+
+ default:
+ assert(0);
+ }
+
+ if (r != OK)
+ return r;
+
+ /* Try to resend the request. */
+ return bdev_senda(call->dev, &call->msg, call->id);
}
--- /dev/null
+/* libbdev - asynchronous call structure management */
+
+#include <minix/drivers.h>
+#include <minix/bdev.h>
+#include <assert.h>
+
+#include "const.h"
+#include "type.h"
+#include "proto.h"
+
+static bdev_call_t *calls[NR_CALLS];
+
+bdev_call_t *bdev_call_alloc(int count)
+{
+/* Allocate a call structure.
+ */
+ bdev_call_t *call;
+ bdev_id_t id;
+
+ for (id = 0; id < NR_CALLS; id++)
+ if (calls[id] == NULL)
+ break;
+
+ if (id == NR_CALLS)
+ return NULL;
+
+ call = malloc(sizeof(bdev_call_t) +
+ sizeof(call->gvec[0]) * (count - 1) +
+ sizeof(call->vec[0]) * count);
+
+ if (call == NULL)
+ return NULL;
+
+ call->id = id;
+ call->vec = (iovec_t *) &call->gvec[count];
+
+ calls[id] = call;
+
+ return call;
+}
+
+void bdev_call_free(bdev_call_t *call)
+{
+/* Free a call structure.
+ */
+
+ assert(calls[call->id] == call);
+
+ calls[call->id] = NULL;
+
+ free(call);
+}
+
+bdev_call_t *bdev_call_get(bdev_id_t id)
+{
+/* Retrieve a call structure by request number.
+ */
+
+ if (id < 0 || id >= NR_CALLS)
+ return NULL;
+
+ return calls[id];
+}
+
+bdev_call_t *bdev_call_find(dev_t dev)
+{
+/* Find the first asynchronous request for the given device, if any.
+ */
+ bdev_id_t id;
+
+ for (id = 0; id < NR_CALLS; id++)
+ if (calls[id] != NULL && calls[id]->dev == dev)
+ return calls[id];
+
+ return NULL;
+}
+
+bdev_call_t *bdev_call_iter_maj(dev_t dev, bdev_call_t *call,
+ bdev_call_t **next)
+{
+/* Iterate over all asynchronous requests for a major device. This function
+ * must be safe even if the returned call structure is freed.
+ */
+ bdev_id_t id;
+ int major;
+
+ major = major(dev);
+
+ /* If this is the first invocation, find the first match. Otherwise, take the
+ * call we found to be next in the last invocation, which may be NULL.
+ */
+ if (call == NULL) {
+ for (id = 0; id < NR_CALLS; id++)
+ if (calls[id] != NULL && major(calls[id]->dev) == major)
+ break;
+
+ if (id == NR_CALLS)
+ return NULL;
+
+ call = calls[id];
+ } else {
+ if ((call = *next) == NULL)
+ return NULL;
+ }
+
+ /* Look for the next match, if any. */
+ *next = NULL;
+
+ for (id = call->id + 1; id < NR_CALLS; id++) {
+ if (calls[id] != NULL && major(calls[id]->dev) == major) {
+ *next = calls[id];
+
+ break;
+ }
+ }
+
+ return call;
+}
#ifndef _BDEV_CONST_H
#define _BDEV_CONST_H
+#define NR_CALLS 256 /* maximum number of concurrent async calls */
+
+#define NO_ID (-1) /* ID for synchronous requests */
+
#define DS_NR_TRIES 100 /* number of times to check endpoint in DS */
#define DS_DELAY 50000 /* delay time (us) between DS checks */
+#define DRIVER_TRIES 10 /* after so many tries, give up on a driver */
+#define RECOVER_TRIES 2 /* tolerated nr of restarts during recovery */
+#define TRANSFER_TRIES 5 /* number of times to try transfers on EIO */
+
+#define NR_OPEN_DEVS 4 /* maximum different opened minor devices */
+
#endif /* _BDEV_CONST_H */
#include <assert.h>
#include "const.h"
+#include "type.h"
#include "proto.h"
static struct {
#include <minix/bdev.h>
#include <assert.h>
+#include "const.h"
+#include "type.h"
#include "proto.h"
static void bdev_cancel(dev_t dev)
/* Recovering the driver for the given device has failed repeatedly. Mark it as
* permanently unusable, and clean up any associated calls and resources.
*/
+ bdev_call_t *call, *next;
- printf("bdev: driver for major %d (endpoint %d) crashed\n",
- major(dev), bdev_driver_get(dev));
+ printf("bdev: giving up on major %d\n", major(dev));
+
+ /* Cancel all pending asynchronous requests. */
+ call = NULL;
+
+ while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL)
+ bdev_callback_asyn(call, EDEADSRCDST);
/* Mark the driver as unusable. */
bdev_driver_clear(dev);
}
+static int bdev_recover(dev_t dev, int update_endpt)
+{
+/* The IPC subsystem has signaled an error communicating to the driver
+ * associated with the given device. Try to recover. If 'update_endpt' is set,
+ * we need to find the new endpoint of the driver first. Return TRUE iff
+ * recovery has been successful.
+ */
+ bdev_call_t *call, *next;
+ endpoint_t endpt;
+ int r, nr_tries;
+
+ printf("bdev: recovering from a driver crash on major %d\n", major(dev));
+
+ for (nr_tries = 0; nr_tries < RECOVER_TRIES; nr_tries++) {
+ /* First update the endpoint, if necessary. */
+ if (update_endpt)
+ (void) bdev_driver_update(dev);
+
+ if ((endpt = bdev_driver_get(dev)) == NONE)
+ break;
+
+ /* If anything goes wrong, update the endpoint again next time. */
+ update_endpt = TRUE;
+
+ /* Reopen all minor devices on the new driver. */
+ if ((r = bdev_minor_reopen(dev)) != OK) {
+ /* If the driver died again, we may give it another try. */
+ if (r == EDEADSRCDST)
+ continue;
+
+ /* If another error occurred, we cannot continue using the
+ * driver as is, but we also cannot force it to restart.
+ */
+ break;
+ }
+
+ /* Resend all asynchronous requests. */
+ call = NULL;
+
+ while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL) {
+ /* It is not strictly necessary that we manage to reissue all
+ * asynchronous requests successfully. We can fail them on an
+ * individual basis here, without affecting the overall
+ * recovery. Note that we will never get new IPC failures here.
+ */
+ if ((r = bdev_restart_asyn(call)) != OK)
+ bdev_callback_asyn(call, r);
+ }
+
+ /* Recovery seems successful. We can now reissue the current
+ * synchronous request (if any), and continue normal operation.
+ */
+ printf("bdev: recovery successful, new driver is at %d\n", endpt);
+
+ return TRUE;
+ }
+
+ /* Recovery failed repeatedly. Give up on this driver. */
+ bdev_cancel(dev);
+
+ return FALSE;
+}
+
void bdev_update(dev_t dev, char *label)
{
/* Set the endpoint for a driver. Perform recovery if necessary.
/* If updating the driver causes an endpoint change, we need to perform
* recovery, but not update the endpoint yet again.
*/
+ if (old_endpt != NONE && old_endpt != endpt)
+ bdev_recover(dev, FALSE /*update_endpt*/);
+}
+
+int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t id)
+{
+/* Send an asynchronous request for the given device. This function will never
+ * get any new IPC errors sending to the driver. If sending an asynchronous
+ * request fails, we will find out through other ways later.
+ */
+ endpoint_t endpt;
+ message m;
+ int r;
+
+ /* If we have no usable driver endpoint, fail instantly. */
+ if ((endpt = bdev_driver_get(dev)) == NONE)
+ return EDEADSRCDST;
+
+ m = *m_orig;
+ m.BDEV_ID = id;
+
+ r = asynsend(endpt, &m);
+
+ if (r != OK)
+ printf("bdev: asynsend to driver (%d) failed (%d)\n", endpt, r);
+
+ return r;
}
int bdev_sendrec(dev_t dev, const message *m_orig)
{
-/* Send a request to the given device, and wait for the reply.
+/* Send a synchronous request for the given device, and wait for the reply.
+ * Return ERESTART if the caller should try to reissue the request.
*/
- static long id = 0;
endpoint_t endpt;
message m;
int r;
/* Send the request and block until we receive a reply. */
m = *m_orig;
- m.BDEV_ID = ++id;
+ m.BDEV_ID = NO_ID;
r = sendrec(endpt, &m);
- /* This version of libbdev does not support recovery. Forget the driver. */
+ /* If communication failed, the driver has died. We assume it will be
+ * restarted soon after, so we attempt recovery. Upon success, we let the
+ * caller reissue the synchronous request.
+ */
if (r == EDEADSRCDST) {
- bdev_cancel(dev);
+ if (!bdev_recover(dev, TRUE /*update_endpt*/))
+ return EDEADSRCDST;
- return EDEADSRCDST;
+ return ERESTART;
}
if (r != OK) {
if (m.m_type != BDEV_REPLY) {
printf("bdev: driver (%d) sent weird response (%d)\n",
endpt, m.m_type);
- return EIO;
+ return EINVAL;
+ }
+
+ /* The protocol contract states that no asynchronous reply can satisfy a
+ * synchronous SENDREC call, so we can never get an asynchronous reply here.
+ */
+ if (m.BDEV_ID != NO_ID) {
+ printf("bdev: driver (%d) sent invalid ID (%ld)\n", endpt, m.BDEV_ID);
+ return EINVAL;
}
- /* ERESTART signifies a driver restart. Again, we do not support this yet. */
+ /* Unless the caller is misusing libbdev, we will only get ERESTART if we
+ * have managed to resend a raw block I/O request to the driver after a
+ * restart, but before VFS has had a chance to reopen the associated device
+ * first. This is highly exceptional, and hard to deal with correctly. We
+ * take the easiest route: sleep for a while so that VFS can reopen the
+ * device, and then resend the request. If the call keeps failing, the caller
+ * will eventually give up.
+ */
if (m.BDEV_STATUS == ERESTART) {
- bdev_cancel(dev);
+ printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
+ endpt);
- return EDEADSRCDST;
- }
+ micro_delay(1000);
- if (m.BDEV_ID != id) {
- printf("bdev: driver (%d) sent invalid response (%ld)\n",
- endpt, m.BDEV_ID);
- return EIO;
+ return ERESTART;
}
/* Return the result of our request. */
return m.BDEV_STATUS;
}
+
+static int bdev_receive(dev_t dev, message *m)
+{
+/* Receive one valid message.
+ */
+ endpoint_t endpt;
+ int r, nr_tries = 0;
+
+ for (;;) {
+ /* Retrieve and check the driver endpoint on every try, as it will
+ * change with each driver restart.
+ */
+ if ((endpt = bdev_driver_get(dev)) == NONE)
+ return EDEADSRCDST;
+
+ r = sef_receive(endpt, m);
+
+ if (r == EDEADSRCDST) {
+ /* If we reached the maximum number of retries, give up. */
+ if (++nr_tries == DRIVER_TRIES)
+ break;
+
+ /* Attempt recovery. If successful, all asynchronous requests
+ * will have been resent, and we can retry receiving a reply.
+ */
+ if (!bdev_recover(dev, TRUE /*update_endpt*/))
+ return EDEADSRCDST;
+
+ continue;
+ }
+
+ if (r != OK) {
+ printf("bdev: IPC to driver (%d) failed (%d)\n", endpt, r);
+
+ return r;
+ }
+
+ if (m->m_type != BDEV_REPLY) {
+ printf("bdev: driver (%d) sent weird response (%d)\n",
+ endpt, m->m_type);
+ return EINVAL;
+ }
+
+ /* The caller is responsible for checking the ID and status. */
+ return OK;
+ }
+
+ /* All tries failed, even though all recovery attempts succeeded. In this
+ * case, we let the caller recheck whether it wants to keep calling us,
+ * returning ERESTART to indicate we can be called again but did not actually
+ * receive a message.
+ */
+ return ERESTART;
+}
+
+void bdev_reply_asyn(message *m)
+{
+/* A reply has come in from a disk driver.
+ */
+ bdev_call_t *call;
+ endpoint_t endpt;
+ bdev_id_t id;
+ int r;
+
+ /* This is a requirement for the caller. */
+ assert(m->m_type == BDEV_REPLY);
+
+ /* Get the corresponding asynchronous call structure. */
+ id = m->BDEV_ID;
+
+ if ((call = bdev_call_get(id)) == NULL) {
+ printf("bdev: driver (%d) replied to unknown request (%ld)\n",
+ m->m_source, m->BDEV_ID);
+ return;
+ }
+
+ /* Make sure the reply was sent from the right endpoint. */
+ endpt = bdev_driver_get(call->dev);
+
+ if (m->m_source != endpt) {
+ /* If the endpoint is NONE, this may be a stray reply. */
+ if (endpt != NONE)
+ printf("bdev: driver (%d) replied to request not sent to it\n",
+ m->m_source);
+ return;
+ }
+
+ /* See the ERESTART comment in bdev_sendrec(). */
+ if (m->BDEV_STATUS == ERESTART) {
+ printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
+ endpt);
+
+ micro_delay(1000);
+
+ if ((r = bdev_restart_asyn(call)) != OK)
+ bdev_callback_asyn(call, r);
+
+ return;
+ }
+
+ bdev_callback_asyn(call, m->BDEV_STATUS);
+}
+
+int bdev_wait_asyn(bdev_id_t id)
+{
+/* Wait for an asynchronous request to complete.
+ */
+ bdev_call_t *call;
+ dev_t dev;
+ message m;
+ int r;
+
+ if ((call = bdev_call_get(id)) == NULL)
+ return ENOENT;
+
+ dev = call->dev;
+
+ do {
+ if ((r = bdev_receive(dev, &m)) != OK && r != ERESTART)
+ return r;
+
+ /* Processing the reply will free up the call structure as a side
+ * effect. If we repeatedly get ERESTART, we will repeatedly resend the
+ * asynchronous request, which will then eventually hit the retry limit
+ * and we will break out of the loop.
+ */
+ if (r == OK)
+ bdev_reply_asyn(&m);
+
+ } while (bdev_call_get(id) != NULL);
+
+ return OK;
+}
--- /dev/null
+/* libbdev - tracking and reopening of opened minor devices */
+
+#include <minix/drivers.h>
+#include <minix/bdev.h>
+#include <assert.h>
+
+#include "const.h"
+#include "type.h"
+#include "proto.h"
+
+static struct {
+ dev_t dev;
+ int count;
+ int access;
+} open_dev[NR_OPEN_DEVS] = { { NO_DEV, 0, 0 } };
+
+int bdev_minor_reopen(dev_t dev)
+{
+/* Reopen all minor devices on a major device. This function duplicates some
+ * code from elsewhere, because in this case we must avoid performing recovery.
+ * FIXME: if reopening fails with a non-IPC error, we should attempt to close
+ * all minors that we did manage to reopen so far, or they might stay open
+ * forever.
+ */
+ endpoint_t endpt;
+ message m;
+ int i, j, r, major;
+
+ major = major(dev);
+ endpt = bdev_driver_get(dev);
+
+ assert(endpt != NONE);
+
+ for (i = 0; i < NR_OPEN_DEVS; i++) {
+ if (major(open_dev[i].dev) != major)
+ continue;
+
+ /* Each minor device may have been opened multiple times. Send an open
+ * request for each time that it was opened before. We could reopen it
+ * just once, but then we'd have to keep a shadow open count as well.
+ */
+ for (j = 0; j < open_dev[i].count; j++) {
+ memset(&m, 0, sizeof(m));
+ m.m_type = BDEV_OPEN;
+ m.BDEV_MINOR = minor(open_dev[i].dev);
+ m.BDEV_ACCESS = open_dev[i].access;
+ m.BDEV_ID = NO_ID;
+
+ if ((r = sendrec(endpt, &m)) != OK) {
+ printf("bdev: IPC to driver (%d) failed (%d)\n",
+ endpt, r);
+ return r;
+ }
+
+ if (m.m_type != BDEV_REPLY) {
+ printf("bdev: driver (%d) sent weird response (%d)\n",
+ endpt, m.m_type);
+ return EINVAL;
+ }
+
+ if (m.BDEV_ID != NO_ID) {
+ printf("bdev: driver (%d) sent invalid ID (%ld)\n",
+ endpt, m.BDEV_ID);
+ return EINVAL;
+ }
+
+ if ((r = m.BDEV_STATUS) != OK) {
+ printf("bdev: driver (%d) failed device reopen (%d)\n",
+ endpt, r);
+ return r;
+ }
+ }
+ }
+
+ return OK;
+}
+
+void bdev_minor_add(dev_t dev, int access)
+{
+/* Increase the reference count of the given minor device.
+ */
+ int i, free = -1;
+
+ for (i = 0; i < NR_OPEN_DEVS; i++) {
+ if (open_dev[i].dev == dev) {
+ open_dev[i].count++;
+ open_dev[i].access |= access;
+
+ return;
+ }
+
+ if (free < 0 && open_dev[i].dev == NO_DEV)
+ free = i;
+ }
+
+ if (free < 0) {
+ printf("bdev: too many open devices, increase NR_OPEN_DEVS\n");
+ return;
+ }
+
+ open_dev[free].dev = dev;
+ open_dev[free].count = 1;
+ open_dev[free].access = access;
+}
+
+void bdev_minor_del(dev_t dev)
+{
+/* Decrease the reference count of the given minor device, if present.
+ */
+ int i;
+
+ for (i = 0; i < NR_OPEN_DEVS; i++) {
+ if (open_dev[i].dev == dev) {
+ if (!--open_dev[i].count)
+ open_dev[i].dev = NO_DEV;
+
+ break;
+ }
+ }
+}
#ifndef _BDEV_PROTO_H
#define _BDEV_PROTO_H
+/* bdev.c */
+extern void bdev_callback_asyn(bdev_call_t *call, int result);
+extern int bdev_restart_asyn(bdev_call_t *call);
+
/* driver.c */
extern void bdev_driver_init(void);
extern void bdev_driver_clear(dev_t dev);
extern endpoint_t bdev_driver_get(dev_t dev);
extern endpoint_t bdev_driver_update(dev_t dev);
+/* call.c */
+extern bdev_call_t *bdev_call_alloc(int count);
+extern void bdev_call_free(bdev_call_t *call);
+extern bdev_call_t *bdev_call_get(bdev_id_t id);
+extern bdev_call_t *bdev_call_find(dev_t dev);
+extern bdev_call_t *bdev_call_iter_maj(dev_t dev, bdev_call_t *last,
+ bdev_call_t **next);
+
/* ipc.c */
extern void bdev_update(dev_t dev, char *label);
+extern int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t num);
extern int bdev_sendrec(dev_t dev, const message *m_orig);
+/* minor.c */
+extern int bdev_minor_reopen(dev_t dev);
+extern void bdev_minor_add(dev_t dev, int access);
+extern void bdev_minor_del(dev_t dev);
+
#endif /* _BDEV_PROTO_H */
--- /dev/null
+#ifndef _BDEV_TYPE_H
+#define _BDEV_TYPE_H
+
+typedef struct {
+ bdev_id_t id; /* call ID */
+ dev_t dev; /* target device number */
+ message msg; /* request message */
+ bdev_callback_t callback; /* callback function */
+ bdev_param_t param; /* callback parameter */
+ int driver_tries; /* times retried on driver restarts */
+ int transfer_tries; /* times retried on transfer errors */
+ iovec_t *vec; /* original vector */
+ iovec_s_t gvec[1]; /* grant vector */
+} bdev_call_t;
+
+#endif /* _BDEV_TYPE_H */