From: David van Moolenbroek Date: Thu, 21 Jul 2016 20:59:26 +0000 (+0000) Subject: Add libsockevent: a socket event dispatching library X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/zpipe.c?a=commitdiff_plain;h=refs%2Fchanges%2F24%2F3424%2F1;p=minix.git Add libsockevent: a socket event dispatching library This library provides an event-based abstraction model and dispatching facility for socket drivers. Its main goal is to eliminate any and all need for socket drivers to keep track of pending socket calls. Additionally, this library takes over responsibility of a number of other tasks that would otherwise be duplicated between socket drivers, but in such a way that individual socket drivers retain a large degree of freedom in terms of API behavior. The library's main features are: - suspension, resumption, and cancellation of socket calls; - an abstraction layer for select(2); - state tracking of shutdown(2); - pending (asynchronous) errors and the SO_ERROR socket option; - listening-socket tracking and the SO_ACCEPTCONN socket option; - generation of SIGPIPE signals; SO_NOSIGPIPE, MSG_NOSIGNAL; - send and receive low-watermark tracking, SO_SNDLOWAT, SO_RCVLOWAT; - send and receive timeout support and SO_SNDTIMEO, SO_RCVTIMEO; - an abstraction layer for the SO_LINGER socket option; - tracking of various on/off socket options as well as SO_TYPE; - a range of pre-checks on socket calls that are required POSIX. In order to track per-socket state, the library manages an opaque "sock" object for each socket. The allocation of such objects is left entirely to the socket driver. Each sock object has an associated callback table for calls from libsockevent to the socket driver. The socket driver can raise events on the sock object in order to flag that any previously suspended operations of a particular type should be resumed. The library may defer processing such raised events if immediate processing could interfere with internal consistency. The sockevent library is layered on top of libsockdriver, and should be used by all socket driver implementations if at all possible. Change-Id: I3eb2c80602a63ef13035f646473360293607ab76 --- diff --git a/distrib/sets/lists/minix-comp/mi b/distrib/sets/lists/minix-comp/mi index 1bec67497..f17be8a51 100644 --- a/distrib/sets/lists/minix-comp/mi +++ b/distrib/sets/lists/minix-comp/mi @@ -1239,6 +1239,7 @@ ./usr/include/minix/sef.h minix-comp ./usr/include/minix/sffs.h minix-comp ./usr/include/minix/sockdriver.h minix-comp +./usr/include/minix/sockevent.h minix-comp ./usr/include/minix/sound.h minix-comp ./usr/include/minix/spin.h minix-comp ./usr/include/minix/sys_config.h minix-comp @@ -1931,6 +1932,7 @@ ./usr/lib/bc/libsaslc.a minix-comp bitcode ./usr/lib/bc/libsffs.a minix-comp bitcode ./usr/lib/bc/libsockdriver.a minix-comp bitcode +./usr/lib/bc/libsockevent.a minix-comp bitcode ./usr/lib/bc/libsqlite3.a minix-comp bitcode ./usr/lib/bc/libssl.a minix-comp bitcode ./usr/lib/bc/libsys.a minix-comp bitcode @@ -2092,6 +2094,8 @@ ./usr/lib/libsffs_pic.a minix-comp ./usr/lib/libsockdriver.a minix-comp ./usr/lib/libsockdriver_pic.a minix-comp +./usr/lib/libsockevent.a minix-comp +./usr/lib/libsockevent_pic.a minix-comp ./usr/lib/libsqlite3.a minix-comp ./usr/lib/libsqlite3_pic.a minix-comp ./usr/lib/libstdc++.a minix-comp libstdcxx diff --git a/distrib/sets/lists/minix-debug/mi b/distrib/sets/lists/minix-debug/mi index 98d520e13..01d41d478 100644 --- a/distrib/sets/lists/minix-debug/mi +++ b/distrib/sets/lists/minix-debug/mi @@ -99,6 +99,7 @@ ./usr/lib/libsaslc_g.a minix-debug debuglib ./usr/lib/libsffs_g.a minix-debug debuglib ./usr/lib/libsockdriver_g.a minix-debug debuglib +./usr/lib/libsockevent_g.a minix-debug debuglib ./usr/lib/libsqlite3_g.a minix-debug debuglib ./usr/lib/libssl_g.a minix-debug debuglib ./usr/lib/libstdc++_g.a minix-debug libstdcxx,debuglib diff --git a/lib/Makefile b/lib/Makefile index cc022eed4..434b969b6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -37,6 +37,7 @@ SUBDIR+= ../minix/lib/libasyn \ ../minix/lib/libnetdriver \ ../minix/lib/libsffs \ ../minix/lib/libsockdriver \ + ../minix/lib/libsockevent \ ../minix/lib/libtimers \ ../minix/lib/libusb \ ../minix/lib/libvtreefs diff --git a/minix/include/minix/Makefile b/minix/include/minix/Makefile index fb3d553b0..f3edeb197 100644 --- a/minix/include/minix/Makefile +++ b/minix/include/minix/Makefile @@ -17,7 +17,8 @@ INCS+= acpi.h audio_fw.h bitmap.h \ netdriver.h optset.h padconf.h partition.h portio.h \ priv.h procfs.h profile.h queryparam.h \ rmib.h rs.h safecopies.h sched.h sef.h sffs.h \ - sockdriver.h sound.h spin.h sys_config.h sysctl.h sysinfo.h \ + sockdriver.h sockevent.h sound.h spin.h \ + sys_config.h sysctl.h sysinfo.h \ syslib.h sysutil.h timers.h type.h \ u64.h usb.h usb_ch9.h vbox.h \ vboxfs.h vboxif.h vboxtype.h vm.h \ diff --git a/minix/include/minix/sockevent.h b/minix/include/minix/sockevent.h new file mode 100644 index 000000000..345185648 --- /dev/null +++ b/minix/include/minix/sockevent.h @@ -0,0 +1,120 @@ +#ifndef _MINIX_SOCKEVENT_H +#define _MINIX_SOCKEVENT_H + +#include + +/* Socket events. */ +#define SEV_BIND 0x01 /* a pending bind operation has ended */ +#define SEV_CONNECT 0x02 /* a pending connect operation has ended */ +#define SEV_ACCEPT 0x04 /* pending accept operations may be resumed */ +#define SEV_SEND 0x08 /* pending send operations may be resumed */ +#define SEV_RECV 0x10 /* pending receive operations may be resumed */ +#define SEV_CLOSE 0x20 /* a pending close operation has ended */ + +/* Socket flags. */ +#define SFL_SHUT_RD 0x01 /* socket has been shut down for reading */ +#define SFL_SHUT_WR 0x02 /* socket has been shut down for writing */ +#define SFL_CLOSING 0x04 /* socket close operation in progress */ +#define SFL_CLONED 0x08 /* socket has been cloned but not accepted */ +#define SFL_TIMER 0x10 /* socket is on list of timers */ + +/* + * Special return value from sop_recv callback functions. This pseudo-value + * is used to differentiate between zero-sized packets and actual EOF. + */ +#define SOCKEVENT_EOF 1 + +struct sockevent_ops; +struct sockevent_proc; + +/* Socket structure. None of its fields must ever be accessed directly. */ +struct sock { + sockid_t sock_id; /* socket identifier */ + unsigned char sock_events; /* pending events (SEV_) */ + unsigned char sock_flags; /* internal flags (SFL_) */ + unsigned char sock_domain; /* domain, address family (PF_, AF_) */ + int sock_type; /* type: stream, datagram.. (SOCK_) */ + int sock_err; /* pending error code < 0, 0 if none */ + unsigned int sock_opt; /* generic option flags (SO_) */ + clock_t sock_linger; /* SO_LINGER value, in ticks or time */ + clock_t sock_stimeo; /* SO_SNDTIMEO value, in clock ticks */ + clock_t sock_rtimeo; /* SO_RCVTIMEO value, in clock ticks */ + size_t sock_slowat; /* SO_SNDLOWAT value, in bytes */ + size_t sock_rlowat; /* SO_RCVLOWAT value, in bytes */ + const struct sockevent_ops *sock_ops; /* socket operations table */ + SIMPLEQ_ENTRY(sock) sock_next; /* list for pending events */ + SLIST_ENTRY(sock) sock_hash; /* list for hash table */ + SLIST_ENTRY(sock) sock_timer; /* list of socks with timers */ + struct sockevent_proc *sock_proc; /* list of suspended calls */ + struct sockdriver_select sock_select; /* pending select query */ + unsigned int sock_selops; /* pending select operations, or 0 */ +}; + +/* Socket operations table. */ +struct sockevent_ops { + int (* sop_pair)(struct sock * sock1, struct sock * sock2, + endpoint_t user_endpt); + int (* sop_bind)(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt); + int (* sop_connect)(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt); + int (* sop_listen)(struct sock * sock, int backlog); + sockid_t (* sop_accept)(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len, endpoint_t user_endpt, + struct sock ** newsockp); + int (* sop_test_accept)(struct sock * sock); + int (* sop_pre_send)(struct sock * sock, size_t len, socklen_t ctl_len, + const struct sockaddr * addr, socklen_t addr_len, + endpoint_t user_endpt, int flags); + int (* sop_send)(struct sock * sock, + const struct sockdriver_data * data, size_t len, size_t * off, + const struct sockdriver_data * ctl, socklen_t ctl_len, + socklen_t * ctl_off, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt, int flags, size_t min); + int (* sop_test_send)(struct sock * sock, size_t min); + int (* sop_pre_recv)(struct sock * sock, endpoint_t user_endpt, + int flags); + int (* sop_recv)(struct sock * sock, + const struct sockdriver_data * data, size_t len, size_t * off, + const struct sockdriver_data * ctl, socklen_t ctl_len, + socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len, + endpoint_t user_endpt, int flags, size_t min, int * rflags); + int (* sop_test_recv)(struct sock * sock, size_t min, size_t * size); + int (* sop_ioctl)(struct sock * sock, unsigned long request, + const struct sockdriver_data * data, endpoint_t user_endpt); + void (* sop_setsockmask)(struct sock * sock, unsigned int mask); + int (* sop_setsockopt)(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t len); + int (* sop_getsockopt)(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t * len); + int (* sop_getsockname)(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len); + int (* sop_getpeername)(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len); + int (* sop_shutdown)(struct sock * sock, unsigned int flags); + int (* sop_close)(struct sock * sock, int force); + void (* sop_free)(struct sock * sock); +}; + +typedef sockid_t (* sockevent_socket_cb_t)(int domain, int type, int protocol, + endpoint_t user_endpt, struct sock ** sock, + const struct sockevent_ops ** ops); + +void sockevent_init(sockevent_socket_cb_t socket_cb); +void sockevent_process(const message * m_ptr, int ipc_status); + +void sockevent_clone(struct sock * sock, struct sock * newsock, + sockid_t newid); + +void sockevent_raise(struct sock * sock, unsigned int mask); +void sockevent_set_error(struct sock * sock, int err); +void sockevent_set_shutdown(struct sock * sock, unsigned int flags); + +#define sockevent_get_domain(sock) ((int)((sock)->sock_domain)) +#define sockevent_get_type(sock) ((sock)->sock_type) +#define sockevent_get_opt(sock) ((sock)->sock_opt) +#define sockevent_is_listening(sock) (!!((sock)->sock_opt & SO_ACCEPTCONN)) +#define sockevent_is_shutdown(sock, mask) ((sock)->sock_flags & (mask)) +#define sockevent_is_closing(sock) (!!((sock)->sock_flags & SFL_CLOSING)) + +#endif /* !_MINIX_SOCKEVENT_H */ diff --git a/minix/lib/Makefile b/minix/lib/Makefile index d15baacd6..d120a8172 100644 --- a/minix/lib/Makefile +++ b/minix/lib/Makefile @@ -19,6 +19,7 @@ SUBDIR+= libminixfs SUBDIR+= libnetdriver SUBDIR+= libsffs SUBDIR+= libsockdriver +SUBDIR+= libsockevent SUBDIR+= libtimers SUBDIR+= libusb SUBDIR+= libvtreefs diff --git a/minix/lib/libsockevent/Makefile b/minix/lib/libsockevent/Makefile new file mode 100644 index 000000000..713cd13ca --- /dev/null +++ b/minix/lib/libsockevent/Makefile @@ -0,0 +1,9 @@ +# Makefile for libsockevent + +CPPFLAGS+= -D_MINIX_SYSTEM -I${NETBSDSRCDIR}/minix/lib/libcharevent + +LIB= sockevent + +SRCS= sockevent.c sockevent_proc.c + +.include diff --git a/minix/lib/libsockevent/sockevent.c b/minix/lib/libsockevent/sockevent.c new file mode 100644 index 000000000..bb84cee51 --- /dev/null +++ b/minix/lib/libsockevent/sockevent.c @@ -0,0 +1,2590 @@ +/* Socket event dispatching library - by D.C. van Moolenbroek */ + +#include +#include +#include +#include + +#include "sockevent_proc.h" + +#define US 1000000UL /* microseconds per second */ + +#define SOCKHASH_SLOTS 256 /* # slots in ID-to-sock hash table */ + +static SLIST_HEAD(, sock) sockhash[SOCKHASH_SLOTS]; + +static SLIST_HEAD(, sock) socktimer; + +static minix_timer_t sockevent_timer; + +static SIMPLEQ_HEAD(, sock) sockevent_pending; + +static sockevent_socket_cb_t sockevent_socket_cb = NULL; + +static int sockevent_working; + +static void socktimer_del(struct sock * sock); +static void sockevent_cancel_send(struct sock * sock, + struct sockevent_proc * spr, int err); +static void sockevent_cancel_recv(struct sock * sock, + struct sockevent_proc * spr, int err); + +/* + * Initialize the hash table of sock objects. + */ +static void +sockhash_init(void) +{ + unsigned int slot; + + for (slot = 0; slot < __arraycount(sockhash); slot++) + SLIST_INIT(&sockhash[slot]); +} + +/* + * Given a socket identifier, return a hash table slot number. + */ +static unsigned int +sockhash_slot(sockid_t id) +{ + + /* + * The idea of the shift is that a socket driver may offer multiple + * classes of sockets, and put the class in the higher bits. The shift + * aims to prevent that all classes' first sockets end up in the same + * hash slot. + */ + return (id + (id >> 16)) % SOCKHASH_SLOTS; +} + +/* + * Obtain a sock object from the hash table using its unique identifier. + * Return a pointer to the object if found, or NULL otherwise. + */ +static struct sock * +sockhash_get(sockid_t id) +{ + struct sock *sock; + unsigned int slot; + + slot = sockhash_slot(id); + + SLIST_FOREACH(sock, &sockhash[slot], sock_hash) { + if (sock->sock_id == id) + return sock; + } + + return NULL; +} + +/* + * Add a sock object to the hash table. The sock object must have a valid ID + * in its 'sock_id' field, and must not be in the hash table already. + */ +static void +sockhash_add(struct sock * sock) +{ + unsigned int slot; + + slot = sockhash_slot(sock->sock_id); + + SLIST_INSERT_HEAD(&sockhash[slot], sock, sock_hash); +} + +/* + * Remove a sock object from the hash table. The sock object must be in the + * hash table. + */ +static void +sockhash_del(struct sock * sock) +{ + unsigned int slot; + + slot = sockhash_slot(sock->sock_id); + + /* This macro is O(n). */ + SLIST_REMOVE(&sockhash[slot], sock, sock, sock_hash); +} + +/* + * Reset a socket object to a proper initial state, with a particular socket + * identifier, a SOCK_ type, and a socket operations table. The socket is + * added to the ID-to-object hash table. This function always succeeds. + */ +static void +sockevent_reset(struct sock * sock, sockid_t id, int domain, int type, + const struct sockevent_ops * ops) +{ + + assert(sock != NULL); + + memset(sock, 0, sizeof(*sock)); + + sock->sock_id = id; + sock->sock_domain = domain; + sock->sock_type = type; + + sock->sock_slowat = 1; + sock->sock_rlowat = 1; + + sock->sock_ops = ops; + sock->sock_proc = NULL; + sock->sock_select.ss_endpt = NONE; + + sockhash_add(sock); +} + +/* + * Initialize a new socket that will serve as an accepted socket on the given + * listening socket 'sock'. The new socket is given as 'newsock', and its new + * socket identifier is given as 'newid'. This function always succeeds. + */ +void +sockevent_clone(struct sock * sock, struct sock * newsock, sockid_t newid) +{ + + sockevent_reset(newsock, newid, (int)sock->sock_domain, + sock->sock_type, sock->sock_ops); + + /* These are the settings that are currently inherited. */ + newsock->sock_opt = sock->sock_opt & ~SO_ACCEPTCONN; + newsock->sock_linger = sock->sock_linger; + newsock->sock_stimeo = sock->sock_stimeo; + newsock->sock_rtimeo = sock->sock_rtimeo; + newsock->sock_slowat = sock->sock_slowat; + newsock->sock_rlowat = sock->sock_rlowat; + + newsock->sock_flags |= SFL_CLONED; +} + +/* + * A new socket has just been accepted. The corresponding listening socket is + * given as 'sock'. The new socket has ID 'newid', and if it had not already + * been added to the hash table through sockevent_clone() before, 'newsock' is + * a non-NULL pointer which identifies the socket object to clone into. + */ +static void +sockevent_accepted(struct sock * sock, struct sock * newsock, sockid_t newid) +{ + + if (newsock == NULL) { + if ((newsock = sockhash_get(newid)) == NULL) + panic("libsockdriver: socket driver returned unknown " + "ID %d from accept callback", newid); + } else + sockevent_clone(sock, newsock, newid); + + assert(newsock->sock_flags & SFL_CLONED); + newsock->sock_flags &= ~SFL_CLONED; +} + +/* + * Allocate a sock object, by asking the socket driver for one. On success, + * return OK, with a pointer to the new object stored in 'sockp'. This new + * object has all its fields set to initial values, in part based on the given + * parameters. On failure, return an error code. Failure has two typical + * cause: either the given domain, type, protocol combination is not supported, + * or the socket driver is out of sockets (globally or for this combination). + */ +static int +sockevent_alloc(int domain, int type, int protocol, endpoint_t user_endpt, + struct sock ** sockp) +{ + struct sock *sock; + const struct sockevent_ops *ops; + sockid_t r; + + /* + * Verify that the given domain is sane. Unlike the type and protocol, + * the domain is already verified by VFS, so we do not limit ourselves + * here. The result is that we can store the domain in just a byte. + */ + if (domain < 0 || domain > UINT8_MAX) + return EAFNOSUPPORT; + + /* Make sure that the library has actually been initialized. */ + if (sockevent_socket_cb == NULL) + panic("libsockevent: not initialized"); + + sock = NULL; + ops = NULL; + + /* + * Ask the socket driver to create a socket for the given combination + * of domain, type, and protocol. If so, let it return a new sock + * object, a unique socket identifier for that object, and an + * operations table for it. + */ + if ((r = sockevent_socket_cb(domain, type, protocol, user_endpt, &sock, + &ops)) < 0) + return r; + + assert(sock != NULL); + assert(ops != NULL); + + sockevent_reset(sock, r, domain, type, ops); + + *sockp = sock; + return OK; +} + +/* + * Free a previously allocated sock object. + */ +static void +sockevent_free(struct sock * sock) +{ + const struct sockevent_ops *ops; + + assert(sock->sock_proc == NULL); + + socktimer_del(sock); + + sockhash_del(sock); + + /* + * Invalidate the operations table on the socket, before freeing the + * socket. This allows us to detect cases where sockevent functions + * are called on sockets that have already been freed. + */ + ops = sock->sock_ops; + sock->sock_ops = NULL; + + assert(ops != NULL); + assert(ops->sop_free != NULL); + + ops->sop_free(sock); +} + +/* + * Create a new socket. + */ +static sockid_t +sockevent_socket(int domain, int type, int protocol, endpoint_t user_endpt) +{ + struct sock *sock; + int r; + + if ((r = sockevent_alloc(domain, type, protocol, user_endpt, + &sock)) != OK) + return r; + + return sock->sock_id; +} + +/* + * Create a pair of connected sockets. + */ +static int +sockevent_socketpair(int domain, int type, int protocol, endpoint_t user_endpt, + sockid_t id[2]) +{ + struct sock *sock1, *sock2; + int r; + + if ((r = sockevent_alloc(domain, type, protocol, user_endpt, + &sock1)) != OK) + return r; + + /* Creating socket pairs is not always supported. */ + if (sock1->sock_ops->sop_pair == NULL) { + sockevent_free(sock1); + + return EOPNOTSUPP; + } + + if ((r = sockevent_alloc(domain, type, protocol, user_endpt, + &sock2)) != OK) { + sockevent_free(sock1); + + return r; + } + + assert(sock1->sock_ops == sock2->sock_ops); + + r = sock1->sock_ops->sop_pair(sock1, sock2, user_endpt); + + if (r != OK) { + sockevent_free(sock2); + sockevent_free(sock1); + + return r; + } + + id[0] = sock1->sock_id; + id[1] = sock2->sock_id; + return OK; +} + +/* + * A send request returned EPIPE. If desired, send a SIGPIPE signal to the + * user process that issued the request. + */ +static void +sockevent_sigpipe(struct sock * sock, endpoint_t user_endpt, int flags) +{ + + /* + * POSIX says that pipe signals should be generated for SOCK_STREAM + * sockets. Linux does just this, NetBSD raises signals for all socket + * types. + */ + if (sock->sock_type != SOCK_STREAM) + return; + + /* + * Why would there be fewer than four ways to do the same thing? + * O_NOSIGPIPE, MSG_NOSIGNAL, SO_NOSIGPIPE, and of course blocking + * SIGPIPE. VFS already sets MSG_NOSIGNAL for calls on sockets with + * O_NOSIGPIPE. The fact that SO_NOSIGPIPE is a thing, is also the + * reason why we cannot let VFS handle signal generation altogether. + */ + if (flags & MSG_NOSIGNAL) + return; + if (sock->sock_opt & SO_NOSIGPIPE) + return; + + /* + * Send a SIGPIPE signal to the user process. Unfortunately we cannot + * guarantee that the SIGPIPE reaches the user process before the send + * call returns. Usually, the scheduling priorities of system services + * are such that the signal is likely to arrive first anyway, but if + * timely arrival of the signal is required, a more fundamental change + * to the system would be needed. + */ + sys_kill(user_endpt, SIGPIPE); +} + +/* + * Suspend a request without data, that is, a bind, connect, accept, or close + * request. + */ +static void +sockevent_suspend(struct sock * sock, unsigned int event, + const struct sockdriver_call * __restrict call, endpoint_t user_endpt) +{ + struct sockevent_proc *spr, **sprp; + + /* There is one slot for each process, so this should never fail. */ + if ((spr = sockevent_proc_alloc()) == NULL) + panic("libsockevent: too many suspended processes"); + + spr->spr_next = NULL; + spr->spr_event = event; + spr->spr_timer = FALSE; + spr->spr_call = *call; + spr->spr_endpt = user_endpt; + + /* + * Add the request to the tail of the queue. This operation is O(n), + * but the number of suspended requests per socket is expected to be + * low at all times. + */ + for (sprp = &sock->sock_proc; *sprp != NULL; + sprp = &(*sprp)->spr_next); + *sprp = spr; +} + +/* + * Suspend a request with data, that is, a send or receive request. + */ +static void +sockevent_suspend_data(struct sock * sock, unsigned int event, int timer, + const struct sockdriver_call * __restrict call, endpoint_t user_endpt, + const struct sockdriver_data * __restrict data, size_t len, size_t off, + const struct sockdriver_data * __restrict ctl, socklen_t ctl_len, + socklen_t ctl_off, int flags, int rflags, clock_t time) +{ + struct sockevent_proc *spr, **sprp; + + /* There is one slot for each process, so this should never fail. */ + if ((spr = sockevent_proc_alloc()) == NULL) + panic("libsockevent: too many suspended processes"); + + spr->spr_next = NULL; + spr->spr_event = event; + spr->spr_timer = timer; + spr->spr_call = *call; + spr->spr_endpt = user_endpt; + sockdriver_pack_data(&spr->spr_data, call, data, len); + spr->spr_datalen = len; + spr->spr_dataoff = off; + sockdriver_pack_data(&spr->spr_ctl, call, ctl, ctl_len); + spr->spr_ctllen = ctl_len; + spr->spr_ctloff = ctl_off; + spr->spr_flags = flags; + spr->spr_rflags = rflags; + spr->spr_time = time; + + /* + * Add the request to the tail of the queue. This operation is O(n), + * but the number of suspended requests per socket is expected to be + * low at all times. + */ + for (sprp = &sock->sock_proc; *sprp != NULL; + sprp = &(*sprp)->spr_next); + *sprp = spr; +} + +/* + * Return TRUE if there are any suspended requests on the given socket's queue + * that match any of the events in the given event mask, or FALSE otherwise. + */ +static int +sockevent_has_suspended(struct sock * sock, unsigned int mask) +{ + struct sockevent_proc *spr; + + for (spr = sock->sock_proc; spr != NULL; spr = spr->spr_next) + if (spr->spr_event & mask) + return TRUE; + + return FALSE; +} + +/* + * Check whether the given call is on the given socket's queue of suspended + * requests. If so, remove it from the queue and return a pointer to the + * suspension data structure. The caller is then responsible for freeing that + * data structure using sockevent_proc_free(). If the call was not found, the + * function returns NULL. + */ +static struct sockevent_proc * +sockevent_unsuspend(struct sock * sock, const struct sockdriver_call * call) +{ + struct sockevent_proc *spr, **sprp; + + /* Find the suspended request being canceled. */ + for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; + sprp = &spr->spr_next) { + if (spr->spr_call.sc_endpt == call->sc_endpt && + spr->spr_call.sc_req == call->sc_req) { + /* Found; remove and return it. */ + *sprp = spr->spr_next; + + return spr; + } + } + + return NULL; +} + +/* + * Attempt to resume the given suspended request for the given socket object. + * Return TRUE if the suspended request has been fully resumed and can be + * removed from the queue of suspended requests, or FALSE if it has not been + * fully resumed and should stay on the queue. In the latter case, no + * resumption will be attempted for other suspended requests of the same type. + */ +static int +sockevent_resume(struct sock * sock, struct sockevent_proc * spr) +{ + struct sock *newsock; + struct sockdriver_data data, ctl; + char addr[SOCKADDR_MAX]; + socklen_t addr_len; + size_t len, min; + sockid_t r; + + switch (spr->spr_event) { + case SEV_CONNECT: + /* + * If the connect call was suspended for the purpose of + * intercepting resumption, simply remove it from the queue. + */ + if (spr->spr_call.sc_endpt == NONE) + return TRUE; + + /* FALLTHROUGH */ + case SEV_BIND: + if ((r = sock->sock_err) != OK) + sock->sock_err = OK; + + sockdriver_reply_generic(&spr->spr_call, r); + + return TRUE; + + case SEV_ACCEPT: + /* + * A previous accept call may not have blocked on a socket that + * was not in listening mode. + */ + assert(sock->sock_opt & SO_ACCEPTCONN); + + addr_len = 0; + newsock = NULL; + + /* + * This call is suspended, which implies that the call table + * pointer has already tested to be non-NULL. + */ + if ((r = sock->sock_ops->sop_accept(sock, + (struct sockaddr *)&addr, &addr_len, spr->spr_endpt, + &newsock)) == SUSPEND) + return FALSE; + + if (r >= 0) { + assert(addr_len <= sizeof(addr)); + + sockevent_accepted(sock, newsock, r); + } + + sockdriver_reply_accept(&spr->spr_call, r, + (struct sockaddr *)&addr, addr_len); + + return TRUE; + + case SEV_SEND: + if (sock->sock_err != OK || (sock->sock_flags & SFL_SHUT_WR)) { + if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) + r = (int)spr->spr_dataoff; + else if ((r = sock->sock_err) != OK) + sock->sock_err = OK; + else + r = EPIPE; + } else { + sockdriver_unpack_data(&data, &spr->spr_call, + &spr->spr_data, spr->spr_datalen); + sockdriver_unpack_data(&ctl, &spr->spr_call, + &spr->spr_ctl, spr->spr_ctllen); + + len = spr->spr_datalen - spr->spr_dataoff; + + min = sock->sock_slowat; + if (min > len) + min = len; + + /* + * As mentioned elsewhere, we do not save the address + * upon suspension so we cannot supply it anymore here. + */ + r = sock->sock_ops->sop_send(sock, &data, len, + &spr->spr_dataoff, &ctl, + spr->spr_ctllen - spr->spr_ctloff, + &spr->spr_ctloff, NULL, 0, spr->spr_endpt, + spr->spr_flags, min); + + assert(r <= 0); + + if (r == SUSPEND) + return FALSE; + + /* + * If an error occurred but some data were already + * sent, return the progress rather than the error. + * Note that if the socket driver detects an + * asynchronous error during the send, it itself must + * perform this check and call sockevent_set_error() as + * needed, to make sure the error does not get lost. + */ + if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) + r = spr->spr_dataoff; + } + + if (r == EPIPE) + sockevent_sigpipe(sock, spr->spr_endpt, + spr->spr_flags); + + sockdriver_reply_generic(&spr->spr_call, r); + + return TRUE; + + case SEV_RECV: + addr_len = 0; + + if (sock->sock_flags & SFL_SHUT_RD) + r = SOCKEVENT_EOF; + else { + len = spr->spr_datalen - spr->spr_dataoff; + + if (sock->sock_err == OK) { + min = sock->sock_rlowat; + if (min > len) + min = len; + } else + min = 0; + + sockdriver_unpack_data(&data, &spr->spr_call, + &spr->spr_data, spr->spr_datalen); + sockdriver_unpack_data(&ctl, &spr->spr_call, + &spr->spr_ctl, spr->spr_ctllen); + + r = sock->sock_ops->sop_recv(sock, &data, len, + &spr->spr_dataoff, &ctl, + spr->spr_ctllen - spr->spr_ctloff, + &spr->spr_ctloff, (struct sockaddr *)&addr, + &addr_len, spr->spr_endpt, spr->spr_flags, min, + &spr->spr_rflags); + + /* + * If the call remains suspended but a socket error is + * pending, return the pending socket error instead. + */ + if (r == SUSPEND) { + if (sock->sock_err == OK) + return FALSE; + + r = SOCKEVENT_EOF; + } + + assert(addr_len <= sizeof(addr)); + } + + /* + * If the receive call reported success, or if some data were + * already received, return the (partial) result. Otherwise, + * return a pending error if any, or otherwise a regular error + * or 0 for EOF. + */ + if (r == OK || spr->spr_dataoff > 0 || spr->spr_ctloff > 0) + r = (int)spr->spr_dataoff; + else if (sock->sock_err != OK) { + r = sock->sock_err; + + sock->sock_err = OK; + } else if (r == SOCKEVENT_EOF) + r = 0; /* EOF */ + + sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff, + (struct sockaddr *)&addr, addr_len, spr->spr_rflags); + + return TRUE; + + case SEV_CLOSE: + sockdriver_reply_generic(&spr->spr_call, OK); + + return TRUE; + + default: + panic("libsockevent: process suspended on unknown event 0x%x", + spr->spr_event); + } +} + +/* + * Return TRUE if the given socket is ready for reading for a select call, or + * FALSE otherwise. + */ +static int +sockevent_test_readable(struct sock * sock) +{ + int r; + + /* + * The meaning of "ready-to-read" depends on whether the socket is a + * listening socket or not. For the former, it is a test on whether + * there are any new sockets to accept. However, shutdown flags take + * precedence in both cases. + */ + if (sock->sock_flags & SFL_SHUT_RD) + return TRUE; + + if (sock->sock_err != OK) + return TRUE; + + /* + * Depending on whether this is a listening-mode socket, test whether + * either accepts or receives would block. + */ + if (sock->sock_opt & SO_ACCEPTCONN) { + if (sock->sock_ops->sop_test_accept == NULL) + return TRUE; + + r = sock->sock_ops->sop_test_accept(sock); + } else { + if (sock->sock_ops->sop_test_recv == NULL) + return TRUE; + + r = sock->sock_ops->sop_test_recv(sock, sock->sock_rlowat, + NULL); + } + + return (r != SUSPEND); +} + +/* + * Return TRUE if the given socket is ready for writing for a select call, or + * FALSE otherwise. + */ +static int +sockevent_test_writable(struct sock * sock) +{ + int r; + + if (sock->sock_err != OK) + return TRUE; + + if (sock->sock_flags & SFL_SHUT_WR) + return TRUE; + + if (sock->sock_ops->sop_test_send == NULL) + return TRUE; + + /* + * Test whether sends would block. The low send watermark is relevant + * for stream-type sockets only. + */ + r = sock->sock_ops->sop_test_send(sock, sock->sock_slowat); + + return (r != SUSPEND); +} + +/* + * Test whether any of the given select operations are ready on the given + * socket. Return the subset of ready operations; zero if none. + */ +static unsigned int +sockevent_test_select(struct sock * sock, unsigned int ops) +{ + unsigned int ready_ops; + + assert(!(ops & ~(SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR))); + + /* + * We do not support the "bind in progress" case here. If a blocking + * bind call is in progress, the file descriptor should not be ready + * for either reading or writing. Currently, socket drivers will have + * to cover this case themselves. Otherwise we would have to check the + * queue of suspended calls, or create a custom flag for this. + */ + + ready_ops = 0; + + if ((ops & SDEV_OP_RD) && sockevent_test_readable(sock)) + ready_ops |= SDEV_OP_RD; + + if ((ops & SDEV_OP_WR) && sockevent_test_writable(sock)) + ready_ops |= SDEV_OP_WR; + + /* TODO: OOB receive support. */ + + return ready_ops; +} + +/* + * Fire the given mask of events on the given socket object now. + */ +static void +sockevent_fire(struct sock * sock, unsigned int mask) +{ + struct sockevent_proc *spr, **sprp; + unsigned int r, flag, ops; + + /* + * A completed connection attempt (successful or not) also always + * implies that the socket becomes writable. For convenience we + * enforce this rule here, because it is easy to forget. Note that in + * any case, a suspended connect request should be the first in the + * list, so we do not risk returning 0 from a connect call as a result + * of sock_err getting eaten by another resumed call. + */ + if (mask & SEV_CONNECT) + mask |= SEV_SEND; + + /* + * First try resuming regular system calls. + */ + for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) { + flag = spr->spr_event; + + if ((mask & flag) && sockevent_resume(sock, spr)) { + *sprp = spr->spr_next; + + sockevent_proc_free(spr); + } else { + mask &= ~flag; + + sprp = &spr->spr_next; + } + } + + /* + * Then see if we can satisfy pending select queries. + */ + if ((mask & (SEV_ACCEPT | SEV_SEND | SEV_RECV)) && + sock->sock_select.ss_endpt != NONE) { + assert(sock->sock_selops != 0); + + /* + * Only retest select operations that, based on the given event + * mask, could possibly be satisfied now. + */ + ops = sock->sock_selops; + if (!(mask & (SEV_ACCEPT | SEV_RECV))) + ops &= ~SDEV_OP_RD; + if (!(mask & SEV_SEND)) + ops &= ~SDEV_OP_WR; + if (!(0)) /* TODO: OOB receive support */ + ops &= ~SDEV_OP_ERR; + + /* Are there any operations to test? */ + if (ops != 0) { + /* Test those operations. */ + r = sockevent_test_select(sock, ops); + + /* Were any satisfied? */ + if (r != 0) { + /* Let the caller know. */ + sockdriver_reply_select(&sock->sock_select, + sock->sock_id, r); + + sock->sock_selops &= ~r; + + /* Are there any saved operations left now? */ + if (sock->sock_selops == 0) + sock->sock_select.ss_endpt = NONE; + } + } + } + + /* + * Finally, a SEV_CLOSE event unconditionally frees the sock object. + * This event should be fired only for sockets that are either not yet, + * or not anymore, in use by userland. + */ + if (mask & SEV_CLOSE) { + assert(sock->sock_flags & (SFL_CLONED | SFL_CLOSING)); + + sockevent_free(sock); + } +} + +/* + * Process all pending events. Events must still be blocked, so that if + * handling one event generates a new event, that event is handled from here + * rather than immediately. + */ +static void +sockevent_pump(void) +{ + struct sock *sock; + unsigned int mask; + + assert(sockevent_working); + + while (!SIMPLEQ_EMPTY(&sockevent_pending)) { + sock = SIMPLEQ_FIRST(&sockevent_pending); + SIMPLEQ_REMOVE_HEAD(&sockevent_pending, sock_next); + + mask = sock->sock_events; + assert(mask != 0); + sock->sock_events = 0; + + sockevent_fire(sock, mask); + /* + * At this point, the sock object may already have been readded + * to the event list, or even be deallocated altogether. + */ + } +} + +/* + * Return TRUE if any events are pending on any sockets, or FALSE otherwise. + */ +static int +sockevent_has_events(void) +{ + + return (!SIMPLEQ_EMPTY(&sockevent_pending)); +} + +/* + * Raise the given bitwise-OR'ed set of events on the given socket object. + * Depending on the context of the call, they events may or may not be + * processed immediately. + */ +void +sockevent_raise(struct sock * sock, unsigned int mask) +{ + + assert(sock->sock_ops != NULL); + + /* + * Handle SEV_CLOSE first. This event must not be deferred, so as to + * let socket drivers recycle sock objects as they are needed. For + * example, a user-closed TCP socket may stay open to transmit the + * remainder of its send buffer, until the TCP driver runs out of + * sockets, in which case the connection is aborted. The driver would + * then raise SEV_CLOSE on the sock object so as to clean it up, and + * immediately reuse it afterward. If the close event were to be + * deferred, this immediate reuse would not be possible. + * + * The sop_free() callback routine may not raise new events, and thus, + * the state of 'sockevent_working' need not be checked or set here. + */ + if (mask & SEV_CLOSE) { + assert(mask == SEV_CLOSE); + + sockevent_fire(sock, mask); + + return; + } + + /* + * If we are currently processing a socket message, store the event for + * later. If not, this call is not coming from inside libsockevent, + * and we must handle the event immediately. + */ + if (sockevent_working) { + assert(mask != 0); + assert(mask <= UCHAR_MAX); /* sock_events field size check */ + + if (sock->sock_events == 0) + SIMPLEQ_INSERT_TAIL(&sockevent_pending, sock, + sock_next); + + sock->sock_events |= mask; + } else { + sockevent_working = TRUE; + + sockevent_fire(sock, mask); + + if (sockevent_has_events()) + sockevent_pump(); + + sockevent_working = FALSE; + } +} + +/* + * Set a pending error on the socket object, and wake up any suspended + * operations that are affected by this. + */ +void +sockevent_set_error(struct sock * sock, int err) +{ + + assert(err < 0); + assert(sock->sock_ops != NULL); + + /* If an error was set already, it will be overridden. */ + sock->sock_err = err; + + sockevent_raise(sock, SEV_BIND | SEV_CONNECT | SEV_SEND | SEV_RECV); +} + +/* + * Initialize timer-related data structures. + */ +static void +socktimer_init(void) +{ + + SLIST_INIT(&socktimer); + + init_timer(&sockevent_timer); +} + +/* + * Check whether the given socket object has any suspended requests that have + * now expired. If so, cancel them. Also, if the socket object has any + * suspended requests with a timeout that has not yet expired, return the + * earliest (relative) timeout of all of them, or TMR_NEVER if no such requests + * are present. + */ +static clock_t +sockevent_expire(struct sock * sock, clock_t now) +{ + struct sockevent_proc *spr, **sprp; + clock_t lowest, left; + int r; + + /* + * First handle the case that the socket is closed. In this case, + * there may be a linger timer, although the socket may also simply + * still be on the timer list because of a request that did not time + * out right before the socket was closed. + */ + if (sock->sock_flags & SFL_CLOSING) { + /* Was there a linger timer and has it expired? */ + if ((sock->sock_opt & SO_LINGER) && + tmr_is_first(sock->sock_linger, now)) { + assert(sock->sock_ops->sop_close != NULL); + + /* + * Whatever happens next, we must now resume the + * pending close operation, if it was not canceled + * earlier. As before, we return OK rather than the + * standardized EWOULDBLOCK, to ensure that the user + * process knows the file descriptor has been closed. + */ + if ((spr = sock->sock_proc) != NULL) { + assert(spr->spr_event == SEV_CLOSE); + assert(spr->spr_next == NULL); + + sock->sock_proc = NULL; + + sockdriver_reply_generic(&spr->spr_call, OK); + + sockevent_proc_free(spr); + } + + /* + * Tell the socket driver that closing the socket is + * now a bit more desired than the last time we asked. + */ + r = sock->sock_ops->sop_close(sock, TRUE /*force*/); + + assert(r == OK || r == SUSPEND); + + /* + * The linger timer fires once. After that, the socket + * driver is free to decide that it still will not + * close the socket. If it does, do not fire the + * linger timer again. + */ + if (r == SUSPEND) + sock->sock_opt &= ~SO_LINGER; + else + sockevent_free(sock); + } + + return TMR_NEVER; + } + + /* + * Then see if any send and/or receive requests have expired. Also see + * if there are any send and/or receive requests left that have not yet + * expired but do have a timeout, so that we can return the lowest of + * those timeouts. + */ + lowest = TMR_NEVER; + + for (sprp = &sock->sock_proc; (spr = *sprp) != NULL; ) { + /* Skip requests without a timeout. */ + if (spr->spr_timer == 0) { + sprp = &spr->spr_next; + + continue; + } + + assert(spr->spr_event == SEV_SEND || + spr->spr_event == SEV_RECV); + + /* + * If the request has expired, cancel it and remove it from the + * list. Otherwise, see if the request has the lowest number + * of ticks until its timeout so far. + */ + if (tmr_is_first(spr->spr_time, now)) { + *sprp = spr->spr_next; + + if (spr->spr_event == SEV_SEND) + sockevent_cancel_send(sock, spr, EWOULDBLOCK); + else + sockevent_cancel_recv(sock, spr, EWOULDBLOCK); + + sockevent_proc_free(spr); + } else { + left = spr->spr_time - now; + + if (lowest == TMR_NEVER || lowest > left) + lowest = left; + + sprp = &spr->spr_next; + } + } + + return lowest; +} + +/* + * The socket event alarm went off. Go through the set of socket objects with + * timers, and see if any of their requests have now expired. Set a new alarm + * as necessary. + */ +static void +socktimer_expire(int arg __unused) +{ + SLIST_HEAD(, sock) oldtimer; + struct sock *sock, *tsock; + clock_t now, lowest, left; + int working; + + /* + * This function may or may not be called from a context where we are + * already deferring events, so we have to cover both cases here. + */ + if ((working = sockevent_working) == FALSE) + sockevent_working = TRUE; + + /* Start a new list. */ + memcpy(&oldtimer, &socktimer, sizeof(oldtimer)); + SLIST_INIT(&socktimer); + + now = getticks(); + lowest = TMR_NEVER; + + /* + * Go through all sockets that have or had a request with a timeout, + * canceling any expired requests and building a new list of sockets + * that still have requests with timeouts as we go. + */ + SLIST_FOREACH_SAFE(sock, &oldtimer, sock_timer, tsock) { + assert(sock->sock_flags & SFL_TIMER); + sock->sock_flags &= ~SFL_TIMER; + + left = sockevent_expire(sock, now); + /* + * The sock object may already have been deallocated now. + * If 'next' is TMR_NEVER, do not touch 'sock' anymore. + */ + + if (left != TMR_NEVER) { + if (lowest == TMR_NEVER || lowest > left) + lowest = left; + + SLIST_INSERT_HEAD(&socktimer, sock, sock_timer); + + sock->sock_flags |= SFL_TIMER; + } + } + + /* If there is a new lowest timeout at all, set a new timer. */ + if (lowest != TMR_NEVER) + set_timer(&sockevent_timer, lowest, socktimer_expire, 0); + + if (!working) { + /* If any new events were raised, process them now. */ + if (sockevent_has_events()) + sockevent_pump(); + + sockevent_working = FALSE; + } +} + +/* + * Set a timer for the given (relative) number of clock ticks, adding the + * associated socket object to the set of socket objects with timers, if it was + * not already in that set. Set a new alarm if necessary, and return the + * absolute timeout for the timer. Since the timers list is maintained lazily, + * the caller need not take the object off the set if the call was canceled + * later; see also socktimer_del(). + */ +static clock_t +socktimer_add(struct sock * sock, clock_t ticks) +{ + clock_t now; + + /* + * Relative time comparisons require that any two times are no more + * than half the comparison space (clock_t, unsigned long) apart. + */ + assert(ticks <= TMRDIFF_MAX); + + /* If the socket was not already on the timers list, put it on. */ + if (!(sock->sock_flags & SFL_TIMER)) { + SLIST_INSERT_HEAD(&socktimer, sock, sock_timer); + + sock->sock_flags |= SFL_TIMER; + } + + /* + * (Re)set the timer if either it was not running at all or this new + * timeout will occur sooner than the currently scheduled alarm. Note + * that setting a timer that was already set is allowed. + */ + now = getticks(); + + if (!tmr_is_set(&sockevent_timer) || + tmr_is_first(now + ticks, tmr_exp_time(&sockevent_timer))) + set_timer(&sockevent_timer, ticks, socktimer_expire, 0); + + /* Return the absolute timeout. */ + return now + ticks; +} + +/* + * Remove a socket object from the set of socket objects with timers. Since + * the timer list is maintained lazily, this needs to be done only right before + * the socket object is freed. + */ +static void +socktimer_del(struct sock * sock) +{ + + if (sock->sock_flags & SFL_TIMER) { + /* This macro is O(n). */ + SLIST_REMOVE(&socktimer, sock, sock, sock_timer); + + sock->sock_flags &= ~SFL_TIMER; + } +} + +/* + * Bind a socket to a local address. + */ +static int +sockevent_bind(sockid_t id, const struct sockaddr * __restrict addr, + socklen_t addr_len, endpoint_t user_endpt, + const struct sockdriver_call * __restrict call) +{ + struct sock *sock; + int r; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + if (sock->sock_ops->sop_bind == NULL) + return EOPNOTSUPP; + + /* Binding a socket in listening mode is never supported. */ + if (sock->sock_opt & SO_ACCEPTCONN) + return EINVAL; + + r = sock->sock_ops->sop_bind(sock, addr, addr_len, user_endpt); + + if (r == SUSPEND) { + if (call == NULL) + return EINPROGRESS; + + sockevent_suspend(sock, SEV_BIND, call, user_endpt); + } + + return r; +} + +/* + * Connect a socket to a remote address. + */ +static int +sockevent_connect(sockid_t id, const struct sockaddr * __restrict addr, + socklen_t addr_len, endpoint_t user_endpt, + const struct sockdriver_call * call) +{ + struct sockdriver_call fakecall; + struct sockevent_proc *spr; + struct sock *sock; + int r; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + if (sock->sock_ops->sop_connect == NULL) + return EOPNOTSUPP; + + /* Connecting a socket in listening mode is never supported. */ + if (sock->sock_opt & SO_ACCEPTCONN) + return EOPNOTSUPP; + + /* + * The upcoming connect call may fire an accept event for which the + * handler may in turn fire a connect event on this socket. Since we + * delay event processing until after processing calls, this would + * create the problem that even if the connection is accepted right + * away, non-blocking connect requests would return EINPROGRESS. For + * UDS, this is undesirable behavior. To remedy this, we use a hack: + * we temporarily suspend the connect even if non-blocking, then + * process events, and then cancel the connect request again. If the + * connection was accepted immediately, the cancellation will have no + * effect, since the request has already been replied to. In order not + * to violate libsockdriver rules with this hack, we fabricate a fake + * 'conn' object. + */ + r = sock->sock_ops->sop_connect(sock, addr, addr_len, user_endpt); + + if (r == SUSPEND) { + if (call != NULL || sockevent_has_events()) { + if (call == NULL) { + fakecall.sc_endpt = NONE; + + call = &fakecall; + } + + assert(!sockevent_has_suspended(sock, + SEV_SEND | SEV_RECV)); + + sockevent_suspend(sock, SEV_CONNECT, call, user_endpt); + + if (call == &fakecall) { + /* Process any pending events first now. */ + sockevent_pump(); + + /* + * If the connect request has not been resumed + * yet now, we must remove it from the queue + * again, and return EINPROGRESS ourselves. + * Otherwise, return OK or a pending error. + */ + spr = sockevent_unsuspend(sock, call); + if (spr != NULL) { + sockevent_proc_free(spr); + + r = EINPROGRESS; + } else if ((r = sock->sock_err) != OK) + sock->sock_err = OK; + } + } else + r = EINPROGRESS; + } + + if (r == OK) { + /* + * A completed connection attempt also always implies that the + * socket becomes writable. For convenience we enforce this + * rule here, because it is easy to forget. + */ + sockevent_raise(sock, SEV_SEND); + } + + return r; +} + +/* + * Put a socket in listening mode. + */ +static int +sockevent_listen(sockid_t id, int backlog) +{ + struct sock *sock; + int r; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + if (sock->sock_ops->sop_listen == NULL) + return EOPNOTSUPP; + + /* + * Perform a general adjustment on the backlog value, applying the + * customary BSD "fudge factor" of 1.5x. Keep the value within bounds + * though. POSIX imposes that a negative backlog value is equal to a + * backlog value of zero. A backlog value of zero, in turn, may mean + * anything; we take it to be one. POSIX also imposes that all socket + * drivers accept up to at least SOMAXCONN connections on the queue. + */ + if (backlog < 0) + backlog = 0; + if (backlog < SOMAXCONN) + backlog += 1 + ((unsigned int)backlog >> 1); + if (backlog > SOMAXCONN) + backlog = SOMAXCONN; + + r = sock->sock_ops->sop_listen(sock, backlog); + + /* + * On success, the socket is now in listening mode. As part of that, + * a select(2) ready-to-read condition now indicates that a connection + * may be accepted on the socket, rather than that data may be read. + * Since libsockevent is responsible for this distinction, we keep + * track of the listening mode at this level. Conveniently, there is a + * socket option for this, which we support out of the box as a result. + */ + if (r == OK) { + sock->sock_opt |= SO_ACCEPTCONN; + + /* + * For the extremely unlikely case that right after the socket + * is put into listening mode, it has a connection ready tor + * accept, we retest blocked ready-to-read select queries now. + */ + sockevent_raise(sock, SEV_ACCEPT); + } + + return r; +} + +/* + * Accept a connection on a listening socket, creating a new socket. + */ +static sockid_t +sockevent_accept(sockid_t id, struct sockaddr * __restrict addr, + socklen_t * __restrict addr_len, endpoint_t user_endpt, + const struct sockdriver_call * __restrict call) +{ + struct sock *sock, *newsock; + sockid_t r; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + if (sock->sock_ops->sop_accept == NULL) + return EOPNOTSUPP; + + /* + * Attempt to accept a connection. The socket driver is responsible + * for allocating a sock object (and identifier) on success. It may + * already have done so before, in which case it should leave newsock + * filled with NULL; otherwise, the returned sock object is cloned from + * the listening socket. The socket driver is also responsible for + * failing the call if the socket is not in listening mode, because it + * must specify the error to return: EOPNOTSUPP or EINVAL. + */ + newsock = NULL; + + if ((r = sock->sock_ops->sop_accept(sock, addr, addr_len, user_endpt, + &newsock)) == SUSPEND) { + assert(sock->sock_opt & SO_ACCEPTCONN); + + if (call == NULL) + return EWOULDBLOCK; + + sockevent_suspend(sock, SEV_ACCEPT, call, user_endpt); + + return SUSPEND; + } + + if (r >= 0) + sockevent_accepted(sock, newsock, r); + + return r; +} + +/* + * Send regular and/or control data. + */ +static int +sockevent_send(sockid_t id, const struct sockdriver_data * __restrict data, + size_t len, const struct sockdriver_data * __restrict ctl_data, + socklen_t ctl_len, const struct sockaddr * __restrict addr, + socklen_t addr_len, endpoint_t user_endpt, int flags, + const struct sockdriver_call * __restrict call) +{ + struct sock *sock; + clock_t time; + size_t min, off; + socklen_t ctl_off; + int r, timer; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + /* + * The order of the following checks is not necessarily fixed, and may + * be changed later. As far as applicable, they should match the order + * of the checks during call resumption, though. + */ + if ((r = sock->sock_err) != OK) { + sock->sock_err = OK; + + return r; + } + + if (sock->sock_flags & SFL_SHUT_WR) { + sockevent_sigpipe(sock, user_endpt, flags); + + return EPIPE; + } + + /* + * Translate the sticky SO_DONTROUTE option to a per-request + * MSG_DONTROUTE flag. This achieves two purposes: socket drivers have + * to check only one flag, and socket drivers that do not support the + * flag will fail send requests in a consistent way. + */ + if (sock->sock_opt & SO_DONTROUTE) + flags |= MSG_DONTROUTE; + + /* + * Check if this is a valid send request as far as the socket driver is + * concerned. We do this separately from sop_send for the reason that + * this send request may immediately be queued behind other pending + * send requests (without a call to sop_send), which means even invalid + * requests would be queued and not return failure until much later. + */ + if (sock->sock_ops->sop_pre_send != NULL && + (r = sock->sock_ops->sop_pre_send(sock, len, ctl_len, addr, + addr_len, user_endpt, + flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK) + return r; + + if (sock->sock_ops->sop_send == NULL) + return EOPNOTSUPP; + + off = 0; + ctl_off = 0; + + /* + * Sending out-of-band data is treated differently from regular data: + * + * - sop_send is called immediately, even if a partial non-OOB send + * operation is currently suspended (TODO: it may have to be aborted + * in order to maintain atomicity guarantees - that should be easy); + * - sop_send must not return SUSPEND; instead, if it cannot process + * the OOB data immediately, it must return an appropriate error; + * - the send low watermark is ignored. + * + * Given that none of the current socket drivers support OOB data at + * all, more sophisticated approaches would have no added value now. + */ + if (flags & MSG_OOB) { + r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data, + ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, 0); + + if (r == SUSPEND) + panic("libsockevent: MSG_OOB send calls may not be " + "suspended"); + + return (r == OK) ? (int)off : r; + } + + /* + * Only call the actual sop_send function now if no other send calls + * are suspended already. + * + * Call sop_send with 'min' set to the minimum of the request size and + * the socket's send low water mark, but only if the call is non- + * blocking. For stream-oriented sockets, this should have the effect + * that non-blocking calls fail with EWOULDBLOCK if not at least that + * much can be sent immediately. For consistency, we choose to apply + * the same threshold to blocking calls. For datagram-oriented + * sockets, the minimum is not a factor to be considered. + */ + if (!sockevent_has_suspended(sock, SEV_SEND)) { + min = sock->sock_slowat; + if (min > len) + min = len; + + r = sock->sock_ops->sop_send(sock, data, len, &off, ctl_data, + ctl_len, &ctl_off, addr, addr_len, user_endpt, flags, min); + } else + r = SUSPEND; + + if (r == SUSPEND) { + /* + * We do not store the target's address on suspension, because + * that would add significantly to the per-process suspension + * state. As a result, we disallow socket drivers from + * suspending send calls with addresses, because we would no + * longer have the address for proper call resumption. + * However, we do not know here whether the socket is in + * connection-oriented mode; if it is, the address is to be + * ignored altogether. Therefore, there is no test on 'addr' + * here. Resumed calls will get a NULL address pointer, and + * the socket driver is expected to do the right thing. + */ + + /* + * For non-blocking socket calls, return an error only if we + * were not able to send anything at all. If only control data + * were sent, the return value is therefore zero. + */ + if (call != NULL) { + if (sock->sock_stimeo != 0) { + timer = TRUE; + time = socktimer_add(sock, sock->sock_stimeo); + } else { + timer = FALSE; + time = 0; + } + + sockevent_suspend_data(sock, SEV_SEND, timer, call, + user_endpt, data, len, off, ctl_data, ctl_len, + ctl_off, flags, 0, time); + } else + r = (off > 0 || ctl_off > 0) ? OK : EWOULDBLOCK; + } else if (r == EPIPE) + sockevent_sigpipe(sock, user_endpt, flags); + + return (r == OK) ? (int)off : r; +} + +/* + * The inner part of the receive request handler. An error returned from here + * may be overridden by an error pending on the socket, although data returned + * from here trumps such pending errors. + */ +static int +sockevent_recv_inner(struct sock * sock, + const struct sockdriver_data * __restrict data, + size_t len, size_t * __restrict off, + const struct sockdriver_data * __restrict ctl_data, + socklen_t ctl_len, socklen_t * __restrict ctl_off, + struct sockaddr * __restrict addr, + socklen_t * __restrict addr_len, endpoint_t user_endpt, + int * __restrict flags, const struct sockdriver_call * __restrict call) +{ + clock_t time; + size_t min; + int r, oob, inflags, timer; + + /* + * Check if this is a valid receive request as far as the socket driver + * is concerned. We do this separately from sop_recv for the reason + * that this receive request may immediately be queued behind other + * pending receive requests (without a call to sop_recv), which means + * even invalid requests would be queued and not return failure until + * much later. + */ + inflags = *flags; + *flags = 0; + + if (sock->sock_ops->sop_pre_recv != NULL && + (r = sock->sock_ops->sop_pre_recv(sock, user_endpt, + inflags & ~(MSG_DONTWAIT | MSG_NOSIGNAL))) != OK) + return r; + + /* + * The order of the following checks is not necessarily fixed, and may + * be changed later. As far as applicable, they should match the order + * of the checks during call resumption, though. + */ + if (sock->sock_flags & SFL_SHUT_RD) + return SOCKEVENT_EOF; + + if (sock->sock_ops->sop_recv == NULL) + return EOPNOTSUPP; + + /* + * Receiving out-of-band data is treated differently from regular data: + * + * - sop_recv is called immediately, even if a partial non-OOB receive + * operation is currently suspended (TODO: it may have to be aborted + * in order to maintain atomicity guarantees - that should be easy); + * - sop_recv must not return SUSPEND; instead, if it cannot return any + * the OOB data immediately, it must return an appropriate error; + * - the receive low watermark is ignored. + * + * Given that none of the current socket drivers support OOB data at + * all, more sophisticated approaches would have no added value now. + */ + oob = (inflags & MSG_OOB); + + if (oob && (sock->sock_opt & SO_OOBINLINE)) + return EINVAL; + + /* + * Only call the actual sop_recv function now if no other receive + * calls are suspended already. + * + * Call sop_recv with 'min' set to the minimum of the request size and + * the socket's socket's low water mark, unless there is a pending + * error. As a result, blocking calls will block, and non-blocking + * calls will yield EWOULDBLOCK, if at least that much can be received, + * unless another condition (EOF or that pending error) prevents more + * from being received anyway. For datagram-oriented sockets, the + * minimum is not a factor to be considered. + */ + if (oob || !sockevent_has_suspended(sock, SEV_RECV)) { + if (!oob && sock->sock_err == OK) { + min = sock->sock_rlowat; + if (min > len) + min = len; + } else + min = 0; /* receive even no-data segments */ + + r = sock->sock_ops->sop_recv(sock, data, len, off, ctl_data, + ctl_len, ctl_off, addr, addr_len, user_endpt, inflags, min, + flags); + } else + r = SUSPEND; + + assert(r <= 0 || r == SOCKEVENT_EOF); + + if (r == SUSPEND) { + if (oob) + panic("libsockevent: MSG_OOB receive calls may not be " + "suspended"); + + /* + * For non-blocking socket calls, return EWOULDBLOCK only if we + * did not receive anything at all. If only control data were + * received, the return value is therefore zero. Suspension + * implies that there is nothing to read. For the purpose of + * the calling wrapper function, never suspend a call when + * there is a pending error. + */ + if (call != NULL && sock->sock_err == OK) { + if (sock->sock_rtimeo != 0) { + timer = TRUE; + time = socktimer_add(sock, sock->sock_rtimeo); + } else { + timer = FALSE; + time = 0; + } + + sockevent_suspend_data(sock, SEV_RECV, timer, call, + user_endpt, data, len, *off, ctl_data, + ctl_len, *ctl_off, inflags, *flags, time); + } else + r = EWOULDBLOCK; + } + + return r; +} + +/* + * Receive regular and/or control data. + */ +static int +sockevent_recv(sockid_t id, const struct sockdriver_data * __restrict data, + size_t len, const struct sockdriver_data * __restrict ctl_data, + socklen_t * __restrict ctl_len, struct sockaddr * __restrict addr, + socklen_t * __restrict addr_len, endpoint_t user_endpt, + int * __restrict flags, const struct sockdriver_call * __restrict call) +{ + struct sock *sock; + size_t off; + socklen_t ctl_inlen; + int r; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + /* + * This function is a wrapper around the actual receive functionality. + * The reason for this is that receiving data should take precedence + * over a pending socket error, while a pending socket error should + * take precedence over both regular errors as well as EOF. In other + * words: if there is a pending error, we must try to receive anything + * at all; if receiving does not work, we must fail the call with the + * pending error. However, until we call the receive callback, we have + * no way of telling whether any data can be received. So we must try + * that before we can decide whether to return a pending error. + */ + off = 0; + ctl_inlen = *ctl_len; + *ctl_len = 0; + + /* + * Attempt to perform the actual receive call. + */ + r = sockevent_recv_inner(sock, data, len, &off, ctl_data, ctl_inlen, + ctl_len, addr, addr_len, user_endpt, flags, call); + + /* + * If the receive request succeeded, or it failed but yielded a partial + * result, then return the (partal) result. Otherwise, if an error is + * pending, return that error. Otherwise, return either a regular + * error or 0 for EOF. + */ + if (r == OK || (r != SUSPEND && (off > 0 || *ctl_len > 0))) + r = (int)off; + else if (sock->sock_err != OK) { + assert(r != SUSPEND); + + r = sock->sock_err; + + sock->sock_err = OK; + } else if (r == SOCKEVENT_EOF) + r = 0; + + return r; +} + +/* + * Process an I/O control call. + */ +static int +sockevent_ioctl(sockid_t id, unsigned long request, + const struct sockdriver_data * __restrict data, endpoint_t user_endpt, + const struct sockdriver_call * __restrict call __unused) +{ + struct sock *sock; + size_t size; + int r, val; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + /* We handle a very small subset of generic IOCTLs here. */ + switch (request) { + case FIONREAD: + size = 0; + if (!(sock->sock_flags & SFL_SHUT_RD) && + sock->sock_ops->sop_test_recv != NULL) + (void)sock->sock_ops->sop_test_recv(sock, 0, &size); + + val = (int)size; + + return sockdriver_copyout(data, 0, &val, sizeof(val)); + } + + if (sock->sock_ops->sop_ioctl == NULL) + return ENOTTY; + + r = sock->sock_ops->sop_ioctl(sock, request, data, user_endpt); + + /* + * Suspending IOCTL requests is not currently supported by this + * library, even though the VFS protocol and libsockdriver do support + * it. The reason is that IOCTLs do not match our proces suspension + * model: they could be neither queued nor repeated. For now, it seems + * that this feature is not needed by the socket drivers either. Thus, + * even though there are possible solutions, we defer implementing them + * until we know what exactly is needed. + */ + if (r == SUSPEND) + panic("libsockevent: socket driver suspended IOCTL 0x%lx", + request); + + return r; +} + +/* + * Set socket options. + */ +static int +sockevent_setsockopt(sockid_t id, int level, int name, + const struct sockdriver_data * data, socklen_t len) +{ + struct sock *sock; + struct linger linger; + struct timeval tv; + clock_t secs, ticks; + int r, val; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + if (level == SOL_SOCKET) { + /* + * Handle a subset of the socket-level options here. For most + * of them, this means that the socket driver itself need not + * handle changing or returning the options, but still needs to + * implement the correct behavior based on them where needed. + * A few of them are handled exclusively in this library: + * SO_ACCEPTCONN, SO_NOSIGPIPE, SO_ERROR, SO_TYPE, SO_LINGER, + * SO_SNDLOWAT, SO_RCVLOWAT, SO_SNDTIMEO, and SO_RCVTIMEO. + * The SO_USELOOPBACK option is explicitly absent, as it is + * valid for routing sockets only and is set by default there. + */ + switch (name) { + case SO_DEBUG: + case SO_REUSEADDR: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_OOBINLINE: + case SO_REUSEPORT: + case SO_NOSIGPIPE: + case SO_TIMESTAMP: + /* + * Simple on-off options. Changing them does not + * involve the socket driver. + */ + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val) + sock->sock_opt |= (unsigned int)name; + else + sock->sock_opt &= ~(unsigned int)name; + + /* + * In priciple these on-off options are maintained in + * this library, but some socket drivers may need to + * apply the options elsewhere, so we notify them that + * something has changed. Using the sop_setsockopt + * callback would be inconvenient for this for two + * reasons: multiple value copy-ins and default errors. + */ + if (sock->sock_ops->sop_setsockmask != NULL) + sock->sock_ops->sop_setsockmask(sock, + sock->sock_opt); + + /* + * The inlining of OOB data may make new data available + * through regular receive calls. Thus, see if we can + * wake up any suspended receive calls now. + */ + if (name == SO_OOBINLINE && val) + sockevent_raise(sock, SEV_RECV); + + return OK; + + case SO_LINGER: + /* The only on-off option with an associated value. */ + if ((r = sockdriver_copyin_opt(data, &linger, + sizeof(linger), len)) != OK) + return r; + + if (linger.l_onoff) { + if (linger.l_linger < 0) + return EINVAL; + /* EDOM is the closest applicable error.. */ + secs = (clock_t)linger.l_linger; + if (secs >= TMRDIFF_MAX / sys_hz()) + return EDOM; + + sock->sock_opt |= SO_LINGER; + sock->sock_linger = secs * sys_hz(); + } else { + sock->sock_opt &= ~SO_LINGER; + sock->sock_linger = 0; + } + + return OK; + + case SO_SNDLOWAT: + case SO_RCVLOWAT: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val <= 0) + return EINVAL; + + /* + * Setting these values may allow suspended operations + * (send, recv, select) to be resumed, so recheck. + */ + if (name == SO_SNDLOWAT) { + sock->sock_slowat = (size_t)val; + + sockevent_raise(sock, SEV_SEND); + } else { + sock->sock_rlowat = (size_t)val; + + sockevent_raise(sock, SEV_RECV); + } + + return OK; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + if ((r = sockdriver_copyin_opt(data, &tv, sizeof(tv), + len)) != OK) + return r; + + if (tv.tv_sec < 0 || tv.tv_usec < 0 || + (unsigned long)tv.tv_usec >= US) + return EINVAL; + if (tv.tv_sec >= TMRDIFF_MAX / sys_hz()) + return EDOM; + + ticks = tv.tv_sec * sys_hz() + + (tv.tv_usec * sys_hz() + US - 1) / US; + + if (name == SO_SNDTIMEO) + sock->sock_stimeo = ticks; + else + sock->sock_rtimeo = ticks; + + /* + * The timeouts for any calls already in progress for + * this socket are left as is. + */ + return OK; + + case SO_ACCEPTCONN: + case SO_ERROR: + case SO_TYPE: + /* These options may be retrieved but not set. */ + return ENOPROTOOPT; + + default: + /* + * The remaining options either cannot be handled in a + * generic way, or are not recognized altogether. Pass + * them to the socket driver, which should handle what + * it knows and reject the rest. + */ + break; + } + } + + if (sock->sock_ops->sop_setsockopt == NULL) + return ENOPROTOOPT; + + /* + * The socket driver must return ENOPROTOOPT for all options it does + * not recognize. + */ + return sock->sock_ops->sop_setsockopt(sock, level, name, data, len); +} + +/* + * Retrieve socket options. + */ +static int +sockevent_getsockopt(sockid_t id, int level, int name, + const struct sockdriver_data * __restrict data, + socklen_t * __restrict len) +{ + struct sock *sock; + struct linger linger; + struct timeval tv; + clock_t ticks; + int val; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + if (level == SOL_SOCKET) { + /* + * As with setting, handle a subset of the socket-level options + * here. The rest is to be taken care of by the socket driver. + */ + switch (name) { + case SO_DEBUG: + case SO_ACCEPTCONN: + case SO_REUSEADDR: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_OOBINLINE: + case SO_REUSEPORT: + case SO_NOSIGPIPE: + case SO_TIMESTAMP: + val = !!(sock->sock_opt & (unsigned int)name); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case SO_LINGER: + linger.l_onoff = !!(sock->sock_opt & SO_LINGER); + linger.l_linger = sock->sock_linger / sys_hz(); + + return sockdriver_copyout_opt(data, &linger, + sizeof(linger), len); + + case SO_ERROR: + if ((val = -sock->sock_err) != OK) + sock->sock_err = OK; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case SO_TYPE: + val = sock->sock_type; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case SO_SNDLOWAT: + val = (int)sock->sock_slowat; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case SO_RCVLOWAT: + val = (int)sock->sock_rlowat; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + if (name == SO_SNDTIMEO) + ticks = sock->sock_stimeo; + else + ticks = sock->sock_rtimeo; + + tv.tv_sec = ticks / sys_hz(); + tv.tv_usec = (ticks % sys_hz()) * US / sys_hz(); + + return sockdriver_copyout_opt(data, &tv, sizeof(tv), + len); + + default: + break; + } + } + + if (sock->sock_ops->sop_getsockopt == NULL) + return ENOPROTOOPT; + + /* + * The socket driver must return ENOPROTOOPT for all options it does + * not recognize. + */ + return sock->sock_ops->sop_getsockopt(sock, level, name, data, len); +} + +/* + * Retrieve a socket's local address. + */ +static int +sockevent_getsockname(sockid_t id, struct sockaddr * __restrict addr, + socklen_t * __restrict addr_len) +{ + struct sock *sock; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + if (sock->sock_ops->sop_getsockname == NULL) + return EOPNOTSUPP; + + return sock->sock_ops->sop_getsockname(sock, addr, addr_len); +} + +/* + * Retrieve a socket's remote address. + */ +static int +sockevent_getpeername(sockid_t id, struct sockaddr * __restrict addr, + socklen_t * __restrict addr_len) +{ + struct sock *sock; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + /* Listening-mode sockets cannot possibly have a peer address. */ + if (sock->sock_opt & SO_ACCEPTCONN) + return ENOTCONN; + + if (sock->sock_ops->sop_getpeername == NULL) + return EOPNOTSUPP; + + return sock->sock_ops->sop_getpeername(sock, addr, addr_len); +} + +/* + * Mark the socket object as shut down for sending and/or receiving. The flags + * parameter may be a bitwise-OR'ed combination of SFL_SHUT_RD and SFL_SHUT_WR. + * This function will wake up any suspended requests affected by this change, + * but it will not invoke the sop_shutdown() callback function on the socket. + * The function may in fact be called from sop_shutdown() before completion to + * mark the socket as shut down as reflected by sockevent_is_shutdown(). + */ +void +sockevent_set_shutdown(struct sock * sock, unsigned int flags) +{ + unsigned int mask; + + assert(sock->sock_ops != NULL); + assert(!(flags & ~(SFL_SHUT_RD | SFL_SHUT_WR))); + + /* Look at the newly set flags only. */ + flags &= ~(unsigned int)sock->sock_flags; + + if (flags != 0) { + sock->sock_flags |= flags; + + /* + * Wake up any blocked calls that are affected by the shutdown. + * Shutting down listening sockets causes ongoing accept calls + * to be rechecked. + */ + mask = 0; + if (flags & SFL_SHUT_RD) + mask |= SEV_RECV; + if (flags & SFL_SHUT_WR) + mask |= SEV_SEND; + if (sock->sock_opt & SO_ACCEPTCONN) + mask |= SEV_ACCEPT; + + assert(mask != 0); + sockevent_raise(sock, mask); + } +} + +/* + * Shut down socket send and receive operations. + */ +static int +sockevent_shutdown(sockid_t id, int how) +{ + struct sock *sock; + unsigned int flags; + int r; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + /* Convert the request to a set of flags. */ + flags = 0; + if (how == SHUT_RD || how == SHUT_RDWR) + flags |= SFL_SHUT_RD; + if (how == SHUT_WR || how == SHUT_RDWR) + flags |= SFL_SHUT_WR; + + if (sock->sock_ops->sop_shutdown != NULL) + r = sock->sock_ops->sop_shutdown(sock, flags); + else + r = OK; + + /* On success, update our internal state as well. */ + if (r == OK) + sockevent_set_shutdown(sock, flags); + + return r; +} + +/* + * Close a socket. + */ +static int +sockevent_close(sockid_t id, const struct sockdriver_call * call) +{ + struct sock *sock; + int r, force; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + assert(sock->sock_proc == NULL); + sock->sock_select.ss_endpt = NONE; + + /* + * There are several scenarios when it comes to closing sockets. First + * of all, we never actually force the socket driver to close a socket. + * The driver may always suspend the close call and take as long as it + * wants. After a suspension, it signals its completion of the close + * through the SEV_CLOSE socket event. + * + * With that said, we offer two levels of urgency regarding the close + * request: regular and forced. The former allows for a graceful + * close; the latter urges the socket driver to close the socket as + * soon as possible. A socket that has been requested to be closed + * gracefully can, as long as it is still open (i.e., no SEV_CLOSE was + * fired yet), later be requested to be closed forcefully. This is how + * SO_LINGER with a nonzero timeout is implemented. If SO_LINGER is + * set with a zero timeout, the socket is force-closed immediately. + * Finally, if SO_LINGER is not set, the socket will be closed normally + * and never be forced--akin to SO_LINGER with an infinite timeout. + * + * The return value of the caller's close(2) may only ever be either + * OK or EINPROGRESS, to ensure that the caller knows that the file + * descriptor is freed up, as per Austin Group Defect #529. In fact, + * EINPROGRESS is to be returned only on signal interruption (i.e., + * cancel). For that reason, this function only ever returns OK. + */ + force = ((sock->sock_opt & SO_LINGER) && sock->sock_linger == 0); + + if (sock->sock_ops->sop_close != NULL) + r = sock->sock_ops->sop_close(sock, force); + else + r = OK; + + assert(r == OK || r == SUSPEND); + + if (r == SUSPEND) { + sock->sock_flags |= SFL_CLOSING; + + /* + * If we were requested to force-close the socket immediately, + * but the socket driver needs more time anyway, then tell the + * caller that the socket was closed right away. + */ + if (force) + return OK; + + /* + * If we are to force-close the socket only after a specific + * linger timeout, set the timer for that now, even if the call + * is non-blocking. This also means that we cannot associate + * the linger timeout with the close call. Instead, we convert + * the sock_linger value from a (relative) duration to an + * (absolute) timeout time, and use the SFL_CLOSING flag (along + * with SFL_TIMER) to tell the difference. Since the socket is + * otherwise unreachable from userland at this point, the + * conversion is never visible in any way. + * + * The socket may already be in the timers list, so we must + * always check the SO_LINGER flag before checking sock_linger. + * + * If SO_LINGER is not set, we must never suspend the call. + */ + if (sock->sock_opt & SO_LINGER) { + sock->sock_linger = + socktimer_add(sock, sock->sock_linger); + } else + call = NULL; + + /* + * A non-blocking close is completed asynchronously. The + * caller is not told about this with EWOULDBLOCK as usual, for + * the reasons mentioned above. + */ + if (call != NULL) + sockevent_suspend(sock, SEV_CLOSE, call, NONE); + else + r = OK; + } else if (r == OK) + sockevent_free(sock); + + return r; +} + +/* + * Cancel a suspended send request. + */ +static void +sockevent_cancel_send(struct sock * sock, struct sockevent_proc * spr, int err) +{ + int r; + + /* + * If any regular or control data were sent, return the number of data + * bytes sent--possibly zero. Otherwise return the given error code. + */ + if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) + r = (int)spr->spr_dataoff; + else + r = err; + + sockdriver_reply_generic(&spr->spr_call, r); + + /* + * In extremely rare circumstances, one send may be queued behind + * another send even though the former can actually be sent on the + * socket right away. For this reason, we retry sending when canceling + * a send. We need to do this only when the first send in the queue + * was canceled, but multiple blocked sends on a single socket should + * be rare anyway. + */ + sockevent_raise(sock, SEV_SEND); +} + +/* + * Cancel a suspended receive request. + */ +static void +sockevent_cancel_recv(struct sock * sock, struct sockevent_proc * spr, int err) +{ + int r; + + /* + * If any regular or control data were received, return the number of + * data bytes received--possibly zero. Otherwise return the given + * error code. + */ + if (spr->spr_dataoff > 0 || spr->spr_ctloff > 0) + r = (int)spr->spr_dataoff; + else + r = err; + + /* + * Also return any flags set for the data received so far, e.g. + * MSG_CTRUNC. Do not return an address: receive calls on unconnected + * sockets must never block after receiving some data--instead, they + * are supposed to return MSG_TRUNC if not all data were copied out. + */ + sockdriver_reply_recv(&spr->spr_call, r, spr->spr_ctloff, NULL, 0, + spr->spr_rflags); + + /* + * The same story as for sends (see above) applies to receives, + * although this case should be even more rare in practice. + */ + sockevent_raise(sock, SEV_RECV); +} + +/* + * Cancel a previous request that may currently be suspended. The cancel + * operation itself does not have a reply. Instead, if the given request was + * found to be suspended, that request must be aborted and an appropriate reply + * must be sent for the request. If no matching request was found, no reply + * must be sent at all. + */ +static void +sockevent_cancel(sockid_t id, const struct sockdriver_call * call) +{ + struct sockevent_proc *spr; + struct sock *sock; + + /* + * Due to asynchronous close(2) operations, not even the sock object + * may be found. If this (entirely legitimate) case, do not send any + * reply. + */ + if ((sock = sockhash_get(id)) == NULL) + return; + + /* + * The request may already have completed by the time we receive the + * cancel request, in which case we can not find it. In this (entirely + * legitimate) case, do not send any reply. + */ + if ((spr = sockevent_unsuspend(sock, call)) == NULL) + return; + + /* + * We found the operation. Cancel it according to its call type. + * Then, once fully done with it, free the suspension data structure. + * + * Note that we have to use the call structure from the suspension data + * structure rather than the given 'call' pointer: only the former + * includes all the information necessary to resume the request! + */ + switch (spr->spr_event) { + case SEV_BIND: + case SEV_CONNECT: + assert(spr->spr_call.sc_endpt != NONE); + + sockdriver_reply_generic(&spr->spr_call, EINTR); + + break; + + case SEV_ACCEPT: + sockdriver_reply_accept(&spr->spr_call, EINTR, NULL, 0); + + break; + + case SEV_SEND: + sockevent_cancel_send(sock, spr, EINTR); + + break; + + case SEV_RECV: + sockevent_cancel_recv(sock, spr, EINTR); + + break; + + case SEV_CLOSE: + /* + * Return EINPROGRESS rather than EINTR, so that the user + * process can tell from the close(2) result that the file + * descriptor has in fact been closed. + */ + sockdriver_reply_generic(&spr->spr_call, EINPROGRESS); + + /* + * Do not free the sock object here: the socket driver will + * complete the close in the background, and fire SEV_CLOSE + * once it is done. Only then is the sock object freed. + */ + break; + + default: + panic("libsockevent: process suspended on unknown event 0x%x", + spr->spr_event); + } + + sockevent_proc_free(spr); +} + +/* + * Process a select request. + */ +static int +sockevent_select(sockid_t id, unsigned int ops, + const struct sockdriver_select * sel) +{ + struct sock *sock; + unsigned int r, notify; + + if ((sock = sockhash_get(id)) == NULL) + return EINVAL; + + notify = (ops & SDEV_NOTIFY); + ops &= (SDEV_OP_RD | SDEV_OP_WR | SDEV_OP_ERR); + + /* + * See if any of the requested select operations can be satisfied + * immediately. + */ + r = sockevent_test_select(sock, ops); + + /* + * If select operations were pending, the new results must not indicate + * that any of those were satisfied, as that would indicate an internal + * logic error: the socket driver is supposed to update its state + * proactively, and thus, discovering that things have changed here is + * not something that should ever happen. + */ + assert(!(sock->sock_selops & r)); + + /* + * If any select operations are not satisfied immediately, and we are + * asked to notify the caller when they are satisfied later, save them + * for later retesting. + */ + ops &= ~r; + + if (notify && ops != 0) { + /* + * For now, we support only one caller when it comes to select + * queries: VFS. If we want to support a networked file system + * (or so) directly calling select as well, this library will + * have to be extended accordingly (should not be too hard). + */ + if (sock->sock_select.ss_endpt != NONE) { + if (sock->sock_select.ss_endpt != sel->ss_endpt) { + printf("libsockevent: no support for multiple " + "select callers yet\n"); + + return EIO; + } + + /* + * If a select query was already pending for this + * caller, we must simply merge in the new operations. + */ + sock->sock_selops |= ops; + } else { + assert(sel->ss_endpt != NONE); + + sock->sock_select = *sel; + sock->sock_selops = ops; + } + } + + return r; +} + +/* + * An alarm has triggered. Expire any timers. Socket drivers that do not pass + * clock notification messages to libsockevent must call expire_timers(3) + * themselves instead. + */ +static void +sockevent_alarm(clock_t now) +{ + + expire_timers(now); +} + +static const struct sockdriver sockevent_tab = { + .sdr_socket = sockevent_socket, + .sdr_socketpair = sockevent_socketpair, + .sdr_bind = sockevent_bind, + .sdr_connect = sockevent_connect, + .sdr_listen = sockevent_listen, + .sdr_accept = sockevent_accept, + .sdr_send = sockevent_send, + .sdr_recv = sockevent_recv, + .sdr_ioctl = sockevent_ioctl, + .sdr_setsockopt = sockevent_setsockopt, + .sdr_getsockopt = sockevent_getsockopt, + .sdr_getsockname = sockevent_getsockname, + .sdr_getpeername = sockevent_getpeername, + .sdr_shutdown = sockevent_shutdown, + .sdr_close = sockevent_close, + .sdr_cancel = sockevent_cancel, + .sdr_select = sockevent_select, + .sdr_alarm = sockevent_alarm +}; + +/* + * Initialize the socket event library. + */ +void +sockevent_init(sockevent_socket_cb_t socket_cb) +{ + + sockhash_init(); + + socktimer_init(); + + sockevent_proc_init(); + + SIMPLEQ_INIT(&sockevent_pending); + + assert(socket_cb != NULL); + sockevent_socket_cb = socket_cb; + + /* Announce we are up. */ + sockdriver_announce(); + + sockevent_working = FALSE; +} + +/* + * Process a socket driver request message. + */ +void +sockevent_process(const message * m_ptr, int ipc_status) +{ + + /* Block events until after we have processed the request. */ + assert(!sockevent_working); + sockevent_working = TRUE; + + /* Actually process the request. */ + sockdriver_process(&sockevent_tab, m_ptr, ipc_status); + + /* + * If any events were fired while processing the request, they will + * have been queued for later. Go through them now. + */ + if (sockevent_has_events()) + sockevent_pump(); + + sockevent_working = FALSE; +} diff --git a/minix/lib/libsockevent/sockevent_proc.c b/minix/lib/libsockevent/sockevent_proc.c new file mode 100644 index 000000000..df5e3a91f --- /dev/null +++ b/minix/lib/libsockevent/sockevent_proc.c @@ -0,0 +1,52 @@ +/* libsockevent - sockevent_proc.c - process suspension state management */ + +#include +#include + +#include "sockevent_proc.h" + +static struct sockevent_proc sockevent_procs[NR_PROCS]; +static struct sockevent_proc *sockevent_freeprocs; + +/* + * Initialize the process suspension table. + */ +void +sockevent_proc_init(void) +{ + unsigned int slot; + + for (slot = 0; slot < __arraycount(sockevent_procs); slot++) { + sockevent_procs[slot].spr_next = sockevent_freeprocs; + sockevent_freeprocs = &sockevent_procs[slot]; + } +} + +/* + * Allocate and return a new socket process suspension entry. Return NULL if + * no entries are available. + */ +struct sockevent_proc * +sockevent_proc_alloc(void) +{ + struct sockevent_proc *spr; + + if ((spr = sockevent_freeprocs) == NULL) + return NULL; + + sockevent_freeprocs = spr->spr_next; + spr->spr_next = NULL; + + return spr; +} + +/* + * Free up a previously allocated socket process suspension entry for reuse. + */ +void +sockevent_proc_free(struct sockevent_proc * spr) +{ + + spr->spr_next = sockevent_freeprocs; + sockevent_freeprocs = spr; +} diff --git a/minix/lib/libsockevent/sockevent_proc.h b/minix/lib/libsockevent/sockevent_proc.h new file mode 100644 index 000000000..5bb4593ba --- /dev/null +++ b/minix/lib/libsockevent/sockevent_proc.h @@ -0,0 +1,25 @@ +#ifndef MINIX_SOCKEVENT_PROC_H +#define MINIX_SOCKEVENT_PROC_H + +struct sockevent_proc { + struct sockevent_proc *spr_next; /* next on sock or free list */ + unsigned char spr_event; /* event for call (SEV_) */ + unsigned char spr_timer; /* suspended call has timer? */ + struct sockdriver_call spr_call; /* call structure */ + endpoint_t spr_endpt; /* user endpoint */ + struct sockdriver_packed_data spr_data; /* regular data, packed */ + size_t spr_datalen; /* length of regular data */ + size_t spr_dataoff; /* offset into regular data */ + struct sockdriver_packed_data spr_ctl; /* control data, packed */ + socklen_t spr_ctllen; /* length of control data */ + socklen_t spr_ctloff; /* offset into control data */ + int spr_flags; /* send/recv flags (MSG_) */ + int spr_rflags; /* recv result flags (MSG_) */ + clock_t spr_time; /* timeout time for call */ +}; + +void sockevent_proc_init(void); +struct sockevent_proc *sockevent_proc_alloc(void); +void sockevent_proc_free(struct sockevent_proc *); + +#endif /* !MINIX_SOCKEVENT_PROC_H */ diff --git a/share/mk/bsd.prog.mk b/share/mk/bsd.prog.mk index 2e122a60e..87bdf50e7 100644 --- a/share/mk/bsd.prog.mk +++ b/share/mk/bsd.prog.mk @@ -238,6 +238,7 @@ LIB${_lib:tu}= ${DESTDIR}/usr/lib/lib${_lib:S/xx/++/:S/atf_c/atf-c/}.a netsock \ sffs \ sockdriver \ + sockevent \ sys \ timers \ usb \