./etc/system.conf.d/ipc minix-base
./etc/system.conf.d/lwip minix-base
./etc/system.conf.d/random minix-base
+./etc/system.conf.d/uds minix-base
./etc/system.conf.d/usb_hub minix-base
./etc/system.conf.d/usb_storage minix-base
./etc/termcap minix-base
./usr/man/man2/getgid.2 minix-man
./usr/man/man2/getitimer.2 minix-man
./usr/man/man2/getnucred.2 minix-man obsolete
-./usr/man/man2/getpeereid.2 minix-man
+./usr/man/man2/getpeereid.2 minix-man obsolete
./usr/man/man2/getpeername.2 minix-man
./usr/man/man2/getpid.2 minix-man
./usr/man/man2/getpriority.2 minix-man
./usr/man/man8/syslogd.8 minix-man
./usr/man/man8/tcpd.8 minix-man
./usr/man/man8/traceroute.8 minix-man
-./usr/man/man8/uds.8 minix-man
+./usr/man/man8/uds.8 minix-man obsolete
./usr/man/man8/unix.8 minix-man
./usr/man/man8/unlink.8 minix-man
./usr/man/man8/unstr.8 minix-man
uid 0; # only for copyfd(2)
};
-service uds
-{
- ipc
- SYSTEM vfs rs vm
- ;
- uid 0; # only for checkperms(2) and copyfd(2)
-};
-
service pty
{
system
# pty needs to know the "tty" group ID
up pty -dev /dev/ptmx -args "gid=`stat -f '%g' /dev/ptmx`"
- up uds -dev /dev/uds
+ up uds
up -n ipc
}
retry:
-#ifndef __minix
if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
-#else
- if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
-#endif /* !defined(__minix) */
fatal("socket failed");
if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == -1) {
}
unlink(sa.sun_path);
-#ifndef __minix
if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
-#else
- if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
-#endif /* !defined(__minix) */
fatal("socket failed");
mask = umask(S_IXUSR|S_IXGRP|S_IRWXO);
char *cause;
/* The first client is special and gets a socketpair; create it. */
-#ifndef __minix
if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, pair) != 0)
-#else
- if (socketpair(AF_UNIX, SOCK_SEQPACKET, PF_UNSPEC, pair) != 0)
-#endif /* !defined(__minix) */
fatal("socketpair failed");
switch (fork()) {
#include "reentrant.h"
#include "extern.h"
-#if defined(__minix)
-#include <sys/ioctl.h>
-#endif /* defined(__minix) */
-
#ifdef __weak_alias
__weak_alias(closelog,_closelog)
__weak_alias(openlog,_openlog)
* to give syslogd a chance to empty its socket buffer.
*/
for (tries = 0; tries < MAXTRIES; tries++) {
-#if defined(__minix)
- if (write(data->log_file, tbuf, cnt) != -1)
-#else
if (send(data->log_file, tbuf, cnt, 0) != -1)
-#endif /* defined(__minix) */
break;
if (errno != ENOBUFS) {
disconnectlog_r(data);
/* AF_UNIX address of local logger */
static const struct sockaddr_un sun = {
.sun_family = AF_LOCAL,
-#if !defined(__minix)
.sun_len = sizeof(sun),
-#endif /* !defined(__minix) */
.sun_path = _PATH_LOG,
};
data->log_connected = 0;
}
if (!data->log_connected) {
-#if defined(__minix)
- if(ioctl(data->log_file, NWIOSUDSTADDR, __UNCONST(&sun)) < 0)
-
-#else
if (connect(data->log_file,
(const struct sockaddr *)(const void *)&sun,
(socklen_t)sizeof(sun)) == -1)
-#endif /* defined(__minix) */
{
(void)close(data->log_file);
data->log_file = -1;
# @(#)Makefile.inc 8.2 (Berkeley) 9/5/93
# net sources
-.if defined(__MINIX)
-.PATH: ${NETBSDSRCDIR}/minix/lib/libc/net
-
-CPPFLAGS.getpeereid.c+= -D_MINIX_SYSTEM=1
-CPPFLAGS.getsockopt.c+= -D_MINIX_SYSTEM=1
-CPPFLAGS.setsockopt.c+= -D_MINIX_SYSTEM=1
-.endif
.PATH: ${ARCHDIR}/net ${.CURDIR}/net
SRCS+= base64.c ethers.c gethnamaddr.c getifaddrs.c \
17,0)
des="hello" dev=hello
;;
- 18,0)
- des="UNIX domain socket" dev=uds
- ;;
5[6-9],0|6[0-3],0)
drive=`expr $major - 56`
des="vnode disk $drive" dev=vnd$drive
ttypa ttypb ttypc ttypd ttype ttypf
ttyq0 ttyq1 ttyq2 ttyq3 ttyq4 ttyq5 ttyq6 ttyq7 ttyq8 ttyq9
ttyqa ttyqb ttyqc ttyqd ttyqe ttyqf
- uds
vnd0 vnd0p0 vnd0p0s0 vnd1 vnd1p0 vnd1p0s0
vnd2 vnd3 vnd4 vnd5 vnd6 vnd7
"
klog # Make /dev/klog
ptmx # Make /dev/ptmx
random # Make /dev/random, /dev/urandom
- uds # Make /dev/uds
filter # Make /dev/filter
fbd # Make /dev/fbd
hello # Make /dev/hello
makedev ${dev} c 4 ${minor} ${uname} tty ${permissions}
;;
- uds)
- # Unix domain sockets device
- makedev ${dev} c 18 0 ${uname} ${gname} 666
- ;;
vnd[0-7])
# Whole vnode disk devices.
makedev ${dev} b ${major} 0 ${uname} ${gname} ${permissions}
#define LOG_MAJOR 15 /* 15 = /dev/klog (log driver) */
#define RANDOM_MAJOR 16 /* 16 = /dev/random (random driver) */
#define HELLO_MAJOR 17 /* 17 = /dev/hello (hello driver) */
-#define UDS_MAJOR 18 /* 18 = /dev/uds (pfs) */
-#define FB_MAJOR 19 /* 18 = /dev/fb0 (fb driver) */
+ /* 18 = (unused) */
+#define FB_MAJOR 19 /* 19 = /dev/fb0 (fb driver) */
#define I2C0_MAJOR 20 /* 20 = /dev/i2c-1 (i2c-dev) */
#define I2C1_MAJOR 21 /* 21 = /dev/i2c-2 (i2c-dev) */
#define I2C2_MAJOR 22 /* 22 = /dev/i2c-3 (i2c-dev) */
gid_t getngid(endpoint_t proc_ep);
int getsockcred(endpoint_t proc_ep, struct sockcred * sockcred, gid_t * groups,
int ngroups);
-int socketpath(endpoint_t endpt, char *path, size_t size, int what, dev_t *dev,
- ino_t *ino);
+int socketpath(endpoint_t endpt, const char *path, size_t size, int what,
+ dev_t *dev, ino_t *ino);
#define SPATH_CHECK 0 /* check user permissions on socket path */
#define SPATH_CREATE 1 /* create socket file at given path */
-#define SPATH_CANONIZE 0x8000 /* copy back canonized path (legacy support) */
int copyfd(endpoint_t endpt, int fd, int what);
#define COPYFD_FROM 0 /* copy file descriptor from remote process */
#define COPYFD_TO 1 /* copy file descriptor to remote process */
+++ /dev/null
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/socket.h>
-#include <sys/ucred.h>
-
-/*
- * get the effective user ID and effective group ID of a peer
- * connected through a Unix domain socket.
- */
-int getpeereid(int sd, uid_t *euid, gid_t *egid) {
- int rc;
- struct uucred cred;
- socklen_t ucred_length;
-
- /* Initialize Data Structures */
- ucred_length = sizeof(struct uucred);
- memset(&cred, '\0', ucred_length);
-
- /* Validate Input Parameters */
- if (euid == NULL || egid == NULL) {
- errno = EFAULT;
- return -1;
- } /* getsockopt will handle validating 'sd' */
-
- /* Get the credentials of the peer at the other end of 'sd' */
- rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &cred, &ucred_length);
- if (rc == 0) {
- /* Success - return the results */
- *euid = cred.cr_uid;
- *egid = cred.cr_gid;
- return 0;
- } else {
- /* Failure - getsockopt takes care of setting errno */
- return -1;
- }
-}
return 0;
}
+#ifdef SO_PEERCRED
if (level == SOL_SOCKET && option_name == SO_PEERCRED)
{
struct uucred cred;
option_len);
return 0;
}
+#endif
if (level == SOL_SOCKET && option_name == SO_REUSEADDR)
return 0;
}
+#ifdef SO_PASSCRED
if (level == SOL_SOCKET && option_name == SO_PASSCRED)
{
i = 1; /* option is always 'on' */
getsockopt_copy(&i, sizeof(i), option_value, option_len);
return 0;
}
+#endif
#if DEBUG
fprintf(stderr, "_uds_getsocketopt: level %d, name %d\n",
return 0;
}
+#ifdef SO_PASSCRED
if (level == SOL_SOCKET && option_name == SO_PASSCRED)
{
if (option_len != sizeof(i))
}
return 0;
}
+#endif
#if DEBUG
fprintf(stderr, "_uds_setsocketopt: level %d, name %d\n",
#include <minix/safecopies.h>
int
-socketpath(endpoint_t endpt, char * path, size_t size, int what, dev_t * dev,
- ino_t * ino)
+socketpath(endpoint_t endpt, const char * path, size_t size, int what,
+ dev_t * dev, ino_t * ino)
{
cp_grant_id_t grant;
message m;
int r;
if ((grant = cpf_grant_direct(VFS_PROC_NR, (vir_bytes)path, size,
- CPF_READ | CPF_WRITE)) == GRANT_INVALID)
+ CPF_READ)) == GRANT_INVALID)
return ENOMEM;
memset(&m, 0, sizeof(m));
m.m_lsys_vfs_socketpath.endpt = endpt;
m.m_lsys_vfs_socketpath.grant = grant;
m.m_lsys_vfs_socketpath.count = size;
- m.m_lsys_vfs_socketpath.what = what | SPATH_CANONIZE;
+ m.m_lsys_vfs_socketpath.what = what;
r = _taskcall(VFS_PROC_NR, VFS_SOCKETPATH, &m);
MAN= accept.2 access.2 bind.2 brk.2 chdir.2 chmod.2 chown.2 \
chroot.2 close.2 connect.2 creat.2 dup.2 execve.2 exit.2 fcntl.2 \
- fork.2 getgid.2 getitimer.2 getpeereid.2 \
+ fork.2 getgid.2 getitimer.2 \
getpeername.2 getpid.2 getpriority.2 getsockname.2 getsockopt.2 \
gettimeofday.2 getuid.2 intro.2 ioctl.2 kill.2 link.2 listen.2 \
lseek.2 mkdir.2 mknod.2 mount.2 open.2 ptrace.2 \
+++ /dev/null
-.TH GETPEEREID 2
-.SH NAME
-getpeereid \- get the effective user ID and effective group ID of a peer
-connected through a Unix domain socket.
-.SH SYNOPSIS
-.ft B
-#include <sys/socket.h>
-
-.in +5
-.ti -5
-int getpeereid(int \fIsd\fP, uid_t *\fIeuid\fP, gid_t *\fIegid\fP);
-.br
-.ft P
-.SH DESCRIPTION
-getpeereid() is often used to authenticate clients connecting to a
-server through a Unix domain socket. The server can call this function
-with a socket descriptor \fIsd\fP and this function will fill\-in
-\fIeuid\fP and \fIegid\fP with the effective user ID and the effective
-group ID of the client process.
-.SH RETURN VALUES
-On success, this function returns 0, \fIeuid\fP is set to the effective
-user ID of the peer connected through Unix domain socket \fIsd\fP, and
-\fIegid\fP is set to the effective group ID of the peer connected
-through Unix domain socket \fIsd\fP. On error, -1 is returned and
-\fIerrno\fP is set.
-.SH ERRORS
-.TP 15
-[EBADF]
-The argument \fIsd\fP is not a descriptor.
-.TP 15
-[ENOTSOCK]
-The argument \fIsd\fP is a descriptor, but not a socket descriptor.
-.TP 15
-[EFAULT]
-The address pointed to by \fIeuid\fP and/or \fIegid\fP is not in a
-valid part of the process address space.
-.SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR unix(8)
-.SH HISTORY
-This function first appeared in Minix 3.1.8.
# Makefile for the UNIX Domain Sockets driver (UDS)
PROG= uds
-SRCS= uds.c ioc_uds.c
-MAN= uds.8 unix.8
+SRCS= uds.c io.c stat.c
+MAN= unix.8
-DPADD+= ${LIBCHARDRIVER} ${LIBSYS}
-LDADD+= -lchardriver -lsys
+FILES=${PROG}.conf
+FILESNAME=${PROG}
+FILESDIR= /etc/system.conf.d
+
+DPADD+= ${LIBSOCKEVENT} ${LIBSOCKDRIVER} ${LIBSYS} ${LIBTIMERS}
+LDADD+= -lsockevent -lsockdriver -lsys -ltimers
+
+WARNS?= 5
.include <minix.service.mk>
--- /dev/null
+/* UNIX Domain Sockets - io.c - sending and receiving */
+
+#include "uds.h"
+#include <sys/mman.h>
+
+/*
+ * Our UDS sockets do not have a send buffer. They only have a receive buffer.
+ * This receive buffer, when not empty, is split up in segments. Each segment
+ * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and
+ * (SOCK_DGRAM) neither. There are two types of ancillary data: in-flight file
+ * descriptors and sender credentials. In addition, for SOCK_DGRAM sockets,
+ * the segment may contain the sender's socket path (if the sender's socket is
+ * bound). Each segment has has a header, containing the full segment size,
+ * the size of the actual data in the segment (if any), and a flags field that
+ * states which ancillary are associated with the segment (if any). For
+ * SOCK_STREAM type sockets, new data may be merged into a previous segment,
+ * but only if it has no ancillary data. For the other two socket types, each
+ * packet has its own header. The resulting behavior should be in line with
+ * the POSIX "Socket Receive Queue" specification.
+ *
+ * More specifically, each segment consists of the following parts:
+ * - always a five-byte header, containing a two-byte segment length (including
+ * the header, so always non-zero), a two-byte regular data length (zero or
+ * more), and a one-byte flags field which is a bitwise combination of
+ * UDS_HAS_{FD,CRED,PATH} flags;
+ * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure;
+ * since this structure is variable-size, the structure is prepended by a
+ * single byte that contains the length of the structure (excluding the byte
+ * itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN);
+ * - next, if UDS_HAS_PATH is set in the segment header:
+ * - next, if the data length is non-zero, the actual regular data.
+ * If the segment is not the last in the receive buffer, it is followed by the
+ * next segment immediately afterward. There is no alignment.
+ *
+ * It is the sender's responsibility to merge new data into the last segment
+ * whenever possible, so that the receiver side never needs to consider more
+ * than one segment at once. In order to allow such merging, each receive
+ * buffer has not only a tail and in-use length (pointing to the head when
+ * combined) but also an offset from the tail to the last header, if any. Note
+ * that the receiver may over time still look at multiple segments for a single
+ * request: this happens when a MSG_WAITALL request empties the buffer and then
+ * blocks - the next piece of arriving data can then obviously not be merged.
+ *
+ * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file
+ * descriptors are associated with the segment. These are stored in a separate
+ * data structure, mainly to simplify cleaning up when the socket is shut down
+ * for reading or closed. That structure also contains the number of file
+ * descriptors associated with the current segment, so this is not stored in
+ * the segment itself. As mentioned later, this may be changed in the future.
+ *
+ * On the sender side, there is a trade-off between fully utilizing the receive
+ * buffer, and not repeatedly performing expensive actions for the same call:
+ * it may be costly to determine exactly how many in-flight file descriptors
+ * there will be (if any) and/or how much space is needed to store credentials.
+ * We currently use the policy that we rather block/reject a send request that
+ * may (just) have fit in the remaining part of the receive buffer, than obtain
+ * the same information multiple times or keep state between callbacks. In
+ * practice this is not expected to make a difference, especially since
+ * transfer of ancillary data should be rare anyway.
+ */
+/*
+ * The current layout of the segment header is as follows.
+ *
+ * The first byte contains the upper eight bits of the total segment length.
+ * The second byte contains the lower eight bits of the total segment length.
+ * The third byte contains the upper eight bits of the data length.
+ * The fourth byte contains the lower eight bits of the data length.
+ * The fifth byte is a bitmask for ancillary data associated with the segment.
+ */
+#define UDS_HDRLEN 5
+
+#define UDS_HAS_FDS 0x01 /* segment has in-flight file descriptors */
+#define UDS_HAS_CRED 0x02 /* segment has sender credentials */
+#define UDS_HAS_PATH 0x04 /* segment has source socket path */
+
+#define UDS_MAXCREDLEN SOCKCREDSIZE(NGROUPS_MAX)
+
+#define uds_get_head(uds) \
+ ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF)
+#define uds_get_last(uds) \
+ ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF)
+#define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF)
+
+/*
+ * All in-flight file descriptors are (co-)owned by the UDS driver itself, as
+ * local open file descriptors. Like any other process, the UDS driver can not
+ * have more than OPEN_MAX open file descriptors at any time. Thus, this is
+ * also the inherent maximum number of in-flight file descriptors. Therefore,
+ * we maintain a single pool of in-flight FD structures, and we associate these
+ * structures with sockets as needed.
+ */
+static struct uds_fd uds_fds[OPEN_MAX];
+static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds;
+
+static char uds_ctlbuf[UDS_CTL_MAX];
+static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)];
+
+/*
+ * Initialize the input/output part of the UDS service.
+ */
+void
+uds_io_init(void)
+{
+ unsigned int slot;
+
+ SIMPLEQ_INIT(&uds_freefds);
+
+ for (slot = 0; slot < __arraycount(uds_fds); slot++)
+ SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next);
+}
+
+/*
+ * Set up all input/output state for the given socket, which has just been
+ * allocated. As part of this, allocate memory for the receive buffer of the
+ * socket. Return OK or a negative error code.
+ */
+int
+uds_io_setup(struct udssock * uds)
+{
+
+ /* TODO: decide if we should preallocate the memory. */
+ if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
+ MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
+ return ENOMEM;
+
+ uds->uds_tail = 0;
+ uds->uds_len = 0;
+ uds->uds_last = 0;
+
+ SIMPLEQ_INIT(&uds->uds_fds);
+
+ return OK;
+}
+
+/*
+ * Clean up the input/output state for the given socket, which is about to be
+ * freed. As part of this, deallocate memory for the receive buffer and close
+ * any file descriptors still in flight on the socket.
+ */
+void
+uds_io_cleanup(struct udssock * uds)
+{
+
+ /* Close any in-flight file descriptors. */
+ uds_io_reset(uds);
+
+ /* Free the receive buffer memory. */
+ if (munmap(uds->uds_buf, UDS_BUF) != 0)
+ panic("UDS: munmap failed: %d", errno);
+}
+
+/*
+ * The socket is being closed or shut down for reading. If there are still any
+ * in-flight file descriptors, theey will never be received anymore, so close
+ * them now.
+ */
+void
+uds_io_reset(struct udssock * uds)
+{
+ struct uds_fd *ufd;
+
+ /*
+ * The UDS service may have the last and only reference to any of these
+ * file descriptors here. For that reason, we currently disallow
+ * transfer of UDS file descriptors, because the close(2) here could
+ * block on a socket close operation back to us, leading to a deadlock.
+ * Also, we use a non-blocking variant of close(2), to prevent that we
+ * end up hanging on sockets with SO_LINGER turned on.
+ */
+ SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) {
+ dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+ closenb(ufd->ufd_fd);
+ }
+
+ SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds);
+
+ /*
+ * If this reset happens as part of a shutdown, it might be done
+ * again on close, so ensure that it will find a clean state. The
+ * receive buffer should never be looked at again either way, but reset
+ * it too just to be sure.
+ */
+ uds->uds_tail = 0;
+ uds->uds_len = 0;
+ uds->uds_last = 0;
+
+ SIMPLEQ_INIT(&uds->uds_fds);
+}
+
+/*
+ * Return the maximum usable part of the receive buffer, in bytes. The return
+ * value is used for the SO_SNDBUF and SO_RCVBUF socket options.
+ */
+size_t
+uds_io_buflen(void)
+{
+
+ /*
+ * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we
+ * could use the full receive buffer for data. This would require that
+ * we store up to one header in the socket object rather than in the
+ * receive buffer.
+ */
+ return UDS_BUF - UDS_HDRLEN;
+}
+
+/*
+ * Fetch 'len' bytes starting from absolute position 'pos' into the receive
+ * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'.
+ * Return the absolute position of the first byte after the fetched data in the
+ * receive buffer.
+ */
+static size_t
+uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len)
+{
+ size_t left;
+
+ assert(off < UDS_BUF);
+
+ left = UDS_BUF - off;
+ if (len >= left) {
+ memcpy(ptr, &uds->uds_buf[off], left);
+
+ if ((len -= left) > 0)
+ memcpy((char *)ptr + left, &uds->uds_buf[0], len);
+
+ return len;
+ } else {
+ memcpy(ptr, &uds->uds_buf[off], len);
+
+ return off + len;
+ }
+}
+
+/*
+ * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive
+ * buffer of socket 'uds', starting at absolute position 'pos' into the receive
+ * buffer. Return the absolute position of the first byte after the stored
+ * data in the receive buffer.
+ */
+static size_t
+uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len)
+{
+ size_t left;
+
+ assert(off < UDS_BUF);
+
+ left = UDS_BUF - off;
+ if (len >= left) {
+ memcpy(&uds->uds_buf[off], ptr, left);
+
+ if ((len -= left) > 0)
+ memcpy(&uds->uds_buf[0], (const char *)ptr + left,
+ len);
+
+ return len;
+ } else {
+ memcpy(&uds->uds_buf[off], ptr, len);
+
+ return off + len;
+ }
+}
+
+/*
+ * Fetch a segment header previously stored in the receive buffer of socket
+ * 'uds' at absolute position 'off'. Return the absolute position of the first
+ * byte after the header, as well as the entire segment length in 'seglen', the
+ * length of the data in the segment in 'datalen', and the segment flags in
+ * 'segflags'.
+ */
+static size_t
+uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen,
+ size_t * datalen, unsigned int * segflags)
+{
+ unsigned char hdr[UDS_HDRLEN];
+
+ off = uds_fetch(uds, off, hdr, sizeof(hdr));
+
+ *seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1];
+ *datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3];
+ *segflags = hdr[4];
+
+ assert(*seglen >= UDS_HDRLEN);
+ assert(*seglen <= uds->uds_len);
+ assert(*datalen <= *seglen - UDS_HDRLEN);
+ assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN);
+ assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
+
+ return off;
+}
+
+/*
+ * Store a segment header in the receive buffer of socket 'uds' at absolute
+ * position 'off', with the segment length 'seglen', the segment data length
+ * 'datalen', and the segment flags 'segflags'. Return the absolute receive
+ * buffer position of the first data byte after the stored header.
+ */
+static size_t
+uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen,
+ unsigned int segflags)
+{
+ unsigned char hdr[UDS_HDRLEN];
+
+ assert(seglen <= USHRT_MAX);
+ assert(datalen <= seglen);
+ assert(segflags <= UCHAR_MAX);
+ assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
+
+ hdr[0] = (seglen >> 8) & 0xff;
+ hdr[1] = seglen & 0xff;
+ hdr[2] = (datalen >> 8) & 0xff;
+ hdr[3] = datalen & 0xff;
+ hdr[4] = segflags;
+
+ return uds_store(uds, off, hdr, sizeof(hdr));
+}
+
+/*
+ * Perform initial checks on a send request, before it may potentially be
+ * suspended. Return OK if this send request is valid, or a negative error
+ * code if it is not.
+ */
+int
+uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
+ const struct sockaddr * addr, socklen_t addr_len __unused,
+ endpoint_t user_endpt __unused, int flags)
+{
+ struct udssock *uds = (struct udssock *)sock;
+ size_t pathlen;
+
+ /*
+ * Reject calls with unknown flags. Besides the flags handled entirely
+ * by libsockevent (which are not part of 'flags' here), that is all of
+ * them. TODO: ensure that we should really reject all other flags
+ * rather than ignore them.
+ */
+ if (flags != 0)
+ return EOPNOTSUPP;
+
+ /*
+ * Perform very basic address and message size checks on the send call.
+ * For non-stream sockets, we must reject packets that may never fit in
+ * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the
+ * send call may end up being suspended indefinitely. Therefore, we
+ * assume the worst-case scenario, which is that a full set of
+ * credentials must be associated with the packet. As a result, we may
+ * reject some large packets that could actually just fit. Checking
+ * the peer's LOCAL_CREDS setting here is not safe: even if we know the
+ * peer already at all (for SOCK_DGRAM we do not), the send may still
+ * block and the option toggled before it unblocks.
+ */
+ switch (uds_get_type(uds)) {
+ case SOCK_STREAM:
+ /* Nothing to check for this case. */
+ break;
+
+ case SOCK_SEQPACKET:
+ if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN)
+ return EMSGSIZE;
+
+ break;
+
+ case SOCK_DGRAM:
+ if (!uds_has_link(uds) && addr == NULL)
+ return EDESTADDRREQ;
+
+ /*
+ * The path is stored without null terminator, but with leading
+ * byte containing the path length--if there is a path at all.
+ */
+ pathlen = (size_t)uds->uds_pathlen;
+ if (pathlen > 0)
+ pathlen++;
+
+ if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN)
+ return EMSGSIZE;
+
+ break;
+
+ default:
+ assert(0);
+ }
+
+ return OK;
+}
+
+/*
+ * Determine whether the (real or pretend) send request should be processed
+ * now, suspended until later, or rejected based on the current socket state.
+ * Return OK if the send request should be processed now. Return SUSPEND if
+ * the send request should be retried later. Return an appropriate negative
+ * error code if the send request should fail.
+ */
+static int
+uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min,
+ int partial)
+{
+ struct udssock *conn;
+ size_t avail, hdrlen, credlen;
+
+ assert(!uds_is_shutdown(uds, SFL_SHUT_WR));
+
+ if (uds_get_type(uds) != SOCK_DGRAM) {
+ if (uds_is_connecting(uds))
+ return SUSPEND;
+ if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
+ return ENOTCONN;
+ if (!uds_has_conn(uds))
+ return EPIPE;
+
+ conn = uds->uds_conn;
+
+ if (uds_is_shutdown(conn, SFL_SHUT_RD))
+ return EPIPE;
+
+ /*
+ * For connection-type sockets, we now have to check if there
+ * is enough room in the receive buffer. For SOCK_STREAM
+ * sockets, we must check if at least 'min' bytes can be moved
+ * into the receive buffer, at least if that is a reasonable
+ * value for ever making any forward progress at all. For
+ * SOCK_SEQPACKET sockets, we must check if the entire packet
+ * of size 'len' can be stored in the receive buffer. In both
+ * cases, we must take into account any metadata to store along
+ * with the data.
+ *
+ * Unlike in uds_pre_send(), we can now check safely whether
+ * the peer is expecting credentials, but we still don't know
+ * the actual size of the credentials, so again we take the
+ * maximum possible size. The same applies to file descriptors
+ * transferred via control data: all we have the control length
+ * right now, which if non-zero we assume to mean there might
+ * be file descriptors.
+ *
+ * In both cases, the reason of overestimating is that actually
+ * getting accurate sizes, by obtaining credentials or copying
+ * in control data, is very costly. We want to do that only
+ * when we are sure we will not suspend the send call after
+ * all. It is no problem to overestimate how much space will
+ * be needed here, but not to underestimate: that could cause
+ * applications that use select(2) and non-blocking sockets to
+ * end up in a busy-wait loop.
+ */
+ if (!partial && (conn->uds_flags & UDSF_PASSCRED))
+ credlen = 1 + UDS_MAXCREDLEN;
+ else
+ credlen = 0;
+
+ avail = UDS_BUF - conn->uds_len;
+
+ if (uds_get_type(uds) == SOCK_STREAM) {
+ /*
+ * Limit the low threshold to the maximum that can ever
+ * be sent at once.
+ */
+ if (min > UDS_BUF - UDS_HDRLEN - credlen)
+ min = UDS_BUF - UDS_HDRLEN - credlen;
+
+ /*
+ * Suspend the call only if not even the low threshold
+ * is met. Otherwise we may make (partial) progress.
+ */
+ if (len > min)
+ len = min;
+
+ /*
+ * If the receive buffer already has at least one
+ * segment, and there are certainly no file descriptors
+ * to transfer now, and we do not have to store
+ * credentials either, then this segment can be merged
+ * with the previous one. In that case, we need no
+ * space for a header. That is certainly the case if
+ * we are resuming an already partially completed send.
+ */
+ hdrlen = (avail == UDS_BUF || ctl_len != 0 ||
+ credlen > 0) ? UDS_HDRLEN : 0;
+ } else
+ hdrlen = UDS_HDRLEN;
+
+ if (avail < hdrlen + credlen + len)
+ return SUSPEND;
+ }
+
+ return OK;
+}
+
+/*
+ * Get the destination peer for a send request. The send test has already been
+ * performed first. On success, return OK, with a pointer to the peer socket
+ * stored in 'peerp'. On failure, return an appropriate error code.
+ */
+static int
+uds_send_peer(struct udssock * uds, const struct sockaddr * addr,
+ socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
+{
+ struct udssock *peer;
+ int r;
+
+ if (uds_get_type(uds) == SOCK_DGRAM) {
+ if (!uds_has_link(uds)) {
+ /* This was already checked in uds_pre_check(). */
+ assert(addr != NULL);
+
+ /*
+ * Find the socket identified by the given address.
+ * If it exists at all, see if it is a proper match.
+ */
+ if ((r = uds_lookup(uds, addr, addr_len, user_endpt,
+ &peer)) != OK)
+ return r;
+
+ /*
+ * If the peer socket is connected to a target, it
+ * must be this socket. Unfortunately, POSIX does not
+ * specify an error code for this. We borrow Linux's.
+ */
+ if (uds_has_link(peer) && peer->uds_link != uds)
+ return EPERM;
+ } else
+ peer = uds->uds_link;
+
+ /*
+ * If the receiving end will never receive this packet, we
+ * might as well not send it, so drop it immeiately. Indicate
+ * as such to the caller, using NetBSD's chosen error code.
+ */
+ if (uds_is_shutdown(peer, SFL_SHUT_RD))
+ return ENOBUFS;
+ } else {
+ assert(uds_has_conn(uds));
+
+ peer = uds->uds_conn;
+ }
+
+ *peerp = peer;
+ return OK;
+}
+
+/*
+ * Generate a new segment for the current send request, or arrange things such
+ * that new data can be merged with a previous segment. As part of this,
+ * decide whether we can merge data at all. The segment will be merged if, and
+ * only if, all of the following requirements are met:
+ *
+ * 1) the socket is of type SOCK_STREAM;
+ * 2) there is a previous segment in the receive buffer;
+ * 3) there is no ancillary data for the current send request.
+ *
+ * Also copy in regular data (if any), retrieve the sender's credentials (if
+ * needed), and copy over the source path (if applicable). However, do not yet
+ * commit the segment (or the new part to be merged), because the send request
+ * may still fail for other reasons.
+ *
+ * On success, return the length of the new segment (or, when merging, the
+ * length to be added to the last segment), as well as a flag indicating
+ * whether we are merging into the last segment in 'mergep', the length of the
+ * (new) data in the segment in 'datalenp', and the new segment's flags in
+ * 'segflagsp' (always zero when merging). Note that a return value of zero
+ * implies that we are merging zero extra bytes into the last segment, which
+ * means that effectively nothing changes; in that case the send call will be
+ * cut short and return zero to the caller as well. On failure, return a
+ * negative error code.
+ */
+static int
+uds_send_data(struct udssock * uds, struct udssock * peer,
+ const struct sockdriver_data * data, size_t len, size_t off,
+ endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep,
+ size_t * __restrict datalenp, unsigned int * __restrict segflagsp)
+{
+ struct sockcred sockcred;
+ gid_t groups[NGROUPS_MAX];
+ iovec_t iov[2];
+ unsigned int iovcnt, segflags;
+ unsigned char lenbyte;
+ size_t credlen, pathlen, datalen, seglen;
+ size_t avail, pos, left;
+ int r, merge;
+
+ /*
+ * At this point we should add the data to the peer's receive buffer.
+ * In the case of SOCK_STREAM sockets, we should add as much of the
+ * data as possible and suspend the call to send the rest later, if
+ * applicable. In the case of SOCK_DGRAM sockets, we should drop the
+ * packet if it does not fit in the buffer.
+ *
+ * Due to the checks in uds_can_send(), we know for sure that we no
+ * longer have to suspend without making any progress at this point.
+ */
+ segflags = (nfds > 0) ? UDS_HAS_FDS : 0;
+
+ /*
+ * Obtain the credentials now. Doing so allows us to determine how
+ * much space we actually need for them.
+ */
+ if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) {
+ memset(&sockcred, 0, sizeof(sockcred));
+
+ if ((r = getsockcred(user_endpt, &sockcred, groups,
+ __arraycount(groups))) != OK)
+ return r;
+
+ credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups);
+
+ segflags |= UDS_HAS_CRED;
+ } else
+ credlen = 0;
+
+ /* For bound source datagram sockets, include the source path. */
+ if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) {
+ pathlen = (size_t)uds->uds_pathlen + 1;
+
+ segflags |= UDS_HAS_PATH;
+ } else
+ pathlen = 0;
+
+ avail = UDS_BUF - peer->uds_len;
+
+ if (uds_get_type(uds) == SOCK_STREAM) {
+ /*
+ * Determine whether we can merge data into the previous
+ * segment. This is a more refined version of the test in
+ * uds_can_send(), as we now know whether there are actually
+ * any FDs to transfer.
+ */
+ merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0);
+
+ /* Determine how much we can send at once. */
+ if (!merge) {
+ assert(avail > UDS_HDRLEN + credlen);
+ datalen = avail - UDS_HDRLEN - credlen;
+ } else
+ datalen = avail;
+
+ if (datalen > len)
+ datalen = len;
+
+ /* If we cannot make progress, we should have suspended.. */
+ assert(datalen != 0 || len == 0);
+ } else {
+ merge = FALSE;
+
+ datalen = len;
+ }
+ assert(datalen <= len);
+ assert(datalen <= UDS_BUF);
+
+ /*
+ * Compute the total amount of space we need for the segment in the
+ * receive buffer. Given that we have done will-it-fit tests in
+ * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one
+ * case left where the result may not fit, and that is for SOCK_DGRAM
+ * packets. In that case, we drop the packet. POSIX says we should
+ * throw an error in that case, and that is also what NetBSD does.
+ */
+ if (!merge)
+ seglen = UDS_HDRLEN + credlen + pathlen + datalen;
+ else
+ seglen = datalen;
+
+ if (seglen > avail) {
+ assert(uds_get_type(uds) == SOCK_DGRAM);
+
+ /* Drop the packet, borrowing NetBSD's chosen error code. */
+ return ENOBUFS;
+ }
+
+ /*
+ * Generate the full segment, but do not yet update the buffer head.
+ * We may still run into an error (copying in file descriptors) or even
+ * decide that nothing gets sent after all (if there are no data or
+ * file descriptors). If we are merging the new data into the previous
+ * segment, do not generate a header.
+ */
+ pos = uds_get_head(peer);
+
+ /* Generate the header, if needed. */
+ if (!merge)
+ pos = uds_store_hdr(peer, pos, seglen, datalen, segflags);
+ else
+ assert(segflags == 0);
+
+ /* Copy in and store the sender's credentials, if desired. */
+ if (credlen > 0) {
+ assert(credlen >= 1 + sizeof(sockcred));
+ assert(credlen <= UCHAR_MAX);
+
+ lenbyte = credlen - 1;
+ pos = uds_store(peer, pos, &lenbyte, 1);
+
+ if (sockcred.sc_ngroups > 0) {
+ pos = uds_store(peer, pos, &sockcred,
+ offsetof(struct sockcred, sc_groups));
+ pos = uds_store(peer, pos, groups,
+ sockcred.sc_ngroups * sizeof(gid_t));
+ } else
+ pos = uds_store(peer, pos, &sockcred,
+ sizeof(sockcred));
+ }
+
+ /* Store the sender's address if any. Datagram sockets only. */
+ if (pathlen > 0) {
+ assert(pathlen > 1);
+ assert(pathlen <= UCHAR_MAX);
+
+ lenbyte = uds->uds_pathlen;
+ pos = uds_store(peer, pos, &lenbyte, 1);
+ pos = uds_store(peer, pos, uds->uds_path, pathlen - 1);
+ }
+
+ /* Lastly, copy in the actual data (if any) from the caller. */
+ if (datalen > 0) {
+ iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos];
+ left = UDS_BUF - pos;
+
+ if (left < datalen) {
+ assert(left > 0);
+ iov[0].iov_size = left;
+ iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0];
+ iov[1].iov_size = datalen - left;
+ iovcnt = 2;
+ } else {
+ iov[0].iov_size = datalen;
+ iovcnt = 1;
+ }
+
+ if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK)
+ return r;
+ }
+
+ *mergep = merge;
+ *datalenp = datalen;
+ *segflagsp = segflags;
+ return seglen;
+}
+
+/*
+ * Copy in control data for the current send request, and extract any file
+ * descriptors to be transferred. Do not yet duplicate the file descriptors,
+ * but rather store a list in a temporary buffer: the send request may still
+ * fail in which case we want to avoid having to undo the duplication.
+ *
+ * On success, return the number of (zero or more) file descriptors extracted
+ * from the request and stored in the temporary buffer. On failure, return a
+ * negative error code.
+ */
+static int
+uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len,
+ endpoint_t user_endpt)
+{
+ struct msghdr msghdr;
+ struct cmsghdr *cmsg;
+ socklen_t left;
+ unsigned int i, n, nfds;
+ int r;
+
+ /*
+ * Copy in the control data. We can spend a lot of effort copying in
+ * the data in small chunks, and change the receiving side to do the
+ * same, but it is really not worth it: applications never send a whole
+ * lot of file descriptors at once, and the buffer size is currently
+ * such that the UDS service itself will exhaust its OPEN_MAX limit
+ * anyway if they do.
+ */
+ if (ctl_len > sizeof(uds_ctlbuf))
+ return ENOBUFS;
+
+ if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK)
+ return r;
+
+ if (ctl_len < sizeof(uds_ctlbuf))
+ memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len);
+
+ /*
+ * Look for any file descriptors, and store their remote file
+ * descriptor numbers into a temporary array.
+ */
+ memset(&msghdr, 0, sizeof(msghdr));
+ msghdr.msg_control = uds_ctlbuf;
+ msghdr.msg_controllen = ctl_len;
+
+ nfds = 0;
+ r = OK;
+
+ /*
+ * The sender may provide file descriptors in multiple chunks.
+ * Currently we do not preserve these chunk boundaries, instead
+ * generating one single chunk with all file descriptors for the
+ * segment upon receipt. If needed, we can fairly easily adapt this
+ * later.
+ */
+ for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
+ /*
+ * Check for bogus lengths. There is no excuse for this;
+ * either the caller does not know what they are doing or we
+ * are looking at a hacking attempt.
+ */
+ assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len);
+ left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf);
+ assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
+
+ if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) {
+ printf("UDS: malformed control data from %u\n",
+ user_endpt);
+ r = EINVAL;
+ break;
+ }
+
+ if (cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS)
+ continue;
+
+ n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+ for (i = 0; i < n; i++) {
+ /*
+ * Copy the file descriptor to the temporary buffer,
+ * whose size is based on the control data buffer, so
+ * it is always large enough to contain all FDs.
+ */
+ assert(nfds < __arraycount(uds_ctlfds));
+
+ memcpy(&uds_ctlfds[nfds],
+ &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
+
+ nfds++;
+ }
+ }
+
+ return nfds;
+}
+
+/*
+ * Actually duplicate any file descriptors that we extracted from the sender's
+ * control data and stored in our temporary buffer. On success, return OK,
+ * with all file descriptors stored in file descriptor objects that are
+ * appended to the socket's list of in-flight FD objects. Thus, on success,
+ * the send request may no longer fail. On failure, return a negative error
+ * code, with any partial duplication undone.
+ */
+static int
+uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt)
+{
+ SIMPLEQ_HEAD(, uds_fd) fds;
+ struct uds_fd *ufd;
+ unsigned int i;
+ int r;
+
+ SIMPLEQ_INIT(&fds);
+
+ for (i = 0; i < nfds; i++) {
+ if (SIMPLEQ_EMPTY(&uds_freefds)) {
+ /* UDS itself may already have OPEN_MAX FDs. */
+ r = ENFILE;
+ break;
+ }
+
+ /*
+ * The caller may have given an invalid FD, or UDS itself may
+ * unexpectedly have run out of available file descriptors etc.
+ */
+ if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0)
+ break;
+
+ ufd = SIMPLEQ_FIRST(&uds_freefds);
+ SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next);
+
+ ufd->ufd_fd = r;
+ ufd->ufd_count = 0;
+
+ SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next);
+
+ dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r));
+ }
+
+ /* Did we experience an error while copying in the file descriptors? */
+ if (r < 0) {
+ /* Revert the successful copyfd() calls made so far. */
+ SIMPLEQ_FOREACH(ufd, &fds, ufd_next) {
+ dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+ closenb(ufd->ufd_fd);
+ }
+
+ SIMPLEQ_CONCAT(&uds_freefds, &fds);
+
+ return r;
+ }
+
+ /*
+ * Success. If there were any file descriptors at all, add them to the
+ * peer's list of in-flight file descriptors. Assign the number of
+ * file descriptors copied in to the first file descriptor object, so
+ * that we know how many to copy out (or discard) for this segment.
+ * Also set the UDS_HAS_FDS flag on the segment.
+ */
+ ufd = SIMPLEQ_FIRST(&fds);
+ ufd->ufd_count = nfds;
+
+ SIMPLEQ_CONCAT(&peer->uds_fds, &fds);
+
+ return OK;
+}
+
+/*
+ * The current send request is successful or at least has made progress.
+ * Commit the new segment or, if we decided to merge the new data into the last
+ * segment, update the header of the last segment. Also wake up the receiving
+ * side, because there will now be new data to receive.
+ */
+static void
+uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen,
+ int merge, size_t seglen, unsigned int segflags)
+{
+ size_t pos, prevseglen, prevdatalen;
+
+ /*
+ * For non-datagram sockets, credentials are sent only once after
+ * setting the LOCAL_CREDS option. After that, the option is unset.
+ */
+ if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM)
+ peer->uds_flags &= ~UDSF_PASSCRED;
+
+ if (merge) {
+ assert(segflags == 0);
+
+ pos = uds_get_last(peer);
+
+ (void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen,
+ &segflags);
+
+ peer->uds_len += seglen;
+ assert(peer->uds_len <= UDS_BUF);
+
+ seglen += prevseglen;
+ datalen += prevdatalen;
+ assert(seglen <= UDS_BUF);
+
+ uds_store_hdr(peer, pos, seglen, datalen, segflags);
+ } else {
+ peer->uds_last = peer->uds_len;
+
+ peer->uds_len += seglen;
+ assert(peer->uds_len <= UDS_BUF);
+ }
+
+ /* Now that there are new data, wake up the receiver side. */
+ sockevent_raise(&peer->uds_sock, SEV_RECV);
+}
+
+/*
+ * Process a send request. Return OK if the send request has successfully
+ * completed, SUSPEND if it should be tried again later, or a negative error
+ * code on failure. In all cases, the values of 'off' and 'ctl_off' must be
+ * updated if any progress has been made; if either is non-zero, libsockevent
+ * will return the partial progress rather than an error code.
+ */
+int
+uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len,
+ size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
+ socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len,
+ endpoint_t user_endpt, int flags __unused, size_t min)
+{
+ struct udssock *uds = (struct udssock *)sock;
+ struct udssock *peer;
+ size_t seglen, datalen = 0 /*gcc*/;
+ unsigned int nfds, segflags = 0 /*gcc*/;
+ int r, partial, merge = 0 /*gcc*/;
+
+ dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n",
+ uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
+ (ctl_off != NULL) ? *ctl_off : 0, flags));
+
+ partial = (off != NULL && *off > 0);
+
+ /*
+ * First see whether we can process this send call at all right now.
+ * Most importantly, for connected sockets, if the peer's receive
+ * buffer is full, we may have to suspend the call until some space has
+ * been freed up.
+ */
+ if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK)
+ return r;
+
+ /*
+ * Then get the peer socket. For connected sockets, this is trivial.
+ * For unconnected sockets, it may involve a lookup of the given
+ * address.
+ */
+ if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK)
+ return r;
+
+ /*
+ * We now know for sure that we will not suspend this call without
+ * making any progress. However, the call may still fail. Copy in
+ * control data first now, so that we know whether there are any file
+ * descriptors to transfer. This aspect may determine whether or not
+ * we can merge data with a previous segment. Do not actually copy in
+ * the actual file descriptors yet, because that is much harder to undo
+ * in case of a failure later on.
+ */
+ if (ctl_len > 0) {
+ /* We process control data once, in full. */
+ assert(*ctl_off == 0);
+
+ if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0)
+ return r;
+ nfds = (unsigned int)r;
+ } else
+ nfds = 0;
+
+ /*
+ * Now generate a new segment, or (if possible) merge new data into the
+ * last segment. Since the call may still fail, prepare the segment
+ * but do not update the buffer head yet. Note that the segment
+ * contains not just regular data (in fact it may contain no data at
+ * all) but (also) certain ancillary data.
+ */
+ if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds,
+ &merge, &datalen, &segflags)) <= 0)
+ return r;
+ seglen = (size_t)r;
+
+ /*
+ * If we extracted any file descriptors from the control data earlier,
+ * copy them over to ourselves now. The resulting in-flight file
+ * descriptors are stored in a separate data structure. This is the
+ * last point where the send call may actually fail.
+ */
+ if (nfds > 0) {
+ if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK)
+ return r;
+ }
+
+ /*
+ * The transmission is now known to be (partially) successful. Commit
+ * the new work by moving the receive buffer head.
+ */
+ uds_send_advance(uds, peer, datalen, merge, seglen, segflags);
+
+ /*
+ * Register the result. For stream-type sockets, the expected behavior
+ * is that all data be sent, and so we may still have to suspend the
+ * call after partial progress. Otherwise, we are now done. Either
+ * way, we are done with the control data, so mark it as consumed.
+ */
+ *off += datalen;
+ *ctl_off += ctl_len;
+ if (uds_get_type(uds) == SOCK_STREAM && datalen < len)
+ return SUSPEND;
+ else
+ return OK;
+}
+
+/*
+ * Test whether a send request would block. The given 'min' parameter contains
+ * the minimum number of bytes that should be possible to send without blocking
+ * (the low send watermark). Return SUSPEND if the send request would block,
+ * or any other error code if it would not.
+ */
+int
+uds_test_send(struct sock * sock, size_t min)
+{
+ struct udssock *uds = (struct udssock *)sock;
+
+ return uds_send_test(uds, min, 0, min, FALSE /*partial*/);
+}
+
+/*
+ * Perform initial checks on a receive request, before it may potentially be
+ * suspended. Return OK if this receive request is valid, or a negative error
+ * code if it is not.
+ */
+int
+uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
+ int flags)
+{
+
+ /*
+ * Reject calls with unknown flags. TODO: ensure that we should really
+ * reject all other flags rather than ignore them.
+ */
+ if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0)
+ return EOPNOTSUPP;
+
+ return OK;
+}
+
+/*
+ * Determine whether the (real or pretend) receive request should be processed
+ * now, suspended until later, or rejected based on the current socket state.
+ * Return OK if the receive request should be processed now, along with a first
+ * indication whether the call may still be suspended later in 'may_block'.
+ * Return SUSPEND if the receive request should be retried later. Return an
+ * appropriate negative error code if the receive request should fail.
+ */
+static int
+uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial,
+ int * may_block)
+{
+ size_t seglen, datalen;
+ unsigned int segflags;
+ int r;
+
+ /*
+ * If there are any pending data, those should always be received
+ * first. However, if there is nothing to receive, then whether we
+ * should suspend the receive call or fail immediately depends on other
+ * conditions. We first look at these other conditions.
+ */
+ r = OK;
+
+ if (uds_get_type(uds) != SOCK_DGRAM) {
+ if (uds_is_connecting(uds))
+ r = SUSPEND;
+ else if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
+ r = ENOTCONN;
+ else if (!uds_has_conn(uds) ||
+ uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR))
+ r = SOCKEVENT_EOF;
+ }
+
+ if (uds->uds_len == 0) {
+ /*
+ * For stream-type sockets, we use the policy: if no regular
+ * data is requested, then end the call without receiving
+ * anything. For packet-type sockets, the request should block
+ * until there is a packet to discard, though.
+ */
+ if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0))
+ return r;
+
+ return SUSPEND;
+ }
+
+ /*
+ * For stream-type sockets, we should still suspend the call if fewer
+ * than 'min' bytes are available right now, and there is a possibility
+ * that more data may arrive later. More may arrive later iff 'r' is
+ * OK (i.e., no EOF or error will follow) and, in case we already
+ * received some partial results, there is not already a next segment
+ * with ancillary data (i.e, nonzero segment flags), or in any case
+ * there isn't more than one segment in the buffer. Limit 'min' to the
+ * maximum that can ever be received, though. Since that is difficult
+ * in our case, we check whether the buffer is entirely full instead.
+ */
+ if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 &&
+ uds->uds_len < UDS_BUF) {
+ assert(uds->uds_len >= UDS_HDRLEN);
+
+ (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen,
+ &segflags);
+
+ if (datalen < min && seglen == uds->uds_len &&
+ (!partial || segflags == 0))
+ return SUSPEND;
+ }
+
+ /*
+ * Also start the decision process as to whether we should suspend the
+ * current call if MSG_WAITALL is given. Unfortunately there is no one
+ * place where we can conveniently do all the required checks.
+ */
+ if (may_block != NULL)
+ *may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM);
+ return OK;
+}
+
+/*
+ * Receive regular data, and possibly the source path, from the tail segment in
+ * the receive buffer. On success, return the positive non-zero length of the
+ * tail segment, with 'addr' and 'addr_len' modified to store the source
+ * address if applicable, the result flags in 'rflags' updated as appropriate,
+ * the tail segment's data length stored in 'datalen', the number of received
+ * regular data bytes stored in 'reslen', the segment flags stored in
+ * 'segflags', and the absolute receive buffer position of the credentials in
+ * the segment stored in 'credpos' if applicable. Since the receive call may
+ * still fail, this function must not yet update the tail or any other aspect
+ * of the receive buffer. Return zero if the current receive call was already
+ * partially successful (due to MSG_WAITALL) and can no longer make progress,
+ * and thus should be ended. Return a negative error code on failure.
+ */
+static int
+uds_recv_data(struct udssock * uds, const struct sockdriver_data * data,
+ size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len,
+ int * __restrict rflags, size_t * __restrict datalen,
+ size_t * __restrict reslen, unsigned int * __restrict segflags,
+ size_t * __restrict credpos)
+{
+ iovec_t iov[2];
+ unsigned char lenbyte;
+ unsigned int iovcnt;
+ size_t pos, seglen, left;
+ int r;
+
+ pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags);
+
+ /*
+ * If a partially completed receive now runs into a segment that cannot
+ * be logically merged with the previous one (because it has at least
+ * one segment flag set, meaning it has ancillary data), then we must
+ * shortcut the receive now.
+ */
+ if (off != 0 && *segflags != 0)
+ return OK;
+
+ /*
+ * As stated, for stream-type sockets, we choose to ignore zero-size
+ * receive calls. This has the consequence that reading a zero-sized
+ * segment (with ancillary data) requires a receive request for at
+ * least one regular data byte. Such a receive call would then return
+ * zero. The problem with handling zero-data receive requests is that
+ * we need to know whether the current segment is terminated (i.e., no
+ * more data can possibly be merged into it later), which is a test
+ * that we rather not perform, not in the least because we do not know
+ * whether there is an error pending on the socket.
+ *
+ * For datagrams, we currently allow a zero-size receive call to
+ * discard the next datagram.
+ *
+ * TODO: compare this against policies on other platforms.
+ */
+ if (len == 0 && uds_get_type(uds) == SOCK_STREAM)
+ return OK;
+
+ /*
+ * We have to skip the credentials for now: these are copied out as
+ * control data, and thus will (well, may) be looked at when dealing
+ * with the control data. For the same reason, we do not even look at
+ * UDS_HAS_FDS here.
+ */
+ if (*segflags & UDS_HAS_CRED) {
+ *credpos = pos;
+
+ pos = uds_fetch(uds, pos, &lenbyte, 1);
+ pos = uds_advance(pos, (size_t)lenbyte);
+ }
+
+ /*
+ * Copy out the source address, but only if the (datagram) socket is
+ * not connected. TODO: even when it is connected, it may still
+ * receive packets sent to it from other sockets *before* being
+ * connected, and the receiver has no way of knowing that those packets
+ * did not come from its new peer. Ideally, the older packets should
+ * be dropped..
+ */
+ if (*segflags & UDS_HAS_PATH) {
+ pos = uds_fetch(uds, pos, &lenbyte, 1);
+
+ if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds))
+ uds_make_addr((const char *)&uds->uds_buf[pos],
+ (size_t)lenbyte, addr, addr_len);
+
+ pos = uds_advance(pos, (size_t)lenbyte);
+ }
+
+ /*
+ * We can receive no more data than those that are present in the
+ * segment, obviously. For stream-type sockets, any more data that
+ * could have been received along with the current data would have been
+ * merged in the current segment, so we need not search for any next
+ * segments.
+ *
+ * For non-stream sockets, the caller may receive less than a whole
+ * packet if it supplied a small buffer. In that case, the rest of the
+ * packet will be discarded (but not here yet!) and the caller gets
+ * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway.
+ */
+ if (len > *datalen)
+ len = *datalen;
+ else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM)
+ *rflags |= MSG_TRUNC;
+
+ /* Copy out the data to the caller. */
+ if (len > 0) {
+ iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos];
+ left = UDS_BUF - pos;
+
+ if (left < len) {
+ iov[0].iov_size = left;
+ iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0];
+ iov[1].iov_size = len - left;
+ iovcnt = 2;
+ } else {
+ iov[0].iov_size = len;
+ iovcnt = 1;
+ }
+
+ if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK)
+ return r;
+ }
+
+ *reslen = len;
+ assert(seglen > 0 && seglen <= INT_MAX);
+ return (int)seglen;
+}
+
+/*
+ * The current segment has associated file descriptors. If possible, copy out
+ * all file descriptors to the receiver, and generate and copy out a chunk of
+ * control data that contains their file descriptor numbers. If not all
+ * file descriptors fit in the receiver's buffer, or if any error occurs, no
+ * file descriptors are copied out.
+ */
+static int
+uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl,
+ socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags)
+{
+ struct msghdr msghdr;
+ struct cmsghdr *cmsg;
+ struct uds_fd *ufd;
+ unsigned int i, nfds;
+ socklen_t chunklen, chunkspace;
+ int r, fd, what;
+
+ /* See how many file descriptors should be part of this chunk. */
+ assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+ ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+ nfds = ufd->ufd_count;
+ assert(nfds > 0);
+
+ /*
+ * We produce and copy out potentially unaligned chunks, using
+ * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE.
+ * This may leave "gap" bytes unchanged in userland, but that should
+ * not be a problem. By producing unaligned chunks, we eliminate a
+ * potential boundary case where the unaligned chunk passed in (by the
+ * sender) no longer fits in the same buffer after being aligned here.
+ */
+ chunklen = CMSG_LEN(sizeof(int) * nfds);
+ chunkspace = CMSG_SPACE(sizeof(int) * nfds);
+ assert(chunklen <= sizeof(uds_ctlbuf));
+ if (chunklen > ctl_len)
+ return 0; /* chunk would not fit, so produce nothing instead */
+ if (chunkspace > ctl_len)
+ chunkspace = ctl_len;
+
+ memset(&msghdr, 0, sizeof(msghdr));
+ msghdr.msg_control = uds_ctlbuf;
+ msghdr.msg_controllen = sizeof(uds_ctlbuf);
+
+ memset(uds_ctlbuf, 0, chunklen);
+ cmsg = CMSG_FIRSTHDR(&msghdr);
+ cmsg->cmsg_len = chunklen;
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+
+ /*
+ * Copy the group's local file descriptors to the target endpoint, and
+ * store the resulting remote file descriptors in the chunk buffer.
+ */
+ r = OK;
+
+ for (i = 0; i < nfds; i++) {
+ assert(ufd != SIMPLEQ_END(&uds->uds_fds));
+ assert(i == 0 || ufd->ufd_count == 0);
+
+ what = COPYFD_TO;
+ if (flags & MSG_CMSG_CLOEXEC)
+ what |= COPYFD_CLOEXEC;
+
+ /* Failure may happen legitimately here (e.g., EMFILE). */
+ if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0)
+ break; /* we keep our progress so far in 'i' */
+
+ fd = r;
+
+ dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd));
+
+ memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int));
+
+ ufd = SIMPLEQ_NEXT(ufd, ufd_next);
+ }
+
+ /* If everything went well so far, copy out the produced chunk. */
+ if (r >= 0)
+ r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen);
+
+ /*
+ * Handle errors. At this point, the 'i' variable contains the number
+ * of file descriptors that have already been successfully copied out.
+ */
+ if (r < 0) {
+ /* Revert the successful copyfd() calls made so far. */
+ while (i-- > 0) {
+ memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
+
+ (void)copyfd(user_endpt, fd, COPYFD_CLOSE);
+ }
+
+ return r;
+ }
+
+ /*
+ * Success. Return the aligned size of the produced chunk, if the
+ * given length permits it. From here on, the receive call may no
+ * longer fail, as that would result in lost file descriptors.
+ */
+ return chunkspace;
+}
+
+/*
+ * Generate and copy out a chunk of control data with the sender's credentials.
+ * Return the aligned chunk size on success, or a negative error code on
+ * failure.
+ */
+static int
+uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl,
+ socklen_t ctl_len, socklen_t ctl_off, size_t credpos)
+{
+ struct msghdr msghdr;
+ struct cmsghdr *cmsg;
+ socklen_t chunklen, chunkspace;
+ unsigned char lenbyte;
+ size_t credlen;
+ int r;
+
+ /*
+ * Since the sender side already did the hard work of producing the
+ * (variable-size) sockcred structure as it should be received, there
+ * is relatively little work to be done here.
+ */
+ credpos = uds_fetch(uds, credpos, &lenbyte, 1);
+ credlen = (size_t)lenbyte;
+
+ chunklen = CMSG_LEN(credlen);
+ chunkspace = CMSG_SPACE(credlen);
+ assert(chunklen <= sizeof(uds_ctlbuf));
+ if (chunklen > ctl_len)
+ return 0; /* chunk would not fit, so produce nothing instead */
+ if (chunkspace > ctl_len)
+ chunkspace = ctl_len;
+
+ memset(&msghdr, 0, sizeof(msghdr));
+ msghdr.msg_control = uds_ctlbuf;
+ msghdr.msg_controllen = sizeof(uds_ctlbuf);
+
+ memset(uds_ctlbuf, 0, chunklen);
+ cmsg = CMSG_FIRSTHDR(&msghdr);
+ cmsg->cmsg_len = chunklen;
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_CREDS;
+
+ uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen);
+
+ if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK)
+ return r;
+
+ return chunkspace;
+}
+
+/*
+ * Copy out control data for the ancillary data associated with the current
+ * segment, if any. Return OK on success, at which point the current receive
+ * call may no longer fail. 'rflags' may be updated with additional result
+ * flags. Return a negative error code on failure.
+ */
+static int
+uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl,
+ socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt,
+ int flags, unsigned int segflags, size_t credpos, int * rflags)
+{
+ int r;
+
+ /*
+ * We first copy out all file descriptors, if any. We put them in one
+ * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS
+ * chunks. We believe that this should not cause application-level
+ * issues, but if it does, we can change that later with some effort.
+ * We then copy out credentials, if any.
+ *
+ * We copy out each control chunk independently of the others, and also
+ * perform error recovery on a per-chunk basis. This implies the
+ * following. If producing or copying out the first chunk fails, the
+ * entire recvmsg(2) call will fail with an appropriate error. If
+ * producing or copying out any subsequent chunk fails, the recvmsg(2)
+ * call will still return the previously generated chunks (a "short
+ * control read" if you will) as well as the MSG_CTRUNC flag. This
+ * approach is simple and clean, and it guarantees that we can always
+ * copy out at least as many file descriptors as we copied in for this
+ * segment, even if credentials are present as well. However, the
+ * approach does cause slightly more overhead when there are multiple
+ * chunks per call, as those are copied out separately.
+ *
+ * Since the generated SCM_RIGHTS chunk is never larger than the
+ * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf"
+ * buffer is always large enough to contain the chunk in its entirety.
+ * SCM_CREDS chunks should always fit easily as well.
+ *
+ * The MSG_CTRUNC flag will be returned iff not the entire user-given
+ * control buffer was filled and not all control chunks were delivered.
+ * Our current implementation does not deliver partial chunks. NetBSD
+ * does, except for SCM_RIGHTS chunks.
+ *
+ * TODO: get rid of the redundancy in processing return values.
+ */
+ if (segflags & UDS_HAS_FDS) {
+ r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt,
+ flags);
+
+ /*
+ * At this point, 'r' contains one of the following:
+ *
+ * r > 0 a chunk of 'r' bytes was added successfully.
+ * r == 0 not enough space left; the chunk was not added.
+ * r < 0 an error occurred; the chunk was not added.
+ */
+ if (r < 0 && *ctl_off == 0)
+ return r;
+
+ if (r > 0) {
+ ctl_len -= r;
+ *ctl_off += r;
+ } else
+ *rflags |= MSG_CTRUNC;
+ }
+
+ if (segflags & UDS_HAS_CRED) {
+ r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos);
+
+ /* As above. */
+ if (r < 0 && *ctl_off == 0)
+ return r;
+
+ if (r > 0) {
+ ctl_len -= r;
+ *ctl_off += r;
+ } else
+ *rflags |= MSG_CTRUNC;
+ }
+
+ return OK;
+}
+
+/*
+ * The current receive request is successful or, in the case of MSG_WAITALL,
+ * has made progress. Advance the receive buffer tail, either by discarding
+ * the entire tail segment or by generating a new, smaller tail segment that
+ * contains only the regular data left to be received from the original tail
+ * segment. Also wake up the sending side for connection-oriented sockets if
+ * applicable, because there may now be room for more data to be sent. Update
+ * 'may_block' if we are now sure that the call may not block on MSG_WAITALL
+ * after all.
+ */
+static void
+uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen,
+ size_t reslen, unsigned int segflags, int * may_block)
+{
+ struct udssock *conn;
+ struct uds_fd *ufd;
+ size_t delta, nseglen, advance;
+ unsigned int nfds;
+
+ /* Note that 'reslen' may be legitimately zero. */
+ assert(reslen <= datalen);
+
+ if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen)
+ reslen = datalen;
+
+ delta = datalen - reslen;
+
+ if (delta == 0) {
+ /*
+ * Fully consume the tail segment. We advance the tail by the
+ * full segment length, thus moving up to either the next
+ * segment in the receive buffer, or an empty receive buffer.
+ */
+ advance = seglen;
+
+ uds->uds_tail = uds_advance(uds->uds_tail, advance);
+ } else {
+ /*
+ * Partially consume the tail segment. We put a new segment
+ * header right in front of the remaining data, which obviously
+ * always fits. Since any ancillary data was consumed along
+ * with the first data byte of the segment, the new segment has
+ * no ancillary data anymore (and thus a zero flags field).
+ */
+ nseglen = UDS_HDRLEN + delta;
+ assert(nseglen < seglen);
+
+ advance = seglen - nseglen;
+
+ uds->uds_tail = uds_advance(uds->uds_tail, advance);
+
+ uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0);
+ }
+
+ /*
+ * For datagram-oriented sockets, we always consume at least a header.
+ * For stream-type sockets, we either consume a zero-data segment along
+ * with its ancillary data, or we consume at least one byte from a
+ * segment that does have regular data. In all other cases, the
+ * receive call has already been ended by now. Thus, we always advance
+ * the tail of the receive buffer here.
+ */
+ assert(advance > 0);
+
+ /*
+ * The receive buffer's used length (uds_len) and pointer to the
+ * previous segment header (uds_last) are offsets from the tail. Now
+ * that we have moved the tail, we need to adjust these accordingly.
+ * If the buffer is now empty, reset the tail to the buffer start so as
+ * to avoid splitting inter-process copies whenever possible.
+ */
+ assert(uds->uds_len >= advance);
+ uds->uds_len -= advance;
+
+ if (uds->uds_len == 0)
+ uds->uds_tail = 0;
+
+ /*
+ * If uds_last is zero here, it was pointing to the segment we just
+ * (partially) consumed. By leaving it zero, it will still point to
+ * the new or next segment.
+ */
+ if (uds->uds_last > 0) {
+ assert(uds->uds_len > 0);
+ assert(uds->uds_last >= advance);
+ uds->uds_last -= advance;
+ }
+
+ /*
+ * If there were any file descriptors associated with this segment,
+ * close and free them now.
+ */
+ if (segflags & UDS_HAS_FDS) {
+ assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+ ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+ nfds = ufd->ufd_count;
+ assert(nfds > 0);
+
+ while (nfds-- > 0) {
+ assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+ ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+ SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next);
+
+ dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+ closenb(ufd->ufd_fd);
+
+ SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next);
+ }
+ }
+
+ /*
+ * If there is now any data left in the receive buffer, then there has
+ * been a reason that we haven't received it. For stream sockets, that
+ * reason is that the next segment has ancillary data. In any case,
+ * this means we should never block the current receive operation
+ * waiting for more data. Otherwise, we may block on MSG_WAITALL.
+ */
+ if (uds->uds_len > 0)
+ *may_block = FALSE;
+
+ /*
+ * If the (non-datagram) socket has a peer that is not shut down for
+ * writing, see if it can be woken up to send more data. Note that
+ * the event will never be processed immediately.
+ */
+ if (uds_is_connected(uds)) {
+ assert(uds_get_type(uds) != SOCK_DGRAM);
+
+ conn = uds->uds_conn;
+
+ if (!uds_is_shutdown(conn, SFL_SHUT_WR))
+ sockevent_raise(&conn->uds_sock, SEV_SEND);
+ }
+}
+
+/*
+ * Process a receive request. Return OK if the receive request has completed
+ * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an
+ * end-of-file condition is reached, or a negative error code on failure. In
+ * all cases, the values of 'off' and 'ctl_off' must be updated if any progress
+ * has been made; if either is non-zero, libsockevent will return the partial
+ * progress rather than an error code or EOF.
+ */
+int
+uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len,
+ size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
+ socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len,
+ endpoint_t user_endpt, int flags, size_t min, int * rflags)
+{
+ struct udssock *uds = (struct udssock *)sock;
+ size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/;
+ unsigned int segflags;
+ int r, partial, may_block;
+
+ dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n",
+ uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
+ (ctl_off != NULL) ? *ctl_off : 0, flags));
+
+ /*
+ * Start by testing whether anything can be received at all, or whether
+ * an error or EOF should be returned instead, or whether the receive
+ * call should be suspended until later otherwise. If no (regular or
+ * control) data can be received, or if this was a test for select,
+ * we bail out right after.
+ */
+ partial = (off != NULL && *off > 0);
+
+ if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK)
+ return r;
+
+ /*
+ * Copy out regular data, if any. Do this before copying out control
+ * data, because the latter is harder to undo on failure. This data
+ * copy function returns returns OK (0) if we are to return a result of
+ * zero bytes (which is *not* EOF) to the caller without doing anything
+ * else. The function returns a nonzero positive segment length if we
+ * should carry on with the receive call (as it happens, all its other
+ * returned values may in fact be zero).
+ */
+ if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags,
+ &datalen, &reslen, &segflags, &credpos)) <= 0)
+ return r;
+ seglen = (size_t)r;
+
+ /*
+ * Copy out control data, if any: transfer and copy out records of file
+ * descriptors, and/or copy out sender credentials. This is the last
+ * part of the call that may fail.
+ */
+ if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags,
+ segflags, credpos, rflags)) != OK)
+ return r;
+
+ /*
+ * Now that the call has succeeded, move the tail of the receive
+ * buffer, unless we were merely peeking.
+ */
+ if (!(flags & MSG_PEEK))
+ uds_recv_advance(uds, seglen, datalen, reslen, segflags,
+ &may_block);
+ else
+ may_block = FALSE;
+
+ /*
+ * If the MSG_WAITALL flag was given, we may still have to suspend the
+ * call after partial success. In particular, the receive call may
+ * suspend after partial success if all of these conditions are met:
+ *
+ * 1) the socket is a stream-type socket;
+ * 2) MSG_WAITALL is set;
+ * 3) MSG_PEEK is not set;
+ * 4) MSG_DONTWAIT is not set (tested upon return);
+ * 5) the socket must not have a pending error (tested upon return);
+ * 6) the socket must not be shut down for reading (tested later);
+ * 7) the socket must still be connected to a peer (no EOF);
+ * 8) the peer must not have been shut down for writing (no EOF);
+ * 9) the next segment, if any, contains no ancillary data.
+ *
+ * Together, these points guarantee that the call could conceivably
+ * receive more after being resumed. Points 4 to 6 are covered by
+ * libsockevent, which will end the call even if we return SUSPEND
+ * here. Due to segment merging, we cover point 9 by checking that
+ * there is currently no next segment at all. Once a new segment
+ * arrives, the ancillary-data test is done then.
+ */
+ *off += reslen;
+ if ((flags & MSG_WAITALL) && reslen < len && may_block)
+ return SUSPEND;
+ else
+ return OK;
+}
+
+/*
+ * Test whether a receive request would block. The given 'min' parameter
+ * contains the minimum number of bytes that should be possible to receive
+ * without blocking (the low receive watermark). Return SUSPEND if the send
+ * request would block. Otherwise, return any other error code (including OK
+ * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled
+ * with the number of bytes available for receipt right now (if not zero).
+ * Note that if 'size' is not NULL, 'min' will always be zero.
+ */
+int
+uds_test_recv(struct sock * sock, size_t min, size_t * size)
+{
+ struct udssock *uds = (struct udssock *)sock;
+ size_t seglen;
+ unsigned int segflags;
+ int r;
+
+ if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/,
+ NULL /*may_block*/)) == SUSPEND)
+ return r;
+
+ if (size != NULL && uds->uds_len > 0)
+ (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size,
+ &segflags);
+
+ return r;
+}
+++ /dev/null
-/*
- * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL)
- * This code handles ioctl(2) commands to implement the socket API.
- * Some helper functions are also present.
- */
-
-#include "uds.h"
-
-static int
-perform_connection(devminor_t minorx, devminor_t minory,
- struct sockaddr_un *addr)
-{
- /*
- * There are several places were a connection is established, the
- * initiating call being one of accept(2), connect(2), socketpair(2).
- */
- dprintf(("UDS: perform_connection(%d, %d)\n", minorx, minory));
-
- /*
- * Only connection-oriented types are acceptable and only equal
- * types can connect to each other.
- */
- if ((uds_fd_table[minorx].type != SOCK_SEQPACKET &&
- uds_fd_table[minorx].type != SOCK_STREAM) ||
- uds_fd_table[minorx].type != uds_fd_table[minory].type)
- return EINVAL;
-
- /* Connect the pair of sockets. */
- uds_fd_table[minorx].peer = minory;
- uds_fd_table[minory].peer = minorx;
-
- /* Set the address of both sockets */
- memcpy(&uds_fd_table[minorx].addr, addr, sizeof(struct sockaddr_un));
- memcpy(&uds_fd_table[minory].addr, addr, sizeof(struct sockaddr_un));
-
- return OK;
-}
-
-static int
-do_accept(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- devminor_t minorparent; /* minor number of parent (server) */
- devminor_t minorpeer;
- int rc, i;
- struct sockaddr_un addr;
-
- dprintf(("UDS: do_accept(%d)\n", minor));
-
- /*
- * Somewhat weird logic is used in this function, so here's an
- * overview... The minor number is the server's client socket
- * (the socket to be returned by accept()). The data waiting
- * for us in the IO Grant is the address that the server is
- * listening on. This function uses the address to find the
- * server's descriptor. From there we can perform the
- * connection or suspend and wait for a connect().
- */
-
- /* This IOCTL must be called on a 'fresh' socket. */
- if (uds_fd_table[minor].type != -1)
- return EINVAL;
-
- /* Get the server's address */
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
- sizeof(struct sockaddr_un))) != OK)
- return rc;
-
- /* Locate the server socket. */
- for (i = 0; i < NR_FDS; i++) {
- if (uds_fd_table[i].stale == FALSE &&
- uds_fd_table[i].listening == TRUE &&
- uds_fd_table[i].addr.sun_family == AF_UNIX &&
- !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
- sizeof(uds_fd_table[i].addr.sun_path)))
- break;
- }
-
- if (i == NR_FDS)
- return EINVAL;
-
- minorparent = i; /* parent */
-
- /* We are the parent's child. */
- uds_fd_table[minorparent].child = minor;
-
- /*
- * The peer has the same type as the parent. we need to be that
- * type too.
- */
- uds_fd_table[minor].type = uds_fd_table[minorparent].type;
-
- /* Locate the peer to accept in the parent's backlog. */
- minorpeer = -1;
- for (i = 0; i < uds_fd_table[minorparent].backlog_size; i++) {
- if (uds_fd_table[minorparent].backlog[i] != -1) {
- minorpeer = uds_fd_table[minorparent].backlog[i];
- uds_fd_table[minorparent].backlog[i] = -1;
- break;
- }
- }
-
- if (minorpeer == -1) {
- dprintf(("UDS: do_accept(%d): suspend\n", minor));
-
- /*
- * There are no peers in the backlog, suspend and wait for one
- * to show up.
- */
- uds_fd_table[minor].suspended = UDS_SUSPENDED_ACCEPT;
-
- return EDONTREPLY;
- }
-
- dprintf(("UDS: connecting %d to %d -- parent is %d\n", minor,
- minorpeer, minorparent));
-
- if ((rc = perform_connection(minor, minorpeer, &addr)) != OK) {
- dprintf(("UDS: do_accept(%d): connection failed\n", minor));
-
- return rc;
- }
-
- uds_fd_table[minorparent].child = -1;
-
- /* If the peer is blocked on connect() or write(), revive the peer. */
- if (uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_CONNECT ||
- uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_WRITE) {
- dprintf(("UDS: do_accept(%d): revive %d\n", minor, minorpeer));
- uds_unsuspend(minorpeer);
- }
-
- /* See if we can satisfy an ongoing select. */
- if ((uds_fd_table[minorpeer].sel_ops & CDEV_OP_WR) &&
- uds_fd_table[minorpeer].size < UDS_BUF) {
- /* A write on the peer is possible now. */
- chardriver_reply_select(uds_fd_table[minorpeer].sel_endpt,
- minorpeer, CDEV_OP_WR);
- uds_fd_table[minorpeer].sel_ops &= ~CDEV_OP_WR;
- }
-
- return OK;
-}
-
-static int
-do_connect(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int child, peer;
- struct sockaddr_un addr;
- int rc, i, j;
- dev_t dev;
- ino_t ino;
-
- dprintf(("UDS: do_connect(%d)\n", minor));
-
- /* Only connection oriented sockets can connect. */
- if (uds_fd_table[minor].type != SOCK_STREAM &&
- uds_fd_table[minor].type != SOCK_SEQPACKET)
- return EINVAL;
-
- /* The socket must not be connecting or connected already. */
- peer = uds_fd_table[minor].peer;
- if (peer != -1) {
- if (uds_fd_table[peer].peer == -1)
- return EALREADY; /* connecting */
- else
- return EISCONN; /* connected */
- }
-
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
- sizeof(struct sockaddr_un))) != OK)
- return rc;
-
- if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
- sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
- return rc;
-
- /*
- * Look for a socket of the same type that is listening on the
- * address we want to connect to.
- */
- for (i = 0; i < NR_FDS; i++) {
- if (uds_fd_table[minor].type != uds_fd_table[i].type)
- continue;
- if (uds_fd_table[i].listening == FALSE)
- continue;
- if (uds_fd_table[i].stale == TRUE)
- continue;
- if (uds_fd_table[i].addr.sun_family != AF_UNIX)
- continue;
- if (strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
- sizeof(uds_fd_table[i].addr.sun_path)))
- continue;
-
- /* Found a matching socket. */
- break;
- }
-
- if (i == NR_FDS)
- return ECONNREFUSED;
-
- /* If the server is blocked on an accept, perform the connection. */
- if ((child = uds_fd_table[i].child) != -1) {
- rc = perform_connection(minor, child, &addr);
-
- if (rc != OK)
- return rc;
-
- uds_fd_table[i].child = -1;
-
- dprintf(("UDS: do_connect(%d): revive %d\n", minor, child));
-
- /* Wake up the accepting party. */
- uds_unsuspend(child);
-
- return OK;
- }
-
- dprintf(("UDS: adding %d to %d's backlog\n", minor, i));
-
- /* Look for a free slot in the backlog. */
- rc = -1;
- for (j = 0; j < uds_fd_table[i].backlog_size; j++) {
- if (uds_fd_table[i].backlog[j] == -1) {
- uds_fd_table[i].backlog[j] = minor;
-
- rc = 0;
- break;
- }
- }
-
- if (rc == -1)
- return ECONNREFUSED; /* backlog is full */
-
- /* See if the server is blocked on select(). */
- if (uds_fd_table[i].sel_ops & CDEV_OP_RD) {
- /* Satisfy a read-type select on the server. */
- chardriver_reply_select(uds_fd_table[i].sel_endpt, i,
- CDEV_OP_RD);
-
- uds_fd_table[i].sel_ops &= ~CDEV_OP_RD;
- }
-
- /* We found our server. */
- uds_fd_table[minor].peer = i;
-
- memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un));
-
- dprintf(("UDS: do_connect(%d): suspend\n", minor));
-
- /* Suspend until the server side accepts the connection. */
- uds_fd_table[minor].suspended = UDS_SUSPENDED_CONNECT;
-
- return EDONTREPLY;
-}
-
-static int
-do_listen(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc;
- int backlog_size;
-
- dprintf(("UDS: do_listen(%d)\n", minor));
-
- /* Ensure the socket has a type and is bound. */
- if (uds_fd_table[minor].type == -1 ||
- uds_fd_table[minor].addr.sun_family != AF_UNIX)
- return EINVAL;
-
- /* listen(2) supports only two socket types. */
- if (uds_fd_table[minor].type != SOCK_STREAM &&
- uds_fd_table[minor].type != SOCK_SEQPACKET)
- return EOPNOTSUPP;
-
- /*
- * The POSIX standard doesn't say what to do if listen() has
- * already been called. Well, there isn't an errno. We silently
- * let it happen, but if listen() has already been called, we
- * don't allow the backlog to shrink.
- */
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &backlog_size,
- sizeof(backlog_size))) != OK)
- return rc;
-
- if (uds_fd_table[minor].listening == FALSE) {
- /* Set the backlog size to a reasonable value. */
- if (backlog_size <= 0 || backlog_size > UDS_SOMAXCONN)
- backlog_size = UDS_SOMAXCONN;
-
- uds_fd_table[minor].backlog_size = backlog_size;
- } else {
- /* Allow the user to expand the backlog size. */
- if (backlog_size > uds_fd_table[minor].backlog_size &&
- backlog_size < UDS_SOMAXCONN)
- uds_fd_table[minor].backlog_size = backlog_size;
-
- /*
- * Don't let the user shrink the backlog_size, as we might
- * have clients waiting in those slots.
- */
- }
-
- /* This socket is now listening. */
- uds_fd_table[minor].listening = TRUE;
-
- return OK;
-}
-
-static int
-do_socket(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc, type;
-
- dprintf(("UDS: do_socket(%d)\n", minor));
-
- /* The socket type can only be set once. */
- if (uds_fd_table[minor].type != -1)
- return EINVAL;
-
- /* Get the requested type. */
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &type,
- sizeof(type))) != OK)
- return rc;
-
- /* Assign the type if it is valid only. */
- switch (type) {
- case SOCK_STREAM:
- case SOCK_DGRAM:
- case SOCK_SEQPACKET:
- uds_fd_table[minor].type = type;
- return OK;
-
- default:
- return EINVAL;
- }
-}
-
-static int
-do_bind(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- struct sockaddr_un addr;
- int rc, i;
- dev_t dev;
- ino_t ino;
-
- dprintf(("UDS: do_bind(%d)\n", minor));
-
- /* If the type hasn't been set by do_socket() yet, OR an attempt
- * to re-bind() a non-SOCK_DGRAM socket is made, fail the call.
- */
- if ((uds_fd_table[minor].type == -1) ||
- (uds_fd_table[minor].addr.sun_family == AF_UNIX &&
- uds_fd_table[minor].type != SOCK_DGRAM))
- return EINVAL;
-
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
- sizeof(struct sockaddr_un))) != OK)
- return rc;
-
- /* Do some basic sanity checks on the address. */
- if (addr.sun_family != AF_UNIX)
- return EAFNOSUPPORT;
-
- if (addr.sun_path[0] == '\0')
- return ENOENT;
-
- /* Attempt to create the socket file. */
- if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
-#if NOT_YET
- sizeof(addr.sun_path), SPATH_CREATE, &dev, &ino)) != OK)
-#else
- sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
-#endif
- return rc;
-
- /*
- * It is possible that the socket path name was already in use as
- * address by another socket. This means that the socket file was
- * prematurely unlinked. In that case, mark the old socket as stale,
- * so that its path name will not be matched and only the newly bound
- * socket will be found in address-based searches. For now, we leave
- * the old socket marked as stale for as long as it is bound to the
- * same address. A more advanced implementation could establish an
- * order between the sockets so that the most recently bound socket is
- * found at any time, but it is doubtful whether that would be useful.
- */
- for (i = 0; i < NR_FDS; i++) {
- if (uds_fd_table[i].stale == FALSE &&
- uds_fd_table[i].addr.sun_family == AF_UNIX &&
- !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
- sizeof(uds_fd_table[i].addr.sun_path))) {
-#if NOT_YET
- uds_fd_table[i].stale = TRUE;
-#else
- return EADDRINUSE;
-#endif
- }
- }
-
- /* Looks good, perform the bind(). */
- uds_fd_table[minor].stale = FALSE;
- memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un));
-
- return OK;
-}
-
-static int
-do_getsockname(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- dprintf(("UDS: do_getsockname(%d)\n", minor));
-
- /*
- * Unconditionally send the address we have assigned to this socket.
- * The POSIX standard doesn't say what to do if the address hasn't been
- * set. If the address isn't currently set, then the user will get
- * NULL bytes. Note: libc depends on this behavior.
- */
- return sys_safecopyto(endpt, grant, 0,
- (vir_bytes) &uds_fd_table[minor].addr, sizeof(struct sockaddr_un));
-}
-
-static int
-do_getpeername(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int peer_minor;
-
- dprintf(("UDS: do_getpeername(%d)\n", minor));
-
- /* Check that the socket is connected with a valid peer. */
- if (uds_fd_table[minor].peer != -1) {
- peer_minor = uds_fd_table[minor].peer;
-
- /* Copy the address from the peer. */
- return sys_safecopyto(endpt, grant, 0,
- (vir_bytes) &uds_fd_table[peer_minor].addr,
- sizeof(struct sockaddr_un));
- } else if (uds_fd_table[minor].err == ECONNRESET) {
- uds_fd_table[minor].err = 0;
-
- return ECONNRESET;
- } else
- return ENOTCONN;
-}
-
-static int
-do_shutdown(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc, how;
-
- dprintf(("UDS: do_shutdown(%d)\n", minor));
-
- /* The socket must be connection oriented. */
- if (uds_fd_table[minor].type != SOCK_STREAM &&
- uds_fd_table[minor].type != SOCK_SEQPACKET)
- return EINVAL;
-
- if (uds_fd_table[minor].peer == -1) {
- /* shutdown(2) is only valid for connected sockets. */
- if (uds_fd_table[minor].err == ECONNRESET)
- return ECONNRESET;
- else
- return ENOTCONN;
- }
-
- /* Get the 'how' parameter from the caller. */
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &how,
- sizeof(how))) != OK)
- return rc;
-
- switch (how) {
- case SHUT_RD: /* Take away read permission. */
- uds_fd_table[minor].mode &= ~UDS_R;
- break;
-
- case SHUT_WR: /* Take away write permission. */
- uds_fd_table[minor].mode &= ~UDS_W;
- break;
-
- case SHUT_RDWR: /* Shut down completely. */
- uds_fd_table[minor].mode = 0;
- break;
-
- default:
- return EINVAL;
- }
-
- return OK;
-}
-
-static int
-do_socketpair(devminor_t minorx, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc;
- dev_t minorin;
- devminor_t minory;
- struct sockaddr_un addr;
-
- dprintf(("UDS: do_socketpair(%d)\n", minorx));
-
- /* The ioctl argument is the minor number of the second socket. */
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &minorin,
- sizeof(minorin))) != OK)
- return rc;
-
- minory = minor(minorin);
-
- dprintf(("UDS: socketpair(%d, %d,)\n", minorx, minory));
-
- /* Security check: both sockets must have the same owner endpoint. */
- if (uds_fd_table[minorx].owner != uds_fd_table[minory].owner)
- return EPERM;
-
- addr.sun_family = AF_UNIX;
- addr.sun_path[0] = 'X';
- addr.sun_path[1] = '\0';
-
- return perform_connection(minorx, minory, &addr);
-}
-
-static int
-do_getsockopt_sotype(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- dprintf(("UDS: do_getsockopt_sotype(%d)\n", minor));
-
- /* If the type hasn't been set yet, we fail the call. */
- if (uds_fd_table[minor].type == -1)
- return EINVAL;
-
- return sys_safecopyto(endpt, grant, 0,
- (vir_bytes) &uds_fd_table[minor].type, sizeof(int));
-}
-
-static int
-do_getsockopt_peercred(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int peer_minor;
- int rc;
- struct uucred cred;
-
- dprintf(("UDS: do_getsockopt_peercred(%d)\n", minor));
-
- if (uds_fd_table[minor].peer == -1) {
- if (uds_fd_table[minor].err == ECONNRESET) {
- uds_fd_table[minor].err = 0;
-
- return ECONNRESET;
- } else
- return ENOTCONN;
- }
-
- peer_minor = uds_fd_table[minor].peer;
-
- /*
- * Obtain the peer's credentials and copy them out. Ignore failures;
- * in that case, the caller will simply get no credentials.
- */
- memset(&cred, 0, sizeof(cred));
- cred.cr_uid = -1;
- cred.cr_gid = -1;
- (void)getepinfo(uds_fd_table[peer_minor].owner, &cred.cr_uid,
- &cred.cr_gid);
-
- return sys_safecopyto(endpt, grant, 0, (vir_bytes) &cred,
- sizeof(struct uucred));
-}
-
-static int
-do_getsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- size_t sndbuf = UDS_BUF;
-
- dprintf(("UDS: do_getsockopt_sndbuf(%d)\n", minor));
-
- return sys_safecopyto(endpt, grant, 0, (vir_bytes) &sndbuf,
- sizeof(sndbuf));
-}
-
-static int
-do_setsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc;
- size_t sndbuf;
-
- dprintf(("UDS: do_setsockopt_sndbuf(%d)\n", minor));
-
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &sndbuf,
- sizeof(sndbuf))) != OK)
- return rc;
-
- /* The send buffer is limited to 32KB at the moment. */
- if (sndbuf > UDS_BUF)
- return ENOSYS;
-
- /* FIXME: actually shrink the buffer. */
- return OK;
-}
-
-static int
-do_getsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- size_t rcvbuf = UDS_BUF;
-
- dprintf(("UDS: do_getsockopt_rcvbuf(%d)\n", minor));
-
- return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rcvbuf,
- sizeof(rcvbuf));
-}
-
-static int
-do_setsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc;
- size_t rcvbuf;
-
- dprintf(("UDS: do_setsockopt_rcvbuf(%d)\n", minor));
-
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &rcvbuf,
- sizeof(rcvbuf))) != OK)
- return rc;
-
- /* The receive buffer is limited to 32KB at the moment. */
- if (rcvbuf > UDS_BUF)
- return ENOSYS;
-
- /* FIXME: actually shrink the buffer. */
- return OK;
-}
-
-static int
-do_sendto(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc;
- struct sockaddr_un addr;
- dev_t dev;
- ino_t ino;
-
- dprintf(("UDS: do_sendto(%d)\n", minor));
-
- /* This IOCTL is only for SOCK_DGRAM sockets. */
- if (uds_fd_table[minor].type != SOCK_DGRAM)
- return EINVAL;
-
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
- sizeof(struct sockaddr_un))) != OK)
- return rc;
-
- /* Do some basic sanity checks on the address. */
- if (addr.sun_family != AF_UNIX || addr.sun_path[0] == '\0')
- return EINVAL;
-
- if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
- sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
- return rc;
-
- memcpy(&uds_fd_table[minor].target, &addr, sizeof(struct sockaddr_un));
-
- return OK;
-}
-
-static int
-do_recvfrom(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- dprintf(("UDS: do_recvfrom(%d)\n", minor));
-
- return sys_safecopyto(endpt, grant, 0,
- (vir_bytes) &uds_fd_table[minor].source,
- sizeof(struct sockaddr_un));
-}
-
-static int
-send_fds(devminor_t minor, struct msg_control *msg_ctrl,
- struct ancillary *data)
-{
- int i, rc, nfds, totalfds;
- endpoint_t from_ep;
- struct msghdr msghdr;
- struct cmsghdr *cmsg = NULL;
-
- dprintf(("UDS: send_fds(%d)\n", minor));
-
- from_ep = uds_fd_table[minor].owner;
-
- /* Obtain this socket's credentials. */
- if ((rc = getepinfo(from_ep, &data->cred.uid, &data->cred.gid)) < 0)
- return rc;
-
- dprintf(("UDS: minor=%d cred={%d,%d}\n", minor,
- data->cred.uid, data->cred.gid));
-
- totalfds = data->nfiledes;
-
- memset(&msghdr, '\0', sizeof(struct msghdr));
- msghdr.msg_control = msg_ctrl->msg_control;
- msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
- for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
- cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
- if (cmsg->cmsg_level != SOL_SOCKET ||
- cmsg->cmsg_type != SCM_RIGHTS)
- continue;
-
- nfds = MIN((cmsg->cmsg_len-CMSG_LEN(0))/sizeof(int), OPEN_MAX);
-
- for (i = 0; i < nfds; i++) {
- if (totalfds == OPEN_MAX)
- return EOVERFLOW;
-
- data->fds[totalfds] = ((int *) CMSG_DATA(cmsg))[i];
- dprintf(("UDS: minor=%d fd[%d]=%d\n", minor, totalfds,
- data->fds[totalfds]));
- totalfds++;
- }
- }
-
- for (i = data->nfiledes; i < totalfds; i++) {
- if ((rc = copyfd(from_ep, data->fds[i], COPYFD_FROM)) < 0) {
- printf("UDS: copyfd(COPYFD_FROM) failed: %d\n", rc);
-
- /* Revert the successful copyfd() calls made so far. */
- for (i--; i >= data->nfiledes; i--)
- close(data->fds[i]);
-
- return rc;
- }
-
- dprintf(("UDS: send_fds(): %d -> %d\n", data->fds[i], rc));
-
- data->fds[i] = rc; /* this is now the local FD */
- }
-
- data->nfiledes = totalfds;
-
- return OK;
-}
-
-/*
- * This function calls close() for all of the FDs in flight. This is used
- * when a Unix Domain Socket is closed and there exists references to file
- * descriptors that haven't been received with recvmsg().
- */
-int
-uds_clear_fds(devminor_t minor, struct ancillary *data)
-{
- int i;
-
- dprintf(("UDS: uds_clear_fds(%d)\n", minor));
-
- for (i = 0; i < data->nfiledes; i++) {
- dprintf(("UDS: uds_clear_fds() => %d\n", data->fds[i]));
-
- close(data->fds[i]);
-
- data->fds[i] = -1;
- }
-
- data->nfiledes = 0;
-
- return OK;
-}
-
-static int
-recv_fds(devminor_t minor, struct ancillary *data,
- struct msg_control *msg_ctrl)
-{
- int rc, i, j, fds[OPEN_MAX];
- struct msghdr msghdr;
- struct cmsghdr *cmsg;
- endpoint_t to_ep;
-
- dprintf(("UDS: recv_fds(%d)\n", minor));
-
- msghdr.msg_control = msg_ctrl->msg_control;
- msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
- cmsg = CMSG_FIRSTHDR(&msghdr);
- cmsg->cmsg_len = CMSG_LEN(sizeof(int) * data->nfiledes);
- cmsg->cmsg_level = SOL_SOCKET;
- cmsg->cmsg_type = SCM_RIGHTS;
-
- to_ep = uds_fd_table[minor].owner;
-
- /* Copy to the target endpoint. */
- for (i = 0; i < data->nfiledes; i++) {
- if ((rc = copyfd(to_ep, data->fds[i], COPYFD_TO)) < 0) {
- printf("UDS: copyfd(COPYFD_TO) failed: %d\n", rc);
-
- /* Revert the successful copyfd() calls made so far. */
- for (i--; i >= 0; i--)
- (void) copyfd(to_ep, fds[i], COPYFD_CLOSE);
-
- return rc;
- }
-
- fds[i] = rc; /* this is now the remote FD */
- }
-
- /* Close the local copies only once the entire procedure succeeded. */
- for (i = 0; i < data->nfiledes; i++) {
- dprintf(("UDS: recv_fds(): %d -> %d\n", data->fds[i], fds[i]));
-
- ((int *)CMSG_DATA(cmsg))[i] = fds[i];
-
- close(data->fds[i]);
-
- data->fds[i] = -1;
- }
-
- data->nfiledes = 0;
-
- return OK;
-}
-
-static int
-recv_cred(devminor_t minor, struct ancillary *data,
- struct msg_control *msg_ctrl)
-{
- struct msghdr msghdr;
- struct cmsghdr *cmsg;
- struct uucred *cred;
-
- dprintf(("UDS: recv_cred(%d)\n", minor));
-
- msghdr.msg_control = msg_ctrl->msg_control;
- msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
- cmsg = CMSG_FIRSTHDR(&msghdr);
- if (cmsg->cmsg_len > 0)
- cmsg = CMSG_NXTHDR(&msghdr, cmsg);
-
- cmsg->cmsg_len = CMSG_LEN(sizeof(struct uucred));
- cmsg->cmsg_level = SOL_SOCKET;
- cmsg->cmsg_type = SCM_CREDS;
- cred = (struct uucred *)CMSG_DATA(cmsg);
- memset(cred, 0, sizeof(*cred));
- cred->cr_uid = data->cred.uid;
- cred->cr_gid = data->cred.gid;
-
- return OK;
-}
-
-static int
-do_sendmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int peer, rc, i;
- struct msg_control msg_ctrl;
-
- dprintf(("UDS: do_sendmsg(%d)\n", minor));
-
- memset(&msg_ctrl, '\0', sizeof(struct msg_control));
-
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl,
- sizeof(struct msg_control))) != OK)
- return rc;
-
- /* Locate the peer. */
- peer = -1;
- if (uds_fd_table[minor].type == SOCK_DGRAM) {
- if (uds_fd_table[minor].target.sun_path[0] == '\0' ||
- uds_fd_table[minor].target.sun_family != AF_UNIX)
- return EDESTADDRREQ;
-
- for (i = 0; i < NR_FDS; i++) {
- /*
- * Look for a SOCK_DGRAM socket that is bound on the
- * target address.
- */
- if (uds_fd_table[i].type == SOCK_DGRAM &&
- uds_fd_table[i].stale == FALSE &&
- uds_fd_table[i].addr.sun_family == AF_UNIX &&
- !strncmp(uds_fd_table[minor].target.sun_path,
- uds_fd_table[i].addr.sun_path,
- sizeof(uds_fd_table[i].addr.sun_path))) {
- peer = i;
- break;
- }
- }
-
- if (peer == -1)
- return ENOENT;
- } else {
- peer = uds_fd_table[minor].peer;
- if (peer == -1)
- return ENOTCONN;
- }
-
- dprintf(("UDS: sendmsg(%d) -- peer=%d\n", minor, peer));
-
- /*
- * Note: it's possible that there is already some file descriptors in
- * ancillary_data if the peer didn't call recvmsg() yet. That's okay.
- * The receiver will get the current file descriptors plus the new
- * ones.
- */
- return send_fds(minor, &msg_ctrl, &uds_fd_table[peer].ancillary_data);
-}
-
-static int
-do_recvmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc;
- struct msg_control msg_ctrl;
- socklen_t clen_avail = 0;
- socklen_t clen_needed = 0;
- socklen_t clen_desired = 0;
-
- dprintf(("UDS: do_recvmsg(%d)\n", minor));
- dprintf(("UDS: minor=%d credentials={uid:%d,gid:%d}\n", minor,
- uds_fd_table[minor].ancillary_data.cred.uid,
- uds_fd_table[minor].ancillary_data.cred.gid));
-
- memset(&msg_ctrl, '\0', sizeof(struct msg_control));
-
- /*
- * Get the msg_control from the user. It will include the
- * amount of space the user has allocated for control data.
- */
- if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl,
- sizeof(struct msg_control))) != OK)
- return rc;
-
- clen_avail = MIN(msg_ctrl.msg_controllen, MSG_CONTROL_MAX);
-
- if (uds_fd_table[minor].ancillary_data.nfiledes > 0) {
- clen_needed = CMSG_SPACE(sizeof(int) *
- uds_fd_table[minor].ancillary_data.nfiledes);
- }
-
- /* if there is room we also include credentials */
- clen_desired = clen_needed + CMSG_SPACE(sizeof(struct uucred));
-
- if (clen_needed > clen_avail)
- return EOVERFLOW;
-
- if (uds_fd_table[minor].ancillary_data.nfiledes > 0) {
- if ((rc = recv_fds(minor, &uds_fd_table[minor].ancillary_data,
- &msg_ctrl)) != OK)
- return rc;
- }
-
- if (clen_desired <= clen_avail) {
- rc = recv_cred(minor, &uds_fd_table[minor].ancillary_data,
- &msg_ctrl);
- if (rc != OK)
- return rc;
- msg_ctrl.msg_controllen = clen_desired;
- } else
- msg_ctrl.msg_controllen = clen_needed;
-
- /* Send the control data to the user. */
- return sys_safecopyto(endpt, grant, 0, (vir_bytes) &msg_ctrl,
- sizeof(struct msg_control));
-}
-
-static int
-do_fionread(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
- int rc;
-
- rc = uds_perform_read(minor, NONE, GRANT_INVALID, UDS_BUF, 1);
-
- /* What should we do on error? Just set to zero for now. */
- if (rc < 0)
- rc = 0;
-
- return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rc, sizeof(rc));
-}
-
-int
-uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
- cp_grant_id_t grant)
-{
- int rc;
-
- switch (request) {
- case NWIOSUDSCONN:
- /* Connect to a listening socket -- connect(). */
- rc = do_connect(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSACCEPT:
- /* Accept an incoming connection -- accept(). */
- rc = do_accept(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSBLOG:
- /*
- * Set the backlog_size and put the socket into the listening
- * state -- listen().
- */
- rc = do_listen(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSTYPE:
- /* Set the SOCK_ type for this socket -- socket(). */
- rc = do_socket(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSADDR:
- /* Set the address for this socket -- bind(). */
- rc = do_bind(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSADDR:
- /* Get the address for this socket -- getsockname(). */
- rc = do_getsockname(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSPADDR:
- /* Get the address for the peer -- getpeername(). */
- rc = do_getpeername(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSSHUT:
- /*
- * Shut down a socket for reading, writing, or both --
- * shutdown().
- */
- rc = do_shutdown(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSPAIR:
- /* Connect two sockets -- socketpair(). */
- rc = do_socketpair(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSSOTYPE:
- /* Get socket type -- getsockopt(SO_TYPE). */
- rc = do_getsockopt_sotype(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSPEERCRED:
- /* Get peer endpoint -- getsockopt(SO_PEERCRED). */
- rc = do_getsockopt_peercred(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSTADDR:
- /* Set target address -- sendto(). */
- rc = do_sendto(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSFADDR:
- /* Get from address -- recvfrom(). */
- rc = do_recvfrom(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSSNDBUF:
- /* Get the send buffer size -- getsockopt(SO_SNDBUF). */
- rc = do_getsockopt_sndbuf(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSSNDBUF:
- /* Set the send buffer size -- setsockopt(SO_SNDBUF). */
- rc = do_setsockopt_sndbuf(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSRCVBUF:
- /* Get the send buffer size -- getsockopt(SO_SNDBUF). */
- rc = do_getsockopt_rcvbuf(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSRCVBUF:
- /* Set the send buffer size -- setsockopt(SO_SNDBUF). */
- rc = do_setsockopt_rcvbuf(minor, endpt, grant);
-
- break;
-
- case NWIOSUDSCTRL:
- /* Set the control data -- sendmsg(). */
- rc = do_sendmsg(minor, endpt, grant);
-
- break;
-
- case NWIOGUDSCTRL:
- /* Set the control data -- recvmsg(). */
- rc = do_recvmsg(minor, endpt, grant);
-
- break;
-
- case FIONREAD:
- /*
- * Get the number of bytes immediately available for reading.
- */
- rc = do_fionread(minor, endpt, grant);
-
- break;
-
- default:
- /*
- * The IOCTL command is not valid for /dev/uds -- this happens
- * a lot and is normal. A lot of libc functions determine the
- * socket type with IOCTLs. Any unrecognized requests simply
- * get an ENOTTY response.
- */
-
- rc = ENOTTY;
- }
-
- return rc;
-}
--- /dev/null
+/* UNIX Domain Sockets - stat.c - network status */
+
+#include "uds.h"
+#include <sys/socketvar.h>
+#include <sys/unpcb.h>
+
+/*
+ * Fill the given 'ki' structure with information about the socket 'uds'.
+ */
+static void
+uds_get_info(struct kinfo_pcb * ki, const struct udssock * uds)
+{
+ struct udssock *peer;
+ socklen_t len;
+ int type;
+
+ type = uds_get_type(uds);
+ peer = uds_get_peer(uds);
+
+ ki->ki_pcbaddr = (uint64_t)(uintptr_t)uds;
+ ki->ki_ppcbaddr = (uint64_t)(uintptr_t)uds;
+ ki->ki_sockaddr = (uint64_t)(uintptr_t)&uds->uds_sock;
+ ki->ki_family = AF_UNIX;
+ ki->ki_type = type;
+ ki->ki_protocol = UDSPROTO_UDS;
+ ki->ki_pflags = 0;
+ if (uds->uds_flags & UDSF_CONNWAIT)
+ ki->ki_pflags |= UNP_CONNWAIT;
+ if (uds->uds_flags & UDSF_PASSCRED)
+ ki->ki_pflags |= UNP_WANTCRED;
+ if (type != SOCK_DGRAM && uds->uds_cred.unp_pid != -1) {
+ if (uds_is_listening(uds))
+ ki->ki_pflags |= UNP_EIDSBIND;
+ else if (uds_is_connecting(uds) || uds_is_connected(uds))
+ ki->ki_pflags |= UNP_EIDSVALID;
+ }
+ /* Not sure about NetBSD connection states. First attempt here. */
+ if (uds_is_connecting(uds))
+ ki->ki_sostate = SS_ISCONNECTING;
+ else if (uds_is_connected(uds))
+ ki->ki_sostate = SS_ISCONNECTED;
+ else if (uds_is_disconnected(uds))
+ ki->ki_sostate = SS_ISDISCONNECTED;
+ ki->ki_rcvq = uds->uds_len;
+ /* We currently mirror the peer's receive queue size when connected. */
+ if (uds_is_connected(uds))
+ ki->ki_sndq = peer->uds_len;
+ /* The source is not set for bound connection-type sockets here. */
+ if (type == SOCK_DGRAM || uds_is_listening(uds))
+ uds_make_addr(uds->uds_path, (size_t)uds->uds_pathlen,
+ &ki->ki_src, &len);
+ if (peer != NULL)
+ uds_make_addr(peer->uds_path, (size_t)peer->uds_pathlen,
+ &ki->ki_dst, &len);
+ /* TODO: we should set ki_inode and ki_vnode, but to what? */
+ ki->ki_conn = (uint64_t)(uintptr_t)peer;
+ if (!TAILQ_EMPTY(&uds->uds_queue))
+ ki->ki_refs =
+ (uint64_t)(uintptr_t)TAILQ_FIRST(&uds->uds_queue);
+ if (uds_has_link(uds))
+ ki->ki_nextref =
+ (uint64_t)(uintptr_t)TAILQ_NEXT(uds, uds_next);
+}
+
+/*
+ * Remote MIB implementation of CTL_NET PF_LOCAL {SOCK_STREAM,SOCK_DGRAM,
+ * SOCK_SEQPACKET} 0. This function handles all queries on the
+ * "net.local.{stream,dgram,seqpacket}.pcblist" sysctl(7) nodes.
+ *
+ * The 0 for "pcblist" is a MINIXism: we use it to keep our arrays small.
+ * NetBSD numbers these nodes dynamically and so they have numbers above
+ * CREATE_BASE. That also means that no userland application can possibly
+ * hardcode their numbers, and must perform lookups by name. In turn, that
+ * means that we can safely change the 0 to another number if NetBSD ever
+ * introduces statically numbered nodes in these subtrees.
+ */
+static ssize_t
+net_local_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
+ struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
+{
+ struct udssock *uds;
+ struct kinfo_pcb ki;
+ ssize_t off;
+ int r, type, size, max;
+
+ if (call->call_namelen != 4)
+ return EINVAL;
+
+ /* The first two added name fields are not used. */
+
+ size = call->call_name[2];
+ if (size < 0 || (size_t)size > sizeof(ki))
+ return EINVAL;
+ if (size == 0)
+ size = sizeof(ki);
+ max = call->call_name[3];
+
+ type = call->call_oname[2];
+
+ off = 0;
+
+ for (uds = uds_enum(NULL, type); uds != NULL;
+ uds = uds_enum(uds, type)) {
+ if (rmib_inrange(oldp, off)) {
+ memset(&ki, 0, sizeof(ki));
+
+ uds_get_info(&ki, uds);
+
+ if ((r = rmib_copyout(oldp, off, &ki, size)) < 0)
+ return r;
+ }
+
+ off += size;
+ if (max > 0 && --max == 0)
+ break;
+ }
+
+ /*
+ * Margin to limit the possible effects of the inherent race condition
+ * between receiving just the data size and receiving the actual data.
+ */
+ if (oldp == NULL)
+ off += PCB_SLOP * size;
+
+ return off;
+}
+
+/* The CTL_NET PF_LOCAL SOCK_STREAM subtree. */
+static struct rmib_node net_local_stream_table[] = {
+ [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+ "pcblist", "SOCK_STREAM protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL SOCK_DGRAM subtree. */
+static struct rmib_node net_local_dgram_table[] = {
+ [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+ "pcblist", "SOCK_DGRAM protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL SOCK_SEQPACKET subtree. */
+static struct rmib_node net_local_seqpacket_table[] = {
+ [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+ "pcblist", "SOCK_SEQPACKET protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL subtree. */
+static struct rmib_node net_local_table[] = {
+/* 1*/ [SOCK_STREAM] = RMIB_NODE(RMIB_RO, net_local_stream_table,
+ "stream", "SOCK_STREAM settings"),
+/* 2*/ [SOCK_DGRAM] = RMIB_NODE(RMIB_RO, net_local_dgram_table,
+ "dgram", "SOCK_DGRAM settings"),
+/* 5*/ [SOCK_SEQPACKET] = RMIB_NODE(RMIB_RO, net_local_seqpacket_table,
+ "seqpacket", "SOCK_SEQPACKET settings"),
+};
+
+static struct rmib_node net_local_node =
+ RMIB_NODE(RMIB_RO, net_local_table, "local", "PF_LOCAL related settings");
+
+/*
+ * Initialize the status module.
+ */
+void
+uds_stat_init(void)
+{
+ const int mib[] = { CTL_NET, PF_LOCAL };
+ int r;
+
+ /*
+ * Register our own "net.local" subtree with the MIB service.
+ *
+ * This call only returns local failures. Remote failures (in the MIB
+ * service) are silently ignored. So, we can safely panic on failure.
+ */
+ if ((r = rmib_register(mib, __arraycount(mib), &net_local_node)) != OK)
+ panic("UDS: unable to register remote MIB tree: %d", r);
+}
+
+/*
+ * Clean up the status module.
+ */
+void
+uds_stat_cleanup(void)
+{
+
+ rmib_deregister(&net_local_node);
+}
+++ /dev/null
-.TH UDS 8
-.SH NAME
-uds \- unix domain sockets device
-.SH DESCRIPTION
-The \fIuds\fP device gives access to the unix domain socket services in
-Minix. It is a virtual device similar to the \fItcp\fP and \fIudp\fP
-Internet Protocol server devices.
-.SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR dev(4),
-.BR ip(4),
-.BR unix(8)
-.SH HISTORY
-This device first appeared in Minix 3.1.8.
-/*
- * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL)
- * This code handles requests generated by operations on /dev/uds
- *
- * The interface to UNIX domain sockets is similar to the interface to network
- * sockets. There is a character device (/dev/uds) and this server is a
- * 'driver' for that device.
- */
+/* UNIX Domain Sockets - uds.c - socket management */
#include "uds.h"
-static ssize_t uds_perform_write(devminor_t, endpoint_t, cp_grant_id_t, size_t,
- int);
-
-static int uds_open(devminor_t, int, endpoint_t);
-static int uds_close(devminor_t);
-static ssize_t uds_read(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t,
- int, cdev_id_t);
-static ssize_t uds_write(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t,
- int, cdev_id_t);
-static int uds_ioctl(devminor_t, unsigned long, endpoint_t, cp_grant_id_t, int,
- endpoint_t, cdev_id_t);
-static int uds_cancel(devminor_t, endpoint_t, cdev_id_t);
-static int uds_select(devminor_t, unsigned int, endpoint_t);
-
-static struct chardriver uds_tab = {
- .cdr_open = uds_open,
- .cdr_close = uds_close,
- .cdr_read = uds_read,
- .cdr_write = uds_write,
- .cdr_ioctl = uds_ioctl,
- .cdr_cancel = uds_cancel,
- .cdr_select = uds_select
-};
+static struct udssock uds_array[NR_UDSSOCK];
+static TAILQ_HEAD(uds_freelist, udssock) uds_freelist;
+static unsigned int uds_in_use;
+static int uds_running;
-/* File Descriptor Table */
-uds_fd_t uds_fd_table[NR_FDS];
+static const struct sockevent_ops uds_ops;
-static unsigned int uds_exit_left;
+static SLIST_HEAD(udshash, udssock) udshash[UDSHASH_SLOTS];
-static int
-uds_open(devminor_t UNUSED(orig_minor), int access,
- endpoint_t user_endpt)
+/*
+ * Initialize file-to-socket hash table.
+ */
+static void
+udshash_init(void)
{
- devminor_t minor;
- char *buf;
- int i;
+ unsigned int slot;
- dprintf(("UDS: uds_open() from %d\n", user_endpt));
+ for (slot = 0; slot < __arraycount(udshash); slot++)
+ SLIST_INIT(&udshash[slot]);
+}
- /*
- * Find a slot in the descriptor table for the new descriptor.
- * The index of the descriptor in the table will be returned.
- * Subsequent calls to read/write/close/ioctl/etc will use this
- * minor number. The minor number must be different from the
- * the /dev/uds device's minor number (0).
- */
- for (minor = 1; minor < NR_FDS; minor++)
- if (uds_fd_table[minor].state == UDS_FREE)
- break;
+/*
+ * Return a hash table slot number for the given <dev,ino> pair.
+ */
+static unsigned int
+udshash_slot(dev_t dev, ino_t ino)
+{
- if (minor == NR_FDS)
- return ENFILE;
+ assert(dev != NO_DEV);
+ assert(ino != 0);
/*
- * Allocate memory for the ringer buffer. In order to save on memory
- * in the common case, the buffer is allocated only when the socket is
- * in use. We use mmap instead of malloc to allow the memory to be
- * actually freed later.
+ * Effectively combining two 64-bit numbers into a single 6-or-so-bit
+ * hash is not too easy. This hash function is probably among the
+ * worst options. Then again it is not all that critical as we are not
+ * expecting that many bound UDS sockets in the system anyway.
*/
- if ((buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
- MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
- return ENOMEM;
+ return (unsigned int)(dev ^ ino) % UDSHASH_SLOTS;
+}
- /*
- * Allocate the socket, and set its initial parameters.
- */
- uds_fd_table[minor].state = UDS_INUSE;
- uds_fd_table[minor].owner = user_endpt;
- uds_fd_table[minor].sel_endpt = NONE;
- uds_fd_table[minor].sel_ops = 0;
- uds_fd_table[minor].buf = buf;
- uds_fd_table[minor].pos = 0;
- uds_fd_table[minor].size = 0;
- uds_fd_table[minor].mode = UDS_R | UDS_W;
- uds_fd_table[minor].type = -1;
+/*
+ * Look for a socket that is bound to the given <dev,ino> pair. Return a
+ * pointer to the socket if found, or NULL otherwise.
+ */
+static struct udssock *
+udshash_get(dev_t dev, ino_t ino)
+{
+ struct udssock *uds;
+ unsigned int slot;
- for (i = 0; i < UDS_SOMAXCONN; i++)
- uds_fd_table[minor].backlog[i] = -1;
- uds_fd_table[minor].backlog_size = UDS_SOMAXCONN;
+ slot = udshash_slot(dev, ino);
- memset(&uds_fd_table[minor].ancillary_data, '\0',
- sizeof(struct ancillary));
- for (i = 0; i < OPEN_MAX; i++)
- uds_fd_table[minor].ancillary_data.fds[i] = -1;
+ SLIST_FOREACH(uds, &udshash[slot], uds_hash) {
+ if (uds->uds_dev == dev && uds->uds_ino == ino)
+ return uds;
+ }
- uds_fd_table[minor].stale = FALSE;
- uds_fd_table[minor].listening = FALSE;
- uds_fd_table[minor].peer = -1;
- uds_fd_table[minor].child = -1;
+ return NULL;
+}
- memset(&uds_fd_table[minor].addr, '\0', sizeof(struct sockaddr_un));
- memset(&uds_fd_table[minor].source, '\0', sizeof(struct sockaddr_un));
- memset(&uds_fd_table[minor].target, '\0', sizeof(struct sockaddr_un));
+/*
+ * Add a socket to the file-to-socket hash table. The socket must have its
+ * device and inode fields set, and must not be in the hash table already.
+ */
+static void
+udshash_add(struct udssock * uds)
+{
+ unsigned int slot;
- uds_fd_table[minor].suspended = UDS_NOT_SUSPENDED;
+ slot = udshash_slot(uds->uds_dev, uds->uds_ino);
- return CDEV_CLONED | minor;
+ SLIST_INSERT_HEAD(&udshash[slot], uds, uds_hash);
}
+/*
+ * Remove a socket from the file-to-socket hash table. The socket must be in
+ * the hash table.
+ */
static void
-uds_reset(devminor_t minor)
+udshash_del(struct udssock * uds)
{
- /* Disconnect the socket from its peer. */
- uds_fd_table[minor].peer = -1;
+ unsigned int slot;
- /* Set an error to pass to the caller. */
- uds_fd_table[minor].err = ECONNRESET;
+ slot = udshash_slot(uds->uds_dev, uds->uds_ino);
- /* If a process was blocked on I/O, revive it. */
- if (uds_fd_table[minor].suspended != UDS_NOT_SUSPENDED)
- uds_unsuspend(minor);
+ /* This macro is O(n). */
+ SLIST_REMOVE(&udshash[slot], uds, udssock, uds_hash);
+}
- /* All of the peer's calls will fail immediately now. */
- if (uds_fd_table[minor].sel_ops != 0) {
- chardriver_reply_select(uds_fd_table[minor].sel_endpt, minor,
- uds_fd_table[minor].sel_ops);
- uds_fd_table[minor].sel_ops = 0;
- }
+/*
+ * Return the socket identifier for the given UDS socket object.
+ */
+sockid_t
+uds_get_id(struct udssock * uds)
+{
+
+ return (sockid_t)(uds - uds_array);
}
-static int
-uds_close(devminor_t minor)
+/*
+ * Given either NULL or a previously returned socket, return the next in-use
+ * UDS socket of the given socket type, or NULL if there are no more matches.
+ * The sockets are returned in random order, but each matching socket is
+ * returned exactly once (until any socket is allocated or freed).
+ */
+struct udssock *
+uds_enum(struct udssock * prev, int type)
{
- int i, peer;
+ sockid_t id;
- dprintf(("UDS: uds_close(%d)\n", minor));
+ if (prev != NULL)
+ id = uds_get_id(prev) + 1;
+ else
+ id = 0;
- if (minor < 0 || minor >= NR_FDS) return ENXIO;
+ for (; id < NR_UDSSOCK; id++)
+ if ((uds_array[id].uds_flags & UDSF_IN_USE) &&
+ uds_get_type(&uds_array[id]) == type)
+ return &uds_array[id];
- if (uds_fd_table[minor].state != UDS_INUSE)
- return EINVAL;
+ return NULL;
+}
- peer = uds_fd_table[minor].peer;
- if (peer != -1 && uds_fd_table[peer].peer == -1) {
- /* Connecting socket: clear from server's backlog. */
- if (!uds_fd_table[peer].listening)
- panic("connecting socket attached to non-server");
-
- for (i = 0; i < uds_fd_table[peer].backlog_size; i++) {
- if (uds_fd_table[peer].backlog[i] == minor) {
- uds_fd_table[peer].backlog[i] = -1;
- break;
- }
- }
- } else if (peer != -1) {
- /* Connected socket: disconnect it. */
- uds_reset(peer);
- } else if (uds_fd_table[minor].listening) {
- /* Listening socket: disconnect all sockets in the backlog. */
- for (i = 0; i < uds_fd_table[minor].backlog_size; i++)
- if (uds_fd_table[minor].backlog[i] != -1)
- uds_reset(uds_fd_table[minor].backlog[i]);
- }
+/*
+ * Invalidate credentials on the socket.
+ */
+static void
+uds_clear_cred(struct udssock * uds)
+{
- if (uds_fd_table[minor].ancillary_data.nfiledes > 0)
- uds_clear_fds(minor, &uds_fd_table[minor].ancillary_data);
+ uds->uds_cred.unp_pid = -1;
+ uds->uds_cred.unp_euid = -1;
+ uds->uds_cred.unp_egid = -1;
+}
- /* Release the memory for the ring buffer. */
- munmap(uds_fd_table[minor].buf, UDS_BUF);
+/*
+ * Obtain the credentials (process, user, and group ID) of the given user
+ * endpoint and associate them with the socket for later retrieval. It is
+ * important to note that this information is obtained once at connect time,
+ * and never updated later. The party receiving the credentials must take this
+ * into account.
+ */
+static void
+uds_get_cred(struct udssock * uds, endpoint_t user_endpt)
+{
+ int r;
- /* Set the socket back to its original UDS_FREE state. */
- memset(&uds_fd_table[minor], '\0', sizeof(uds_fd_t));
+ if ((uds->uds_cred.unp_pid = r = getepinfo(user_endpt,
+ &uds->uds_cred.unp_euid, &uds->uds_cred.unp_egid)) < 0) {
+ printf("UDS: failed obtaining credentials of %d (%d)\n",
+ user_endpt, r);
- /* If terminating, and this was the last open socket, exit now. */
- if (uds_exit_left > 0) {
- if (--uds_exit_left == 0)
- chardriver_terminate();
+ uds_clear_cred(uds);
}
+}
+/*
+ * Allocate and initialize a UDS socket. On succes, return OK with a pointer
+ * to the new socket in 'udsp'. On failure, return a negative error code.
+ */
+static int
+uds_alloc(struct udssock ** udsp)
+{
+ struct udssock *uds;
+ int r;
+
+ /* Allocate, initialize, and return a UNIX domain socket object. */
+ if (TAILQ_EMPTY(&uds_freelist))
+ return ENOBUFS;
+
+ uds = TAILQ_FIRST(&uds_freelist);
+
+ uds->uds_conn = NULL; /* not connected */
+ uds->uds_link = NULL; /* not connecting or linked */
+ uds->uds_queued = 0;
+ uds->uds_flags = UDSF_IN_USE; /* may be found through enumeration */
+ uds->uds_pathlen = 0; /* not bound: no path */
+ uds->uds_dev = NO_DEV; /* not hashed: no socket file device */
+ uds->uds_ino = 0; /* not hashed: no socket file inode */
+ uds_clear_cred(uds); /* no bind/connect-time credentials */
+ TAILQ_INIT(&uds->uds_queue); /* an empty queue */
+
+ if ((r = uds_io_setup(uds)) != OK)
+ return r;
+
+ TAILQ_REMOVE(&uds_freelist, uds, uds_next);
+
+ assert(uds_in_use < NR_UDSSOCK);
+ uds_in_use++;
+
+ *udsp = uds;
return OK;
}
-static int
-uds_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
+/*
+ * Free a previously allocated socket.
+ */
+static void
+uds_free(struct sock * sock)
{
- unsigned int ready_ops;
- int i, bytes, watch;
+ struct udssock *uds = (struct udssock *)sock;
- dprintf(("UDS: uds_select(%d)\n", minor));
+ uds_io_cleanup(uds);
- if (minor < 0 || minor >= NR_FDS) return ENXIO;
+ uds->uds_flags = 0; /* no longer in use */
- if (uds_fd_table[minor].state != UDS_INUSE)
- return EINVAL;
+ TAILQ_INSERT_HEAD(&uds_freelist, uds, uds_next);
- watch = (ops & CDEV_NOTIFY);
- ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);
-
- ready_ops = 0;
-
- /* Check if there is data available to read. */
- if (ops & CDEV_OP_RD) {
- bytes = uds_perform_read(minor, NONE, GRANT_INVALID, 1, 1);
- if (bytes > 0) {
- ready_ops |= CDEV_OP_RD; /* data available */
- } else if (uds_fd_table[minor].listening == TRUE) {
- /* Check for pending connections. */
- for (i = 0; i < uds_fd_table[minor].backlog_size; i++)
- {
- if (uds_fd_table[minor].backlog[i] != -1) {
- ready_ops |= CDEV_OP_RD;
- break;
- }
- }
- } else if (bytes != EDONTREPLY) {
- ready_ops |= CDEV_OP_RD; /* error */
- }
+ assert(uds_in_use > 0);
+ if (--uds_in_use == 0 && uds_running == FALSE)
+ sef_cancel();
+}
+
+/*
+ * Create a new socket.
+ */
+static sockid_t
+uds_socket(int domain, int type, int protocol, endpoint_t user_endpt __unused,
+ struct sock ** sockp, const struct sockevent_ops ** ops)
+{
+ struct udssock *uds;
+ int r;
+
+ dprintf(("UDS: socket(%d,%d,%d)\n", domain, type, protocol));
+
+ if (domain != PF_UNIX) {
+ /* This means the service was configured incorrectly. */
+ printf("UDS: got request for domain %d\n", domain);
+
+ return EAFNOSUPPORT;
}
- /* Check if we can write without blocking. */
- if (ops & CDEV_OP_WR) {
- bytes = uds_perform_write(minor, NONE, GRANT_INVALID, 1, 1);
- if (bytes != 0 && bytes != EDONTREPLY)
- ready_ops |= CDEV_OP_WR;
+ /* We support the following three socket types. */
+ switch (type) {
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ case SOCK_DGRAM:
+ break;
+ default:
+ return EPROTOTYPE;
}
/*
- * If not all requested ops were ready, and the caller requests to be
- * notified about changes, we add the remaining ops to the saved set.
+ * The PF_UNIX domain does not support particular protocols, so the
+ * given protocol must be zero (= anything that matches).
*/
- ops &= ~ready_ops;
- if (ops && watch) {
- uds_fd_table[minor].sel_endpt = endpt;
- uds_fd_table[minor].sel_ops |= ops;
- }
+ if (protocol != UDSPROTO_UDS)
+ return EPROTONOSUPPORT;
- return ready_ops;
+ if ((r = uds_alloc(&uds)) != OK)
+ return r;
+
+ dprintf(("UDS: socket returns %d\n", uds_get_id(uds)));
+
+ *sockp = &uds->uds_sock;
+ *ops = &uds_ops;
+ return uds_get_id(uds);
}
-ssize_t
-uds_perform_read(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant,
- size_t size, int pretend)
+/*
+ * Connect a pair of sockets.
+ */
+static int
+uds_pair(struct sock * sock1, struct sock * sock2, endpoint_t user_endpt)
{
- size_t pos, subsize;
- int r, peer;
+ struct udssock *uds1 = (struct udssock *)sock1;
+ struct udssock *uds2 = (struct udssock *)sock2;
- dprintf(("UDS: uds_perform_read(%d)\n", minor));
+ dprintf(("UDS: pair(%d,%d)\n", uds_get_id(uds1), uds_get_id(uds2)));
- peer = uds_fd_table[minor].peer;
+ /* Only connection-oriented types are acceptable. */
+ if (uds_get_type(uds1) == SOCK_DGRAM)
+ return EOPNOTSUPP;
- /* Skip reads of zero bytes. */
- if (size == 0)
- return 0;
+ /* Connect the sockets. */
+ uds1->uds_conn = uds2;
+ uds2->uds_conn = uds1;
+ uds1->uds_flags |= UDSF_CONNECTED;
+ uds2->uds_flags |= UDSF_CONNECTED;
- /* Check if the socket isn't shut down for reads. */
- if (!(uds_fd_table[minor].mode & UDS_R))
- return EPIPE;
+ /* Obtain the (same) credentials for both sides of the connection. */
+ uds_get_cred(uds1, user_endpt);
+ memcpy(&uds2->uds_cred, &uds1->uds_cred, sizeof(uds2->uds_cred));
- if (uds_fd_table[minor].size == 0) {
- if (peer == -1) {
- /*
- * We're not connected. That's only a problem when this
- * socket is connection oriented.
- */
- if (uds_fd_table[minor].type == SOCK_STREAM ||
- uds_fd_table[minor].type == SOCK_SEQPACKET) {
- if (uds_fd_table[minor].err == ECONNRESET) {
- if (!pretend)
- uds_fd_table[minor].err = 0;
- return ECONNRESET;
- } else
- return ENOTCONN;
- }
+ return OK;
+}
+
+/*
+ * Disconnect a UDS socket, notifying or freeing up the other end of the
+ * connection depending on whether the socket was linked, that is, on the
+ * accept queue of a listening socket.
+ */
+static void
+uds_disconnect(struct udssock * uds, int was_linked)
+{
+ struct udssock *conn;
+
+ assert(uds_is_connected(uds));
+ assert(uds_has_conn(uds));
+
+ conn = uds->uds_conn;
+
+ assert(uds_is_connected(conn));
+ assert(uds_has_conn(conn));
+ assert(!uds_has_link(conn));
+ assert(conn->uds_conn == uds);
+
+ /* Disconnect the sockets. */
+ uds->uds_conn = NULL;
+ conn->uds_conn = NULL;
+
+ /*
+ * If the given socket is linked, then it is a connected socket for
+ * which the other end has been created but not yet accepted. In that
+ * case, the other end ('conn') will have to be freed up. Otherwise,
+ * it is a regular user-created socket and we must properly transition
+ * it into disconnected state.
+ */
+ if (!was_linked) {
+ sockevent_raise(&conn->uds_sock, SEV_SEND | SEV_RECV);
+
+ /*
+ * Clear the peer credentials so that they will not be mistaken
+ * for having been obtained at bind time.
+ */
+ uds_clear_cred(conn);
+ } else
+ sockevent_raise(&conn->uds_sock, SEV_CLOSE);
+}
+
+/*
+ * Add the socket 'link' to the queue of the socket 'uds'. This also implies
+ * that 'link's link socket is set to 'uds'.
+ */
+static void
+uds_add_queue(struct udssock * uds, struct udssock * link)
+{
+
+ dprintf(("UDS: add_queue(%d,%d)\n",
+ uds_get_id(uds), uds_get_id(link)));
+
+ TAILQ_INSERT_TAIL(&uds->uds_queue, link, uds_next);
+
+ uds->uds_queued++;
+ assert(uds->uds_queued != 0);
+
+ link->uds_link = uds;
+}
+
+/*
+ * Remove the socket 'link' from the queue of the socket 'uds'. This also
+ * reset 'link's link to NULL.
+ */
+static void
+uds_del_queue(struct udssock * uds, struct udssock * link)
+{
+
+ dprintf(("UDS: del_queue(%d,%d)\n",
+ uds_get_id(uds), uds_get_id(link)));
+
+ assert(link->uds_link == uds);
+
+ TAILQ_REMOVE(&uds->uds_queue, link, uds_next);
+
+ assert(uds->uds_queued > 0);
+ uds->uds_queued--;
+
+ link->uds_link = NULL;
+}
+
+/*
+ * Remove all sockets from the queue of the socket 'uds', with the exception of
+ * 'except' if non-NULL. Raise an ECONNRESET error on all removed sockets that
+ * are not equal to 'uds'.
+ */
+static void
+uds_clear_queue(struct udssock * uds, struct udssock * except)
+{
+ struct udssock *link, *tmp;
+ int found;
+
+ dprintf(("UDS: clear_queue(%d,%d)\n",
+ uds_get_id(uds), (except != NULL) ? uds_get_id(except) : -1));
+
+ found = 0;
+
+ /*
+ * Abort all connecting sockets queued on this socket, except for the
+ * given exception, which may be NULL.
+ */
+ TAILQ_FOREACH_SAFE(link, &uds->uds_queue, uds_next, tmp) {
+ if (link == except) {
+ found++;
+
+ continue;
}
- /* Check if process is reading from a closed pipe. */
- if (peer != -1 && !(uds_fd_table[peer].mode & UDS_W) &&
- uds_fd_table[minor].size == 0)
- return 0;
+ dprintf(("UDS: clear_queue removes %d\n", uds_get_id(link)));
- if (pretend)
- return EDONTREPLY;
+ assert(uds_get_type(link) == SOCK_DGRAM ||
+ uds_is_connecting(link) || uds_is_connected(link));
- if (peer != -1 &&
- uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE)
- panic("writer blocked on empty socket");
+ uds_del_queue(uds, link);
- dprintf(("UDS: suspending read request on %d\n", minor));
+ /*
+ * Generate an error only if the socket was not linked to
+ * itself (only datagram sockets can be linked to themselves).
+ * The error is not helpful for applications in that case.
+ */
+ if (uds != link)
+ sockevent_set_error(&link->uds_sock, ECONNRESET);
- /* Process is reading from an empty pipe. Suspend it. */
- return EDONTREPLY;
+ /*
+ * If this is a listening socket, disconnect the connecting or
+ * connected end. If a connected peer was already created for
+ * the queued socket, dispose of that peer.
+ *
+ * Clear credentials obtained when starting to connect (in
+ * which case the socket is always a connection-oriented
+ * socket), so that they will not be mistaken for credentials
+ * obtained at bind time.
+ */
+ if (uds_get_type(link) != SOCK_DGRAM) {
+ if (uds_is_connected(link))
+ uds_disconnect(link, TRUE /*was_linked*/);
+ else
+ uds_clear_cred(link);
+ }
}
- /* How much can we get from the ring buffer? */
- if (size > uds_fd_table[minor].size)
- size = uds_fd_table[minor].size;
+ assert(uds->uds_queued == found);
+}
- if (pretend)
- return size;
+/*
+ * Check whether the socket address given in 'addr', with length 'addr_len', is
+ * a valid UNIX domain socket address (including a path to a socket file). On
+ * success, return the (non-zero) length of the socket file's path, minus the
+ * null terminator which may in fact not be present. The caller is responsible
+ * for copying and terminating the path as needed. A pointer to the path as
+ * stored in 'addr' is returned in 'pathp'. On failure, return an error code.
+ */
+static int
+uds_check_addr(const struct sockaddr * addr, socklen_t addr_len,
+ const char ** pathp)
+{
+ const char *p;
+ size_t len;
- /* Get the data from the tail of the ring buffer. */
- pos = uds_fd_table[minor].pos;
+ /*
+ * We could cast to a sockaddr_un structure pointer first, but that
+ * would not provide any benefits here. Instead, we use sa_data as the
+ * generic equivalent of sun_path.
+ */
+ if (addr_len < offsetof(struct sockaddr, sa_data))
+ return EINVAL;
- subsize = UDS_BUF - pos;
- if (subsize > size)
- subsize = size;
+ if (addr->sa_family != AF_UNIX)
+ return EAFNOSUPPORT;
- if ((r = sys_safecopyto(endpt, grant, 0,
- (vir_bytes) &uds_fd_table[minor].buf[pos], subsize)) != OK)
- return r;
+ len = (size_t)addr_len - offsetof(struct sockaddr, sa_data);
+ if (len > 0 && (p = memchr(addr->sa_data, '\0', len)) != NULL)
+ len = (size_t)(p - addr->sa_data);
- if (subsize < size) {
- if ((r = sys_safecopyto(endpt, grant, subsize,
- (vir_bytes) uds_fd_table[minor].buf,
- size - subsize)) != OK)
- return r;
+ /* The given path name must not be an empty string. */
+ if (len == 0)
+ return ENOENT;
+
+ /* This check should be redundant but better safe than sorry. */
+ if (len >= UDS_PATH_MAX)
+ return EINVAL;
+
+ *pathp = (const char *)addr->sa_data;
+ return len;
+}
+
+/*
+ * Given the socket file path given as 'path' with length 'path_len' (not
+ * necessarily null terminated), store a socket address with the path in
+ * 'addr', and return the socket address length in 'addr_len'. The calling
+ * libraries (libsockdriver, libsockevent) and the static assert in uds.h
+ * guarantee that 'addr' is sufficiently large to store any address we generate
+ * here. The libraries may subsequently copy out only a part of it to the user
+ * process. This function always succeeds.
+ */
+void
+uds_make_addr(const char * path, size_t len, struct sockaddr * addr,
+ socklen_t * addr_len)
+{
+
+ /*
+ * Generate the address. The stored length (sa_len/sun_len) does not
+ * include a null terminator. The entire structure does include a null
+ * terminator, but only if the socket is bound.
+ */
+ addr->sa_len = offsetof(struct sockaddr, sa_data) + len;
+ addr->sa_family = AF_UNIX;
+ if (len > 0) {
+ /* This call may (intentionally) overrun the sa_data size. */
+ memcpy((char *)addr->sa_data, path, len);
+ ((char *)addr->sa_data)[len] = '\0';
+
+ /* The socket is bound, so include the null terminator. */
+ len++;
+ assert(len <= UDS_PATH_MAX);
}
- /* Advance the buffer tail. */
- uds_fd_table[minor].pos = (pos + size) % UDS_BUF;
- uds_fd_table[minor].size -= size;
-
- /* Reset position if the buffer is empty (it may save a copy call). */
- if (uds_fd_table[minor].size == 0)
- uds_fd_table[minor].pos = 0;
-
- /* See if we can wake up a blocked writer. */
- if (peer != -1 && uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE)
- uds_unsuspend(peer);
-
- /* See if we can satisfy an ongoing select. */
- if (peer != -1 && (uds_fd_table[peer].sel_ops & CDEV_OP_WR) &&
- uds_fd_table[minor].size < UDS_BUF) {
- /* A write on the peer is possible now. */
- chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer,
- CDEV_OP_WR);
- uds_fd_table[peer].sel_ops &= ~CDEV_OP_WR;
+ /* Note that this length may be different from sa_len/sun_len now. */
+ *addr_len = offsetof(struct sockaddr, sa_data) + len;
+}
+
+/*
+ * Bind a socket to a local address.
+ */
+static int
+uds_bind(struct sock * sock, const struct sockaddr * addr, socklen_t addr_len,
+ endpoint_t user_endpt)
+{
+ struct udssock *uds = (struct udssock *)sock;
+ struct udssock *uds2;
+ const char *path;
+ size_t len;
+ dev_t dev;
+ ino_t ino;
+ int r;
+
+ dprintf(("UDS: bind(%d)\n", uds_get_id(uds)));
+
+ /* A socket may be bound at any time, but only once. */
+ if (uds_is_bound(uds))
+ return EINVAL;
+
+ /* Verify that the user gave us an acceptable address. */
+ if ((r = uds_check_addr(addr, addr_len, &path)) < 0)
+ return r;
+ len = (size_t)r;
+
+ /* Attempt to create the socket file on the file system. */
+ r = socketpath(user_endpt, path, len, SPATH_CREATE, &dev, &ino);
+ if (r != OK)
+ return r;
+ assert(dev != NO_DEV && ino != 0);
+
+ /*
+ * It is possible that a socket file of a previously bound socket was
+ * unlinked, and due to inode number reuse, a new socket file has now
+ * been created with the same <dev,ino> pair. In that case, we must
+ * unbind the old socket, because it must no longer be found. The old
+ * socket will still have a path (and behave as though it is bound) but
+ * no longer be found through hash lookups.
+ */
+ if ((uds2 = udshash_get(dev, ino)) != NULL) {
+ udshash_del(uds2);
+
+ uds2->uds_dev = NO_DEV;
+ uds2->uds_ino = 0;
}
- return size; /* number of bytes read */
+ /*
+ * Obtain credentials for the socket, unless the socket is already
+ * connecting or connected, in which case we must not replace the
+ * credentials we obtained already. We later clear those credentials
+ * upon a connection failure or disconnect, so that if the socket is
+ * then put in listening mode, we know there are no bind-time
+ * credentials. Not ideal, but we really need two separate sets of
+ * credentials if we want to get this right, which is a waste of memory
+ * as no sane application writer would ever rely on credential passing
+ * after recycling a socket..
+ */
+ if (uds_get_type(uds) != SOCK_DGRAM && !uds_is_connecting(uds) &&
+ !uds_is_connected(uds))
+ uds_get_cred(uds, user_endpt);
+
+ /* Asssign the address to the socket. */
+ uds->uds_pathlen = len;
+ memcpy(&uds->uds_path, path, len);
+ uds->uds_dev = dev;
+ uds->uds_ino = ino;
+
+ udshash_add(uds);
+
+ return OK;
}
-static ssize_t
-uds_perform_write(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant,
- size_t size, int pretend)
+/*
+ * Look up a UDS socket based on a user-given address. If a socket exists for
+ * the address, check if it is type-compatible with the given UDS socket.
+ * On succes, return OK, with 'peerp' set to the socket that was found. On
+ * failure, return a negative error code.
+ */
+int
+uds_lookup(struct udssock * uds, const struct sockaddr * addr,
+ socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
{
- size_t subsize, pos;
- int i, r, peer;
+ struct udssock *peer;
+ const char *path;
+ size_t len;
+ dev_t dev;
+ ino_t ino;
+ int r;
+
+ /* Verify that the user gave us an acceptable address. */
+ if ((r = uds_check_addr(addr, addr_len, &path)) < 0)
+ return r;
+ len = (size_t)r;
- dprintf(("UDS: uds_perform_write(%d)\n", minor));
+ /* Attempt to look up the socket file on the file system. */
+ r = socketpath(user_endpt, path, len, SPATH_CHECK, &dev, &ino);
+ if (r != OK)
+ return r;
+ assert(dev != NO_DEV && ino != 0);
- /* Skip writes of zero bytes. */
- if (size == 0)
- return 0;
+ if ((peer = udshash_get(dev, ino)) == NULL)
+ return ECONNREFUSED;
+ if (uds_get_type(peer) != uds_get_type(uds))
+ return EPROTOTYPE;
- /* Check if the socket isn't shut down for writes. */
- if (!(uds_fd_table[minor].mode & UDS_W))
- return EPIPE;
+ *peerp = peer;
+ return OK;
+}
- /* Datagram messages must fit in the buffer entirely. */
- if (size > UDS_BUF && uds_fd_table[minor].type != SOCK_STREAM)
- return EMSGSIZE;
+/*
+ * Given the listening socket 'uds', and the socket 'link' that is calling or
+ * has called connect(2) and is or will be linked to the listening socket's
+ * queue, create a new socket and connect it to 'link', putting both sockets in
+ * the connected state. The given link socket may be in unconnected,
+ * connecting, or disconnected state prior to the call. Return OK or an error
+ * code. The link state of the link socket remains unchanged in any case.
+ */
+static int
+uds_attach(struct udssock * uds, struct udssock * link)
+{
+ struct udssock *conn;
+ int r;
+
+ /*
+ * Allocate a new socket to use as peer socket for the connection that
+ * is about to be established. The new socket is not yet known by
+ * libsockevent.
+ */
+ if ((r = uds_alloc(&conn)) != OK)
+ return r;
- if (uds_fd_table[minor].type == SOCK_STREAM ||
- uds_fd_table[minor].type == SOCK_SEQPACKET) {
+ /*
+ * Ask libsockevent to clone the sock object in the new UDS socket from
+ * the listening socket. This adds the sock object to libsockevent's
+ * data structures and ensures that we can safely use the socket
+ * despite the fact that it has not yet been accepted (and thus
+ * returned to libsockevent). From this moment on, we must either
+ * return the socket's ID (but not a pointer to it!) from uds_accept()
+ * or raise SEV_CLOSE on it.
+ */
+ sockevent_clone(&uds->uds_sock, &conn->uds_sock, uds_get_id(conn));
+
+ /* Connect the link socket to the new socket. */
+ link->uds_conn = conn;
+ link->uds_flags |= UDSF_CONNECTED;
+
+ /*
+ * Connect the new socket to the link socket as well. The child
+ * socket should also inherit pretty much all settings from the
+ * listening socket, including the bind path and the listening socket's
+ * bind-time credentials.
+ */
+ conn->uds_conn = link;
+ conn->uds_flags = uds->uds_flags & (UDSF_PASSCRED | UDSF_CONNWAIT);
+ conn->uds_flags |= UDSF_CONNECTED;
+ conn->uds_pathlen = uds->uds_pathlen;
+ memcpy(conn->uds_path, uds->uds_path, (size_t)uds->uds_pathlen);
+ memcpy(&conn->uds_cred, &uds->uds_cred, sizeof(conn->uds_cred));
+
+ return OK;
+}
+
+/*
+ * Connect a socket to a remote address.
+ */
+static int
+uds_connect(struct sock * sock, const struct sockaddr * addr,
+ socklen_t addr_len, endpoint_t user_endpt)
+{
+ struct udssock *uds = (struct udssock *)sock;
+ struct udssock *link;
+ int r;
+
+ dprintf(("UDS: connect(%d)\n", uds_get_id(uds)));
+
+ /* For connection-oriented sockets, several state checks apply. */
+ if (uds_get_type(uds) != SOCK_DGRAM) {
+ if (uds_is_listening(uds))
+ return EOPNOTSUPP;
+ if (uds_is_connecting(uds))
+ return EALREADY;
+ if (uds_is_connected(uds))
+ return EISCONN;
+ /* Disconnected sockets may be reconnected, see below. */
+ } else {
/*
- * If we're writing to a connection-oriented socket, then it
- * needs a peer to write to. For disconnected sockets, writing
- * is an error; for connecting sockets, writes should suspend.
+ * Connectionless sockets may be unconnected by providing an
+ * address with family AF_UNSPEC. Handle this case first here.
*/
- peer = uds_fd_table[minor].peer;
-
- if (peer == -1) {
- if (uds_fd_table[minor].err == ECONNRESET) {
- if (!pretend)
- uds_fd_table[minor].err = 0;
- return ECONNRESET;
- } else
- return ENOTCONN;
- } else if (uds_fd_table[peer].peer == -1) /* connecting */
- return EDONTREPLY;
- } else /* uds_fd_table[minor].type == SOCK_DGRAM */ {
- peer = -1;
-
- /* Locate the "peer" we want to write to. */
- for (i = 0; i < NR_FDS; i++) {
+ if (addr_len >= offsetof(struct sockaddr, sa_data) &&
+ addr->sa_family == AF_UNSPEC) {
/*
- * Look for a SOCK_DGRAM socket that is bound on
- * the target address.
+ * Reset this socket's previous connection to another
+ * socket, if any. Unconnecting has no effect on other
+ * sockets connected to this socket, though.
*/
- if (uds_fd_table[i].type == SOCK_DGRAM &&
- uds_fd_table[i].stale == FALSE &&
- uds_fd_table[i].addr.sun_family == AF_UNIX &&
- !strncmp(uds_fd_table[minor].target.sun_path,
- uds_fd_table[i].addr.sun_path,
- sizeof(uds_fd_table[i].addr.sun_path))) {
- peer = i;
- break;
- }
- }
+ if (uds_has_link(uds))
+ uds_del_queue(uds->uds_link, uds);
- if (peer == -1)
- return ENOENT;
+ return OK;
+ }
}
- /* Check if we write to a closed pipe. */
- if (!(uds_fd_table[peer].mode & UDS_R))
- return EPIPE;
-
/*
- * We have to preserve the boundary for DGRAM. If there's already a
- * packet waiting, discard it silently and pretend it was written.
+ * Find the socket identified by the given address. If it exists at
+ * all, see if it is a proper match.
*/
- if (uds_fd_table[minor].type == SOCK_DGRAM &&
- uds_fd_table[peer].size > 0)
- return size;
+ if ((r = uds_lookup(uds, addr, addr_len, user_endpt, &link)) != OK)
+ return r;
/*
- * Check if the ring buffer is already full, and if the SEQPACKET
- * message wouldn't write to an empty buffer.
+ * Handle connectionless sockets first, in which case a connect links
+ * the socket to a send target and limits receipt to datagrams from
+ * that target. We actually point the socket to the peer socket,
+ * through uds_link. That also means that if the target socket
+ * disappears, we have to reset any sockets connected to it, in which
+ * case we return them to the unconnected state. In order to allow
+ * finding all sockets connected to a particular socket, we put all
+ * those sockets on their target's queue, hence why we use uds_link and
+ * not uds_conn. As mentioned before, we allow reconnecting without
+ * restrictions.
+ * TODO: see if reconnecting should clear a pending ECONNRESET.
+ *
+ * An important note: 'uds' and 'link' may actually be the same socket,
+ * if the caller chooses to connect a socket with itself!
*/
- if (uds_fd_table[peer].size == UDS_BUF ||
- (uds_fd_table[minor].type == SOCK_SEQPACKET &&
- uds_fd_table[peer].size > 0)) {
- if (pretend)
- return EDONTREPLY;
+ if (uds_get_type(uds) == SOCK_DGRAM) {
+ /* Reconnecting to the same socket has no effect. */
+ if (uds_has_link(uds) && uds->uds_link == link)
+ return OK;
- if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ)
- panic("reader blocked on full socket");
+ /*
+ * If the intended target is linked to another socket, we
+ * refuse linking to it. Sending or receiving would never work
+ * anyway. Do allow a socket to link to itself after being
+ * linked to another socket. The error code is the same as in
+ * the sending code, borrowed from Linux.
+ */
+ if (uds != link && uds_has_link(link) && link->uds_link != uds)
+ return EPERM;
- dprintf(("UDS: suspending write request on %d\n", minor));
+ /*
+ * Reset this socket's previous link to another socket, if any.
+ */
+ if (uds_has_link(uds))
+ uds_del_queue(uds->uds_link, uds);
- /* Process is reading from an empty pipe. Suspend it. */
- return EDONTREPLY;
- }
+ /*
+ * Reset any links to this socket, except for the one by
+ * the intended target. Sending or receiving would no longer
+ * work anyway. If the socket was linked to itself, clear its
+ * self-link without generating an ECONNRESET. If the socket
+ * is relinking to itself, reestablish the link after first
+ * clearing it.
+ */
+ uds_clear_queue(uds, (uds != link) ? link : NULL);
- /* How much can we add to the ring buffer? */
- if (size > UDS_BUF - uds_fd_table[peer].size)
- size = UDS_BUF - uds_fd_table[peer].size;
+ uds_add_queue(link, uds);
- if (pretend)
- return size;
+ return OK;
+ }
- /* Put the data at the head of the ring buffer. */
- pos = (uds_fd_table[peer].pos + uds_fd_table[peer].size) % UDS_BUF;
+ /*
+ * For connection-oriented sockets there is more to do. First, make
+ * sure that the peer is a listening socket, that it has not been shut
+ * down, and that its backlog is not already at the configured maximum.
+ */
+ if (!uds_is_listening(link))
+ return ECONNREFUSED;
- subsize = UDS_BUF - pos;
- if (subsize > size)
- subsize = size;
+ if (uds_is_shutdown(link, SFL_SHUT_RD | SFL_SHUT_WR))
+ return ECONNREFUSED;
- if ((r = sys_safecopyfrom(endpt, grant, 0,
- (vir_bytes) &uds_fd_table[peer].buf[pos], subsize)) != OK)
- return r;
+ if (link->uds_queued >= link->uds_backlog)
+ return ECONNREFUSED;
- if (subsize < size) {
- if ((r = sys_safecopyfrom(endpt, grant, subsize,
- (vir_bytes) uds_fd_table[peer].buf, size - subsize)) != OK)
+ /*
+ * The behavior of connect(2) now depends on whether LOCAL_CONNWAIT is
+ * set on either the connecting or the listening socket. If it is not,
+ * the socket will be connected to a new as-yet invisible socket, which
+ * will be the one returned from accept(2) later. If it was, the
+ * socket will be put in the connecting state.
+ */
+ if (!((uds->uds_flags | link->uds_flags) & UDSF_CONNWAIT)) {
+ if ((r = uds_attach(link, uds)) != OK)
return r;
- }
- /* Advance the buffer head. */
- uds_fd_table[peer].size += size;
-
- /* Fill in the source address to be returned by recvfrom, recvmsg. */
- if (uds_fd_table[minor].type == SOCK_DGRAM)
- memcpy(&uds_fd_table[peer].source, &uds_fd_table[minor].addr,
- sizeof(struct sockaddr_un));
-
- /* See if we can wake up a blocked reader. */
- if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ)
- uds_unsuspend(peer);
-
- /* See if we can satisfy an ongoing select. */
- if ((uds_fd_table[peer].sel_ops & CDEV_OP_RD) &&
- uds_fd_table[peer].size > 0) {
- /* A read on the peer is possible now. */
- chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer,
- CDEV_OP_RD);
- uds_fd_table[peer].sel_ops &= ~CDEV_OP_RD;
+ assert(uds_is_connected(uds));
+ } else {
+ /*
+ * Disconnected sockets now stop being connected. Any pending
+ * data can still be received, though.
+ */
+ uds->uds_flags &= ~UDSF_CONNECTED;
+
+ r = SUSPEND;
}
- return size; /* number of bytes written */
+ /* Obtain credentials for the socket. */
+ uds_get_cred(uds, user_endpt);
+
+ /* Add the socket at the end of the listening socket's queue. */
+ uds_add_queue(link, uds);
+
+ assert(r != SUSPEND || uds_is_connecting(uds));
+
+ /*
+ * Let an accept call handle the rest, which will in turn resume this
+ * connect call. The sockevent library ensures that this works even if
+ * the call is non-blocking.
+ */
+ sockevent_raise(&link->uds_sock, SEV_ACCEPT);
+
+ return r;
}
-static ssize_t
-uds_read(devminor_t minor, u64_t position, endpoint_t endpt,
- cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
+/*
+ * Put a socket in listening mode.
+ */
+static int
+uds_listen(struct sock * sock, int backlog)
{
- ssize_t rc;
+ struct udssock *uds = (struct udssock *)sock;
+
+ /* The maximum backlog value must not exceed its field size. */
+ assert(SOMAXCONN <= USHRT_MAX);
- dprintf(("UDS: uds_read(%d)\n", minor));
+ dprintf(("UDS: listen(%d)\n", uds_get_id(uds)));
- if (minor < 0 || minor >= NR_FDS) return ENXIO;
+ /* Only connection-oriented types may be put in listening mode. */
+ if (uds_get_type(uds) == SOCK_DGRAM)
+ return EOPNOTSUPP;
- if (uds_fd_table[minor].state != UDS_INUSE)
+ /* A connecting or connected socket may not listen. */
+ if (uds_is_connecting(uds) || uds_is_connected(uds))
return EINVAL;
- rc = uds_perform_read(minor, endpt, grant, size, 0);
+ /* POSIX says that this is now the appropriate error code here. */
+ if (!uds_is_bound(uds))
+ return EDESTADDRREQ;
+
+ /*
+ * The socket is now entering the listening state. If it was
+ * previously disconnected, clear the connection flag.
+ */
+ uds->uds_flags &= ~UDSF_CONNECTED;
+
+ /*
+ * We do not remove sockets from the backlog if it is now being dropped
+ * below the current number of queued sockets. We only refuse newly
+ * connecting sockets beyond the backlog size.
+ */
+ uds->uds_backlog = backlog;
- /* If the call couldn't complete, suspend the caller. */
- if (rc == EDONTREPLY) {
- uds_fd_table[minor].suspended = UDS_SUSPENDED_READ;
- uds_fd_table[minor].susp_endpt = endpt;
- uds_fd_table[minor].susp_grant = grant;
- uds_fd_table[minor].susp_size = size;
- uds_fd_table[minor].susp_id = id;
+ return OK;
+}
+
+/*
+ * Test whether an accept request would block. Return OK if a socket could be
+ * accepted, an appropriate error code if an accept call would fail instantly,
+ * or SUSPEND if the accept request would block waiting for a connection.
+ */
+static int
+uds_test_accept(struct sock * sock)
+{
+ struct udssock *uds = (struct udssock *)sock;
- /* If the call wasn't supposed to block, cancel immediately. */
- if (flags & CDEV_NONBLOCK) {
- uds_cancel(minor, endpt, id);
+ /*
+ * Ensure that the socket is in listening mode. If not, we must return
+ * the error code that is appropriate for this socket type.
+ */
+ if (uds_get_type(uds) == SOCK_DGRAM)
+ return EOPNOTSUPP;
+ if (!uds_is_listening(uds))
+ return EINVAL;
- rc = EAGAIN;
- }
+ /*
+ * If the socket has been shut down, new connections are no longer
+ * accepted and accept calls no longer block. This is not a POSIX
+ * requirement, but rather an application convenience feature.
+ */
+ if (uds->uds_queued == 0) {
+ if (uds_is_shutdown(uds, SFL_SHUT_RD | SFL_SHUT_WR))
+ return ECONNABORTED;
+
+ return SUSPEND;
}
- return rc;
+ return OK;
}
-static ssize_t
-uds_write(devminor_t minor, u64_t position, endpoint_t endpt,
- cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
+/*
+ * Accept a connection on a listening socket, creating a new socket. On
+ * success, return the new socket identifier, with the new socket stored in
+ * 'newsockp'. Otherwise, return an error code.
+ */
+static sockid_t
+uds_accept(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len,
+ endpoint_t user_endpt __unused, struct sock ** newsockp)
{
- ssize_t rc;
+ struct udssock *uds = (struct udssock *)sock;
+ struct udssock *link, *conn;
+ sockid_t r;
- dprintf(("UDS: uds_write(%d)\n", minor));
+ dprintf(("UDS: accept(%d)\n", uds_get_id(uds)));
- if (minor < 0 || minor >= NR_FDS) return ENXIO;
+ if ((r = uds_test_accept(sock)) != OK)
+ return r;
- if (uds_fd_table[minor].state != UDS_INUSE)
- return EINVAL;
+ /*
+ * Take the first connecting socket off the listening queue.
+ */
+ assert(!TAILQ_EMPTY(&uds->uds_queue));
- rc = uds_perform_write(minor, endpt, grant, size, 0);
+ link = TAILQ_FIRST(&uds->uds_queue);
- /* If the call couldn't complete, suspend the caller. */
- if (rc == EDONTREPLY) {
- uds_fd_table[minor].suspended = UDS_SUSPENDED_WRITE;
- uds_fd_table[minor].susp_endpt = endpt;
- uds_fd_table[minor].susp_grant = grant;
- uds_fd_table[minor].susp_size = size;
- uds_fd_table[minor].susp_id = id;
+ /*
+ * Depending on the LOCAL_CONNWAIT setting at the time of connect(2),
+ * the socket may be connecting or connected. In the latter case, its
+ * attached socket is the socket we will return now. Otherwise we have
+ * to attach a socket first.
+ */
+ assert(uds_is_connecting(link) || uds_is_connected(link));
- /* If the call wasn't supposed to block, cancel immediately. */
- if (flags & CDEV_NONBLOCK) {
- uds_cancel(minor, endpt, id);
+ if (uds_is_connecting(link)) {
+ /*
+ * Attach a new socket. If this fails, return the error but
+ * leave the connecting socket on the listening queue.
+ */
+ if ((r = uds_attach(uds, link)) != OK)
+ return r;
- rc = EAGAIN;
- }
+ assert(uds_is_connected(link));
+
+ /*
+ * Wake up blocked (connect, send, select) calls on the peer
+ * socket.
+ */
+ sockevent_raise(&link->uds_sock, SEV_CONNECT);
}
- return rc;
+ uds_del_queue(uds, link);
+
+ /* Return the peer socket's address to the caller. */
+ uds_make_addr(link->uds_path, link->uds_pathlen, addr, addr_len);
+
+ conn = link->uds_conn;
+
+ dprintf(("UDS: accept returns %d\n", uds_get_id(conn)));
+
+ /*
+ * We already cloned the sock object, so return its ID but not a
+ * pointer to it. That tells libsockevent not to reinitialize it.
+ */
+ *newsockp = NULL;
+ return uds_get_id(conn);
}
+/*
+ * Set socket options.
+ */
static int
-uds_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
- cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
+uds_setsockopt(struct sock * sock, int level, int name,
+ const struct sockdriver_data * data, socklen_t len)
{
- int rc, s;
+ struct udssock *uds = (struct udssock *)sock;
+ int r, val;
- dprintf(("UDS: uds_ioctl(%d, %lu)\n", minor, request));
+ dprintf(("UDS: setsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name));
- if (minor < 0 || minor >= NR_FDS) return ENXIO;
+ switch (level) {
+ case SOL_SOCKET:
+ switch (name) {
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ /*
+ * The send buffer size may not be changed because the
+ * buffer is the same as the other side's receive
+ * buffer, and what the other side is may vary from
+ * send call to send call. Changing the receive buffer
+ * size would disallow us from even accurately guessing
+ * the send buffer size in getsockopt calls. Therefore
+ * both are hardcoded and cannot actually be changed.
+ * In order to support applications that want at least
+ * a certain minimum, we do accept requests to shrink
+ * either buffer, but we ignore the given size.
+ */
+ if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+ len)) != OK)
+ return r;
- if (uds_fd_table[minor].state != UDS_INUSE)
- return EINVAL;
+ if (val <= 0 || (size_t)val > uds_io_buflen())
+ return EINVAL;
+
+ return OK; /* ignore new value */
+ }
+
+ break;
+
+ case UDSPROTO_UDS:
+ switch (name) {
+ case LOCAL_CREDS:
+ if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+ len)) != OK)
+ return r;
+
+ if (val)
+ uds->uds_flags |= UDSF_PASSCRED;
+ else
+ uds->uds_flags &= ~UDSF_PASSCRED;
+
+ /*
+ * In incredibly rare cases, disabling this flag may
+ * allow blocked sends to be resumed, because suddenly
+ * no room for the credentials is needed in the receive
+ * buffer anymore.
+ */
+ if (!val)
+ sockevent_raise(&uds->uds_sock, SEV_SEND);
- /* Update the owner endpoint. */
- uds_fd_table[minor].owner = user_endpt;
-
- /* Let the UDS ioctl subsystem handle the actual request. */
- rc = uds_do_ioctl(minor, request, endpt, grant);
-
- /* If the call couldn't complete, suspend the caller. */
- if (rc == EDONTREPLY) {
- /* The suspension type is already set by the IOCTL handler. */
- if ((s = uds_fd_table[minor].suspended) == UDS_NOT_SUSPENDED)
- panic("IOCTL did not actually suspend?");
- uds_fd_table[minor].susp_endpt = endpt;
- uds_fd_table[minor].susp_grant = grant;
- uds_fd_table[minor].susp_size = 0; /* irrelevant */
- uds_fd_table[minor].susp_id = id;
-
- /* If the call wasn't supposed to block, cancel immediately. */
- if (flags & CDEV_NONBLOCK) {
- uds_cancel(minor, endpt, id);
- if (s == UDS_SUSPENDED_CONNECT)
- rc = EINPROGRESS;
+ return OK;
+
+ case LOCAL_CONNWAIT:
+ if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+ len)) != OK)
+ return r;
+
+ if (val)
+ uds->uds_flags |= UDSF_CONNWAIT;
else
- rc = EAGAIN;
+ uds->uds_flags &= ~UDSF_CONNWAIT;
+
+ /*
+ * Changing the setting does not affect sockets that
+ * are currently pending to be accepted. Therefore,
+ * uds_accept() may have to deal with either case on a
+ * socket-by-socket basis.
+ */
+ return OK;
+
+ case LOCAL_PEEREID:
+ /* This option may be retrieved but not set. */
+ return ENOPROTOOPT;
}
+
+ break;
}
- return rc;
+ return ENOPROTOOPT;
}
-void
-uds_unsuspend(devminor_t minor)
+/*
+ * Retrieve socket options.
+ */
+static int
+uds_getsockopt(struct sock * sock, int level, int name,
+ const struct sockdriver_data * data, socklen_t * len)
{
- int r;
- uds_fd_t *fdp;
+ struct udssock *uds = (struct udssock *)sock;
+ int val;
- fdp = &uds_fd_table[minor];
+ dprintf(("UDS: getsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name));
- switch (fdp->suspended) {
- case UDS_SUSPENDED_READ:
- r = uds_perform_read(minor, fdp->susp_endpt, fdp->susp_grant,
- fdp->susp_size, 0);
+ switch (level) {
+ case SOL_SOCKET:
+ switch (name) {
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ /* See uds_setsockopt() for why this is static. */
+ val = (int)uds_io_buflen();
- if (r == EDONTREPLY)
- return;
+ return sockdriver_copyout_opt(data, &val, sizeof(val),
+ len);
+ }
break;
- case UDS_SUSPENDED_WRITE:
- r = uds_perform_write(minor, fdp->susp_endpt, fdp->susp_grant,
- fdp->susp_size, 0);
+ case UDSPROTO_UDS:
+ switch (name) {
+ case LOCAL_CREDS:
+ val = !!(uds->uds_flags & UDSF_PASSCRED);
- if (r == EDONTREPLY)
- return;
+ return sockdriver_copyout_opt(data, &val, sizeof(val),
+ len);
- break;
+ case LOCAL_CONNWAIT:
+ val = !!(uds->uds_flags & UDSF_CONNWAIT);
- case UDS_SUSPENDED_CONNECT:
- case UDS_SUSPENDED_ACCEPT:
- /*
- * In both cases, the caller already set up the connection.
- * The only thing to do here is unblock.
- */
- r = fdp->err;
- fdp->err = 0;
+ return sockdriver_copyout_opt(data, &val, sizeof(val),
+ len);
- break;
+ case LOCAL_PEEREID:
+ /* getpeereid(3) documents these error codes. */
+ if (uds_get_type(uds) == SOCK_DGRAM)
+ return EINVAL;
+ if (!uds_is_connected(uds))
+ return ENOTCONN;
- default:
- panic("unknown suspension type %d", fdp->suspended);
+ /*
+ * This is a custom MINIX3 error, indicating that there
+ * are no credentials to return. This could be due to
+ * a failure to obtain them (which *should* not happen)
+ * but also if the socket was bound while connected,
+ * disconnected, and then reused as listening socket.
+ */
+ if (uds->uds_conn->uds_cred.unp_pid == -1)
+ return EINVAL;
+
+ return sockdriver_copyout_opt(data,
+ &uds->uds_conn->uds_cred,
+ sizeof(uds->uds_conn->uds_cred), len);
+ }
+
+ break;
}
- chardriver_reply_task(fdp->susp_endpt, fdp->susp_id, r);
+ return ENOPROTOOPT;
+}
- fdp->suspended = UDS_NOT_SUSPENDED;
+/*
+ * Retrieve a socket's local address.
+ */
+static int
+uds_getsockname(struct sock * sock, struct sockaddr * addr,
+ socklen_t * addr_len)
+{
+ struct udssock *uds = (struct udssock *)sock;
+
+ dprintf(("UDS: getsockname(%d)\n", uds_get_id(uds)));
+
+ uds_make_addr(uds->uds_path, uds->uds_pathlen, addr, addr_len);
+
+ return OK;
}
+/*
+ * Retrieve a socket's remote address.
+ */
static int
-uds_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
+uds_getpeername(struct sock * sock, struct sockaddr * addr,
+ socklen_t * addr_len)
{
- uds_fd_t *fdp;
- int i;
+ struct udssock *uds = (struct udssock *)sock;
+ struct udssock *peer;
- dprintf(("UDS: uds_cancel(%d)\n", minor));
+ dprintf(("UDS: getpeername(%d)\n", uds_get_id(uds)));
- if (minor < 0 || minor >= NR_FDS) return EDONTREPLY;
+ /*
+ * For disconnected sockets, we no longer have a peer socket and thus
+ * also no peer address. Too bad, but NetBSD does the same.
+ *
+ * For connecting sockets we could in fact return a peer address, but
+ * POSIX says (and other platforms agree) that we should deny the call.
+ */
+ peer = uds_get_peer(uds);
- fdp = &uds_fd_table[minor];
+ if (peer == NULL || uds_is_connecting(uds))
+ return ENOTCONN;
- if (fdp->state != UDS_INUSE) {
- printf("UDS: cancel request for closed minor %d\n", minor);
- return EDONTREPLY;
- }
+ uds_make_addr(peer->uds_path, peer->uds_pathlen, addr, addr_len);
- /* Make sure the cancel request is for a request we're hanging on. */
- if (fdp->suspended == UDS_NOT_SUSPENDED || fdp->susp_endpt != endpt ||
- fdp->susp_id != id)
- return EDONTREPLY; /* this happens. */
+ return OK;
+}
+
+/*
+ * Shut down socket send and receive operations. Note that 'flags' is a
+ * bitwise mask with libsockevent's SFL_SHUT_{RD,WR} flags rather than the set
+ * of SHUT_{RD,WR,RDWR} values from userland.
+ */
+static int
+uds_shutdown(struct sock * sock, unsigned int flags)
+{
+ struct udssock *uds = (struct udssock *)sock;
+ struct udssock *conn;
+ unsigned int mask;
+
+ dprintf(("UDS: shutdown(%d,0x%x)\n", uds_get_id(uds), flags));
/*
- * The system call was cancelled, so the socket is not suspended
- * anymore.
+ * If we are shutting down the socket for reading, we can already close
+ * any in-flight file descriptors associated with this socket.
*/
- switch (fdp->suspended) {
- case UDS_SUSPENDED_ACCEPT:
- /* A partial accept() only sets the server's child. */
- for (i = 0; i < NR_FDS; i++)
- if (uds_fd_table[i].child == minor)
- uds_fd_table[i].child = -1;
+ if (flags & SFL_SHUT_RD)
+ uds_io_reset(uds);
- break;
+ /*
+ * A shutdown on this side of a connection may have an effect on
+ * ongoing operations on the other side. Fire appropriate events.
+ */
+ if (uds_is_connected(uds)) {
+ assert(uds_get_type(uds) != SOCK_DGRAM);
- case UDS_SUSPENDED_CONNECT:
- /* Connect requests should continue asynchronously. */
- break;
+ conn = uds->uds_conn;
- case UDS_SUSPENDED_READ:
- case UDS_SUSPENDED_WRITE:
- /* Nothing more to do. */
- break;
+ mask = 0;
+ if (flags & SFL_SHUT_RD)
+ mask |= SEV_SEND;
+ if (flags & SFL_SHUT_WR)
+ mask |= SEV_RECV;
- default:
- panic("unknown suspension type %d", fdp->suspended);
+ sockevent_raise(&conn->uds_sock, mask);
}
- fdp->suspended = UDS_NOT_SUSPENDED;
+ return OK;
+}
+
+/*
+ * Close a socket.
+ *
+ * The 'force' flag is unused because we need never wait for data to be sent,
+ * since we keep all in-flight data on the receiver side.
+ */
+static int
+uds_close(struct sock * sock, int force __unused)
+{
+ struct udssock *uds = (struct udssock *)sock;
+
+ dprintf(("UDS: close(%d)\n", uds_get_id(uds)));
+
+ if (uds_get_type(uds) == SOCK_DGRAM) {
+ /* If this socket is linked to a target, disconnect it. */
+ if (uds_has_link(uds))
+ uds_del_queue(uds->uds_link, uds);
- return EINTR; /* reply to the original request */
+ /* Reset all sockets linked to this socket as a target. */
+ uds_clear_queue(uds, NULL);
+ } else if (uds_is_listening(uds)) {
+ /*
+ * Abort all connecting sockets queued on this socket, and
+ * break all connections for connected sockets queued on this
+ * socket, freeing their peers.
+ */
+ uds_clear_queue(uds, NULL);
+ } else if (uds_has_link(uds)) {
+ /*
+ * This socket is connecting or connected while the other side
+ * has not been accepted yet. Remove the socket from the
+ * listening socket's queue, and if it was connected, get rid
+ * of its peer socket altogether.
+ */
+ assert(uds_is_listening(uds->uds_link));
+
+ uds_del_queue(uds->uds_link, uds);
+
+ if (uds_is_connected(uds))
+ uds_disconnect(uds, TRUE /*was_linked*/);
+ } else if (uds_is_connected(uds)) {
+ /*
+ * Decouple the peer socket from this socket, and possibly wake
+ * up any pending operations on it. The socket remains marked
+ * as connected, but will now be disconnected.
+ */
+ uds_disconnect(uds, FALSE /*was_linked*/);
+ }
+
+ if (uds_is_hashed(uds))
+ udshash_del(uds);
+
+ return OK;
}
+static const struct sockevent_ops uds_ops = {
+ .sop_pair = uds_pair,
+ .sop_bind = uds_bind,
+ .sop_connect = uds_connect,
+ .sop_listen = uds_listen,
+ .sop_accept = uds_accept,
+ .sop_test_accept = uds_test_accept,
+ .sop_pre_send = uds_pre_send,
+ .sop_send = uds_send,
+ .sop_test_send = uds_test_send,
+ .sop_pre_recv = uds_pre_recv,
+ .sop_recv = uds_recv,
+ .sop_test_recv = uds_test_recv,
+ .sop_setsockopt = uds_setsockopt,
+ .sop_getsockopt = uds_getsockopt,
+ .sop_getsockname = uds_getsockname,
+ .sop_getpeername = uds_getpeername,
+ .sop_shutdown = uds_shutdown,
+ .sop_close = uds_close,
+ .sop_free = uds_free
+};
+
/*
- * Initialize the server.
+ * Initialize the service.
*/
static int
-uds_init(int UNUSED(type), sef_init_info_t *UNUSED(info))
+uds_init(int type __unused, sef_init_info_t * info __unused)
{
- /* Setting everything to NULL implicitly sets the state to UDS_FREE. */
- memset(uds_fd_table, '\0', sizeof(uds_fd_t) * NR_FDS);
+ unsigned int i;
+
+ /* Initialize the list of free sockets. */
+ TAILQ_INIT(&uds_freelist);
- uds_exit_left = 0;
+ for (i = 0; i < __arraycount(uds_array); i++) {
+ uds_array[i].uds_flags = 0;
- /* Announce we are up! */
- chardriver_announce();
+ TAILQ_INSERT_TAIL(&uds_freelist, &uds_array[i], uds_next);
+ }
- return(OK);
+ /* Initialize the file-to-socket hash table. */
+ udshash_init();
+
+ /* Initialize the input/output module. */
+ uds_io_init();
+
+ /* Initialize the status module. */
+ uds_stat_init();
+
+ /* Initialize the sockevent library. */
+ sockevent_init(uds_socket);
+
+ uds_in_use = 0;
+ uds_running = TRUE;
+
+ return OK;
+}
+
+/*
+ * Clean up before shutdown.
+ */
+static void
+uds_cleanup(void)
+{
+
+ /* Tell the status module to clean up. */
+ uds_stat_cleanup();
}
+/*
+ * The service has received a signal.
+ */
static void
uds_signal(int signo)
{
- int i;
- /* Only check for termination signal, ignore anything else. */
- if (signo != SIGTERM) return;
+ /* Only check for the termination signal. Ignore anything else. */
+ if (signo != SIGTERM)
+ return;
- /* Only exit once all sockets have been closed. */
- uds_exit_left = 0;
- for (i = 0; i < NR_FDS; i++)
- if (uds_fd_table[i].state == UDS_INUSE)
- uds_exit_left++;
+ /* Exit only once all sockets have been closed. */
+ uds_running = FALSE;
- if (uds_exit_left == 0)
- chardriver_terminate();
+ if (uds_in_use == 0)
+ sef_cancel();
}
+/*
+ * Perform initialization using the System Event Framework (SEF).
+ */
static void
uds_startup(void)
{
- /* Register init callbacks. */
+
+ /* Register initialization callbacks. */
sef_setcb_init_fresh(uds_init);
- /* Register signal callbacks. */
+ /* Register signal callback. */
sef_setcb_signal_handler(uds_signal);
/* Let SEF perform startup. */
}
/*
- * The UNIX domain sockets driver.
+ * The UNIX Domain Sockets driver.
*/
int
main(void)
{
+ message m;
+ int r, ipc_status;
+
+ /* Initialize the service. */
uds_startup();
- chardriver_task(&uds_tab);
+ /* Loop receiving and processing messages until instructed to stop. */
+ while (uds_running || uds_in_use > 0) {
+ if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) {
+ if (r == EINTR)
+ continue; /* sef_cancel() was called */
+
+ panic("UDS: sef_receive_status failed: %d", r);
+ }
+
+ /*
+ * Messages from the MIB service are (ultimately) for the
+ * status module. Everything else is assumed to be a socket
+ * request and passed to libsockevent, which will ignore
+ * anything it does not recognize.
+ */
+ if (m.m_source == MIB_PROC_NR)
+ rmib_process(&m, ipc_status);
+ else
+ sockevent_process(&m, ipc_status);
+ }
+
+ /* Clean up before graceful shutdown. */
+ uds_cleanup();
- return(OK);
+ return EXIT_SUCCESS;
}
--- /dev/null
+service uds
+{
+ domain LOCAL;
+ system KILL; # for SIGPIPE
+ uid 0; # for socketpath(2) and copyfd(2)
+ ipc
+ SYSTEM vfs rs vm mib
+ ;
+};
-#ifndef __UDS_UDS_H
-#define __UDS_UDS_H
+#ifndef MINIX_NET_UDS_UDS_H
+#define MINIX_NET_UDS_UDS_H
#include <minix/drivers.h>
-#include <minix/chardriver.h>
-#undef send
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/ucred.h>
+#include <minix/sockevent.h>
+#include <minix/rmib.h>
#include <sys/un.h>
-#include <sys/mman.h>
-/* Maximum number of UNIX domain sockets. */
-#define NR_FDS 256
+/*
+ * Maximum number of UNIX domain sockets. The control structures for all of
+ * these are allocated statically, although each socket's receive buffer is
+ * allocated only when the socket is in use. If this constant is increased
+ * beyond 65535, a few field sizes need to be changed.
+ */
+#define NR_UDSSOCK 256
+
+/* Number of slots in the <dev,ino>-to-udssock hash table. */
+#define UDSHASH_SLOTS 64
+
+/* UDS has no protocols, so we accept only an "any protocol" value. */
+#define UDSPROTO_UDS 0
+
+/*
+ * The size of each socket's receive buffer. This size is currently a global
+ * setting which cannot be changed per socket at run time, and it would be
+ * rather tricky to change that. In order not to waste resources, this size
+ * should be a multiple of the page size. Due to the fact that data and
+ * metadata (such as lengths, source addresses and sender credentials) are
+ * intermixed in the same buffer, the actual amount of data that can be in
+ * transit at once is typically less than this value. If this constant is
+ * increased beyond 65535, several fields and field sizes need to be changed.
+ */
+#define UDS_BUF 32768
-/* Connection backlog size for incoming connections. */
-#define UDS_SOMAXCONN 64
+/* Maximum size of control data that can be sent or received at once. */
+#define UDS_CTL_MAX 4096
-/* Maximum UDS socket buffer size. */
-#define UDS_BUF PIPE_BUF
+/*
+ * We allow longer path names than the size of struct sockaddr_un's sun_path
+ * field. The actual limit is determined by the maximum value of the sun_len
+ * field, which is 255 and includes the first two fields of the structure (one
+ * byte each) but not the null terminator of the path. Thus, the maximum
+ * length of the path minus null terminator is 253; with terminator it is 254.
+ */
+#define UDS_PATH_MAX (UINT8_MAX - sizeof(uint8_t) - sizeof(sa_family_t) + 1)
/* Output debugging information? */
#define DEBUG 0
#endif
/*
- * A light version of the "uucred" credentials structure. We basically do not
- * support passing around groups lists, and by not using struct uucred as
- * storage, we save memory for those groups lists as well. Note that the
- * original Linux uucred structure has a 'cr_pid' field as well, but this is
- * unsupported in NetBSD's version of the structure (and rightly so).
+ * We declare this structure only for the static assert right below it. We
+ * have no need for the structure otherwise, as we use "struct sockaddr"
+ * directly instead.
*/
-struct luucred {
- uid_t uid;
- gid_t gid;
+struct sockaddr_unx {
+ uint8_t sunx_len;
+ sa_family_t sunx_family;
+ char sunx_path[UDS_PATH_MAX];
};
-
-/* ancillary data to be sent */
-struct ancillary {
- int fds[OPEN_MAX];
- int nfiledes;
- struct luucred cred;
-};
-
-#define UDS_R 0x1
-#define UDS_W 0x2
+STATIC_SOCKADDR_MAX_ASSERT(sockaddr_unx);
/*
- * Internal State Information for a socket descriptor.
+ * In-flight file descriptor object. Each in-use object is part of a socket's
+ * file descriptor queue, and the file descriptor is for a file open by this
+ * service. For each set of in-flight file descriptors associated with a
+ * particular segment, the first object's count field contains the number of
+ * file descriptors in that set. For all other objects in that set, the count
+ * field is zero. TODO: the count should be stored in the segment itself.
*/
struct uds_fd {
+ SIMPLEQ_ENTRY(uds_fd) ufd_next; /* next FD object for this socket */
+ int ufd_fd; /* local file descriptor number */
+ unsigned int ufd_count; /* number of FDs for this segment */
+};
-/* Flags */
-
- enum UDS_STATE {
- /* This file descriptor is UDS_FREE and can be allocated. */
- UDS_FREE = 0,
-
- /* OR it is UDS_INUSE and can't be allocated. */
- UDS_INUSE = 1
-
- /* state is set to UDS_INUSE in uds_open(). state is Set to
- * UDS_FREE in uds_init() and uds_close(). state should be
- * checked prior to all operations.
- */
- } state;
-
-/* Owner Info */
-
- /* Socket Owner */
- endpoint_t owner;
-
-/* Pipe Housekeeping */
-
- char *buf; /* ring buffer */
- size_t pos; /* tail position into ring buffer */
- size_t size; /* size of used part of ring buffer */
-
- /* control read/write, set by uds_open() and shutdown(2).
- * Can be set to UDS_R|UDS_W, UDS_R, UDS_W, or 0
- * for read and write, read only, write only, or neither.
- * default is UDS_R|UDS_W.
- */
- int mode;
-
-/* Socket Info */
-
- /* socket type - SOCK_STREAM, SOCK_DGRAM, or SOCK_SEQPACKET
- * Set by uds_ioctl(NWIOSUDSTYPE). It defaults to -1 in
- * uds_open(). Any action on a socket with type -1 besides
- * uds_ioctl(NWIOSUDSTYPE) and uds_close() will result in
- * an error.
- */
- int type;
-
- /* queue of pending connections for server sockets.
- * connect(2) inserts and accept(2) removes from the queue
- */
- int backlog[UDS_SOMAXCONN];
-
- /* requested connection backlog size. Set by listen(2)
- * Bounds (0 <= backlog_size <= UDS_SOMAXCONN)
- * Defaults to UDS_SOMAXCONN which is defined above.
- */
- unsigned char backlog_size;
-
- /* index of peer in uds_fd_table for connected sockets.
- * -1 is used to mean no peer. Assumptions: peer != -1 means
- * connected.
- */
- int peer;
-
- /* index of child (client sd returned by accept(2))
- * -1 is used to mean no child.
- */
- int child;
-
- /* address -- the address the socket is bound to.
- * Assumptions: addr.sun_family == AF_UNIX means its bound.
- */
- struct sockaddr_un addr;
-
- /* target -- where DGRAMs are sent to on the next uds_write(). */
- struct sockaddr_un target;
-
- /* source -- address where DGRAMs are from. used to fill in the
- * from address in recvfrom(2) and recvmsg(2).
- */
- struct sockaddr_un source;
-
- /* Flag (TRUE or FALSE) - address overridden by newer socket.
- * Default to FALSE. Set to TRUE by do_bind() on another socket with
- * the same path but its on-disk socket file removed in the meantime.
- */
- int stale;
-
- /* Flag (TRUE or FALSE) - listening for incoming connections.
- * Default to FALSE. Set to TRUE by do_listen().
- */
- int listening;
-
- /* stores file pointers and credentials being sent between
- * processes with sendmsg(2) and recvmsg(2).
- */
- struct ancillary ancillary_data;
-
- /* Holds an errno. This is set when a connected socket is
- * closed and we need to pass ECONNRESET on to a suspended
- * peer.
- */
- int err;
-
-/* Suspend/Revive Housekeeping */
-
- /* SUSPEND State Flags */
- enum UDS_SUSPENDED {
-
- /* Socket isn't blocked. */
- UDS_NOT_SUSPENDED = 0,
-
- /* Socket is blocked on read(2) waiting for data to read. */
- UDS_SUSPENDED_READ = 1,
-
- /* Socket is blocked on write(2) for space to write data. */
- UDS_SUSPENDED_WRITE = 2,
-
- /* Socket is blocked on connect(2) waiting for the server. */
- UDS_SUSPENDED_CONNECT = 4,
-
- /* Socket is blocked on accept(2) waiting for clients. */
- UDS_SUSPENDED_ACCEPT = 8
- } suspended;
-
- /* source endpoint, saved for later use by suspended procs */
- endpoint_t susp_endpt;
-
- /* i/o grant, saved for later use by suspended procs */
- cp_grant_id_t susp_grant;
-
- /* size of request, saved for later use by suspended procs */
- size_t susp_size;
-
- /* request ID, saved for later use by suspended procs */
- cdev_id_t susp_id;
+/*
+ * Connection-type sockets (SOCK_STREAM, SOCK_SEQPACKET) are always in one of
+ * the following five states, each with unique characteristics:
+ *
+ * - Unconnected: this socket is not in any of the other states, usually
+ * because it either has just been created, or because it has failed a
+ * connection attempt. This socket has no connected peer and does not have
+ * the SO_ACCEPTCONN socket option set.
+ * - Listening: this socket is in listening mode. It has a queue with sockets
+ * that are connecting or connected to it but have not yet been accepted on
+ * it. This socket has no connected peer. It has the SO_ACCEPTCONN socket
+ * option set.
+ * - Connecting: this socket is on a listening socket's queue. While in this
+ * state, the socket has the listening socket as its linked peer, and it has
+ * no connected peer.
+ * - Connected: this socket is connected to another socket, which is its
+ * connected peer socket. It has the UDSF_CONNECTED flag set. A socket may
+ * be connected and still be involved with a listening socket; see below.
+ * - Disconnected: this socket was connected to another socket, but that other
+ * socket has been closed. As a result, this socket has no peer. It does
+ * have the UDSF_CONNECTED flag set.
+ *
+ * The UDS service supports two different type of connect behaviors, depending
+ * on what the LOCAL_CONNWAIT option is set to on either the connecting or the
+ * listening socket. If LOCAL_CONNWAIT is not set on either (the default), the
+ * connecting socket socket (let's call it "A") enters the connected state
+ * right away, even if the connection is not immediately accepted through
+ * accept(2). In that case, a new limbo socket "B" is allocated as its
+ * connection peer. Limbo socket B is also in connected state, and either
+ * returned from accept(2) later, or freed when socket A leaves the connected
+ * state. Socket A can leave the connected state either by being closed or
+ * when the listening socket is closed. If LOCAL_CONNWAIT is set, socket A
+ * stays in the connecting state until it is accepted through accept(2).
+ * Importantly, in both cases, it is socket A, and (in the first case) *not*
+ * socket B, that is on the queue of the listening socket!
+ *
+ * Connected peers (uds_conn) are always symmetric: if one socket is connected
+ * to another socket, that other socket is connected to it. Any socket that is
+ * on the queue of another socket, is said to be "linked" to that other socket
+ * (uds_link). This is an asymmetric, one-to-many relationship: many sockets
+ * may be linked to one other socket, which keeps all those sockets on its
+ * queue. From the above story it should now be clear that for connection-type
+ * sockets, only listening sockets may have sockets on its queue, and while
+ * connecting sockets are always on a listening socket's queue, connected
+ * sockets may or may not be. Sockets in other states never are.
+ *
+ * UNIX domain sockets are generally reusable. This means that the listening
+ * state is the only final state; all other socket states allow the socket to
+ * enter another state, although not necessarily every other state. For
+ * example, a disconnected socket may be reconnected to another target; if that
+ * connection fails, the socket will enter the unconnected state. As a result,
+ * a socket in any state (even the listening state) may still have incoming
+ * data pending from a previous connection. However, EOF is currently produced
+ * only for disconnected sockets. To be sure: connecting and connected sockets
+ * must first enter the unconnected or disconnected state, respectively, before
+ * possibly being reconnected.
+ *
+ * For connectionless (i.e., SOCK_DGRAM) sockets, there are no separate states.
+ * However, a connectionless socket may have been connected to another socket.
+ * We maintain these links not with uds_conn but with uds_link, because such
+ * connections are not symmetric, and there is an interest in keeping track of
+ * which datagram sockets are connected to a particular socket (namely, to
+ * break the connection on close without doing an exhaustive search). For that
+ * reason, when a datagram socket connects to another socket, it is linked to
+ * that other socket, and the other socket has this socket on its queue. As a
+ * strange corner case, a connectionless socket may be connected to itself, in
+ * which case it is its own linked peer and it is also on its own queue. For
+ * datagram sockets, uds_conn is always NULL and UDSF_CONNECTED is never set.
+ *
+ * For the purposes of sending and receiving, we generally refer to the
+ * communication partner of a socket as its "peer". As should now be clear,
+ * for connection-type sockets, the socket's peer is identified with uds_conn;
+ * for connectionless sockets, the socket's peer is identified with uds_link.
+ */
+struct udssock {
+ struct sock uds_sock; /* sock object */
+ struct udssock *uds_conn; /* connected socket, or NULL if none */
+ struct udssock *uds_link; /* linked socket, or NULL if none */
+ unsigned char *uds_buf; /* receive buffer (memory-mapped) */
+ unsigned short uds_tail; /* tail of data in receive buffer */
+ unsigned short uds_len; /* length of data in receive buffer */
+ unsigned short uds_last; /* offset to last header in buffer */
+ unsigned short uds_queued; /* current nr of sockets on queue */
+ unsigned short uds_backlog; /* maximum nr of connecting sockets */
+ unsigned char uds_flags; /* UDS-specific flags (UDSF_) */
+ unsigned char uds_pathlen; /* socket file path length (w/o nul) */
+ char uds_path[UDS_PATH_MAX - 1];/* socket file path (not terminated) */
+ dev_t uds_dev; /* socket file device number */
+ ino_t uds_ino; /* socket file inode number */
+ struct unpcbid uds_cred; /* bind/connect-time credentials */
+ SLIST_ENTRY(udssock) uds_hash; /* next in hash chain */
+ TAILQ_ENTRY(udssock) uds_next; /* next in free list or queue */
+ SIMPLEQ_HEAD(, uds_fd) uds_fds; /* in-flight file descriptors */
+ TAILQ_HEAD(, udssock) uds_queue;/* queue of linked sockets */
+};
-/* select() */
-
- /* when a select is in progress, we notify this endpoint
- * of new data.
- */
- endpoint_t sel_endpt;
+#define UDSF_IN_USE 0x01 /* in use (for enumeration only) */
+#define UDSF_CONNECTED 0x02 /* connected or disconnected */
+#define UDSF_CONNWAIT 0x04 /* leave connecting until accept */
+#define UDSF_PASSCRED 0x08 /* pass credentials when receiving */
- /* Options (CDEV_OP_RD,WR,ERR) that are requested. */
- unsigned int sel_ops;
-};
+/* Macros. */
+#define uds_get_type(uds) sockevent_get_type(&(uds)->uds_sock)
-typedef struct uds_fd uds_fd_t;
+/*
+ * A socket that can be found through hash table lookups always has a non-empty
+ * path as well as a valid <dev,ino> pair identifying the socket file that is,
+ * or once was, identified by that path. However, a socket that is bound, even
+ * though it will still have an associated path, is not necessarily hashed.
+ * The reason for the difference is <dev,ino> pair reuse. This case is
+ * elaborated on in uds_bind().
+ */
+#define uds_is_bound(uds) ((uds)->uds_pathlen != 0)
+#define uds_is_hashed(uds) ((uds)->uds_dev != NO_DEV)
-/* File Descriptor Table -- Defined in uds.c */
-EXTERN uds_fd_t uds_fd_table[NR_FDS];
+/*
+ * These macros may be used on all socket types. However, the uds_is_connected
+ * macro returns TRUE only for connection-oriented sockets. To see if a
+ * datagram socket is connected to a target, use uds_has_link instead.
+ */
+#define uds_has_conn(uds) ((uds)->uds_conn != NULL)
+#define uds_has_link(uds) ((uds)->uds_link != NULL)
+#define uds_get_peer(uds) \
+ ((uds_get_type(uds) != SOCK_DGRAM) ? (uds)->uds_conn : (uds)->uds_link)
+#define uds_is_listening(uds) sockevent_is_listening(&(uds)->uds_sock)
+#define uds_is_connecting(uds) \
+ (uds_has_link(uds) && !((uds)->uds_flags & UDSF_CONNECTED) && \
+ uds_get_type(uds) != SOCK_DGRAM)
+#define uds_is_connected(uds) \
+ (((uds)->uds_flags & UDSF_CONNECTED) && uds_has_conn(uds))
+#define uds_is_disconnected(uds) \
+ (((uds)->uds_flags & UDSF_CONNECTED) && !uds_has_conn(uds))
+
+#define uds_is_shutdown(uds, mask) \
+ sockevent_is_shutdown(&(uds)->uds_sock, (mask))
/* Function prototypes. */
-/* ioc_uds.c */
-int uds_clear_fds(devminor_t minor, struct ancillary *data);
-int uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
- cp_grant_id_t grant);
-
/* uds.c */
-ssize_t uds_perform_read(devminor_t minor, endpoint_t endpt,
- cp_grant_id_t grant, size_t size, int pretend);
-void uds_unsuspend(devminor_t minor);
-
-#endif /* !__UDS_UDS_H */
+sockid_t uds_get_id(struct udssock * uds);
+struct udssock *uds_enum(struct udssock * prev, int type);
+void uds_make_addr(const char * path, size_t len, struct sockaddr * addr,
+ socklen_t * addr_len);
+int uds_lookup(struct udssock * uds, const struct sockaddr * addr,
+ socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp);
+
+/* io.c */
+void uds_io_init(void);
+int uds_io_setup(struct udssock * uds);
+void uds_io_cleanup(struct udssock * uds);
+void uds_io_reset(struct udssock * uds);
+size_t uds_io_buflen(void);
+int uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len,
+ const struct sockaddr * addr, socklen_t addr_len,
+ endpoint_t user_endpt, int flags);
+int uds_send(struct sock * sock, const struct sockdriver_data * data,
+ size_t len, size_t * off, const struct sockdriver_data * ctl,
+ socklen_t ctl_len, socklen_t * ctl_off, const struct sockaddr * addr,
+ socklen_t addr_len, endpoint_t user_endpt, int flags, size_t min);
+int uds_test_send(struct sock * sock, size_t min);
+int uds_pre_recv(struct sock * sock, endpoint_t user_endpt, int flags);
+int uds_recv(struct sock * sock, const struct sockdriver_data * data,
+ size_t len, size_t * off, const struct sockdriver_data * ctl,
+ socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr,
+ socklen_t * addr_len, endpoint_t user_endpt, int flags, size_t min,
+ int * rflags);
+int uds_test_recv(struct sock * sock, size_t min, size_t * size);
+
+/* stat.c */
+void uds_stat_init(void);
+void uds_stat_cleanup(void);
+
+#endif /* !MINIX_NET_UDS_UDS_H */
.in +5
.ti -5
int socket(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP);
+.br
.ti -5
int socketpair(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP, int \fIsv[2]\fP);
.br
Local sockets, more commonly known as Unix Domain Sockets, provide a
means of interprocess communication using the socket API.
.SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR getpeereid(2),
-.BR uds(8)
+.BR socket(2) ,
+.BR socketpair(2) ,
+.BR getpeereid(3)
.SH HISTORY
-This Unix Domain Sockets first appeared in Minix 3.1.8.
+This Unix Domain Sockets first appeared in MINIX 3.1.8.
{
/* Copy a file descriptor between processes, or close a remote file descriptor.
* This call is used as back-call by device drivers (UDS, VND), and is expected
- * to be used in response to an IOCTL to such device drivers.
+ * to be used in response to either an IOCTL to VND or a SEND or RECV socket
+ * request to UDS.
*/
struct fproc *rfp;
struct filp *rfilp;
rfp = &fproc[slot];
/* FIXME: we should now check that the user process is indeed blocked on an
- * IOCTL call, so that we can safely mess with its file descriptors. We
- * currently do not have the necessary state to verify this, so we assume
- * that the call is always used in the right way.
+ * IOCTL or socket call, so that we can safely mess with its file
+ * descriptors. We currently do not have the necessary state to verify this,
+ * so we assume that the call is always used in the right way.
*/
/* Depending on the operation, get the file descriptor from the caller or the
* passes in the file descriptor to the device node on which it is performing
* the IOCTL. We do not allow manipulation of such device nodes. In
* practice, this only applies to block-special files (and thus VND), because
- * character-special files (as used by UDS) are unlocked during the IOCTL.
+ * socket files (as used by UDS) are unlocked during the socket operation.
*/
if (rfilp->filp_ioctl_fp == rfp)
return(EBADF);
resolve.l_vnode_lock = VNODE_WRITE;
/* Only the super_user may make nodes other than fifos. */
- if (!super_user && (!S_ISFIFO(mode_bits) && !S_ISSOCK(mode_bits))) {
+ if (!super_user && !S_ISFIFO(mode_bits))
return(EPERM);
- }
+
bits = (mode_bits & S_IFMT) | (mode_bits & ACCESSPERMS & fp->fp_umask);
/* Open directory that's going to hold the new node. */
#include <minix/vfsif.h>
#include <sys/param.h>
#include <sys/stat.h>
-#include <sys/un.h>
#include <sys/dirent.h>
#include "vmnt.h"
#include "vnode.h"
struct fproc *rfp;
char path[PATH_MAX];
struct lookup resolve, resolve2;
- struct sockaddr_un sun;
mode_t bits;
/* This should be replaced by an ACL check. */
what = job_m_in.m_lsys_vfs_socketpath.what;
if (isokendpt(ep, &slot) != OK) return(EINVAL);
- if (pathlen < sizeof(sun.sun_path) || pathlen >= PATH_MAX) return(EINVAL);
+ rfp = &fproc[slot];
- rfp = &(fproc[slot]);
+ /* Copy in the path name, which must not be empty. It is typically not null
+ * terminated.
+ */
+ if (pathlen < 1 || pathlen >= sizeof(path)) return(EINVAL);
r = sys_safecopyfrom(who_e, io_gr, (vir_bytes)0, (vir_bytes)path, pathlen);
if (r != OK) return(r);
path[pathlen] = '\0';
- /* If requested, turn path into canonical path to the socket file */
- if (what & SPATH_CANONIZE) {
- if ((r = canonical_path(path, rfp)) != OK) return(r);
- if (strlen(path) >= pathlen) return(ENAMETOOLONG);
-
- /* copy path back to the caller */
- r = sys_safecopyto(who_e, (cp_grant_id_t)io_gr, (vir_bytes)0,
- (vir_bytes)path, pathlen);
- if (r != OK) return(r);
- }
-
/* Now perform the requested action. For the SPATH_CHECK action, a socket
* file is expected to exist already, and we should check whether the given
* user process has access to it. For the SPATH_CREATE action, no file is
* Since the above canonicalization releases all locks once done, we need to
* recheck absolutely everything now. TODO: do not release locks in between.
*/
- switch (what & ~SPATH_CANONIZE) {
+ switch (what) {
case SPATH_CHECK:
lookup_init(&resolve, path, PATH_NOFLAGS, &vmp, &vp);
resolve.l_vmnt_lock = VMNT_READ;
void test_fail_fl(char *msg, char *file, int line)
{
char *timestamp;
+ int e;
+ e = errno;
timestamp = get_timestamp();
if (errct == 0) fprintf(stderr, "\n");
+ errno = e;
fprintf(stderr, "[ERROR][%s] (%s Line %d) %s [pid=%d:errno=%d:%s]\n",
- timestamp, file, line, msg, getpid(),
- errno, strerror(errno));
+ timestamp, file, line, msg, getpid(), errno, strerror(errno));
fflush(stderr);
if (timestamp != NULL) {
free(timestamp);
timestamp = NULL;
}
+ errno = e;
e(7);
}
SOCKET(sd, info->domain, info->type, 0);
errno = 0;
rc = shutdown(sd, how[i]);
- if (!(rc == -1 && errno == ENOTCONN) &&
+ if (rc != 0 && !(rc == -1 && errno == ENOTCONN) &&
!info->bug_shutdown_not_conn &&
!info->bug_shutdown) {
test_fail("shutdown() should have failed");
SOCKET(sd, info->domain, info->type, 0);
errno = 0;
rc = shutdown(sd, -1);
- if (!(rc == -1 && errno == ENOTCONN) &&
+ if (!(rc == -1 && errno == EINVAL) &&
!info->bug_shutdown_not_conn &&
!info->bug_shutdown) {
- test_fail("shutdown(sd, -1) should have failed with ENOTCONN");
+ test_fail("shutdown(sd, -1) should have failed with EINVAL");
}
CLOSE(sd);
CLOSE(sd);
}
-
-
SOCKET(sd, info->domain, info->type, 0);
debug("Test setsockopt() works");
test_fail("[client] getpeername() should have worked");
}
- /* we need to use the full path "/usr/src/test/DIR_56/test.sock"
- * because that is what is returned by getpeername().
- */
info->callback_check_sockaddr((struct sockaddr *) &peer_addr,
peer_addr_len, "getpeername", 1);
if (!info->ignore_write_conn_reset) {
test_fail("write should have failed\n");
}
- } else if (errno != ECONNRESET) {
- test_fail("errno should've been ECONNRESET\n");
+ } else if (errno != EPIPE && errno != ECONNRESET) {
+ test_fail("errno should've been EPIPE/ECONNRESET\n");
}
}
if (abort_type == 1) {
memset(buf, '\0', BUFSIZE);
rc = read(client_sd, buf, BUFSIZE);
- if (rc != -1 && (rc != 0 || !info->ignore_read_conn_reset)) {
+ if (rc != 0 && rc != -1) {
test_fail("read should've failed or returned zero\n");
}
if (rc != 0 && errno != ECONNRESET) {
test_fail("recvmsg");
}
- /* we need to use the full path "/usr/src/test/DIR_56/testb.sock"
- * because that is what is returned by recvmsg().
- */
info->callback_check_sockaddr((struct sockaddr *) &addr,
msg2.msg_namelen, "recvmsg", 2);
if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
test_fail("bind() should have worked");
+ if (info->callback_set_listen_opt != NULL)
+ info->callback_set_listen_opt(server_sd);
+
if (listen(server_sd, 8) == -1)
test_fail("listen() should have worked");
if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
test_fail("bind() should have worked");
+ if (info->callback_set_listen_opt != NULL)
+ info->callback_set_listen_opt(server_sd);
+
if (listen(server_sd, 8) == -1)
test_fail("listen() should have worked");
errct = 0;
close(client_sd);
+ /* Ensure that the parent is blocked on the send(). */
+ sleep(1);
+
check_select(server_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/);
len = sizeof(addr);
if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
test_fail("bind() should have worked");
+ if (info->callback_set_listen_opt != NULL)
+ info->callback_set_listen_opt(server_sd);
+
if (listen(server_sd, 8) == -1)
test_fail("listen() should have worked");
if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
test_fail("bind() should have worked");
+ if (info->callback_set_listen_opt != NULL)
+ info->callback_set_listen_opt(server_sd);
+
if (listen(server_sd, 8) == -1)
test_fail("listen() should have worked");
byte = 0;
if (write(client_sd, &byte, 1) != -1 || errno != ENOTCONN)
- /* Yes, you fucked up the fix for the FIXME below. */
test_fail("write() should have yielded ENOTCONN");
if (connect(client_sd, info->clientaddr, info->clientaddrlen) != -1) {
}
/*
- * FIXME: currently UDS cannot distinguish between sockets that have
- * not yet been connected, and sockets that have been disconnected.
- * Thus, we get the same error for both: ENOTCONN instead of EPIPE.
+ * The error we get on the next write() depends on whether the socket
+ * may be reused after a failed connect: for TCP/IP, it may not, so we
+ * get EPIPE; for UDS, it may be reused, so we get ENOTCONN.
*/
-#if 0
- if (write(client_sd, &byte, 1) != -1 || errno != EPIPE)
- test_fail("write() should have yielded EPIPE");
-#endif
+ if (!info->bug_connect_after_close) {
+ if (write(client_sd, &byte, 1) != -1 ||
+ (errno != EPIPE && errno != ENOTCONN))
+ test_fail("write() should have yielded "
+ "EPIPE/ENOTCONN");
+ }
close(client_sd);
if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
test_fail("bind() should have worked");
+ if (info->callback_set_listen_opt != NULL)
+ info->callback_set_listen_opt(server_sd);
+
if (listen(server_sd, 8) == -1)
test_fail("listen() should have worked");
test_fail("write() should have yielded ECONNRESET");
}
- /*
- * FIXME: currently UDS cannot distinguish between sockets that have
- * not yet been connected, and sockets that have been disconnected.
- * Thus, we get the same error for both: ENOTCONN instead of EPIPE.
- */
-#if 0
- if (write(client_sd, &byte, 1) != -1 || errno != EPIPE)
- test_fail("write() should have yielded EPIPE");
-#endif
-
check_select_cond(client_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/,
!info->ignore_select_delay);
int ignore_accept_delay; /* success from accept after aborted connect */
int ignore_connect_delay; /* nb connect not instant */
int ignore_connect_unaccepted; /* connect succeeds without accept */
- int ignore_read_conn_reset; /* read does not guarantee ECONNRESET */
int ignore_select_delay; /* select delay reflecting other side nb op */
int ignore_send_waiting; /* can send while waiting for nb recv */
int ignore_write_conn_reset; /* write does not guarantee ECONNRESET */
void (* callback_cleanup)(void);
void (* callback_xfer_peercred)(int sd); /* can be NULL */
void (* callback_xfer_prepclient)(void); /* can be NULL */
+ void (* callback_set_listen_opt)(int sd); /* can be NULL */
};
void test_abort_client_server(const struct socket_test_info *info,
/* socket types supported */
static int types[3] = {SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM};
-static char sock_fullpath[PATH_MAX + 1];
-
-/* Convert name to the full path of the socket. Assumes name is in cwd. */
-static char *fullpath(const char *name)
-{
- char cwd[PATH_MAX + 1];
-
- if (realpath(".", cwd) == NULL)
- test_fail("Couldn't retrieve current working dir");
-
- snprintf(sock_fullpath, PATH_MAX, "%s/%s", cwd, name);
-
- return(sock_fullpath);
-}
static void test_header(void)
{
static void test_ucred(void)
{
- struct uucred credentials;
+ struct unpcbid credentials;
socklen_t ucred_length;
uid_t euid = geteuid();
gid_t egid = getegid();
int sv[2];
int rc;
- debug("Test credentials passing");
+ debug("Test peer credentials");
- ucred_length = sizeof(struct uucred);
+ ucred_length = sizeof(credentials);
rc = socketpair(PF_UNIX, SOCK_STREAM, 0, sv);
if (rc == -1) {
}
memset(&credentials, '\0', ucred_length);
- rc = getsockopt(sv[0], SOL_SOCKET, SO_PEERCRED, &credentials,
+ rc = getsockopt(sv[0], 0, LOCAL_PEEREID, &credentials,
&ucred_length);
if (rc == -1) {
- test_fail("getsockopt(SO_PEERCRED) failed");
- } else if (credentials.cr_ngroups != 0 ||
- credentials.cr_uid != geteuid() ||
- credentials.cr_gid != getegid()) {
- /* printf("%d=%d %d=%d %d=%d",credentials.cr_ngroups, 0,
- credentials.cr_uid, geteuid(), credentials.cr_gid, getegid()); */
+ test_fail("getsockopt(LOCAL_PEEREID) failed");
+ } else if (credentials.unp_pid != getpid() ||
+ credentials.unp_euid != geteuid() ||
+ credentials.unp_egid != getegid()) {
+ printf("%d=%d %d=%d %d=%d",credentials.unp_pid, getpid(),
+ credentials.unp_euid, geteuid(),
+ credentials.unp_egid, getegid());
test_fail("Credential passing gave us the wrong cred");
}
rc = getpeereid(sv[0], &euid, &egid);
if (rc == -1) {
test_fail("getpeereid(sv[0], &euid, &egid) failed");
- } else if (credentials.cr_uid != euid || credentials.cr_gid != egid) {
+ } else if (credentials.unp_euid != euid ||
+ credentials.unp_egid != egid) {
test_fail("getpeereid() didn't give the correct euid/egid");
}
if (!(sockaddr_un->sun_family == AF_UNIX &&
strncmp(sockaddr_un->sun_path,
- fullpath(path),
+ path,
sizeof(sockaddr_un->sun_path) - 1) == 0)) {
snprintf(buf, sizeof(buf), "%s() didn't return the right addr",
UNLINK(TEST_SYM_A);
UNLINK(TEST_SYM_B);
- SYMLINK(TEST_SYM_A, TEST_SYM_B);
SYMLINK(TEST_SYM_B, TEST_SYM_A);
SOCKET(sd, PF_UNIX, SOCK_STREAM, 0);
strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1);
errno = 0;
rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un));
+ if (!((rc == -1) && (errno == EADDRINUSE))) {
+ test_fail("bind() should have failed with EADDRINUSE");
+ }
+ CLOSE(sd);
+
+ SYMLINK(TEST_SYM_A, TEST_SYM_B);
+
+ SOCKET(sd, PF_UNIX, SOCK_STREAM, 0);
+
+ strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1);
+ strlcat(addr.sun_path, "/x", sizeof(addr.sun_path));
+ errno = 0;
+ rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un));
if (!((rc == -1) && (errno == ELOOP))) {
test_fail("bind() should have failed with ELOOP");
}
}
static void callback_xfer_peercred(int sd) {
- struct uucred credentials;
+ struct unpcbid credentials;
int rc;
socklen_t ucred_length;
- ucred_length = sizeof(struct uucred);
+ ucred_length = sizeof(credentials);
- debug("Test passing the client credentials to the server");
+ debug("Test obtaining the peer credentials");
memset(&credentials, '\0', ucred_length);
- rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &credentials,
- &ucred_length);
+ rc = getsockopt(sd, 0, LOCAL_PEEREID, &credentials, &ucred_length);
if (rc == -1) {
test_fail("[client] getsockopt() failed");
- } else if (credentials.cr_uid != geteuid() ||
- credentials.cr_gid != getegid()) {
- printf("%d=%d=%d %d=%d=%d\n", credentials.cr_uid, getuid(),
- geteuid(), credentials.cr_gid, getgid(), getegid());
+ } else if (credentials.unp_euid != geteuid() ||
+ credentials.unp_egid != getegid()) {
+ printf("%d=* %d=%d %d=%d", credentials.unp_pid,
+ credentials.unp_euid, geteuid(),
+ credentials.unp_egid, getegid());
test_fail("[client] Credential passing gave us a bad UID/GID");
}
}
+static void
+callback_set_listen_opt(int sd)
+{
+ int val;
+
+ /*
+ * Several of the tests assume that a new connection to a server will
+ * not be established (i.e., go from "connecting" to "connected" state)
+ * until the server actually accepts the connection with an accept(2)
+ * call. With the new UDS implementation, this is no longer true: to
+ * match the behavior of other systems, UDS now preemptively connects
+ * the socket in anticipation of the accept(2) call. We can change
+ * back to the old behavior by setting LOCAL_CONNWAIT however, and
+ * since the test effectively tests a larger set of socket transitions
+ * that way, that is what we do for these tests.
+ */
+ val = 1;
+ if (setsockopt(sd, 0, LOCAL_CONNWAIT, &val, sizeof(val)) != 0)
+ test_fail("setsockopt(LOCAL_CONNWAIT)");
+}
+
static void test_vectorio(int type)
{
int sv[2];
int rc;
int src;
int dst;
- struct uucred cred;
+ int one;
+ union {
+ struct sockcred cred;
+ char buf[SOCKCREDSIZE(NGROUPS_MAX)];
+ } cred;
struct cmsghdr *cmsg = NULL;
struct sockaddr_un addr;
struct iovec iov[3];
char buf2[BUFSIZE];
char buf3[BUFSIZE];
char ctrl[BUFSIZE];
- socklen_t addrlen = sizeof(struct sockaddr_un);
+ socklen_t len, addrlen = sizeof(struct sockaddr_un);
debug("test_scm_credentials");
test_fail("bind");
}
+ debug("request credential passing");
+
+ one = 1;
+ rc = setsockopt(dst, 0, LOCAL_CREDS, &one, sizeof(one));
+ if (rc == -1) {
+ test_fail("setsockopt(LOCAL_CREDS)");
+ }
+
+ debug("sending msg1");
+
memset(&buf1, '\0', BUFSIZE);
memset(&buf2, '\0', BUFSIZE);
memset(&buf3, '\0', BUFSIZE);
msg1.msg_controllen = 0;
msg1.msg_flags = 0;
- debug("sending msg1");
-
rc = sendmsg(src, &msg1, 0);
if (rc == -1) {
test_fail("sendmsg");
* because that is what is returned by recvmsg().
*/
if (addr.sun_family != AF_UNIX || strcmp(addr.sun_path,
- fullpath(TEST_SUN_PATHB))) {
+ TEST_SUN_PATHB)) {
test_fail("recvmsg");
}
debug("looking for credentials");
- memset(&cred, '\0', sizeof(struct uucred));
+ len = 0;
+
+ memset(&cred, 'x', sizeof(cred));
for (cmsg = CMSG_FIRSTHDR(&msg2); cmsg != NULL;
cmsg = CMSG_NXTHDR(&msg2, cmsg)) {
if (cmsg->cmsg_level == SOL_SOCKET &&
cmsg->cmsg_type == SCM_CREDS) {
-
- memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct uucred));
+ /* Great, this alignment business! But then at least
+ * give me a macro to compute the actual data length..
+ */
+ len = cmsg->cmsg_len - (socklen_t)
+ ((char *)CMSG_DATA(cmsg) - (char *)cmsg);
+
+ if (len < sizeof(struct sockcred))
+ test_fail("credentials too small");
+ else if (len > sizeof(cred))
+ test_fail("credentials too large");
+ memcpy(cred.buf, CMSG_DATA(cmsg), len);
break;
}
}
- if (cred.cr_ngroups != 0 || cred.cr_uid != geteuid() ||
- cred.cr_gid != getegid()) {
+ if (len == 0)
+ test_fail("no credentials found");
+
+ if (len != SOCKCREDSIZE(cred.cred.sc_ngroups))
+ test_fail("wrong credentials size");
+ /*
+ * TODO: check supplementary groups. This whole test is pretty much
+ * pointless since we're running with very standard credentials anyway.
+ */
+ if (cred.cred.sc_uid != getuid() ||
+ cred.cred.sc_euid != geteuid() ||
+ cred.cred.sc_gid != getgid() ||
+ cred.cred.sc_egid != getegid() ||
+ cred.cred.sc_ngroups < 0 || cred.cred.sc_ngroups > NGROUPS_MAX) {
test_fail("did no receive the proper credentials");
}
* Test various aspects related to the socket files on the file system.
* This subtest is woefully incomplete and currently only attempts to test
* aspects that have recently been affected by code changes. In the future,
- * there should be tests for path canonicalization and the entire range of file
- * system path and access related error codes (TODO).
+ * there should be tests for the entire range of file system path and access
+ * related error codes (TODO).
*/
static void
test_file(void)
{
- struct sockaddr_un addr;
-#if NOT_YET
- struct sockaddr_un saddr, saddr2;
+ struct sockaddr_un addr, saddr, saddr2;
char buf[1];
socklen_t len;
struct stat st;
mode_t omask;
- int, csd, fd;
-#endif
- int sd, sd2;
+ int sd, sd2, csd, fd;
/*
* If the provided socket path exists on the file system, the bind(2)
CLOSE(sd);
-#if NOT_YET
if (bind(sd2, (struct sockaddr *)&addr, sizeof(addr)) != -1)
test_fail("Binding socket unexpectedly succeeded");
if (errno != EADDRINUSE)
if (memcmp(&saddr, &saddr2, sizeof(saddr)))
test_fail("Unexpected old socket address");
- /*
- * Currently, our implementation "hides" the old socket even if the new
- * socket is closed, but since this is not standard behavior and may be
- * changed later, we do not test for it. However, in any case,
- * rebinding the hidden socket should make it "visible" again.
- */
- strlcpy(saddr2.sun_path, TEST_SUN_PATHB, sizeof(saddr2.sun_path));
- if (bind(sd, (struct sockaddr *)&saddr2, sizeof(saddr2)) != 0)
- test_fail("Can't rebind socket");
-
- memset(buf, 'Z', sizeof(buf));
- if (sendto(csd, buf, sizeof(buf), 0, (struct sockaddr *)&saddr2,
- sizeof(saddr2)) != sizeof(buf))
- test_fail("Can't send to socket");
- if (recvfrom(sd, buf, sizeof(buf), 0, NULL, 0) != sizeof(buf))
- test_fail("Can't receive from socket");
- if (buf[0] != 'Z')
- test_fail("Transmission failure");
-
if (unlink(TEST_SUN_PATH) != 0)
test_fail("Can't unlink socket");
- if (unlink(TEST_SUN_PATHB) != 0)
- test_fail("Can't unlink other socket");
CLOSE(sd);
CLOSE(sd2);
UNLINK(TEST_SUN_PATH);
umask(omask);
-#endif
/*
* Only socket(2), socketpair(2), and accept(2) may be used to obtain
.clientaddrsym = (struct sockaddr *) &clientaddrsym,
.clientaddrsymlen = sizeof(clientaddrsym),
.domain = PF_UNIX,
- .expected_rcvbuf = PIPE_BUF,
- .expected_sndbuf = PIPE_BUF,
+ .expected_rcvbuf = 32768 - 5, /* no constants: */
+ .expected_sndbuf = 32768 - 5, /* UDS internals */
.serveraddr = (struct sockaddr *) &clientaddr,
.serveraddrlen = sizeof(clientaddr),
.serveraddr2 = (struct sockaddr *) &clientaddr2,
.callback_cleanup = callback_cleanup,
.callback_xfer_prepclient = callback_xfer_prepclient,
.callback_xfer_peercred = callback_xfer_peercred,
+ .callback_set_listen_opt = callback_set_listen_opt,
};
debug("entering main()");
start(56);
+ /* This test was written before UDS started supporting SIGPIPE. */
+ signal(SIGPIPE, SIG_IGN);
+
test_socket(&info);
test_bind(&info);
test_bind_unix();
.ignore_accept_delay = 1,
.ignore_connect_unaccepted = 1,
.ignore_connect_delay = 1,
- .ignore_read_conn_reset = 1,
.ignore_select_delay = 1,
.ignore_send_waiting = 1,
.ignore_write_conn_reset = 1,
.ignore_accept_delay = 1,
.ignore_connect_unaccepted = 1,
.ignore_connect_delay = 1,
- .ignore_read_conn_reset = 1,
.ignore_select_delay = 1,
.ignore_send_waiting = 1,
.ignore_write_conn_reset = 1,
FLAG(NWUO_DI_IPOPT),
};
+static void
+put_struct_uucred(struct trace_proc * proc, const char * name, int flags,
+ vir_bytes addr)
+{
+ struct uucred cred;
+
+ if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred)))
+ return;
+
+ put_value(proc, "cr_uid", "%u", cred.cr_uid);
+ if (verbose > 0) {
+ put_value(proc, "cr_gid", "%u", cred.cr_gid);
+ if (verbose > 1)
+ put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups);
+ put_groups(proc, "cr_groups", PF_LOCADDR,
+ (vir_bytes)&cred.cr_groups, cred.cr_ngroups);
+ }
+
+ put_close_struct(proc, verbose > 0);
+}
+
static void
put_msg_control(struct trace_proc * proc, struct msg_control * ptr)
{
void put_in_addr(struct trace_proc *proc, const char *name, struct in_addr in);
void put_socket_type(struct trace_proc *proc, const char *name, int type);
void put_socket_family(struct trace_proc *proc, const char *name, int family);
-void put_struct_uucred(struct trace_proc *proc, const char *name, int flags,
- vir_bytes addr);
void put_cmsg_type(struct trace_proc *proc, const char *name, int type);
void put_shutdown_how(struct trace_proc *proc, const char *name, int how);
put_close(proc, "]");
}
-void
-put_struct_uucred(struct trace_proc * proc, const char * name, int flags,
- vir_bytes addr)
+static void
+put_struct_sockcred(struct trace_proc * proc, const char * name, int flags,
+ vir_bytes addr, size_t left)
{
- struct uucred cred;
+ struct sockcred sc;
- if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred)))
+ if (!put_open_struct(proc, name, flags, addr, &sc, sizeof(sc)))
return;
- put_value(proc, "cr_uid", "%u", cred.cr_uid);
+ put_value(proc, "sc_uid", "%u", sc.sc_uid);
+ if (verbose > 0)
+ put_value(proc, "sc_euid", "%u", sc.sc_euid);
+ put_value(proc, "sc_gid", "%u", sc.sc_gid);
if (verbose > 0) {
- put_value(proc, "cr_gid", "%u", cred.cr_gid);
+ put_value(proc, "sc_egid", "%u", sc.sc_egid);
if (verbose > 1)
- put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups);
- put_groups(proc, "cr_groups", PF_LOCADDR,
- (vir_bytes)&cred.cr_groups, cred.cr_ngroups);
+ put_value(proc, "sc_ngroups", "%d", sc.sc_ngroups);
+ if (left >= sizeof(sc.sc_groups[0]) * (sc.sc_ngroups - 1)) {
+ put_groups(proc, "sc_groups", flags,
+ addr + offsetof(struct sockcred, sc_groups),
+ sc.sc_ngroups);
+ } else
+ put_field(proc, "sc_groups", "..");
}
- put_close_struct(proc, verbose > 0);
+ put_close_struct(proc, verbose > 1);
}
static void
size_t len)
{
struct cmsghdr cmsg;
- char buf[CMSG_SPACE(sizeof(struct uucred))];
+ char buf[CMSG_SPACE(sizeof(struct sockcred))];
size_t off, chunk, datalen;
if (valuesonly > 1 || addr == 0 || len < CMSG_LEN(0)) {
addr + off + chunk, datalen);
} else if (cmsg.cmsg_level == SOL_SOCKET &&
cmsg.cmsg_type == SCM_CREDS &&
- datalen >= sizeof(struct uucred) &&
+ datalen >= sizeof(struct sockcred) &&
chunk >= CMSG_LEN(datalen)) {
- put_struct_uucred(proc, "cmsg_data", PF_LOCADDR,
- (vir_bytes)&buf[CMSG_LEN(0)]);
+ put_struct_sockcred(proc, "cmsg_data", PF_LOCADDR,
+ (vir_bytes)&buf[CMSG_LEN(0)],
+ datalen - sizeof(struct sockcred));
} else if (datalen > 0)
put_field(proc, "cmsg_data", "..");
TEXT(SO_REUSEPORT);
TEXT(SO_NOSIGPIPE);
TEXT(SO_TIMESTAMP);
- TEXT(SO_PASSCRED);
- TEXT(SO_PEERCRED);
TEXT(SO_SNDBUF);
TEXT(SO_RCVBUF);
TEXT(SO_SNDLOWAT);
const char *text;
int i;
struct linger l;
- struct uucred cr;
struct timeval tv;
void *ptr;
size_t size;
case SO_REUSEPORT:
case SO_NOSIGPIPE:
case SO_TIMESTAMP:
- case SO_PASSCRED:
case SO_SNDBUF:
case SO_RCVBUF:
case SO_SNDLOWAT:
ptr = &l;
size = sizeof(l);
break;
- case SO_PEERCRED:
- ptr = &cr;
- size = sizeof(cr);
- break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
ptr = &tv;
put_value(proc, "l_linger", "%d", l.l_linger);
put_close(proc, "}");
break;
- case SO_PEERCRED:
- put_struct_uucred(proc, name, PF_LOCADDR, (vir_bytes)&cr);
- break;
case SO_ERROR:
put_open(proc, name, 0, "{", ", ");
if (!valuesonly && (text = get_error_name(i)) != NULL)
#define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */
#define SO_TIMESTAMP 0x2000 /* timestamp received dgram traffic */
-#if defined(__minix) && defined(_MINIX_SYSTEM)
-/* Minixism which should go, so hide it from userland. */
-#define SO_PASSCRED 0x100000
-#define SO_PEERCRED 0x200000
-#endif /* defined(__minix) */
-
/*
* Additional options, not kept in so_options.
*/