UDS: full rewrite

author David van Moolenbroek <david@minix3.org>

Sun, 21 Feb 2016 22:59:04 +0000 (22:59 +0000)

committer David van Moolenbroek <david@minix3.org>

Thu, 9 Mar 2017 23:39:56 +0000 (23:39 +0000)
author David van Moolenbroek <david@minix3.org>
Sun, 21 Feb 2016 22:59:04 +0000 (22:59 +0000)
committer David van Moolenbroek <david@minix3.org>
Thu, 9 Mar 2017 23:39:56 +0000 (23:39 +0000)
diff --git a/distrib/sets/lists/minix-base/mi b/distrib/sets/lists/minix-base/mi

index e2ee8526a59b32f1c468f886197c77f4c6577c98..8463f888f838088440d8e1147b54f351b9ad3303 100644 (file)
--- a/distrib/sets/lists/minix-base/mi
+++ b/distrib/sets/lists/minix-base/mi
@@ -179,6 +179,7 @@
  ./etc/system.conf.d/ipc                                 minix-base
  ./etc/system.conf.d/lwip                                minix-base
  ./etc/system.conf.d/random                              minix-base
+./etc/system.conf.d/uds                                 minix-base
  ./etc/system.conf.d/usb_hub                             minix-base
  ./etc/system.conf.d/usb_storage                         minix-base
  ./etc/termcap                                           minix-base
diff --git a/distrib/sets/lists/minix-man/mi b/distrib/sets/lists/minix-man/mi

index 6ce70fc755e4ca5879ee5144346fdfe78f010d9e..895353269e77117dbdfa5ca0f3c2090299c89731 100644 (file)
--- a/distrib/sets/lists/minix-man/mi
+++ b/distrib/sets/lists/minix-man/mi
@@ -477,7 +477,7 @@
  ./usr/man/man2/getgid.2                                 minix-man
  ./usr/man/man2/getitimer.2                              minix-man
  ./usr/man/man2/getnucred.2                              minix-man       obsolete
-./usr/man/man2/getpeereid.2                             minix-man
+./usr/man/man2/getpeereid.2                             minix-man       obsolete
  ./usr/man/man2/getpeername.2                            minix-man
  ./usr/man/man2/getpid.2                                 minix-man
  ./usr/man/man2/getpriority.2                            minix-man
@@ -3463,7 +3463,7 @@
  ./usr/man/man8/syslogd.8                                minix-man
  ./usr/man/man8/tcpd.8                                   minix-man
  ./usr/man/man8/traceroute.8                             minix-man
-./usr/man/man8/uds.8                                    minix-man
+./usr/man/man8/uds.8                                    minix-man       obsolete
  ./usr/man/man8/unix.8                                   minix-man
  ./usr/man/man8/unlink.8                                 minix-man
  ./usr/man/man8/unstr.8                                  minix-man
diff --git a/etc/system.conf b/etc/system.conf

index d89a6d3dfbdb71299386bcfb5baa2be60bc3c922..38c43f24fbcf36c05f60ab6974480598cc9223c8 100644 (file)
--- a/etc/system.conf
+++ b/etc/system.conf
@@ -494,14 +494,6 @@ service vnd
         uid     0;      # only for copyfd(2)
  };
  
-service uds
-{
-       ipc
-               SYSTEM vfs rs vm
-       ;
-       uid     0;      # only for checkperms(2) and copyfd(2)
-};
-
  service pty
  {
         system
diff --git a/etc/usr/rc b/etc/usr/rc

index 69dfb16070d3758704e3cb51fb38668164a4c919..1d5bb3e84633a527790d2281ed4b8f9eb6d0a65a 100644 (file)
--- a/etc/usr/rc
+++ b/etc/usr/rc
@@ -201,7 +201,7 @@ start)
      # pty needs to know the "tty" group ID
      up pty -dev /dev/ptmx -args "gid=`stat -f '%g' /dev/ptmx`"
  
-    up uds -dev /dev/uds
+    up uds
  
      up -n ipc
  
diff --git a/external/bsd/tmux/dist/client.c b/external/bsd/tmux/dist/client.c

index d790ea8e8cb2160d845222c2564fe9e494bc1cd8..ce88e544807b454dc5ae559cc61a2cbf51290378 100644 (file)
--- a/external/bsd/tmux/dist/client.c
+++ b/external/bsd/tmux/dist/client.c
@@ -107,11 +107,7 @@ client_connect(char *path, int start_server)
         }
  
  retry:
-#ifndef __minix
         if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
-#else
-       if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
-#endif /* !defined(__minix) */
                 fatal("socket failed");
  
         if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == -1) {
diff --git a/external/bsd/tmux/dist/server.c b/external/bsd/tmux/dist/server.c

index 5682afe666bdc5e9b0e15563b7e79a777e19ef77..33b8576c247171d08fe73780f912986cffd29281 100644 (file)
--- a/external/bsd/tmux/dist/server.c
+++ b/external/bsd/tmux/dist/server.c
@@ -84,11 +84,7 @@ server_create_socket(void)
         }
         unlink(sa.sun_path);
  
-#ifndef __minix
         if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1)
-#else
-       if ((fd = socket(AF_UNIX, SOCK_SEQPACKET, 0)) == -1)
-#endif /* !defined(__minix) */
                 fatal("socket failed");
  
         mask = umask(S_IXUSR|S_IXGRP|S_IRWXO);
@@ -114,11 +110,7 @@ server_start(int lockfd, char *lockfile)
         char            *cause;
  
         /* The first client is special and gets a socketpair; create it. */
-#ifndef __minix
         if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, pair) != 0)
-#else
-       if (socketpair(AF_UNIX, SOCK_SEQPACKET, PF_UNSPEC, pair) != 0)
-#endif /* !defined(__minix) */
                 fatal("socketpair failed");
  
         switch (fork()) {
diff --git a/lib/libc/gen/syslog.c b/lib/libc/gen/syslog.c

index f074348116398abdcbee9455eef4d56e505ab7b6..5ece67ddd0170d2d83825121046a2196a8f2a155 100644 (file)
--- a/lib/libc/gen/syslog.c
+++ b/lib/libc/gen/syslog.c
@@ -59,10 +59,6 @@ __RCSID("$NetBSD: syslog.c,v 1.54 2014/09/18 13:58:20 christos Exp $");
  #include "reentrant.h"
  #include "extern.h"
  
-#if defined(__minix)
-#include <sys/ioctl.h>
-#endif /* defined(__minix) */
-
  #ifdef __weak_alias
  __weak_alias(closelog,_closelog)
  __weak_alias(openlog,_openlog)
@@ -452,11 +448,7 @@ vsyslogp_r(int pri, struct syslog_data *data, const char *msgid,
          * to give syslogd a chance to empty its socket buffer.
          */
         for (tries = 0; tries < MAXTRIES; tries++) {
-#if defined(__minix)
-               if (write(data->log_file, tbuf, cnt) != -1)
-#else
                 if (send(data->log_file, tbuf, cnt, 0) != -1)
-#endif /* defined(__minix) */
                         break;
                 if (errno != ENOBUFS) {
                         disconnectlog_r(data);
@@ -513,9 +505,7 @@ connectlog_r(struct syslog_data *data)
         /* AF_UNIX address of local logger */
         static const struct sockaddr_un sun = {
                 .sun_family = AF_LOCAL,
-#if !defined(__minix)
                 .sun_len = sizeof(sun),
-#endif /* !defined(__minix) */
                 .sun_path = _PATH_LOG,
         };
  
@@ -526,14 +516,9 @@ connectlog_r(struct syslog_data *data)
                 data->log_connected = 0;
         }
         if (!data->log_connected) {
-#if defined(__minix)
-               if(ioctl(data->log_file, NWIOSUDSTADDR, __UNCONST(&sun)) < 0)
-
-#else
                 if (connect(data->log_file,
                     (const struct sockaddr *)(const void *)&sun,
                     (socklen_t)sizeof(sun)) == -1)
-#endif /* defined(__minix) */
                 {
                         (void)close(data->log_file);
                         data->log_file = -1;
diff --git a/lib/libc/net/Makefile.inc b/lib/libc/net/Makefile.inc

index bf65717a15a7860a670b7156721bcd5110179af3..fa29213c587b0459067b6205391250e3fa200072 100644 (file)
--- a/lib/libc/net/Makefile.inc
+++ b/lib/libc/net/Makefile.inc
@@ -2,13 +2,6 @@
  #      @(#)Makefile.inc        8.2 (Berkeley) 9/5/93
  
  # net sources
-.if defined(__MINIX)
-.PATH: ${NETBSDSRCDIR}/minix/lib/libc/net
-
-CPPFLAGS.getpeereid.c+= -D_MINIX_SYSTEM=1
-CPPFLAGS.getsockopt.c+= -D_MINIX_SYSTEM=1
-CPPFLAGS.setsockopt.c+= -D_MINIX_SYSTEM=1
-.endif
  .PATH: ${ARCHDIR}/net ${.CURDIR}/net
  
  SRCS+= base64.c ethers.c gethnamaddr.c getifaddrs.c \
diff --git a/minix/commands/DESCRIBE/DESCRIBE.sh b/minix/commands/DESCRIBE/DESCRIBE.sh

index e826052abf56b78f56f55ec3f26759371d5f407e..09f56ecf569415e8c66f1e44068a02e7096d3831 100644 (file)
--- a/minix/commands/DESCRIBE/DESCRIBE.sh
+++ b/minix/commands/DESCRIBE/DESCRIBE.sh
@@ -192,9 +192,6 @@ do
      17,0)
         des="hello" dev=hello
         ;;
-    18,0)
-       des="UNIX domain socket" dev=uds
-       ;;
      5[6-9],0|6[0-3],0)
         drive=`expr $major - 56`
         des="vnode disk $drive" dev=vnd$drive
diff --git a/minix/commands/MAKEDEV/MAKEDEV.sh b/minix/commands/MAKEDEV/MAKEDEV.sh

index 78d647dcb43437e5a420b3b54470b9a4a01dcddb..8c8a98b126caae4d4cc400c492860d3676b8afcc 100755 (executable)
--- a/minix/commands/MAKEDEV/MAKEDEV.sh
+++ b/minix/commands/MAKEDEV/MAKEDEV.sh
@@ -49,7 +49,6 @@ STD_DEVICES="
         ttypa ttypb ttypc ttypd ttype ttypf
         ttyq0 ttyq1 ttyq2 ttyq3 ttyq4 ttyq5 ttyq6 ttyq7 ttyq8 ttyq9
         ttyqa ttyqb ttyqc ttyqd ttyqe ttyqf
-       uds
         vnd0 vnd0p0 vnd0p0s0 vnd1 vnd1p0 vnd1p0s0
         vnd2 vnd3 vnd4 vnd5 vnd6 vnd7
  "
@@ -134,7 +133,6 @@ Where key is one of the following:
    klog                    # Make /dev/klog
    ptmx                    # Make /dev/ptmx
    random                  # Make /dev/random, /dev/urandom
-  uds                     # Make /dev/uds
    filter                  # Make /dev/filter
    fbd                     # Make /dev/fbd
    hello                   # Make /dev/hello
@@ -438,10 +436,6 @@ do
  
                 makedev ${dev} c 4 ${minor} ${uname} tty ${permissions}
                 ;;
-       uds)
-               # Unix domain sockets device
-               makedev ${dev} c 18 0 ${uname} ${gname} 666
-               ;;
         vnd[0-7])
                 # Whole vnode disk devices.
                 makedev ${dev} b ${major} 0 ${uname} ${gname} ${permissions}
diff --git a/minix/include/minix/dmap.h b/minix/include/minix/dmap.h

index 8c6560d3bad7e5cd6756024ab4c27b250e0f5802..c027317476e26b5e8b40735fecf0a98ff678e9f9 100644 (file)
--- a/minix/include/minix/dmap.h
+++ b/minix/include/minix/dmap.h
@@ -36,8 +36,8 @@
  #define LOG_MAJOR                15    /* 15 = /dev/klog   (log driver)      */
  #define RANDOM_MAJOR             16    /* 16 = /dev/random (random driver)   */
  #define HELLO_MAJOR              17    /* 17 = /dev/hello  (hello driver)    */
-#define UDS_MAJOR                18    /* 18 = /dev/uds    (pfs)             */
-#define FB_MAJOR                 19    /* 18 = /dev/fb0    (fb driver)       */
+                                       /* 18 = (unused)                      */
+#define FB_MAJOR                 19    /* 19 = /dev/fb0    (fb driver)       */
  #define I2C0_MAJOR               20    /* 20 = /dev/i2c-1  (i2c-dev)         */
  #define I2C1_MAJOR               21    /* 21 = /dev/i2c-2  (i2c-dev)         */
  #define I2C2_MAJOR               22    /* 22 = /dev/i2c-3  (i2c-dev)         */
diff --git a/minix/include/minix/syslib.h b/minix/include/minix/syslib.h

index 98f58cbd5cac82b5224ab8528a2b4e793f6272f1..1856c41daa8b18700c0951d4a49becd925671713 100644 (file)
--- a/minix/include/minix/syslib.h
+++ b/minix/include/minix/syslib.h
@@ -273,11 +273,10 @@ uid_t getnuid(endpoint_t proc_ep);
  gid_t getngid(endpoint_t proc_ep);
  int getsockcred(endpoint_t proc_ep, struct sockcred * sockcred, gid_t * groups,
         int ngroups);
-int socketpath(endpoint_t endpt, char *path, size_t size, int what, dev_t *dev,
-       ino_t *ino);
+int socketpath(endpoint_t endpt, const char *path, size_t size, int what,
+       dev_t *dev, ino_t *ino);
  #define SPATH_CHECK    0       /* check user permissions on socket path */
  #define SPATH_CREATE   1       /* create socket file at given path */
-#define SPATH_CANONIZE 0x8000  /* copy back canonized path (legacy support) */
  int copyfd(endpoint_t endpt, int fd, int what);
  #define COPYFD_FROM    0       /* copy file descriptor from remote process */
  #define COPYFD_TO      1       /* copy file descriptor to remote process */
diff --git a/minix/lib/libc/net/getpeereid.c b/minix/lib/libc/net/getpeereid.c

deleted file mode 100644 (file)

index 7638c12..0000000
--- a/minix/lib/libc/net/getpeereid.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/socket.h>
-#include <sys/ucred.h>
-
-/*
- * get the effective user ID and effective group ID of a peer
- * connected through a Unix domain socket.
- */
-int getpeereid(int sd, uid_t *euid, gid_t *egid) {
-       int rc;
-       struct uucred cred;
-       socklen_t ucred_length;
-
-       /* Initialize Data Structures */
-       ucred_length = sizeof(struct uucred);
-       memset(&cred, '\0', ucred_length);
-
-       /* Validate Input Parameters */
-       if (euid == NULL || egid == NULL) {
-               errno = EFAULT;
-               return -1;
-       } /* getsockopt will handle validating 'sd' */
-
-       /* Get the credentials of the peer at the other end of 'sd' */
-       rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &cred, &ucred_length);
-       if (rc == 0) {
-               /* Success - return the results */
-               *euid = cred.cr_uid;
-               *egid = cred.cr_gid;
-               return 0;
-       } else {
-               /* Failure - getsockopt takes care of setting errno */
-               return -1;
-       }
-}
diff --git a/minix/lib/libc/sys/getsockopt.c b/minix/lib/libc/sys/getsockopt.c

index 7edea530e71cfaf4d5f260856295780c9a070f65..7c000fb05a365cd80ce6a2bf8d3a07c420040640 100644 (file)
--- a/minix/lib/libc/sys/getsockopt.c
+++ b/minix/lib/libc/sys/getsockopt.c
@@ -244,6 +244,7 @@ static int _uds_getsockopt(int sock, int level, int option_name,
                 return 0;
         }
  
+#ifdef SO_PEERCRED
         if (level == SOL_SOCKET && option_name == SO_PEERCRED)
         {
                 struct uucred cred;
@@ -257,6 +258,7 @@ static int _uds_getsockopt(int sock, int level, int option_name,
                                                         option_len);
                 return 0;
         }
+#endif
  
  
         if (level == SOL_SOCKET && option_name == SO_REUSEADDR)
@@ -269,12 +271,14 @@ static int _uds_getsockopt(int sock, int level, int option_name,
                 return 0;
         }
  
+#ifdef SO_PASSCRED
         if (level == SOL_SOCKET && option_name == SO_PASSCRED)
         {
                 i = 1;  /* option is always 'on' */
                 getsockopt_copy(&i, sizeof(i), option_value, option_len);
                 return 0;
         }
+#endif
  
  #if DEBUG
         fprintf(stderr, "_uds_getsocketopt: level %d, name %d\n",
diff --git a/minix/lib/libc/sys/setsockopt.c b/minix/lib/libc/sys/setsockopt.c

index 04c0d311d38766d138145b02fd3b6b5ff60c6a97..cbe29ed03d54c98049ba3366d29a5ca806d5d008 100644 (file)
--- a/minix/lib/libc/sys/setsockopt.c
+++ b/minix/lib/libc/sys/setsockopt.c
@@ -267,6 +267,7 @@ static int _uds_setsockopt(int sock, int level, int option_name,
                 return 0;
         }
  
+#ifdef SO_PASSCRED
         if (level == SOL_SOCKET && option_name == SO_PASSCRED)
         {
                 if (option_len != sizeof(i))
@@ -283,6 +284,7 @@ static int _uds_setsockopt(int sock, int level, int option_name,
                 }
                 return 0;
         }
+#endif
  
  #if DEBUG
         fprintf(stderr, "_uds_setsocketopt: level %d, name %d\n",
diff --git a/minix/lib/libsys/socketpath.c b/minix/lib/libsys/socketpath.c

index 2473a781522d4cef7c448d4bec4d378884954310..e4634bfb0d2d02061abca60e5b70651176554f8b 100644 (file)
--- a/minix/lib/libsys/socketpath.c
+++ b/minix/lib/libsys/socketpath.c
@@ -5,22 +5,22 @@
  #include <minix/safecopies.h>
  
  int
-socketpath(endpoint_t endpt, char * path, size_t size, int what, dev_t * dev,
-       ino_t * ino)
+socketpath(endpoint_t endpt, const char * path, size_t size, int what,
+       dev_t * dev, ino_t * ino)
  {
         cp_grant_id_t grant;
         message m;
         int r;
  
         if ((grant = cpf_grant_direct(VFS_PROC_NR, (vir_bytes)path, size,
-           CPF_READ | CPF_WRITE)) == GRANT_INVALID)
+           CPF_READ)) == GRANT_INVALID)
                 return ENOMEM;
  
         memset(&m, 0, sizeof(m));
         m.m_lsys_vfs_socketpath.endpt = endpt;
         m.m_lsys_vfs_socketpath.grant = grant;
         m.m_lsys_vfs_socketpath.count = size;
-       m.m_lsys_vfs_socketpath.what = what | SPATH_CANONIZE;
+       m.m_lsys_vfs_socketpath.what = what;
  
         r = _taskcall(VFS_PROC_NR, VFS_SOCKETPATH, &m);
  
diff --git a/minix/man/man2/Makefile b/minix/man/man2/Makefile

index 3db88c945f2ba8480d38742e430408862c9ce67e..1342b77ecfe1dace157c5985841ad986c0ea320e 100644 (file)
--- a/minix/man/man2/Makefile
+++ b/minix/man/man2/Makefile
@@ -1,6 +1,6 @@
  MAN=   accept.2 access.2 bind.2 brk.2 chdir.2 chmod.2 chown.2 \
         chroot.2 close.2 connect.2 creat.2 dup.2 execve.2 exit.2 fcntl.2 \
-       fork.2 getgid.2 getitimer.2 getpeereid.2 \
+       fork.2 getgid.2 getitimer.2 \
         getpeername.2 getpid.2 getpriority.2 getsockname.2 getsockopt.2 \
         gettimeofday.2 getuid.2 intro.2 ioctl.2 kill.2 link.2 listen.2 \
         lseek.2 mkdir.2 mknod.2 mount.2 open.2 ptrace.2 \
diff --git a/minix/man/man2/getpeereid.2 b/minix/man/man2/getpeereid.2

deleted file mode 100644 (file)

index 2c0a15f..0000000
--- a/minix/man/man2/getpeereid.2
+++ /dev/null
@@ -1,42 +0,0 @@
-.TH GETPEEREID 2
-.SH NAME
-getpeereid \- get the effective user ID and effective group ID of a peer
-connected through a Unix domain socket.
-.SH SYNOPSIS
-.ft B
-#include <sys/socket.h>
-
-.in +5
-.ti -5
-int getpeereid(int \fIsd\fP, uid_t *\fIeuid\fP, gid_t *\fIegid\fP);
-.br
-.ft P
-.SH DESCRIPTION
-getpeereid() is often used to authenticate clients connecting to a 
-server through a Unix domain socket. The server can call this function 
-with a socket descriptor \fIsd\fP and this function will fill\-in 
-\fIeuid\fP and \fIegid\fP with the effective user ID and the effective 
-group ID of the client process.
-.SH RETURN VALUES
-On success, this function returns 0, \fIeuid\fP is set to the effective 
-user ID of the peer connected through Unix domain socket \fIsd\fP, and 
-\fIegid\fP is set to the effective group ID of the peer connected 
-through Unix domain socket \fIsd\fP. On error, -1 is returned and 
-\fIerrno\fP is set.
-.SH ERRORS
-.TP 15
-[EBADF]
-The argument \fIsd\fP is not a descriptor.
-.TP 15
-[ENOTSOCK]
-The argument \fIsd\fP is a descriptor, but not a socket descriptor.
-.TP 15
-[EFAULT]
-The address pointed to by \fIeuid\fP and/or \fIegid\fP is not in a 
-valid part of the process address space.
-.SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR unix(8)
-.SH HISTORY
-This function first appeared in Minix 3.1.8.
diff --git a/minix/net/uds/Makefile b/minix/net/uds/Makefile

index 8ae35c943d88e7b0e580bd6a26859e8c45a89927..a85c4483e9b5e5d3a4ec19fdf056f796aeea7b21 100644 (file)
--- a/minix/net/uds/Makefile
+++ b/minix/net/uds/Makefile
@@ -1,9 +1,15 @@
  # Makefile for the UNIX Domain Sockets driver (UDS)
  PROG=  uds
-SRCS=  uds.c ioc_uds.c
-MAN=   uds.8 unix.8
+SRCS=  uds.c io.c stat.c
+MAN=   unix.8
  
-DPADD+=        ${LIBCHARDRIVER} ${LIBSYS}
-LDADD+=        -lchardriver -lsys
+FILES=${PROG}.conf
+FILESNAME=${PROG}
+FILESDIR= /etc/system.conf.d
+
+DPADD+=        ${LIBSOCKEVENT} ${LIBSOCKDRIVER} ${LIBSYS} ${LIBTIMERS}
+LDADD+=        -lsockevent -lsockdriver -lsys -ltimers
+
+WARNS?=        5
  
  .include <minix.service.mk>
diff --git a/minix/net/uds/io.c b/minix/net/uds/io.c

new file mode 100644 (file)

index 0000000..1b8de37
--- /dev/null
+++ b/minix/net/uds/io.c
@@ -0,0 +1,1795 @@
+/* UNIX Domain Sockets - io.c - sending and receiving */
+
+#include "uds.h"
+#include <sys/mman.h>
+
+/*
+ * Our UDS sockets do not have a send buffer.  They only have a receive buffer.
+ * This receive buffer, when not empty, is split up in segments.  Each segment
+ * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and
+ * (SOCK_DGRAM) neither.  There are two types of ancillary data: in-flight file
+ * descriptors and sender credentials.  In addition, for SOCK_DGRAM sockets,
+ * the segment may contain the sender's socket path (if the sender's socket is
+ * bound).  Each segment has has a header, containing the full segment size,
+ * the size of the actual data in the segment (if any), and a flags field that
+ * states which ancillary are associated with the segment (if any).  For
+ * SOCK_STREAM type sockets, new data may be merged into a previous segment,
+ * but only if it has no ancillary data.  For the other two socket types, each
+ * packet has its own header.  The resulting behavior should be in line with
+ * the POSIX "Socket Receive Queue" specification.
+ *
+ * More specifically, each segment consists of the following parts:
+ * - always a five-byte header, containing a two-byte segment length (including
+ *   the header, so always non-zero), a two-byte regular data length (zero or
+ *   more), and a one-byte flags field which is a bitwise combination of
+ *   UDS_HAS_{FD,CRED,PATH} flags;
+ * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure;
+ *   since this structure is variable-size, the structure is prepended by a
+ *   single byte that contains the length of the structure (excluding the byte
+ *   itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN);
+ * - next, if UDS_HAS_PATH is set in the segment header:
+ * - next, if the data length is non-zero, the actual regular data.
+ * If the segment is not the last in the receive buffer, it is followed by the
+ * next segment immediately afterward.  There is no alignment.
+ *
+ * It is the sender's responsibility to merge new data into the last segment
+ * whenever possible, so that the receiver side never needs to consider more
+ * than one segment at once.  In order to allow such merging, each receive
+ * buffer has not only a tail and in-use length (pointing to the head when
+ * combined) but also an offset from the tail to the last header, if any.  Note
+ * that the receiver may over time still look at multiple segments for a single
+ * request: this happens when a MSG_WAITALL request empties the buffer and then
+ * blocks - the next piece of arriving data can then obviously not be merged.
+ *
+ * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file
+ * descriptors are associated with the segment.  These are stored in a separate
+ * data structure, mainly to simplify cleaning up when the socket is shut down
+ * for reading or closed.  That structure also contains the number of file
+ * descriptors associated with the current segment, so this is not stored in
+ * the segment itself.  As mentioned later, this may be changed in the future.
+ *
+ * On the sender side, there is a trade-off between fully utilizing the receive
+ * buffer, and not repeatedly performing expensive actions for the same call:
+ * it may be costly to determine exactly how many in-flight file descriptors
+ * there will be (if any) and/or how much space is needed to store credentials.
+ * We currently use the policy that we rather block/reject a send request that
+ * may (just) have fit in the remaining part of the receive buffer, than obtain
+ * the same information multiple times or keep state between callbacks.  In
+ * practice this is not expected to make a difference, especially since
+ * transfer of ancillary data should be rare anyway.
+ */
+/*
+ * The current layout of the segment header is as follows.
+ *
+ * The first byte contains the upper eight bits of the total segment length.
+ * The second byte contains the lower eight bits of the total segment length.
+ * The third byte contains the upper eight bits of the data length.
+ * The fourth byte contains the lower eight bits of the data length.
+ * The fifth byte is a bitmask for ancillary data associated with the segment.
+ */
+#define UDS_HDRLEN     5
+
+#define UDS_HAS_FDS    0x01    /* segment has in-flight file descriptors */
+#define UDS_HAS_CRED   0x02    /* segment has sender credentials */
+#define UDS_HAS_PATH   0x04    /* segment has source socket path */
+
+#define UDS_MAXCREDLEN SOCKCREDSIZE(NGROUPS_MAX)
+
+#define uds_get_head(uds)      \
+       ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF)
+#define uds_get_last(uds)      \
+       ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF)
+#define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF)
+
+/*
+ * All in-flight file descriptors are (co-)owned by the UDS driver itself, as
+ * local open file descriptors.  Like any other process, the UDS driver can not
+ * have more than OPEN_MAX open file descriptors at any time.  Thus, this is
+ * also the inherent maximum number of in-flight file descriptors.  Therefore,
+ * we maintain a single pool of in-flight FD structures, and we associate these
+ * structures with sockets as needed.
+ */
+static struct uds_fd uds_fds[OPEN_MAX];
+static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds;
+
+static char uds_ctlbuf[UDS_CTL_MAX];
+static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)];
+
+/*
+ * Initialize the input/output part of the UDS service.
+ */
+void
+uds_io_init(void)
+{
+       unsigned int slot;
+
+       SIMPLEQ_INIT(&uds_freefds);
+
+       for (slot = 0; slot < __arraycount(uds_fds); slot++)
+               SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next);
+}
+
+/*
+ * Set up all input/output state for the given socket, which has just been
+ * allocated.  As part of this, allocate memory for the receive buffer of the
+ * socket.  Return OK or a negative error code.
+ */
+int
+uds_io_setup(struct udssock * uds)
+{
+
+       /* TODO: decide if we should preallocate the memory. */
+       if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
+           MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
+               return ENOMEM;
+
+       uds->uds_tail = 0;
+       uds->uds_len = 0;
+       uds->uds_last = 0;
+
+       SIMPLEQ_INIT(&uds->uds_fds);
+
+       return OK;
+}
+
+/*
+ * Clean up the input/output state for the given socket, which is about to be
+ * freed.  As part of this, deallocate memory for the receive buffer and close
+ * any file descriptors still in flight on the socket.
+ */
+void
+uds_io_cleanup(struct udssock * uds)
+{
+
+       /* Close any in-flight file descriptors. */
+       uds_io_reset(uds);
+
+       /* Free the receive buffer memory. */
+       if (munmap(uds->uds_buf, UDS_BUF) != 0)
+               panic("UDS: munmap failed: %d", errno);
+}
+
+/*
+ * The socket is being closed or shut down for reading.  If there are still any
+ * in-flight file descriptors, theey will never be received anymore, so close
+ * them now.
+ */
+void
+uds_io_reset(struct udssock * uds)
+{
+       struct uds_fd *ufd;
+
+       /*
+        * The UDS service may have the last and only reference to any of these
+        * file descriptors here.  For that reason, we currently disallow
+        * transfer of UDS file descriptors, because the close(2) here could
+        * block on a socket close operation back to us, leading to a deadlock.
+        * Also, we use a non-blocking variant of close(2), to prevent that we
+        * end up hanging on sockets with SO_LINGER turned on.
+        */
+       SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) {
+               dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+               closenb(ufd->ufd_fd);
+       }
+
+       SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds);
+
+       /*
+        * If this reset happens as part of a shutdown, it might be done
+        * again on close, so ensure that it will find a clean state.  The
+        * receive buffer should never be looked at again either way, but reset
+        * it too just to be sure.
+        */
+       uds->uds_tail = 0;
+       uds->uds_len = 0;
+       uds->uds_last = 0;
+
+       SIMPLEQ_INIT(&uds->uds_fds);
+}
+
+/*
+ * Return the maximum usable part of the receive buffer, in bytes.  The return
+ * value is used for the SO_SNDBUF and SO_RCVBUF socket options.
+ */
+size_t
+uds_io_buflen(void)
+{
+
+       /*
+        * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we
+        * could use the full receive buffer for data.  This would require that
+        * we store up to one header in the socket object rather than in the
+        * receive buffer.
+        */
+       return UDS_BUF - UDS_HDRLEN;
+}
+
+/*
+ * Fetch 'len' bytes starting from absolute position 'pos' into the receive
+ * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'.
+ * Return the absolute position of the first byte after the fetched data in the
+ * receive buffer.
+ */
+static size_t
+uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len)
+{
+       size_t left;
+
+       assert(off < UDS_BUF);
+
+       left = UDS_BUF - off;
+       if (len >= left) {
+               memcpy(ptr, &uds->uds_buf[off], left);
+
+               if ((len -= left) > 0)
+                       memcpy((char *)ptr + left, &uds->uds_buf[0], len);
+
+               return len;
+       } else {
+               memcpy(ptr, &uds->uds_buf[off], len);
+
+               return off + len;
+       }
+}
+
+/*
+ * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive
+ * buffer of socket 'uds', starting at absolute position 'pos' into the receive
+ * buffer.  Return the absolute position of the first byte after the stored
+ * data in the receive buffer.
+ */
+static size_t
+uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len)
+{
+       size_t left;
+
+       assert(off < UDS_BUF);
+
+       left = UDS_BUF - off;
+       if (len >= left) {
+               memcpy(&uds->uds_buf[off], ptr, left);
+
+               if ((len -= left) > 0)
+                       memcpy(&uds->uds_buf[0], (const char *)ptr + left,
+                           len);
+
+               return len;
+       } else {
+               memcpy(&uds->uds_buf[off], ptr, len);
+
+               return off + len;
+       }
+}
+
+/*
+ * Fetch a segment header previously stored in the receive buffer of socket
+ * 'uds' at absolute position 'off'.  Return the absolute position of the first
+ * byte after the header, as well as the entire segment length in 'seglen', the
+ * length of the data in the segment in 'datalen', and the segment flags in
+ * 'segflags'.
+ */
+static size_t
+uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen,
+       size_t * datalen, unsigned int * segflags)
+{
+       unsigned char hdr[UDS_HDRLEN];
+
+       off = uds_fetch(uds, off, hdr, sizeof(hdr));
+
+       *seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1];
+       *datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3];
+       *segflags = hdr[4];
+
+       assert(*seglen >= UDS_HDRLEN);
+       assert(*seglen <= uds->uds_len);
+       assert(*datalen <= *seglen - UDS_HDRLEN);
+       assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN);
+       assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
+
+       return off;
+}
+
+/*
+ * Store a segment header in the receive buffer of socket 'uds' at absolute
+ * position 'off', with the segment length 'seglen', the segment data length
+ * 'datalen', and the segment flags 'segflags'.  Return the absolute receive
+ * buffer position of the first data byte after the stored header.
+ */
+static size_t
+uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen,
+       unsigned int segflags)
+{
+       unsigned char hdr[UDS_HDRLEN];
+
+       assert(seglen <= USHRT_MAX);
+       assert(datalen <= seglen);
+       assert(segflags <= UCHAR_MAX);
+       assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH)));
+
+       hdr[0] = (seglen >> 8) & 0xff;
+       hdr[1] = seglen & 0xff;
+       hdr[2] = (datalen >> 8) & 0xff;
+       hdr[3] = datalen & 0xff;
+       hdr[4] = segflags;
+
+       return uds_store(uds, off, hdr, sizeof(hdr));
+}
+
+/*
+ * Perform initial checks on a send request, before it may potentially be
+ * suspended.  Return OK if this send request is valid, or a negative error
+ * code if it is not.
+ */
+int
+uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
+       const struct sockaddr * addr, socklen_t addr_len __unused,
+       endpoint_t user_endpt __unused, int flags)
+{
+       struct udssock *uds = (struct udssock *)sock;
+       size_t pathlen;
+
+       /*
+        * Reject calls with unknown flags.  Besides the flags handled entirely
+        * by libsockevent (which are not part of 'flags' here), that is all of
+        * them.  TODO: ensure that we should really reject all other flags
+        * rather than ignore them.
+        */
+       if (flags != 0)
+               return EOPNOTSUPP;
+
+       /*
+        * Perform very basic address and message size checks on the send call.
+        * For non-stream sockets, we must reject packets that may never fit in
+        * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the
+        * send call may end up being suspended indefinitely.  Therefore, we
+        * assume the worst-case scenario, which is that a full set of
+        * credentials must be associated with the packet.  As a result, we may
+        * reject some large packets that could actually just fit.  Checking
+        * the peer's LOCAL_CREDS setting here is not safe: even if we know the
+        * peer already at all (for SOCK_DGRAM we do not), the send may still
+        * block and the option toggled before it unblocks.
+        */
+       switch (uds_get_type(uds)) {
+       case SOCK_STREAM:
+               /* Nothing to check for this case. */
+               break;
+
+       case SOCK_SEQPACKET:
+               if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN)
+                       return EMSGSIZE;
+
+               break;
+
+       case SOCK_DGRAM:
+               if (!uds_has_link(uds) && addr == NULL)
+                       return EDESTADDRREQ;
+
+               /*
+                * The path is stored without null terminator, but with leading
+                * byte containing the path length--if there is a path at all.
+                */
+               pathlen = (size_t)uds->uds_pathlen;
+               if (pathlen > 0)
+                       pathlen++;
+
+               if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN)
+                       return EMSGSIZE;
+
+               break;
+
+       default:
+               assert(0);
+       }
+
+       return OK;
+}
+
+/*
+ * Determine whether the (real or pretend) send request should be processed
+ * now, suspended until later, or rejected based on the current socket state.
+ * Return OK if the send request should be processed now.  Return SUSPEND if
+ * the send request should be retried later.  Return an appropriate negative
+ * error code if the send request should fail.
+ */
+static int
+uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min,
+       int partial)
+{
+       struct udssock *conn;
+       size_t avail, hdrlen, credlen;
+
+       assert(!uds_is_shutdown(uds, SFL_SHUT_WR));
+
+       if (uds_get_type(uds) != SOCK_DGRAM) {
+               if (uds_is_connecting(uds))
+                       return SUSPEND;
+               if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
+                       return ENOTCONN;
+               if (!uds_has_conn(uds))
+                       return EPIPE;
+
+               conn = uds->uds_conn;
+
+               if (uds_is_shutdown(conn, SFL_SHUT_RD))
+                       return EPIPE;
+
+               /*
+                * For connection-type sockets, we now have to check if there
+                * is enough room in the receive buffer.  For SOCK_STREAM
+                * sockets, we must check if at least 'min' bytes can be moved
+                * into the receive buffer, at least if that is a reasonable
+                * value for ever making any forward progress at all.  For
+                * SOCK_SEQPACKET sockets, we must check if the entire packet
+                * of size 'len' can be stored in the receive buffer.  In both
+                * cases, we must take into account any metadata to store along
+                * with the data.
+                *
+                * Unlike in uds_pre_send(), we can now check safely whether
+                * the peer is expecting credentials, but we still don't know
+                * the actual size of the credentials, so again we take the
+                * maximum possible size.  The same applies to file descriptors
+                * transferred via control data: all we have the control length
+                * right now, which if non-zero we assume to mean there might
+                * be file descriptors.
+                *
+                * In both cases, the reason of overestimating is that actually
+                * getting accurate sizes, by obtaining credentials or copying
+                * in control data, is very costly.  We want to do that only
+                * when we are sure we will not suspend the send call after
+                * all.  It is no problem to overestimate how much space will
+                * be needed here, but not to underestimate: that could cause
+                * applications that use select(2) and non-blocking sockets to
+                * end up in a busy-wait loop.
+                */
+               if (!partial && (conn->uds_flags & UDSF_PASSCRED))
+                       credlen = 1 + UDS_MAXCREDLEN;
+               else
+                       credlen = 0;
+
+               avail = UDS_BUF - conn->uds_len;
+
+               if (uds_get_type(uds) == SOCK_STREAM) {
+                       /*
+                        * Limit the low threshold to the maximum that can ever
+                        * be sent at once.
+                        */
+                       if (min > UDS_BUF - UDS_HDRLEN - credlen)
+                               min = UDS_BUF - UDS_HDRLEN - credlen;
+
+                       /*
+                        * Suspend the call only if not even the low threshold
+                        * is met.  Otherwise we may make (partial) progress.
+                        */
+                       if (len > min)
+                               len = min;
+
+                       /*
+                        * If the receive buffer already has at least one
+                        * segment, and there are certainly no file descriptors
+                        * to transfer now, and we do not have to store
+                        * credentials either, then this segment can be merged
+                        * with the previous one.  In that case, we need no
+                        * space for a header.  That is certainly the case if
+                        * we are resuming an already partially completed send.
+                        */
+                       hdrlen = (avail == UDS_BUF || ctl_len != 0 ||
+                           credlen > 0) ? UDS_HDRLEN : 0;
+               } else
+                       hdrlen = UDS_HDRLEN;
+
+               if (avail < hdrlen + credlen + len)
+                       return SUSPEND;
+       }
+
+       return OK;
+}
+
+/*
+ * Get the destination peer for a send request.  The send test has already been
+ * performed first.  On success, return OK, with a pointer to the peer socket
+ * stored in 'peerp'.  On failure, return an appropriate error code.
+ */
+static int
+uds_send_peer(struct udssock * uds, const struct sockaddr * addr,
+       socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
+{
+       struct udssock *peer;
+       int r;
+
+       if (uds_get_type(uds) == SOCK_DGRAM) {
+               if (!uds_has_link(uds)) {
+                       /* This was already checked in uds_pre_check(). */
+                       assert(addr != NULL);
+
+                       /*
+                        * Find the socket identified by the given address.
+                        * If it exists at all, see if it is a proper match.
+                        */
+                       if ((r = uds_lookup(uds, addr, addr_len, user_endpt,
+                           &peer)) != OK)
+                               return r;
+
+                       /*
+                        * If the peer socket is connected to a target, it
+                        * must be this socket.  Unfortunately, POSIX does not
+                        * specify an error code for this.  We borrow Linux's.
+                        */
+                       if (uds_has_link(peer) && peer->uds_link != uds)
+                               return EPERM;
+               } else
+                       peer = uds->uds_link;
+
+               /*
+                * If the receiving end will never receive this packet, we
+                * might as well not send it, so drop it immeiately.  Indicate
+                * as such to the caller, using NetBSD's chosen error code.
+                */
+               if (uds_is_shutdown(peer, SFL_SHUT_RD))
+                       return ENOBUFS;
+       } else {
+               assert(uds_has_conn(uds));
+
+               peer = uds->uds_conn;
+       }
+
+       *peerp = peer;
+       return OK;
+}
+
+/*
+ * Generate a new segment for the current send request, or arrange things such
+ * that new data can be merged with a previous segment.  As part of this,
+ * decide whether we can merge data at all.  The segment will be merged if, and
+ * only if, all of the following requirements are met:
+ *
+ *   1) the socket is of type SOCK_STREAM;
+ *   2) there is a previous segment in the receive buffer;
+ *   3) there is no ancillary data for the current send request.
+ *
+ * Also copy in regular data (if any), retrieve the sender's credentials (if
+ * needed), and copy over the source path (if applicable).  However, do not yet
+ * commit the segment (or the new part to be merged), because the send request
+ * may still fail for other reasons.
+ *
+ * On success, return the length of the new segment (or, when merging, the
+ * length to be added to the last segment), as well as a flag indicating
+ * whether we are merging into the last segment in 'mergep', the length of the
+ * (new) data in the segment in 'datalenp', and the new segment's flags in
+ * 'segflagsp' (always zero when merging).  Note that a return value of zero
+ * implies that we are merging zero extra bytes into the last segment, which
+ * means that effectively nothing changes; in that case the send call will be
+ * cut short and return zero to the caller as well.  On failure, return a
+ * negative error code.
+ */
+static int
+uds_send_data(struct udssock * uds, struct udssock * peer,
+       const struct sockdriver_data * data, size_t len, size_t off,
+       endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep,
+       size_t * __restrict datalenp, unsigned int * __restrict segflagsp)
+{
+       struct sockcred sockcred;
+       gid_t groups[NGROUPS_MAX];
+       iovec_t iov[2];
+       unsigned int iovcnt, segflags;
+       unsigned char lenbyte;
+       size_t credlen, pathlen, datalen, seglen;
+       size_t avail, pos, left;
+       int r, merge;
+
+       /*
+        * At this point we should add the data to the peer's receive buffer.
+        * In the case of SOCK_STREAM sockets, we should add as much of the
+        * data as possible and suspend the call to send the rest later, if
+        * applicable.  In the case of SOCK_DGRAM sockets, we should drop the
+        * packet if it does not fit in the buffer.
+        *
+        * Due to the checks in uds_can_send(), we know for sure that we no
+        * longer have to suspend without making any progress at this point.
+        */
+       segflags = (nfds > 0) ? UDS_HAS_FDS : 0;
+
+       /*
+        * Obtain the credentials now.  Doing so allows us to determine how
+        * much space we actually need for them.
+        */
+       if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) {
+               memset(&sockcred, 0, sizeof(sockcred));
+
+               if ((r = getsockcred(user_endpt, &sockcred, groups,
+                   __arraycount(groups))) != OK)
+                       return r;
+
+               credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups);
+
+               segflags |= UDS_HAS_CRED;
+       } else
+               credlen = 0;
+
+       /* For bound source datagram sockets, include the source path. */
+       if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) {
+               pathlen = (size_t)uds->uds_pathlen + 1;
+
+               segflags |= UDS_HAS_PATH;
+       } else
+               pathlen = 0;
+
+       avail = UDS_BUF - peer->uds_len;
+
+       if (uds_get_type(uds) == SOCK_STREAM) {
+               /*
+                * Determine whether we can merge data into the previous
+                * segment.  This is a more refined version of the test in
+                * uds_can_send(), as we now know whether there are actually
+                * any FDs to transfer.
+                */
+               merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0);
+
+               /* Determine how much we can send at once. */
+               if (!merge) {
+                       assert(avail > UDS_HDRLEN + credlen);
+                       datalen = avail - UDS_HDRLEN - credlen;
+               } else
+                       datalen = avail;
+
+               if (datalen > len)
+                       datalen = len;
+
+               /* If we cannot make progress, we should have suspended.. */
+               assert(datalen != 0 || len == 0);
+       } else {
+               merge = FALSE;
+
+               datalen = len;
+       }
+       assert(datalen <= len);
+       assert(datalen <= UDS_BUF);
+
+       /*
+        * Compute the total amount of space we need for the segment in the
+        * receive buffer.  Given that we have done will-it-fit tests in
+        * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one
+        * case left where the result may not fit, and that is for SOCK_DGRAM
+        * packets.  In that case, we drop the packet.  POSIX says we should
+        * throw an error in that case, and that is also what NetBSD does.
+        */
+       if (!merge)
+               seglen = UDS_HDRLEN + credlen + pathlen + datalen;
+       else
+               seglen = datalen;
+
+       if (seglen > avail) {
+               assert(uds_get_type(uds) == SOCK_DGRAM);
+
+               /* Drop the packet, borrowing NetBSD's chosen error code. */
+               return ENOBUFS;
+       }
+
+       /*
+        * Generate the full segment, but do not yet update the buffer head.
+        * We may still run into an error (copying in file descriptors) or even
+        * decide that nothing gets sent after all (if there are no data or
+        * file descriptors).  If we are merging the new data into the previous
+        * segment, do not generate a header.
+        */
+       pos = uds_get_head(peer);
+
+       /* Generate the header, if needed. */
+       if (!merge)
+               pos = uds_store_hdr(peer, pos, seglen, datalen, segflags);
+       else
+               assert(segflags == 0);
+
+       /* Copy in and store the sender's credentials, if desired. */
+       if (credlen > 0) {
+               assert(credlen >= 1 + sizeof(sockcred));
+               assert(credlen <= UCHAR_MAX);
+
+               lenbyte = credlen - 1;
+               pos = uds_store(peer, pos, &lenbyte, 1);
+
+               if (sockcred.sc_ngroups > 0) {
+                       pos = uds_store(peer, pos, &sockcred,
+                           offsetof(struct sockcred, sc_groups));
+                       pos = uds_store(peer, pos, groups,
+                           sockcred.sc_ngroups * sizeof(gid_t));
+               } else
+                       pos = uds_store(peer, pos, &sockcred,
+                           sizeof(sockcred));
+       }
+
+       /* Store the sender's address if any.  Datagram sockets only. */
+       if (pathlen > 0) {
+               assert(pathlen > 1);
+               assert(pathlen <= UCHAR_MAX);
+
+               lenbyte = uds->uds_pathlen;
+               pos = uds_store(peer, pos, &lenbyte, 1);
+               pos = uds_store(peer, pos, uds->uds_path, pathlen - 1);
+       }
+
+       /* Lastly, copy in the actual data (if any) from the caller. */
+       if (datalen > 0) {
+               iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos];
+               left = UDS_BUF - pos;
+
+               if (left < datalen) {
+                       assert(left > 0);
+                       iov[0].iov_size = left;
+                       iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0];
+                       iov[1].iov_size = datalen - left;
+                       iovcnt = 2;
+               } else {
+                       iov[0].iov_size = datalen;
+                       iovcnt = 1;
+               }
+
+               if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK)
+                       return r;
+       }
+
+       *mergep = merge;
+       *datalenp = datalen;
+       *segflagsp = segflags;
+       return seglen;
+}
+
+/*
+ * Copy in control data for the current send request, and extract any file
+ * descriptors to be transferred.  Do not yet duplicate the file descriptors,
+ * but rather store a list in a temporary buffer: the send request may still
+ * fail in which case we want to avoid having to undo the duplication.
+ *
+ * On success, return the number of (zero or more) file descriptors extracted
+ * from the request and stored in the temporary buffer.  On failure, return a
+ * negative error code.
+ */
+static int
+uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len,
+       endpoint_t user_endpt)
+{
+       struct msghdr msghdr;
+       struct cmsghdr *cmsg;
+       socklen_t left;
+       unsigned int i, n, nfds;
+       int r;
+
+       /*
+        * Copy in the control data.  We can spend a lot of effort copying in
+        * the data in small chunks, and change the receiving side to do the
+        * same, but it is really not worth it: applications never send a whole
+        * lot of file descriptors at once, and the buffer size is currently
+        * such that the UDS service itself will exhaust its OPEN_MAX limit
+        * anyway if they do.
+        */
+       if (ctl_len > sizeof(uds_ctlbuf))
+               return ENOBUFS;
+
+       if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK)
+               return r;
+
+       if (ctl_len < sizeof(uds_ctlbuf))
+               memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len);
+
+       /*
+        * Look for any file descriptors, and store their remote file
+        * descriptor numbers into a temporary array.
+        */
+       memset(&msghdr, 0, sizeof(msghdr));
+       msghdr.msg_control = uds_ctlbuf;
+       msghdr.msg_controllen = ctl_len;
+
+       nfds = 0;
+       r = OK;
+
+       /*
+        * The sender may provide file descriptors in multiple chunks.
+        * Currently we do not preserve these chunk boundaries, instead
+        * generating one single chunk with all file descriptors for the
+        * segment upon receipt.  If needed, we can fairly easily adapt this
+        * later.
+        */
+       for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
+           cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
+               /*
+                * Check for bogus lengths.  There is no excuse for this;
+                * either the caller does not know what they are doing or we
+                * are looking at a hacking attempt.
+                */
+               assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len);
+               left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf);
+               assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */
+
+               if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) {
+                       printf("UDS: malformed control data from %u\n",
+                           user_endpt);
+                       r = EINVAL;
+                       break;
+               }
+
+               if (cmsg->cmsg_level != SOL_SOCKET ||
+                   cmsg->cmsg_type != SCM_RIGHTS)
+                       continue;
+
+               n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+               for (i = 0; i < n; i++) {
+                       /*
+                        * Copy the file descriptor to the temporary buffer,
+                        * whose size is based on the control data buffer, so
+                        * it is always large enough to contain all FDs.
+                        */
+                       assert(nfds < __arraycount(uds_ctlfds));
+
+                       memcpy(&uds_ctlfds[nfds],
+                           &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
+
+                       nfds++;
+               }
+       }
+
+       return nfds;
+}
+
+/*
+ * Actually duplicate any file descriptors that we extracted from the sender's
+ * control data and stored in our temporary buffer.  On success, return OK,
+ * with all file descriptors stored in file descriptor objects that are
+ * appended to the socket's list of in-flight FD objects.  Thus, on success,
+ * the send request may no longer fail.  On failure, return a negative error
+ * code, with any partial duplication undone.
+ */
+static int
+uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt)
+{
+       SIMPLEQ_HEAD(, uds_fd) fds;
+       struct uds_fd *ufd;
+       unsigned int i;
+       int r;
+
+       SIMPLEQ_INIT(&fds);
+
+       for (i = 0; i < nfds; i++) {
+               if (SIMPLEQ_EMPTY(&uds_freefds)) {
+                       /* UDS itself may already have OPEN_MAX FDs. */
+                       r = ENFILE;
+                       break;
+               }
+
+               /*
+                * The caller may have given an invalid FD, or UDS itself may
+                * unexpectedly have run out of available file descriptors etc.
+                */
+               if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0)
+                       break;
+
+               ufd = SIMPLEQ_FIRST(&uds_freefds);
+               SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next);
+
+               ufd->ufd_fd = r;
+               ufd->ufd_count = 0;
+
+               SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next);
+
+               dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r));
+       }
+
+       /* Did we experience an error while copying in the file descriptors? */
+       if (r < 0) {
+               /* Revert the successful copyfd() calls made so far. */
+               SIMPLEQ_FOREACH(ufd, &fds, ufd_next) {
+                       dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+                       closenb(ufd->ufd_fd);
+               }
+
+               SIMPLEQ_CONCAT(&uds_freefds, &fds);
+
+               return r;
+       }
+
+       /*
+        * Success.  If there were any file descriptors at all, add them to the
+        * peer's list of in-flight file descriptors.  Assign the number of
+        * file descriptors copied in to the first file descriptor object, so
+        * that we know how many to copy out (or discard) for this segment.
+        * Also set the UDS_HAS_FDS flag on the segment.
+        */
+       ufd = SIMPLEQ_FIRST(&fds);
+       ufd->ufd_count = nfds;
+
+       SIMPLEQ_CONCAT(&peer->uds_fds, &fds);
+
+       return OK;
+}
+
+/*
+ * The current send request is successful or at least has made progress.
+ * Commit the new segment or, if we decided to merge the new data into the last
+ * segment, update the header of the last segment.  Also wake up the receiving
+ * side, because there will now be new data to receive.
+ */
+static void
+uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen,
+       int merge, size_t seglen, unsigned int segflags)
+{
+       size_t pos, prevseglen, prevdatalen;
+
+       /*
+        * For non-datagram sockets, credentials are sent only once after
+        * setting the LOCAL_CREDS option.  After that, the option is unset.
+        */
+       if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM)
+               peer->uds_flags &= ~UDSF_PASSCRED;
+
+       if (merge) {
+               assert(segflags == 0);
+
+               pos = uds_get_last(peer);
+
+               (void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen,
+                   &segflags);
+
+               peer->uds_len += seglen;
+               assert(peer->uds_len <= UDS_BUF);
+
+               seglen += prevseglen;
+               datalen += prevdatalen;
+               assert(seglen <= UDS_BUF);
+
+               uds_store_hdr(peer, pos, seglen, datalen, segflags);
+       } else {
+               peer->uds_last = peer->uds_len;
+
+               peer->uds_len += seglen;
+               assert(peer->uds_len <= UDS_BUF);
+       }
+
+       /* Now that there are new data, wake up the receiver side. */
+       sockevent_raise(&peer->uds_sock, SEV_RECV);
+}
+
+/*
+ * Process a send request.  Return OK if the send request has successfully
+ * completed, SUSPEND if it should be tried again later, or a negative error
+ * code on failure.  In all cases, the values of 'off' and 'ctl_off' must be
+ * updated if any progress has been made; if either is non-zero, libsockevent
+ * will return the partial progress rather than an error code.
+ */
+int
+uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len,
+       size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
+       socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len,
+       endpoint_t user_endpt, int flags __unused, size_t min)
+{
+       struct udssock *uds = (struct udssock *)sock;
+       struct udssock *peer;
+       size_t seglen, datalen = 0 /*gcc*/;
+       unsigned int nfds, segflags = 0 /*gcc*/;
+       int r, partial, merge = 0 /*gcc*/;
+
+       dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n",
+           uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
+           (ctl_off != NULL) ? *ctl_off : 0, flags));
+
+       partial = (off != NULL && *off > 0);
+
+       /*
+        * First see whether we can process this send call at all right now.
+        * Most importantly, for connected sockets, if the peer's receive
+        * buffer is full, we may have to suspend the call until some space has
+        * been freed up.
+        */
+       if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK)
+               return r;
+
+       /*
+        * Then get the peer socket.  For connected sockets, this is trivial.
+        * For unconnected sockets, it may involve a lookup of the given
+        * address.
+        */
+       if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK)
+               return r;
+
+       /*
+        * We now know for sure that we will not suspend this call without
+        * making any progress.  However, the call may still fail.  Copy in
+        * control data first now, so that we know whether there are any file
+        * descriptors to transfer.  This aspect may determine whether or not
+        * we can merge data with a previous segment.  Do not actually copy in
+        * the actual file descriptors yet, because that is much harder to undo
+        * in case of a failure later on.
+        */
+       if (ctl_len > 0) {
+               /* We process control data once, in full. */
+               assert(*ctl_off == 0);
+
+               if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0)
+                       return r;
+               nfds = (unsigned int)r;
+       } else
+               nfds = 0;
+
+       /*
+        * Now generate a new segment, or (if possible) merge new data into the
+        * last segment.  Since the call may still fail, prepare the segment
+        * but do not update the buffer head yet.  Note that the segment
+        * contains not just regular data (in fact it may contain no data at
+        * all) but (also) certain ancillary data.
+        */
+       if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds,
+           &merge, &datalen, &segflags)) <= 0)
+               return r;
+       seglen = (size_t)r;
+
+       /*
+        * If we extracted any file descriptors from the control data earlier,
+        * copy them over to ourselves now.  The resulting in-flight file
+        * descriptors are stored in a separate data structure.  This is the
+        * last point where the send call may actually fail.
+        */
+       if (nfds > 0) {
+               if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK)
+                       return r;
+       }
+
+       /*
+        * The transmission is now known to be (partially) successful.  Commit
+        * the new work by moving the receive buffer head.
+        */
+       uds_send_advance(uds, peer, datalen, merge, seglen, segflags);
+
+       /*
+        * Register the result.  For stream-type sockets, the expected behavior
+        * is that all data be sent, and so we may still have to suspend the
+        * call after partial progress.  Otherwise, we are now done.  Either
+        * way, we are done with the control data, so mark it as consumed.
+        */
+       *off += datalen;
+       *ctl_off += ctl_len;
+       if (uds_get_type(uds) == SOCK_STREAM && datalen < len)
+               return SUSPEND;
+       else
+               return OK;
+}
+
+/*
+ * Test whether a send request would block.  The given 'min' parameter contains
+ * the minimum number of bytes that should be possible to send without blocking
+ * (the low send watermark).  Return SUSPEND if the send request would block,
+ * or any other error code if it would not.
+ */
+int
+uds_test_send(struct sock * sock, size_t min)
+{
+       struct udssock *uds = (struct udssock *)sock;
+
+       return uds_send_test(uds, min, 0, min, FALSE /*partial*/);
+}
+
+/*
+ * Perform initial checks on a receive request, before it may potentially be
+ * suspended.  Return OK if this receive request is valid, or a negative error
+ * code if it is not.
+ */
+int
+uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
+       int flags)
+{
+
+       /*
+        * Reject calls with unknown flags.  TODO: ensure that we should really
+        * reject all other flags rather than ignore them.
+        */
+       if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0)
+               return EOPNOTSUPP;
+
+       return OK;
+}
+
+/*
+ * Determine whether the (real or pretend) receive request should be processed
+ * now, suspended until later, or rejected based on the current socket state.
+ * Return OK if the receive request should be processed now, along with a first
+ * indication whether the call may still be suspended later in 'may_block'.
+ * Return SUSPEND if the receive request should be retried later.  Return an
+ * appropriate negative error code if the receive request should fail.
+ */
+static int
+uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial,
+       int * may_block)
+{
+       size_t seglen, datalen;
+       unsigned int segflags;
+       int r;
+
+       /*
+        * If there are any pending data, those should always be received
+        * first.  However, if there is nothing to receive, then whether we
+        * should suspend the receive call or fail immediately depends on other
+        * conditions.  We first look at these other conditions.
+        */
+       r = OK;
+
+       if (uds_get_type(uds) != SOCK_DGRAM) {
+               if (uds_is_connecting(uds))
+                       r = SUSPEND;
+               else if (!uds_is_connected(uds) && !uds_is_disconnected(uds))
+                       r = ENOTCONN;
+               else if (!uds_has_conn(uds) ||
+                   uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR))
+                       r = SOCKEVENT_EOF;
+       }
+
+       if (uds->uds_len == 0) {
+               /*
+                * For stream-type sockets, we use the policy: if no regular
+                * data is requested, then end the call without receiving
+                * anything.  For packet-type sockets, the request should block
+                * until there is a packet to discard, though.
+                */
+               if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0))
+                       return r;
+
+               return SUSPEND;
+       }
+
+       /*
+        * For stream-type sockets, we should still suspend the call if fewer
+        * than 'min' bytes are available right now, and there is a possibility
+        * that more data may arrive later.  More may arrive later iff 'r' is
+        * OK (i.e., no EOF or error will follow) and, in case we already
+        * received some partial results, there is not already a next segment
+        * with ancillary data (i.e, nonzero segment flags), or in any case
+        * there isn't more than one segment in the buffer.  Limit 'min' to the
+        * maximum that can ever be received, though.  Since that is difficult
+        * in our case, we check whether the buffer is entirely full instead.
+        */
+       if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 &&
+           uds->uds_len < UDS_BUF) {
+               assert(uds->uds_len >= UDS_HDRLEN);
+
+               (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen,
+                   &segflags);
+
+               if (datalen < min && seglen == uds->uds_len &&
+                   (!partial || segflags == 0))
+                       return SUSPEND;
+       }
+
+       /*
+        * Also start the decision process as to whether we should suspend the
+        * current call if MSG_WAITALL is given.  Unfortunately there is no one
+        * place where we can conveniently do all the required checks.
+        */
+       if (may_block != NULL)
+               *may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM);
+       return OK;
+}
+
+/*
+ * Receive regular data, and possibly the source path, from the tail segment in
+ * the receive buffer.  On success, return the positive non-zero length of the
+ * tail segment, with 'addr' and 'addr_len' modified to store the source
+ * address if applicable, the result flags in 'rflags' updated as appropriate,
+ * the tail segment's data length stored in 'datalen', the number of received
+ * regular data bytes stored in 'reslen', the segment flags stored in
+ * 'segflags', and the absolute receive buffer position of the credentials in
+ * the segment stored in 'credpos' if applicable.  Since the receive call may
+ * still fail, this function must not yet update the tail or any other aspect
+ * of the receive buffer.  Return zero if the current receive call was already
+ * partially successful (due to MSG_WAITALL) and can no longer make progress,
+ * and thus should be ended.  Return a negative error code on failure.
+ */
+static int
+uds_recv_data(struct udssock * uds, const struct sockdriver_data * data,
+       size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len,
+       int * __restrict rflags, size_t * __restrict datalen,
+       size_t * __restrict reslen, unsigned int * __restrict segflags,
+       size_t * __restrict credpos)
+{
+       iovec_t iov[2];
+       unsigned char lenbyte;
+       unsigned int iovcnt;
+       size_t pos, seglen, left;
+       int r;
+
+       pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags);
+
+       /*
+        * If a partially completed receive now runs into a segment that cannot
+        * be logically merged with the previous one (because it has at least
+        * one segment flag set, meaning it has ancillary data), then we must
+        * shortcut the receive now.
+        */
+       if (off != 0 && *segflags != 0)
+               return OK;
+
+       /*
+        * As stated, for stream-type sockets, we choose to ignore zero-size
+        * receive calls.  This has the consequence that reading a zero-sized
+        * segment (with ancillary data) requires a receive request for at
+        * least one regular data byte.  Such a receive call would then return
+        * zero.  The problem with handling zero-data receive requests is that
+        * we need to know whether the current segment is terminated (i.e., no
+        * more data can possibly be merged into it later), which is a test
+        * that we rather not perform, not in the least because we do not know
+        * whether there is an error pending on the socket.
+        *
+        * For datagrams, we currently allow a zero-size receive call to
+        * discard the next datagram.
+        *
+        * TODO: compare this against policies on other platforms.
+        */
+       if (len == 0 && uds_get_type(uds) == SOCK_STREAM)
+               return OK;
+
+       /*
+        * We have to skip the credentials for now: these are copied out as
+        * control data, and thus will (well, may) be looked at when dealing
+        * with the control data.  For the same reason, we do not even look at
+        * UDS_HAS_FDS here.
+        */
+       if (*segflags & UDS_HAS_CRED) {
+               *credpos = pos;
+
+               pos = uds_fetch(uds, pos, &lenbyte, 1);
+               pos = uds_advance(pos, (size_t)lenbyte);
+       }
+
+       /*
+        * Copy out the source address, but only if the (datagram) socket is
+        * not connected.  TODO: even when it is connected, it may still
+        * receive packets sent to it from other sockets *before* being
+        * connected, and the receiver has no way of knowing that those packets
+        * did not come from its new peer.  Ideally, the older packets should
+        * be dropped..
+        */
+       if (*segflags & UDS_HAS_PATH) {
+               pos = uds_fetch(uds, pos, &lenbyte, 1);
+
+               if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds))
+                       uds_make_addr((const char *)&uds->uds_buf[pos],
+                           (size_t)lenbyte, addr, addr_len);
+
+               pos = uds_advance(pos, (size_t)lenbyte);
+       }
+
+       /*
+        * We can receive no more data than those that are present in the
+        * segment, obviously.  For stream-type sockets, any more data that
+        * could have been received along with the current data would have been
+        * merged in the current segment, so we need not search for any next
+        * segments.
+        *
+        * For non-stream sockets, the caller may receive less than a whole
+        * packet if it supplied a small buffer.  In that case, the rest of the
+        * packet will be discarded (but not here yet!) and the caller gets
+        * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway.
+        */
+       if (len > *datalen)
+               len = *datalen;
+       else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM)
+               *rflags |= MSG_TRUNC;
+
+       /* Copy out the data to the caller. */
+       if (len > 0) {
+               iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos];
+               left = UDS_BUF - pos;
+
+               if (left < len) {
+                       iov[0].iov_size = left;
+                       iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0];
+                       iov[1].iov_size = len - left;
+                       iovcnt = 2;
+               } else {
+                       iov[0].iov_size = len;
+                       iovcnt = 1;
+               }
+
+               if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK)
+                       return r;
+       }
+
+       *reslen = len;
+       assert(seglen > 0 && seglen <= INT_MAX);
+       return (int)seglen;
+}
+
+/*
+ * The current segment has associated file descriptors.  If possible, copy out
+ * all file descriptors to the receiver, and generate and copy out a chunk of
+ * control data that contains their file descriptor numbers.  If not all
+ * file descriptors fit in the receiver's buffer, or if any error occurs, no
+ * file descriptors are copied out.
+ */
+static int
+uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl,
+       socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags)
+{
+       struct msghdr msghdr;
+       struct cmsghdr *cmsg;
+       struct uds_fd *ufd;
+       unsigned int i, nfds;
+       socklen_t chunklen, chunkspace;
+       int r, fd, what;
+
+       /* See how many file descriptors should be part of this chunk. */
+       assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+       ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+       nfds = ufd->ufd_count;
+       assert(nfds > 0);
+
+       /*
+        * We produce and copy out potentially unaligned chunks, using
+        * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE.
+        * This may leave "gap" bytes unchanged in userland, but that should
+        * not be a problem.  By producing unaligned chunks, we eliminate a
+        * potential boundary case where the unaligned chunk passed in (by the
+        * sender) no longer fits in the same buffer after being aligned here.
+        */
+       chunklen = CMSG_LEN(sizeof(int) * nfds);
+       chunkspace = CMSG_SPACE(sizeof(int) * nfds);
+       assert(chunklen <= sizeof(uds_ctlbuf));
+       if (chunklen > ctl_len)
+               return 0; /* chunk would not fit, so produce nothing instead */
+       if (chunkspace > ctl_len)
+               chunkspace = ctl_len;
+
+       memset(&msghdr, 0, sizeof(msghdr));
+       msghdr.msg_control = uds_ctlbuf;
+       msghdr.msg_controllen = sizeof(uds_ctlbuf);
+
+       memset(uds_ctlbuf, 0, chunklen);
+       cmsg = CMSG_FIRSTHDR(&msghdr);
+       cmsg->cmsg_len = chunklen;
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_RIGHTS;
+
+       /*
+        * Copy the group's local file descriptors to the target endpoint, and
+        * store the resulting remote file descriptors in the chunk buffer.
+        */
+       r = OK;
+
+       for (i = 0; i < nfds; i++) {
+               assert(ufd != SIMPLEQ_END(&uds->uds_fds));
+               assert(i == 0 || ufd->ufd_count == 0);
+
+               what = COPYFD_TO;
+               if (flags & MSG_CMSG_CLOEXEC)
+                       what |= COPYFD_CLOEXEC;
+
+               /* Failure may happen legitimately here (e.g., EMFILE). */
+               if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0)
+                       break; /* we keep our progress so far in 'i' */
+
+               fd = r;
+
+               dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd));
+
+               memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int));
+
+               ufd = SIMPLEQ_NEXT(ufd, ufd_next);
+       }
+
+       /* If everything went well so far, copy out the produced chunk. */
+       if (r >= 0)
+               r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen);
+
+       /*
+        * Handle errors.  At this point, the 'i' variable contains the number
+        * of file descriptors that have already been successfully copied out.
+        */
+       if (r < 0) {
+               /* Revert the successful copyfd() calls made so far. */
+               while (i-- > 0) {
+                       memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int));
+
+                       (void)copyfd(user_endpt, fd, COPYFD_CLOSE);
+               }
+
+               return r;
+       }
+
+       /*
+        * Success.  Return the aligned size of the produced chunk, if the
+        * given length permits it.  From here on, the receive call may no
+        * longer fail, as that would result in lost file descriptors.
+        */
+       return chunkspace;
+}
+
+/*
+ * Generate and copy out a chunk of control data with the sender's credentials.
+ * Return the aligned chunk size on success, or a negative error code on
+ * failure.
+ */
+static int
+uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl,
+       socklen_t ctl_len, socklen_t ctl_off, size_t credpos)
+{
+       struct msghdr msghdr;
+       struct cmsghdr *cmsg;
+       socklen_t chunklen, chunkspace;
+       unsigned char lenbyte;
+       size_t credlen;
+       int r;
+
+       /*
+        * Since the sender side already did the hard work of producing the
+        * (variable-size) sockcred structure as it should be received, there
+        * is relatively little work to be done here.
+        */
+       credpos = uds_fetch(uds, credpos, &lenbyte, 1);
+       credlen = (size_t)lenbyte;
+
+       chunklen = CMSG_LEN(credlen);
+       chunkspace = CMSG_SPACE(credlen);
+       assert(chunklen <= sizeof(uds_ctlbuf));
+       if (chunklen > ctl_len)
+               return 0; /* chunk would not fit, so produce nothing instead */
+       if (chunkspace > ctl_len)
+               chunkspace = ctl_len;
+
+       memset(&msghdr, 0, sizeof(msghdr));
+       msghdr.msg_control = uds_ctlbuf;
+       msghdr.msg_controllen = sizeof(uds_ctlbuf);
+
+       memset(uds_ctlbuf, 0, chunklen);
+       cmsg = CMSG_FIRSTHDR(&msghdr);
+       cmsg->cmsg_len = chunklen;
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_CREDS;
+
+       uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen);
+
+       if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK)
+               return r;
+
+       return chunkspace;
+}
+
+/*
+ * Copy out control data for the ancillary data associated with the current
+ * segment, if any.  Return OK on success, at which point the current receive
+ * call may no longer fail.  'rflags' may be updated with additional result
+ * flags.  Return a negative error code on failure.
+ */
+static int
+uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl,
+       socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt,
+       int flags, unsigned int segflags, size_t credpos, int * rflags)
+{
+       int r;
+
+       /*
+        * We first copy out all file descriptors, if any.  We put them in one
+        * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS
+        * chunks.  We believe that this should not cause application-level
+        * issues, but if it does, we can change that later with some effort.
+        * We then copy out credentials, if any.
+        *
+        * We copy out each control chunk independently of the others, and also
+        * perform error recovery on a per-chunk basis.  This implies the
+        * following.  If producing or copying out the first chunk fails, the
+        * entire recvmsg(2) call will fail with an appropriate error.  If
+        * producing or copying out any subsequent chunk fails, the recvmsg(2)
+        * call will still return the previously generated chunks (a "short
+        * control read" if you will) as well as the MSG_CTRUNC flag.  This
+        * approach is simple and clean, and it guarantees that we can always
+        * copy out at least as many file descriptors as we copied in for this
+        * segment, even if credentials are present as well.  However, the
+        * approach does cause slightly more overhead when there are multiple
+        * chunks per call, as those are copied out separately.
+        *
+        * Since the generated SCM_RIGHTS chunk is never larger than the
+        * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf"
+        * buffer is always large enough to contain the chunk in its entirety.
+        * SCM_CREDS chunks should always fit easily as well.
+        *
+        * The MSG_CTRUNC flag will be returned iff not the entire user-given
+        * control buffer was filled and not all control chunks were delivered.
+        * Our current implementation does not deliver partial chunks.  NetBSD
+        * does, except for SCM_RIGHTS chunks.
+        *
+        * TODO: get rid of the redundancy in processing return values.
+        */
+       if (segflags & UDS_HAS_FDS) {
+               r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt,
+                   flags);
+
+               /*
+                * At this point, 'r' contains one of the following:
+                *
+                *   r > 0      a chunk of 'r' bytes was added successfully.
+                *   r == 0     not enough space left; the chunk was not added.
+                *   r < 0      an error occurred; the chunk was not added.
+                */
+               if (r < 0 && *ctl_off == 0)
+                       return r;
+
+               if (r > 0) {
+                       ctl_len -= r;
+                       *ctl_off += r;
+               } else
+                       *rflags |= MSG_CTRUNC;
+       }
+
+       if (segflags & UDS_HAS_CRED) {
+               r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos);
+
+               /* As above. */
+               if (r < 0 && *ctl_off == 0)
+                       return r;
+
+               if (r > 0) {
+                       ctl_len -= r;
+                       *ctl_off += r;
+               } else
+                       *rflags |= MSG_CTRUNC;
+       }
+
+       return OK;
+}
+
+/*
+ * The current receive request is successful or, in the case of MSG_WAITALL,
+ * has made progress.  Advance the receive buffer tail, either by discarding
+ * the entire tail segment or by generating a new, smaller tail segment that
+ * contains only the regular data left to be received from the original tail
+ * segment.  Also wake up the sending side for connection-oriented sockets if
+ * applicable, because there may now be room for more data to be sent.  Update
+ * 'may_block' if we are now sure that the call may not block on MSG_WAITALL
+ * after all.
+ */
+static void
+uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen,
+       size_t reslen, unsigned int segflags, int * may_block)
+{
+       struct udssock *conn;
+       struct uds_fd *ufd;
+       size_t delta, nseglen, advance;
+       unsigned int nfds;
+
+       /* Note that 'reslen' may be legitimately zero. */
+       assert(reslen <= datalen);
+
+       if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen)
+               reslen = datalen;
+
+       delta = datalen - reslen;
+
+       if (delta == 0) {
+               /*
+                * Fully consume the tail segment.  We advance the tail by the
+                * full segment length, thus moving up to either the next
+                * segment in the receive buffer, or an empty receive buffer.
+                */
+               advance = seglen;
+
+               uds->uds_tail = uds_advance(uds->uds_tail, advance);
+       } else {
+               /*
+                * Partially consume the tail segment.  We put a new segment
+                * header right in front of the remaining data, which obviously
+                * always fits.  Since any ancillary data was consumed along
+                * with the first data byte of the segment, the new segment has
+                * no ancillary data anymore (and thus a zero flags field).
+                */
+               nseglen = UDS_HDRLEN + delta;
+               assert(nseglen < seglen);
+
+               advance = seglen - nseglen;
+
+               uds->uds_tail = uds_advance(uds->uds_tail, advance);
+
+               uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0);
+       }
+
+       /*
+        * For datagram-oriented sockets, we always consume at least a header.
+        * For stream-type sockets, we either consume a zero-data segment along
+        * with its ancillary data, or we consume at least one byte from a
+        * segment that does have regular data.  In all other cases, the
+        * receive call has already been ended by now.  Thus, we always advance
+        * the tail of the receive buffer here.
+        */
+       assert(advance > 0);
+
+       /*
+        * The receive buffer's used length (uds_len) and pointer to the
+        * previous segment header (uds_last) are offsets from the tail.  Now
+        * that we have moved the tail, we need to adjust these accordingly.
+        * If the buffer is now empty, reset the tail to the buffer start so as
+        * to avoid splitting inter-process copies whenever possible.
+        */
+       assert(uds->uds_len >= advance);
+       uds->uds_len -= advance;
+
+       if (uds->uds_len == 0)
+               uds->uds_tail = 0;
+
+       /*
+        * If uds_last is zero here, it was pointing to the segment we just
+        * (partially) consumed.  By leaving it zero, it will still point to
+        * the new or next segment.
+        */
+       if (uds->uds_last > 0) {
+               assert(uds->uds_len > 0);
+               assert(uds->uds_last >= advance);
+               uds->uds_last -= advance;
+       }
+
+       /*
+        * If there were any file descriptors associated with this segment,
+        * close and free them now.
+        */
+       if (segflags & UDS_HAS_FDS) {
+               assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+               ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+               nfds = ufd->ufd_count;
+               assert(nfds > 0);
+
+               while (nfds-- > 0) {
+                       assert(!SIMPLEQ_EMPTY(&uds->uds_fds));
+                       ufd = SIMPLEQ_FIRST(&uds->uds_fds);
+                       SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next);
+
+                       dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd));
+
+                       closenb(ufd->ufd_fd);
+
+                       SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next);
+               }
+       }
+
+       /*
+        * If there is now any data left in the receive buffer, then there has
+        * been a reason that we haven't received it.  For stream sockets, that
+        * reason is that the next segment has ancillary data.  In any case,
+        * this means we should never block the current receive operation
+        * waiting for more data.  Otherwise, we may block on MSG_WAITALL.
+        */
+       if (uds->uds_len > 0)
+               *may_block = FALSE;
+
+       /*
+        * If the (non-datagram) socket has a peer that is not shut down for
+        * writing, see if it can be woken up to send more data.  Note that
+        * the event will never be processed immediately.
+        */
+       if (uds_is_connected(uds)) {
+               assert(uds_get_type(uds) != SOCK_DGRAM);
+
+               conn = uds->uds_conn;
+
+               if (!uds_is_shutdown(conn, SFL_SHUT_WR))
+                       sockevent_raise(&conn->uds_sock, SEV_SEND);
+       }
+}
+
+/*
+ * Process a receive request.  Return OK if the receive request has completed
+ * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an
+ * end-of-file condition is reached, or a negative error code on failure.  In
+ * all cases, the values of 'off' and 'ctl_off' must be updated if any progress
+ * has been made; if either is non-zero, libsockevent will return the partial
+ * progress rather than an error code or EOF.
+ */
+int
+uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len,
+       size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len,
+       socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len,
+       endpoint_t user_endpt, int flags, size_t min, int * rflags)
+{
+       struct udssock *uds = (struct udssock *)sock;
+       size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/;
+       unsigned int segflags;
+       int r, partial, may_block;
+
+       dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n",
+           uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len,
+           (ctl_off != NULL) ? *ctl_off : 0, flags));
+
+       /*
+        * Start by testing whether anything can be received at all, or whether
+        * an error or EOF should be returned instead, or whether the receive
+        * call should be suspended until later otherwise.  If no (regular or
+        * control) data can be received, or if this was a test for select,
+        * we bail out right after.
+        */
+       partial = (off != NULL && *off > 0);
+
+       if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK)
+               return r;
+
+       /*
+        * Copy out regular data, if any.  Do this before copying out control
+        * data, because the latter is harder to undo on failure.  This data
+        * copy function returns returns OK (0) if we are to return a result of
+        * zero bytes (which is *not* EOF) to the caller without doing anything
+        * else.  The function returns a nonzero positive segment length if we
+        * should carry on with the receive call (as it happens, all its other
+        * returned values may in fact be zero).
+        */
+       if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags,
+           &datalen, &reslen, &segflags, &credpos)) <= 0)
+               return r;
+       seglen = (size_t)r;
+
+       /*
+        * Copy out control data, if any: transfer and copy out records of file
+        * descriptors, and/or copy out sender credentials.  This is the last
+        * part of the call that may fail.
+        */
+       if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags,
+           segflags, credpos, rflags)) != OK)
+               return r;
+
+       /*
+        * Now that the call has succeeded, move the tail of the receive
+        * buffer, unless we were merely peeking.
+        */
+       if (!(flags & MSG_PEEK))
+               uds_recv_advance(uds, seglen, datalen, reslen, segflags,
+                   &may_block);
+       else
+               may_block = FALSE;
+
+       /*
+        * If the MSG_WAITALL flag was given, we may still have to suspend the
+        * call after partial success.  In particular, the receive call may
+        * suspend after partial success if all of these conditions are met:
+        *
+        *   1) the socket is a stream-type socket;
+        *   2) MSG_WAITALL is set;
+        *   3) MSG_PEEK is not set;
+        *   4) MSG_DONTWAIT is not set (tested upon return);
+        *   5) the socket must not have a pending error (tested upon return);
+        *   6) the socket must not be shut down for reading (tested later);
+        *   7) the socket must still be connected to a peer (no EOF);
+        *   8) the peer must not have been shut down for writing (no EOF);
+        *   9) the next segment, if any, contains no ancillary data.
+        *
+        * Together, these points guarantee that the call could conceivably
+        * receive more after being resumed.  Points 4 to 6 are covered by
+        * libsockevent, which will end the call even if we return SUSPEND
+        * here.  Due to segment merging, we cover point 9 by checking that
+        * there is currently no next segment at all.  Once a new segment
+        * arrives, the ancillary-data test is done then.
+        */
+       *off += reslen;
+       if ((flags & MSG_WAITALL) && reslen < len && may_block)
+               return SUSPEND;
+       else
+               return OK;
+}
+
+/*
+ * Test whether a receive request would block.  The given 'min' parameter
+ * contains the minimum number of bytes that should be possible to receive
+ * without blocking (the low receive watermark).  Return SUSPEND if the send
+ * request would block.  Otherwise, return any other error code (including OK
+ * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled
+ * with the number of bytes available for receipt right now (if not zero).
+ * Note that if 'size' is not NULL, 'min' will always be zero.
+ */
+int
+uds_test_recv(struct sock * sock, size_t min, size_t * size)
+{
+       struct udssock *uds = (struct udssock *)sock;
+       size_t seglen;
+       unsigned int segflags;
+       int r;
+
+       if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/,
+           NULL /*may_block*/)) == SUSPEND)
+               return r;
+
+       if (size != NULL && uds->uds_len > 0)
+               (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size,
+                   &segflags);
+
+       return r;
+}
diff --git a/minix/net/uds/ioc_uds.c b/minix/net/uds/ioc_uds.c

deleted file mode 100644 (file)

index 8271f43..0000000
--- a/minix/net/uds/ioc_uds.c
+++ /dev/null
@@ -1,1114 +0,0 @@
-/*
- * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL)
- * This code handles ioctl(2) commands to implement the socket API.
- * Some helper functions are also present.
- */
-
-#include "uds.h"
-
-static int
-perform_connection(devminor_t minorx, devminor_t minory,
-       struct sockaddr_un *addr)
-{
-       /*
-        * There are several places were a connection is established, the
-        * initiating call being one of accept(2), connect(2), socketpair(2).
-        */
-       dprintf(("UDS: perform_connection(%d, %d)\n", minorx, minory));
-
-       /*
-        * Only connection-oriented types are acceptable and only equal
-        * types can connect to each other.
-        */
-       if ((uds_fd_table[minorx].type != SOCK_SEQPACKET &&
-           uds_fd_table[minorx].type != SOCK_STREAM) ||
-           uds_fd_table[minorx].type != uds_fd_table[minory].type)
-               return EINVAL;
-
-       /* Connect the pair of sockets. */
-       uds_fd_table[minorx].peer = minory;
-       uds_fd_table[minory].peer = minorx;
-
-       /* Set the address of both sockets */
-       memcpy(&uds_fd_table[minorx].addr, addr, sizeof(struct sockaddr_un));
-       memcpy(&uds_fd_table[minory].addr, addr, sizeof(struct sockaddr_un));
-
-       return OK;
-}
-
-static int
-do_accept(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       devminor_t minorparent; /* minor number of parent (server) */
-       devminor_t minorpeer;
-       int rc, i;
-       struct sockaddr_un addr;
-
-       dprintf(("UDS: do_accept(%d)\n", minor));
-
-       /*
-        * Somewhat weird logic is used in this function, so here's an
-        * overview... The minor number is the server's client socket
-        * (the socket to be returned by accept()). The data waiting
-        * for us in the IO Grant is the address that the server is
-        * listening on. This function uses the address to find the
-        * server's descriptor. From there we can perform the
-        * connection or suspend and wait for a connect().
-        */
-
-       /* This IOCTL must be called on a 'fresh' socket. */
-       if (uds_fd_table[minor].type != -1)
-               return EINVAL;
-
-       /* Get the server's address */
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-           sizeof(struct sockaddr_un))) != OK)
-               return rc;
-
-       /* Locate the server socket. */
-       for (i = 0; i < NR_FDS; i++) {
-               if (uds_fd_table[i].stale == FALSE &&
-                   uds_fd_table[i].listening == TRUE &&
-                   uds_fd_table[i].addr.sun_family == AF_UNIX &&
-                   !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
-                   sizeof(uds_fd_table[i].addr.sun_path)))
-                       break;
-       }
-
-       if (i == NR_FDS)
-               return EINVAL;
-
-       minorparent = i; /* parent */
-
-       /* We are the parent's child. */
-       uds_fd_table[minorparent].child = minor;
-
-       /*
-        * The peer has the same type as the parent. we need to be that
-        * type too.
-        */
-       uds_fd_table[minor].type = uds_fd_table[minorparent].type;
-
-       /* Locate the peer to accept in the parent's backlog. */
-       minorpeer = -1;
-       for (i = 0; i < uds_fd_table[minorparent].backlog_size; i++) {
-               if (uds_fd_table[minorparent].backlog[i] != -1) {
-                       minorpeer = uds_fd_table[minorparent].backlog[i];
-                       uds_fd_table[minorparent].backlog[i] = -1;
-                       break;
-               }
-       }
-
-       if (minorpeer == -1) {
-               dprintf(("UDS: do_accept(%d): suspend\n", minor));
-
-               /*
-                * There are no peers in the backlog, suspend and wait for one
-                * to show up.
-                */
-               uds_fd_table[minor].suspended = UDS_SUSPENDED_ACCEPT;
-
-               return EDONTREPLY;
-       }
-
-       dprintf(("UDS: connecting %d to %d -- parent is %d\n", minor,
-           minorpeer, minorparent));
-
-       if ((rc = perform_connection(minor, minorpeer, &addr)) != OK) {
-               dprintf(("UDS: do_accept(%d): connection failed\n", minor));
-
-               return rc;
-       }
-
-       uds_fd_table[minorparent].child = -1;
-
-       /* If the peer is blocked on connect() or write(), revive the peer. */
-       if (uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_CONNECT ||
-           uds_fd_table[minorpeer].suspended == UDS_SUSPENDED_WRITE) {
-               dprintf(("UDS: do_accept(%d): revive %d\n", minor, minorpeer));
-               uds_unsuspend(minorpeer);
-       }
-
-       /* See if we can satisfy an ongoing select. */
-       if ((uds_fd_table[minorpeer].sel_ops & CDEV_OP_WR) &&
-           uds_fd_table[minorpeer].size < UDS_BUF) {
-               /* A write on the peer is possible now. */
-               chardriver_reply_select(uds_fd_table[minorpeer].sel_endpt,
-                   minorpeer, CDEV_OP_WR);
-               uds_fd_table[minorpeer].sel_ops &= ~CDEV_OP_WR;
-       }
-
-       return OK;
-}
-
-static int
-do_connect(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int child, peer;
-       struct sockaddr_un addr;
-       int rc, i, j;
-       dev_t dev;
-       ino_t ino;
-
-       dprintf(("UDS: do_connect(%d)\n", minor));
-
-       /* Only connection oriented sockets can connect. */
-       if (uds_fd_table[minor].type != SOCK_STREAM &&
-           uds_fd_table[minor].type != SOCK_SEQPACKET)
-               return EINVAL;
-
-       /* The socket must not be connecting or connected already. */
-       peer = uds_fd_table[minor].peer;
-       if (peer != -1) {
-               if (uds_fd_table[peer].peer == -1)
-                       return EALREADY;        /* connecting */
-               else
-                       return EISCONN;         /* connected */
-       }
-
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-           sizeof(struct sockaddr_un))) != OK)
-               return rc;
-
-       if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
-           sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
-               return rc;
-
-       /*
-        * Look for a socket of the same type that is listening on the
-        * address we want to connect to.
-        */
-       for (i = 0; i < NR_FDS; i++) {
-               if (uds_fd_table[minor].type != uds_fd_table[i].type)
-                       continue;
-               if (uds_fd_table[i].listening == FALSE)
-                       continue;
-               if (uds_fd_table[i].stale == TRUE)
-                       continue;
-               if (uds_fd_table[i].addr.sun_family != AF_UNIX)
-                       continue;
-               if (strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
-                   sizeof(uds_fd_table[i].addr.sun_path)))
-                       continue;
-
-               /* Found a matching socket. */
-               break;
-       }
-
-       if (i == NR_FDS)
-               return ECONNREFUSED;
-
-       /* If the server is blocked on an accept, perform the connection. */
-       if ((child = uds_fd_table[i].child) != -1) {
-               rc = perform_connection(minor, child, &addr);
-
-               if (rc != OK)
-                       return rc;
-
-               uds_fd_table[i].child = -1;
-
-               dprintf(("UDS: do_connect(%d): revive %d\n", minor, child));
-
-               /* Wake up the accepting party. */
-               uds_unsuspend(child);
-
-               return OK;
-       }
-
-       dprintf(("UDS: adding %d to %d's backlog\n", minor, i));
-
-       /* Look for a free slot in the backlog. */
-       rc = -1;
-       for (j = 0; j < uds_fd_table[i].backlog_size; j++) {
-               if (uds_fd_table[i].backlog[j] == -1) {
-                       uds_fd_table[i].backlog[j] = minor;
-
-                       rc = 0;
-                       break;
-               }
-       }
-
-       if (rc == -1)
-               return ECONNREFUSED;    /* backlog is full */
-
-       /* See if the server is blocked on select(). */
-       if (uds_fd_table[i].sel_ops & CDEV_OP_RD) {
-               /* Satisfy a read-type select on the server. */
-               chardriver_reply_select(uds_fd_table[i].sel_endpt, i,
-                   CDEV_OP_RD);
-
-               uds_fd_table[i].sel_ops &= ~CDEV_OP_RD;
-       }
-
-       /* We found our server. */
-       uds_fd_table[minor].peer = i;
-
-       memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un));
-
-       dprintf(("UDS: do_connect(%d): suspend\n", minor));
-
-       /* Suspend until the server side accepts the connection. */
-       uds_fd_table[minor].suspended = UDS_SUSPENDED_CONNECT;
-
-       return EDONTREPLY;
-}
-
-static int
-do_listen(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc;
-       int backlog_size;
-
-       dprintf(("UDS: do_listen(%d)\n", minor));
-
-       /* Ensure the socket has a type and is bound. */
-       if (uds_fd_table[minor].type == -1 ||
-           uds_fd_table[minor].addr.sun_family != AF_UNIX)
-               return EINVAL;
-
-       /* listen(2) supports only two socket types. */
-       if (uds_fd_table[minor].type != SOCK_STREAM &&
-           uds_fd_table[minor].type != SOCK_SEQPACKET)
-               return EOPNOTSUPP;
-
-       /*
-        * The POSIX standard doesn't say what to do if listen() has
-        * already been called.  Well, there isn't an errno.  We silently
-        * let it happen, but if listen() has already been called, we
-        * don't allow the backlog to shrink.
-        */
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &backlog_size,
-           sizeof(backlog_size))) != OK)
-               return rc;
-
-       if (uds_fd_table[minor].listening == FALSE) {
-               /* Set the backlog size to a reasonable value. */
-               if (backlog_size <= 0 || backlog_size > UDS_SOMAXCONN)
-                       backlog_size = UDS_SOMAXCONN;
-
-               uds_fd_table[minor].backlog_size = backlog_size;
-       } else {
-               /* Allow the user to expand the backlog size. */
-               if (backlog_size > uds_fd_table[minor].backlog_size &&
-                   backlog_size < UDS_SOMAXCONN)
-                       uds_fd_table[minor].backlog_size = backlog_size;
-
-               /*
-                * Don't let the user shrink the backlog_size, as we might
-                * have clients waiting in those slots.
-                */
-       }
-
-       /* This socket is now listening. */
-       uds_fd_table[minor].listening = TRUE;
-
-       return OK;
-}
-
-static int
-do_socket(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc, type;
-
-       dprintf(("UDS: do_socket(%d)\n", minor));
-
-       /* The socket type can only be set once. */
-       if (uds_fd_table[minor].type != -1)
-               return EINVAL;
-
-       /* Get the requested type. */
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &type,
-           sizeof(type))) != OK)
-               return rc;
-
-       /* Assign the type if it is valid only. */
-       switch (type) {
-       case SOCK_STREAM:
-       case SOCK_DGRAM:
-       case SOCK_SEQPACKET:
-               uds_fd_table[minor].type = type;
-               return OK;
-
-       default:
-               return EINVAL;
-       }
-}
-
-static int
-do_bind(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       struct sockaddr_un addr;
-       int rc, i;
-       dev_t dev;
-       ino_t ino;
-
-       dprintf(("UDS: do_bind(%d)\n", minor));
-
-       /* If the type hasn't been set by do_socket() yet, OR an attempt
-        * to re-bind() a non-SOCK_DGRAM socket is made, fail the call.
-        */
-       if ((uds_fd_table[minor].type == -1) ||
-           (uds_fd_table[minor].addr.sun_family == AF_UNIX &&
-           uds_fd_table[minor].type != SOCK_DGRAM))
-               return EINVAL;
-
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-           sizeof(struct sockaddr_un))) != OK)
-               return rc;
-
-       /* Do some basic sanity checks on the address. */
-       if (addr.sun_family != AF_UNIX)
-               return EAFNOSUPPORT;
-
-       if (addr.sun_path[0] == '\0')
-               return ENOENT;
-
-       /* Attempt to create the socket file. */
-       if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
-#if NOT_YET
-           sizeof(addr.sun_path), SPATH_CREATE, &dev, &ino)) != OK)
-#else
-           sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
-#endif
-               return rc;
-
-       /*
-        * It is possible that the socket path name was already in use as
-        * address by another socket.  This means that the socket file was
-        * prematurely unlinked.  In that case, mark the old socket as stale,
-        * so that its path name will not be matched and only the newly bound
-        * socket will be found in address-based searches.  For now, we leave
-        * the old socket marked as stale for as long as it is bound to the
-        * same address.  A more advanced implementation could establish an
-        * order between the sockets so that the most recently bound socket is
-        * found at any time, but it is doubtful whether that would be useful.
-        */
-       for (i = 0; i < NR_FDS; i++) {
-               if (uds_fd_table[i].stale == FALSE &&
-                   uds_fd_table[i].addr.sun_family == AF_UNIX &&
-                   !strncmp(addr.sun_path, uds_fd_table[i].addr.sun_path,
-                   sizeof(uds_fd_table[i].addr.sun_path))) {
-#if NOT_YET
-                       uds_fd_table[i].stale = TRUE;
-#else
-                       return EADDRINUSE;
-#endif
-               }
-       }
-
-       /* Looks good, perform the bind(). */
-       uds_fd_table[minor].stale = FALSE;
-       memcpy(&uds_fd_table[minor].addr, &addr, sizeof(struct sockaddr_un));
-
-       return OK;
-}
-
-static int
-do_getsockname(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       dprintf(("UDS: do_getsockname(%d)\n", minor));
-
-       /*
-        * Unconditionally send the address we have assigned to this socket.
-        * The POSIX standard doesn't say what to do if the address hasn't been
-        * set.  If the address isn't currently set, then the user will get
-        * NULL bytes.  Note: libc depends on this behavior.
-        */
-       return sys_safecopyto(endpt, grant, 0,
-           (vir_bytes) &uds_fd_table[minor].addr, sizeof(struct sockaddr_un));
-}
-
-static int
-do_getpeername(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int peer_minor;
-
-       dprintf(("UDS: do_getpeername(%d)\n", minor));
-
-       /* Check that the socket is connected with a valid peer. */
-       if (uds_fd_table[minor].peer != -1) {
-               peer_minor = uds_fd_table[minor].peer;
-
-               /* Copy the address from the peer. */
-               return sys_safecopyto(endpt, grant, 0,
-                   (vir_bytes) &uds_fd_table[peer_minor].addr,
-                   sizeof(struct sockaddr_un));
-       } else if (uds_fd_table[minor].err == ECONNRESET) {
-               uds_fd_table[minor].err = 0;
-
-               return ECONNRESET;
-       } else
-               return ENOTCONN;
-}
-
-static int
-do_shutdown(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc, how;
-
-       dprintf(("UDS: do_shutdown(%d)\n", minor));
-
-       /* The socket must be connection oriented. */
-       if (uds_fd_table[minor].type != SOCK_STREAM &&
-           uds_fd_table[minor].type != SOCK_SEQPACKET)
-               return EINVAL;
-
-       if (uds_fd_table[minor].peer == -1) {
-               /* shutdown(2) is only valid for connected sockets. */
-               if (uds_fd_table[minor].err == ECONNRESET)
-                       return ECONNRESET;
-               else
-                       return ENOTCONN;
-       }
-
-       /* Get the 'how' parameter from the caller. */
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &how,
-           sizeof(how))) != OK)
-               return rc;
-
-       switch (how) {
-       case SHUT_RD:           /* Take away read permission. */
-               uds_fd_table[minor].mode &= ~UDS_R;
-               break;
-
-       case SHUT_WR:           /* Take away write permission. */
-               uds_fd_table[minor].mode &= ~UDS_W;
-               break;
-
-       case SHUT_RDWR:         /* Shut down completely. */
-               uds_fd_table[minor].mode = 0;
-               break;
-
-       default:
-               return EINVAL;
-       }
-
-       return OK;
-}
-
-static int
-do_socketpair(devminor_t minorx, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc;
-       dev_t minorin;
-       devminor_t minory;
-       struct sockaddr_un addr;
-
-       dprintf(("UDS: do_socketpair(%d)\n", minorx));
-
-       /* The ioctl argument is the minor number of the second socket. */
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &minorin,
-           sizeof(minorin))) != OK)
-               return rc;
-
-       minory = minor(minorin);
-
-       dprintf(("UDS: socketpair(%d, %d,)\n", minorx, minory));
-
-       /* Security check: both sockets must have the same owner endpoint. */
-       if (uds_fd_table[minorx].owner != uds_fd_table[minory].owner)
-               return EPERM;
-
-       addr.sun_family = AF_UNIX;
-       addr.sun_path[0] = 'X';
-       addr.sun_path[1] = '\0';
-
-       return perform_connection(minorx, minory, &addr);
-}
-
-static int
-do_getsockopt_sotype(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       dprintf(("UDS: do_getsockopt_sotype(%d)\n", minor));
-
-       /* If the type hasn't been set yet, we fail the call. */
-       if (uds_fd_table[minor].type == -1)
-               return EINVAL;
-
-       return sys_safecopyto(endpt, grant, 0,
-           (vir_bytes) &uds_fd_table[minor].type, sizeof(int));
-}
-
-static int
-do_getsockopt_peercred(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int peer_minor;
-       int rc;
-       struct uucred cred;
-
-       dprintf(("UDS: do_getsockopt_peercred(%d)\n", minor));
-
-       if (uds_fd_table[minor].peer == -1) {
-               if (uds_fd_table[minor].err == ECONNRESET) {
-                       uds_fd_table[minor].err = 0;
-
-                       return ECONNRESET;
-               } else
-                       return ENOTCONN;
-       }
-
-       peer_minor = uds_fd_table[minor].peer;
-
-       /*
-        * Obtain the peer's credentials and copy them out.  Ignore failures;
-        * in that case, the caller will simply get no credentials.
-        */
-       memset(&cred, 0, sizeof(cred));
-       cred.cr_uid = -1;
-       cred.cr_gid = -1;
-       (void)getepinfo(uds_fd_table[peer_minor].owner, &cred.cr_uid,
-           &cred.cr_gid);
-
-       return sys_safecopyto(endpt, grant, 0, (vir_bytes) &cred,
-           sizeof(struct uucred));
-}
-
-static int
-do_getsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       size_t sndbuf = UDS_BUF;
-
-       dprintf(("UDS: do_getsockopt_sndbuf(%d)\n", minor));
-
-       return sys_safecopyto(endpt, grant, 0, (vir_bytes) &sndbuf,
-           sizeof(sndbuf));
-}
-
-static int
-do_setsockopt_sndbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc;
-       size_t sndbuf;
-
-       dprintf(("UDS: do_setsockopt_sndbuf(%d)\n", minor));
-
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &sndbuf,
-           sizeof(sndbuf))) != OK)
-               return rc;
-
-       /* The send buffer is limited to 32KB at the moment. */
-       if (sndbuf > UDS_BUF)
-               return ENOSYS;
-
-       /* FIXME: actually shrink the buffer. */
-       return OK;
-}
-
-static int
-do_getsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       size_t rcvbuf = UDS_BUF;
-
-       dprintf(("UDS: do_getsockopt_rcvbuf(%d)\n", minor));
-
-       return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rcvbuf,
-           sizeof(rcvbuf));
-}
-
-static int
-do_setsockopt_rcvbuf(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc;
-       size_t rcvbuf;
-
-       dprintf(("UDS: do_setsockopt_rcvbuf(%d)\n", minor));
-
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &rcvbuf,
-           sizeof(rcvbuf))) != OK)
-               return rc;
-
-       /* The receive buffer is limited to 32KB at the moment. */
-       if (rcvbuf > UDS_BUF)
-               return ENOSYS;
-
-       /* FIXME: actually shrink the buffer. */
-       return OK;
-}
-
-static int
-do_sendto(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc;
-       struct sockaddr_un addr;
-       dev_t dev;
-       ino_t ino;
-
-       dprintf(("UDS: do_sendto(%d)\n", minor));
-
-       /* This IOCTL is only for SOCK_DGRAM sockets. */
-       if (uds_fd_table[minor].type != SOCK_DGRAM)
-               return EINVAL;
-
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &addr,
-           sizeof(struct sockaddr_un))) != OK)
-               return rc;
-
-       /* Do some basic sanity checks on the address. */
-       if (addr.sun_family != AF_UNIX || addr.sun_path[0] == '\0')
-               return EINVAL;
-
-       if ((rc = socketpath(uds_fd_table[minor].owner, addr.sun_path,
-           sizeof(addr.sun_path), SPATH_CHECK, &dev, &ino)) != OK)
-               return rc;
-
-       memcpy(&uds_fd_table[minor].target, &addr, sizeof(struct sockaddr_un));
-
-       return OK;
-}
-
-static int
-do_recvfrom(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       dprintf(("UDS: do_recvfrom(%d)\n", minor));
-
-       return sys_safecopyto(endpt, grant, 0,
-           (vir_bytes) &uds_fd_table[minor].source,
-           sizeof(struct sockaddr_un));
-}
-
-static int
-send_fds(devminor_t minor, struct msg_control *msg_ctrl,
-       struct ancillary *data)
-{
-       int i, rc, nfds, totalfds;
-       endpoint_t from_ep;
-       struct msghdr msghdr;
-       struct cmsghdr *cmsg = NULL;
-
-       dprintf(("UDS: send_fds(%d)\n", minor));
-
-       from_ep = uds_fd_table[minor].owner;
-
-       /* Obtain this socket's credentials. */
-       if ((rc = getepinfo(from_ep, &data->cred.uid, &data->cred.gid)) < 0)
-               return rc;
-
-       dprintf(("UDS: minor=%d cred={%d,%d}\n", minor,
-           data->cred.uid, data->cred.gid));
-
-       totalfds = data->nfiledes;
-
-       memset(&msghdr, '\0', sizeof(struct msghdr));
-       msghdr.msg_control = msg_ctrl->msg_control;
-       msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
-       for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
-           cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
-               if (cmsg->cmsg_level != SOL_SOCKET ||
-                   cmsg->cmsg_type != SCM_RIGHTS)
-                       continue;
-
-               nfds = MIN((cmsg->cmsg_len-CMSG_LEN(0))/sizeof(int), OPEN_MAX);
-
-               for (i = 0; i < nfds; i++) {
-                       if (totalfds == OPEN_MAX)
-                               return EOVERFLOW;
-
-                       data->fds[totalfds] = ((int *) CMSG_DATA(cmsg))[i];
-                       dprintf(("UDS: minor=%d fd[%d]=%d\n", minor, totalfds,
-                           data->fds[totalfds]));
-                       totalfds++;
-               }
-       }
-
-       for (i = data->nfiledes; i < totalfds; i++) {
-               if ((rc = copyfd(from_ep, data->fds[i], COPYFD_FROM)) < 0) {
-                       printf("UDS: copyfd(COPYFD_FROM) failed: %d\n", rc);
-
-                       /* Revert the successful copyfd() calls made so far. */
-                       for (i--; i >= data->nfiledes; i--)
-                               close(data->fds[i]);
-
-                       return rc;
-               }
-
-               dprintf(("UDS: send_fds(): %d -> %d\n", data->fds[i], rc));
-
-               data->fds[i] = rc;      /* this is now the local FD */
-       }
-
-       data->nfiledes = totalfds;
-
-       return OK;
-}
-
-/*
- * This function calls close() for all of the FDs in flight.  This is used
- * when a Unix Domain Socket is closed and there exists references to file
- * descriptors that haven't been received with recvmsg().
- */
-int
-uds_clear_fds(devminor_t minor, struct ancillary *data)
-{
-       int i;
-
-       dprintf(("UDS: uds_clear_fds(%d)\n", minor));
-
-       for (i = 0; i < data->nfiledes; i++) {
-               dprintf(("UDS: uds_clear_fds() => %d\n", data->fds[i]));
-
-               close(data->fds[i]);
-
-               data->fds[i] = -1;
-       }
-
-       data->nfiledes = 0;
-
-       return OK;
-}
-
-static int
-recv_fds(devminor_t minor, struct ancillary *data,
-       struct msg_control *msg_ctrl)
-{
-       int rc, i, j, fds[OPEN_MAX];
-       struct msghdr msghdr;
-       struct cmsghdr *cmsg;
-       endpoint_t to_ep;
-
-       dprintf(("UDS: recv_fds(%d)\n", minor));
-
-       msghdr.msg_control = msg_ctrl->msg_control;
-       msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
-       cmsg = CMSG_FIRSTHDR(&msghdr);
-       cmsg->cmsg_len = CMSG_LEN(sizeof(int) * data->nfiledes);
-       cmsg->cmsg_level = SOL_SOCKET;
-       cmsg->cmsg_type = SCM_RIGHTS;
-
-       to_ep = uds_fd_table[minor].owner;
-
-       /* Copy to the target endpoint. */
-       for (i = 0; i < data->nfiledes; i++) {
-               if ((rc = copyfd(to_ep, data->fds[i], COPYFD_TO)) < 0) {
-                       printf("UDS: copyfd(COPYFD_TO) failed: %d\n", rc);
-
-                       /* Revert the successful copyfd() calls made so far. */
-                       for (i--; i >= 0; i--)
-                               (void) copyfd(to_ep, fds[i], COPYFD_CLOSE);
-
-                       return rc;
-               }
-
-               fds[i] = rc;            /* this is now the remote FD */
-       }
-
-       /* Close the local copies only once the entire procedure succeeded. */
-       for (i = 0; i < data->nfiledes; i++) {
-               dprintf(("UDS: recv_fds(): %d -> %d\n", data->fds[i], fds[i]));
-
-               ((int *)CMSG_DATA(cmsg))[i] = fds[i];
-
-               close(data->fds[i]);
-
-               data->fds[i] = -1;
-       }
-
-       data->nfiledes = 0;
-
-       return OK;
-}
-
-static int
-recv_cred(devminor_t minor, struct ancillary *data,
-       struct msg_control *msg_ctrl)
-{
-       struct msghdr msghdr;
-       struct cmsghdr *cmsg;
-       struct uucred *cred;
-
-       dprintf(("UDS: recv_cred(%d)\n", minor));
-
-       msghdr.msg_control = msg_ctrl->msg_control;
-       msghdr.msg_controllen = msg_ctrl->msg_controllen;
-
-       cmsg = CMSG_FIRSTHDR(&msghdr);
-       if (cmsg->cmsg_len > 0)
-               cmsg = CMSG_NXTHDR(&msghdr, cmsg);
-
-       cmsg->cmsg_len = CMSG_LEN(sizeof(struct uucred));
-       cmsg->cmsg_level = SOL_SOCKET;
-       cmsg->cmsg_type = SCM_CREDS;
-       cred = (struct uucred *)CMSG_DATA(cmsg);
-       memset(cred, 0, sizeof(*cred));
-       cred->cr_uid = data->cred.uid;
-       cred->cr_gid = data->cred.gid;
-
-       return OK;
-}
-
-static int
-do_sendmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int peer, rc, i;
-       struct msg_control msg_ctrl;
-
-       dprintf(("UDS: do_sendmsg(%d)\n", minor));
-
-       memset(&msg_ctrl, '\0', sizeof(struct msg_control));
-
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl,
-           sizeof(struct msg_control))) != OK)
-               return rc;
-
-       /* Locate the peer. */
-       peer = -1;
-       if (uds_fd_table[minor].type == SOCK_DGRAM) {
-               if (uds_fd_table[minor].target.sun_path[0] == '\0' ||
-                   uds_fd_table[minor].target.sun_family != AF_UNIX)
-                       return EDESTADDRREQ;
-
-               for (i = 0; i < NR_FDS; i++) {
-                       /*
-                        * Look for a SOCK_DGRAM socket that is bound on the
-                        * target address.
-                        */
-                       if (uds_fd_table[i].type == SOCK_DGRAM &&
-                           uds_fd_table[i].stale == FALSE &&
-                           uds_fd_table[i].addr.sun_family == AF_UNIX &&
-                           !strncmp(uds_fd_table[minor].target.sun_path,
-                           uds_fd_table[i].addr.sun_path,
-                           sizeof(uds_fd_table[i].addr.sun_path))) {
-                               peer = i;
-                               break;
-                       }
-               }
-
-               if (peer == -1)
-                       return ENOENT;
-       } else {
-               peer = uds_fd_table[minor].peer;
-               if (peer == -1)
-                       return ENOTCONN;
-       }
-
-       dprintf(("UDS: sendmsg(%d) -- peer=%d\n", minor, peer));
-
-       /*
-        * Note: it's possible that there is already some file descriptors in
-        * ancillary_data if the peer didn't call recvmsg() yet.  That's okay.
-        * The receiver will get the current file descriptors plus the new
-        * ones.
-        */
-       return send_fds(minor, &msg_ctrl, &uds_fd_table[peer].ancillary_data);
-}
-
-static int
-do_recvmsg(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc;
-       struct msg_control msg_ctrl;
-       socklen_t clen_avail = 0;
-       socklen_t clen_needed = 0;
-       socklen_t clen_desired = 0;
-
-       dprintf(("UDS: do_recvmsg(%d)\n", minor));
-       dprintf(("UDS: minor=%d credentials={uid:%d,gid:%d}\n", minor,
-           uds_fd_table[minor].ancillary_data.cred.uid,
-           uds_fd_table[minor].ancillary_data.cred.gid));
-
-       memset(&msg_ctrl, '\0', sizeof(struct msg_control));
-
-       /*
-        * Get the msg_control from the user.  It will include the
-        * amount of space the user has allocated for control data.
-        */
-       if ((rc = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &msg_ctrl,
-           sizeof(struct msg_control))) != OK)
-               return rc;
-
-       clen_avail = MIN(msg_ctrl.msg_controllen, MSG_CONTROL_MAX);
-
-       if (uds_fd_table[minor].ancillary_data.nfiledes > 0) {
-               clen_needed = CMSG_SPACE(sizeof(int) *
-                   uds_fd_table[minor].ancillary_data.nfiledes);
-       }
-
-       /* if there is room we also include credentials */
-       clen_desired = clen_needed + CMSG_SPACE(sizeof(struct uucred));
-
-       if (clen_needed > clen_avail)
-               return EOVERFLOW;
-
-       if (uds_fd_table[minor].ancillary_data.nfiledes > 0) {
-               if ((rc = recv_fds(minor, &uds_fd_table[minor].ancillary_data,
-                   &msg_ctrl)) != OK)
-                       return rc;
-       }
-
-       if (clen_desired <= clen_avail) {
-               rc = recv_cred(minor, &uds_fd_table[minor].ancillary_data,
-                   &msg_ctrl);
-               if (rc != OK)
-                       return rc;
-               msg_ctrl.msg_controllen = clen_desired;
-       } else
-               msg_ctrl.msg_controllen = clen_needed;
-
-       /* Send the control data to the user. */
-       return sys_safecopyto(endpt, grant, 0, (vir_bytes) &msg_ctrl,
-           sizeof(struct msg_control));
-}
-
-static int
-do_fionread(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant)
-{
-       int rc;
-
-       rc = uds_perform_read(minor, NONE, GRANT_INVALID, UDS_BUF, 1);
-
-       /* What should we do on error?  Just set to zero for now. */
-       if (rc < 0)
-               rc = 0;
-
-       return sys_safecopyto(endpt, grant, 0, (vir_bytes) &rc, sizeof(rc));
-}
-
-int
-uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
-       cp_grant_id_t grant)
-{
-       int rc;
-
-       switch (request) {
-       case NWIOSUDSCONN:
-               /* Connect to a listening socket -- connect(). */
-               rc = do_connect(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSACCEPT:
-               /* Accept an incoming connection -- accept(). */
-               rc = do_accept(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSBLOG:
-               /*
-                * Set the backlog_size and put the socket into the listening
-                * state -- listen().
-                */
-               rc = do_listen(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSTYPE:
-               /* Set the SOCK_ type for this socket -- socket(). */
-               rc = do_socket(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSADDR:
-               /* Set the address for this socket -- bind(). */
-               rc = do_bind(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSADDR:
-               /* Get the address for this socket -- getsockname(). */
-               rc = do_getsockname(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSPADDR:
-               /* Get the address for the peer -- getpeername(). */
-               rc = do_getpeername(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSSHUT:
-               /*
-                * Shut down a socket for reading, writing, or both --
-                * shutdown().
-                */
-               rc = do_shutdown(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSPAIR:
-               /* Connect two sockets -- socketpair(). */
-               rc = do_socketpair(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSSOTYPE:
-               /* Get socket type -- getsockopt(SO_TYPE). */
-               rc = do_getsockopt_sotype(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSPEERCRED:
-               /* Get peer endpoint -- getsockopt(SO_PEERCRED). */
-               rc = do_getsockopt_peercred(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSTADDR:
-               /* Set target address -- sendto(). */
-               rc = do_sendto(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSFADDR:
-               /* Get from address -- recvfrom(). */
-               rc = do_recvfrom(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSSNDBUF:
-               /* Get the send buffer size -- getsockopt(SO_SNDBUF). */
-               rc = do_getsockopt_sndbuf(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSSNDBUF:
-               /* Set the send buffer size -- setsockopt(SO_SNDBUF). */
-               rc = do_setsockopt_sndbuf(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSRCVBUF:
-               /* Get the send buffer size -- getsockopt(SO_SNDBUF). */
-               rc = do_getsockopt_rcvbuf(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSRCVBUF:
-               /* Set the send buffer size -- setsockopt(SO_SNDBUF). */
-               rc = do_setsockopt_rcvbuf(minor, endpt, grant);
-
-               break;
-
-       case NWIOSUDSCTRL:
-               /* Set the control data -- sendmsg(). */
-               rc = do_sendmsg(minor, endpt, grant);
-
-               break;
-
-       case NWIOGUDSCTRL:
-               /* Set the control data -- recvmsg(). */
-               rc = do_recvmsg(minor, endpt, grant);
-
-               break;
-
-       case FIONREAD:
-               /*
-                * Get the number of bytes immediately available for reading.
-                */
-               rc = do_fionread(minor, endpt, grant);
-
-               break;
-
-       default:
-               /*
-                * The IOCTL command is not valid for /dev/uds -- this happens
-                * a lot and is normal.  A lot of libc functions determine the
-                * socket type with IOCTLs.  Any unrecognized requests simply
-                * get an ENOTTY response.
-                */
-
-               rc = ENOTTY;
-       }
-
-       return rc;
-}
diff --git a/minix/net/uds/stat.c b/minix/net/uds/stat.c

new file mode 100644 (file)

index 0000000..2759f63
--- /dev/null
+++ b/minix/net/uds/stat.c
@@ -0,0 +1,186 @@
+/* UNIX Domain Sockets - stat.c - network status */
+
+#include "uds.h"
+#include <sys/socketvar.h>
+#include <sys/unpcb.h>
+
+/*
+ * Fill the given 'ki' structure with information about the socket 'uds'.
+ */
+static void
+uds_get_info(struct kinfo_pcb * ki, const struct udssock * uds)
+{
+       struct udssock *peer;
+       socklen_t len;
+       int type;
+
+       type = uds_get_type(uds);
+       peer = uds_get_peer(uds);
+
+       ki->ki_pcbaddr = (uint64_t)(uintptr_t)uds;
+       ki->ki_ppcbaddr = (uint64_t)(uintptr_t)uds;
+       ki->ki_sockaddr = (uint64_t)(uintptr_t)&uds->uds_sock;
+       ki->ki_family = AF_UNIX;
+       ki->ki_type = type;
+       ki->ki_protocol = UDSPROTO_UDS;
+       ki->ki_pflags = 0;
+       if (uds->uds_flags & UDSF_CONNWAIT)
+               ki->ki_pflags |= UNP_CONNWAIT;
+       if (uds->uds_flags & UDSF_PASSCRED)
+               ki->ki_pflags |= UNP_WANTCRED;
+       if (type != SOCK_DGRAM && uds->uds_cred.unp_pid != -1) {
+               if (uds_is_listening(uds))
+                       ki->ki_pflags |= UNP_EIDSBIND;
+               else if (uds_is_connecting(uds) || uds_is_connected(uds))
+                       ki->ki_pflags |= UNP_EIDSVALID;
+       }
+       /* Not sure about NetBSD connection states.  First attempt here. */
+       if (uds_is_connecting(uds))
+               ki->ki_sostate = SS_ISCONNECTING;
+       else if (uds_is_connected(uds))
+               ki->ki_sostate = SS_ISCONNECTED;
+       else if (uds_is_disconnected(uds))
+               ki->ki_sostate = SS_ISDISCONNECTED;
+       ki->ki_rcvq = uds->uds_len;
+       /* We currently mirror the peer's receive queue size when connected. */
+       if (uds_is_connected(uds))
+               ki->ki_sndq = peer->uds_len;
+       /* The source is not set for bound connection-type sockets here. */
+       if (type == SOCK_DGRAM || uds_is_listening(uds))
+               uds_make_addr(uds->uds_path, (size_t)uds->uds_pathlen,
+                   &ki->ki_src, &len);
+       if (peer != NULL)
+               uds_make_addr(peer->uds_path, (size_t)peer->uds_pathlen,
+                   &ki->ki_dst, &len);
+       /* TODO: we should set ki_inode and ki_vnode, but to what? */
+       ki->ki_conn = (uint64_t)(uintptr_t)peer;
+       if (!TAILQ_EMPTY(&uds->uds_queue))
+               ki->ki_refs =
+                   (uint64_t)(uintptr_t)TAILQ_FIRST(&uds->uds_queue);
+       if (uds_has_link(uds))
+               ki->ki_nextref =
+                   (uint64_t)(uintptr_t)TAILQ_NEXT(uds, uds_next);
+}
+
+/*
+ * Remote MIB implementation of CTL_NET PF_LOCAL {SOCK_STREAM,SOCK_DGRAM,
+ * SOCK_SEQPACKET} 0.  This function handles all queries on the
+ * "net.local.{stream,dgram,seqpacket}.pcblist" sysctl(7) nodes.
+ *
+ * The 0 for "pcblist" is a MINIXism: we use it to keep our arrays small.
+ * NetBSD numbers these nodes dynamically and so they have numbers above
+ * CREATE_BASE.  That also means that no userland application can possibly
+ * hardcode their numbers, and must perform lookups by name.  In turn, that
+ * means that we can safely change the 0 to another number if NetBSD ever
+ * introduces statically numbered nodes in these subtrees.
+ */
+static ssize_t
+net_local_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
+       struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
+{
+       struct udssock *uds;
+       struct kinfo_pcb ki;
+       ssize_t off;
+       int r, type, size, max;
+
+       if (call->call_namelen != 4)
+               return EINVAL;
+
+       /* The first two added name fields are not used. */
+
+       size = call->call_name[2];
+       if (size < 0 || (size_t)size > sizeof(ki))
+               return EINVAL;
+       if (size == 0)
+               size = sizeof(ki);
+       max = call->call_name[3];
+
+       type = call->call_oname[2];
+
+       off = 0;
+
+       for (uds = uds_enum(NULL, type); uds != NULL;
+           uds = uds_enum(uds, type)) {
+               if (rmib_inrange(oldp, off)) {
+                       memset(&ki, 0, sizeof(ki));
+
+                       uds_get_info(&ki, uds);
+
+                       if ((r = rmib_copyout(oldp, off, &ki, size)) < 0)
+                               return r;
+               }
+
+               off += size;
+               if (max > 0 && --max == 0)
+                       break;
+       }
+
+       /*
+        * Margin to limit the possible effects of the inherent race condition
+        * between receiving just the data size and receiving the actual data.
+        */
+       if (oldp == NULL)
+               off += PCB_SLOP * size;
+
+       return off;
+}
+
+/* The CTL_NET PF_LOCAL SOCK_STREAM subtree. */
+static struct rmib_node net_local_stream_table[] = {
+       [0]     = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+                   "pcblist", "SOCK_STREAM protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL SOCK_DGRAM subtree. */
+static struct rmib_node net_local_dgram_table[] = {
+       [0]     = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+                   "pcblist", "SOCK_DGRAM protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL SOCK_SEQPACKET subtree. */
+static struct rmib_node net_local_seqpacket_table[] = {
+       [0]     = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist,
+                   "pcblist", "SOCK_SEQPACKET protocol control block list"),
+};
+
+/* The CTL_NET PF_LOCAL subtree. */
+static struct rmib_node net_local_table[] = {
+/* 1*/ [SOCK_STREAM]           = RMIB_NODE(RMIB_RO, net_local_stream_table,
+                                   "stream", "SOCK_STREAM settings"),
+/* 2*/ [SOCK_DGRAM]            = RMIB_NODE(RMIB_RO, net_local_dgram_table,
+                                   "dgram", "SOCK_DGRAM settings"),
+/* 5*/ [SOCK_SEQPACKET]        = RMIB_NODE(RMIB_RO, net_local_seqpacket_table,
+                                   "seqpacket", "SOCK_SEQPACKET settings"),
+};
+
+static struct rmib_node net_local_node =
+    RMIB_NODE(RMIB_RO, net_local_table, "local", "PF_LOCAL related settings");
+
+/*
+ * Initialize the status module.
+ */
+void
+uds_stat_init(void)
+{
+       const int mib[] = { CTL_NET, PF_LOCAL };
+       int r;
+
+       /*
+        * Register our own "net.local" subtree with the MIB service.
+        *
+        * This call only returns local failures.  Remote failures (in the MIB
+        * service) are silently ignored.  So, we can safely panic on failure.
+        */
+       if ((r = rmib_register(mib, __arraycount(mib), &net_local_node)) != OK)
+               panic("UDS: unable to register remote MIB tree: %d", r);
+}
+
+/*
+ * Clean up the status module.
+ */
+void
+uds_stat_cleanup(void)
+{
+
+       rmib_deregister(&net_local_node);
+}
diff --git a/minix/net/uds/uds.8 b/minix/net/uds/uds.8

deleted file mode 100644 (file)

index 2484ea7..0000000
--- a/minix/net/uds/uds.8
+++ /dev/null
@@ -1,15 +0,0 @@
-.TH UDS 8
-.SH NAME
-uds \- unix domain sockets device
-.SH DESCRIPTION
-The \fIuds\fP device gives access to the unix domain socket services in 
-Minix. It is a virtual device similar to the \fItcp\fP and \fIudp\fP 
-Internet Protocol server devices.
-.SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR dev(4),
-.BR ip(4),
-.BR unix(8)
-.SH HISTORY
-This device first appeared in Minix 3.1.8.
diff --git a/minix/net/uds/uds.c b/minix/net/uds/uds.c

index baca3c1ed5b697095f2b2cd5fd322c28fd837118..2052a2ac2d5da8f572025a6a1c32f7c1f8978fce 100644 (file)
--- a/minix/net/uds/uds.c
+++ b/minix/net/uds/uds.c
@@ -1,740 +1,1376 @@
-/*
- * Unix Domain Sockets Implementation (PF_UNIX, PF_LOCAL)
- * This code handles requests generated by operations on /dev/uds
- *
- * The interface to UNIX domain sockets is similar to the interface to network
- * sockets. There is a character device (/dev/uds) and this server is a
- * 'driver' for that device.
- */
+/* UNIX Domain Sockets - uds.c - socket management */
  
  #include "uds.h"
  
-static ssize_t uds_perform_write(devminor_t, endpoint_t, cp_grant_id_t, size_t,
-       int);
-
-static int uds_open(devminor_t, int, endpoint_t);
-static int uds_close(devminor_t);
-static ssize_t uds_read(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t,
-       int, cdev_id_t);
-static ssize_t uds_write(devminor_t, u64_t, endpoint_t, cp_grant_id_t, size_t,
-       int, cdev_id_t);
-static int uds_ioctl(devminor_t, unsigned long, endpoint_t, cp_grant_id_t, int,
-       endpoint_t, cdev_id_t);
-static int uds_cancel(devminor_t, endpoint_t, cdev_id_t);
-static int uds_select(devminor_t, unsigned int, endpoint_t);
-
-static struct chardriver uds_tab = {
-       .cdr_open       = uds_open,
-       .cdr_close      = uds_close,
-       .cdr_read       = uds_read,
-       .cdr_write      = uds_write,
-       .cdr_ioctl      = uds_ioctl,
-       .cdr_cancel     = uds_cancel,
-       .cdr_select     = uds_select
-};
+static struct udssock uds_array[NR_UDSSOCK];
+static TAILQ_HEAD(uds_freelist, udssock) uds_freelist;
+static unsigned int uds_in_use;
+static int uds_running;
  
-/* File Descriptor Table */
-uds_fd_t uds_fd_table[NR_FDS];
+static const struct sockevent_ops uds_ops;
  
-static unsigned int uds_exit_left;
+static SLIST_HEAD(udshash, udssock) udshash[UDSHASH_SLOTS];
  
-static int
-uds_open(devminor_t UNUSED(orig_minor), int access,
-       endpoint_t user_endpt)
+/*
+ * Initialize file-to-socket hash table.
+ */
+static void
+udshash_init(void)
  {
-       devminor_t minor;
-       char *buf;
-       int i;
+       unsigned int slot;
  
-       dprintf(("UDS: uds_open() from %d\n", user_endpt));
+       for (slot = 0; slot < __arraycount(udshash); slot++)
+               SLIST_INIT(&udshash[slot]);
+}
  
-       /*
-        * Find a slot in the descriptor table for the new descriptor.
-        * The index of the descriptor in the table will be returned.
-        * Subsequent calls to read/write/close/ioctl/etc will use this
-        * minor number.  The minor number must be different from the
-        * the /dev/uds device's minor number (0).
-        */
-       for (minor = 1; minor < NR_FDS; minor++)
-               if (uds_fd_table[minor].state == UDS_FREE)
-                       break;
+/*
+ * Return a hash table slot number for the given <dev,ino> pair.
+ */
+static unsigned int
+udshash_slot(dev_t dev, ino_t ino)
+{
  
-       if (minor == NR_FDS)
-               return ENFILE;
+       assert(dev != NO_DEV);
+       assert(ino != 0);
  
         /*
-        * Allocate memory for the ringer buffer.  In order to save on memory
-        * in the common case, the buffer is allocated only when the socket is
-        * in use.  We use mmap instead of malloc to allow the memory to be
-        * actually freed later.
+        * Effectively combining two 64-bit numbers into a single 6-or-so-bit
+        * hash is not too easy.  This hash function is probably among the
+        * worst options.  Then again it is not all that critical as we are not
+        * expecting that many bound UDS sockets in the system anyway.
          */
-       if ((buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE,
-           MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED)
-               return ENOMEM;
+       return (unsigned int)(dev ^ ino) % UDSHASH_SLOTS;
+}
  
-       /*
-        * Allocate the socket, and set its initial parameters.
-        */
-       uds_fd_table[minor].state = UDS_INUSE;
-       uds_fd_table[minor].owner = user_endpt;
-       uds_fd_table[minor].sel_endpt = NONE;
-       uds_fd_table[minor].sel_ops = 0;
-       uds_fd_table[minor].buf = buf;
-       uds_fd_table[minor].pos = 0;
-       uds_fd_table[minor].size = 0;
-       uds_fd_table[minor].mode = UDS_R | UDS_W;
-       uds_fd_table[minor].type = -1;
+/*
+ * Look for a socket that is bound to the given <dev,ino> pair.  Return a
+ * pointer to the socket if found, or NULL otherwise.
+ */
+static struct udssock *
+udshash_get(dev_t dev, ino_t ino)
+{
+       struct udssock *uds;
+       unsigned int slot;
  
-       for (i = 0; i < UDS_SOMAXCONN; i++)
-               uds_fd_table[minor].backlog[i] = -1;
-       uds_fd_table[minor].backlog_size = UDS_SOMAXCONN;
+       slot = udshash_slot(dev, ino);
  
-       memset(&uds_fd_table[minor].ancillary_data, '\0',
-           sizeof(struct ancillary));
-       for (i = 0; i < OPEN_MAX; i++)
-               uds_fd_table[minor].ancillary_data.fds[i] = -1;
+       SLIST_FOREACH(uds, &udshash[slot], uds_hash) {
+               if (uds->uds_dev == dev && uds->uds_ino == ino)
+                       return uds;
+       }
  
-       uds_fd_table[minor].stale = FALSE;
-       uds_fd_table[minor].listening = FALSE;
-       uds_fd_table[minor].peer = -1;
-       uds_fd_table[minor].child = -1;
+       return NULL;
+}
  
-       memset(&uds_fd_table[minor].addr, '\0', sizeof(struct sockaddr_un));
-       memset(&uds_fd_table[minor].source, '\0', sizeof(struct sockaddr_un));
-       memset(&uds_fd_table[minor].target, '\0', sizeof(struct sockaddr_un));
+/*
+ * Add a socket to the file-to-socket hash table.  The socket must have its
+ * device and inode fields set, and must not be in the hash table already.
+ */
+static void
+udshash_add(struct udssock * uds)
+{
+       unsigned int slot;
  
-       uds_fd_table[minor].suspended = UDS_NOT_SUSPENDED;
+       slot = udshash_slot(uds->uds_dev, uds->uds_ino);
  
-       return CDEV_CLONED | minor;
+       SLIST_INSERT_HEAD(&udshash[slot], uds, uds_hash);
  }
  
+/*
+ * Remove a socket from the file-to-socket hash table.  The socket must be in
+ * the hash table.
+ */
  static void
-uds_reset(devminor_t minor)
+udshash_del(struct udssock * uds)
  {
-       /* Disconnect the socket from its peer. */
-       uds_fd_table[minor].peer = -1;
+       unsigned int slot;
  
-       /* Set an error to pass to the caller. */
-       uds_fd_table[minor].err = ECONNRESET;
+       slot = udshash_slot(uds->uds_dev, uds->uds_ino);
  
-       /* If a process was blocked on I/O, revive it. */
-       if (uds_fd_table[minor].suspended != UDS_NOT_SUSPENDED)
-               uds_unsuspend(minor);
+       /* This macro is O(n). */
+       SLIST_REMOVE(&udshash[slot], uds, udssock, uds_hash);
+}
  
-       /* All of the peer's calls will fail immediately now. */
-       if (uds_fd_table[minor].sel_ops != 0) {
-               chardriver_reply_select(uds_fd_table[minor].sel_endpt, minor,
-                   uds_fd_table[minor].sel_ops);
-               uds_fd_table[minor].sel_ops = 0;
-       }
+/*
+ * Return the socket identifier for the given UDS socket object.
+ */
+sockid_t
+uds_get_id(struct udssock * uds)
+{
+
+       return (sockid_t)(uds - uds_array);
  }
  
-static int
-uds_close(devminor_t minor)
+/*
+ * Given either NULL or a previously returned socket, return the next in-use
+ * UDS socket of the given socket type, or NULL if there are no more matches.
+ * The sockets are returned in random order, but each matching socket is
+ * returned exactly once (until any socket is allocated or freed).
+ */
+struct udssock *
+uds_enum(struct udssock * prev, int type)
  {
-       int i, peer;
+       sockid_t id;
  
-       dprintf(("UDS: uds_close(%d)\n", minor));
+       if (prev != NULL)
+               id = uds_get_id(prev) + 1;
+       else
+               id = 0;
  
-       if (minor < 0 || minor >= NR_FDS) return ENXIO;
+       for (; id < NR_UDSSOCK; id++)
+               if ((uds_array[id].uds_flags & UDSF_IN_USE) &&
+                   uds_get_type(&uds_array[id]) == type)
+                       return &uds_array[id];
  
-       if (uds_fd_table[minor].state != UDS_INUSE)
-               return EINVAL;
+       return NULL;
+}
  
-       peer = uds_fd_table[minor].peer;
-       if (peer != -1 && uds_fd_table[peer].peer == -1) {
-               /* Connecting socket: clear from server's backlog. */
-               if (!uds_fd_table[peer].listening)
-                       panic("connecting socket attached to non-server");
-
-               for (i = 0; i < uds_fd_table[peer].backlog_size; i++) {
-                       if (uds_fd_table[peer].backlog[i] == minor) {
-                               uds_fd_table[peer].backlog[i] = -1;
-                               break;
-                       }
-               }
-       } else if (peer != -1) {
-               /* Connected socket: disconnect it. */
-               uds_reset(peer);
-       } else if (uds_fd_table[minor].listening) {
-               /* Listening socket: disconnect all sockets in the backlog. */
-               for (i = 0; i < uds_fd_table[minor].backlog_size; i++)
-                       if (uds_fd_table[minor].backlog[i] != -1)
-                               uds_reset(uds_fd_table[minor].backlog[i]);
-       }
+/*
+ * Invalidate credentials on the socket.
+ */
+static void
+uds_clear_cred(struct udssock * uds)
+{
  
-       if (uds_fd_table[minor].ancillary_data.nfiledes > 0)
-               uds_clear_fds(minor, &uds_fd_table[minor].ancillary_data);
+       uds->uds_cred.unp_pid = -1;
+       uds->uds_cred.unp_euid = -1;
+       uds->uds_cred.unp_egid = -1;
+}
  
-       /* Release the memory for the ring buffer. */
-       munmap(uds_fd_table[minor].buf, UDS_BUF);
+/*
+ * Obtain the credentials (process, user, and group ID) of the given user
+ * endpoint and associate them with the socket for later retrieval.  It is
+ * important to note that this information is obtained once at connect time,
+ * and never updated later.  The party receiving the credentials must take this
+ * into account.
+ */
+static void
+uds_get_cred(struct udssock * uds, endpoint_t user_endpt)
+{
+       int r;
  
-       /* Set the socket back to its original UDS_FREE state. */
-       memset(&uds_fd_table[minor], '\0', sizeof(uds_fd_t));
+       if ((uds->uds_cred.unp_pid = r = getepinfo(user_endpt,
+           &uds->uds_cred.unp_euid, &uds->uds_cred.unp_egid)) < 0) {
+               printf("UDS: failed obtaining credentials of %d (%d)\n",
+                   user_endpt, r);
  
-       /* If terminating, and this was the last open socket, exit now. */
-       if (uds_exit_left > 0) {
-               if (--uds_exit_left == 0)
-                       chardriver_terminate();
+               uds_clear_cred(uds);
         }
+}
  
+/*
+ * Allocate and initialize a UDS socket.  On succes, return OK with a pointer
+ * to the new socket in 'udsp'.  On failure, return a negative error code.
+ */
+static int
+uds_alloc(struct udssock ** udsp)
+{
+       struct udssock *uds;
+       int r;
+
+       /* Allocate, initialize, and return a UNIX domain socket object. */
+       if (TAILQ_EMPTY(&uds_freelist))
+               return ENOBUFS;
+
+       uds = TAILQ_FIRST(&uds_freelist);
+
+       uds->uds_conn = NULL;           /* not connected */
+       uds->uds_link = NULL;           /* not connecting or linked */
+       uds->uds_queued = 0;
+       uds->uds_flags = UDSF_IN_USE;   /* may be found through enumeration */
+       uds->uds_pathlen = 0;           /* not bound: no path */
+       uds->uds_dev = NO_DEV;          /* not hashed: no socket file device */
+       uds->uds_ino = 0;               /* not hashed: no socket file inode */
+       uds_clear_cred(uds);            /* no bind/connect-time credentials */
+       TAILQ_INIT(&uds->uds_queue);    /* an empty queue */
+
+       if ((r = uds_io_setup(uds)) != OK)
+               return r;
+
+       TAILQ_REMOVE(&uds_freelist, uds, uds_next);
+
+       assert(uds_in_use < NR_UDSSOCK);
+       uds_in_use++;
+
+       *udsp = uds;
         return OK;
  }
  
-static int
-uds_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
+/*
+ * Free a previously allocated socket.
+ */
+static void
+uds_free(struct sock * sock)
  {
-       unsigned int ready_ops;
-       int i, bytes, watch;
+       struct udssock *uds = (struct udssock *)sock;
  
-       dprintf(("UDS: uds_select(%d)\n", minor));
+       uds_io_cleanup(uds);
  
-       if (minor < 0 || minor >= NR_FDS) return ENXIO;
+       uds->uds_flags = 0;             /* no longer in use */
  
-       if (uds_fd_table[minor].state != UDS_INUSE)
-               return EINVAL;
+       TAILQ_INSERT_HEAD(&uds_freelist, uds, uds_next);
  
-       watch = (ops & CDEV_NOTIFY);
-       ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);
-
-       ready_ops = 0;
-
-       /* Check if there is data available to read. */
-       if (ops & CDEV_OP_RD) {
-               bytes = uds_perform_read(minor, NONE, GRANT_INVALID, 1, 1);
-               if (bytes > 0) {
-                       ready_ops |= CDEV_OP_RD;        /* data available */
-               } else if (uds_fd_table[minor].listening == TRUE) {
-                       /* Check for pending connections. */
-                       for (i = 0; i < uds_fd_table[minor].backlog_size; i++)
-                       {
-                               if (uds_fd_table[minor].backlog[i] != -1) {
-                                       ready_ops |= CDEV_OP_RD;
-                                       break;
-                               }
-                       }
-               } else if (bytes != EDONTREPLY) {
-                       ready_ops |= CDEV_OP_RD;        /* error */
-               }
+       assert(uds_in_use > 0);
+       if (--uds_in_use == 0 && uds_running == FALSE)
+               sef_cancel();
+}
+
+/*
+ * Create a new socket.
+ */
+static sockid_t
+uds_socket(int domain, int type, int protocol, endpoint_t user_endpt __unused,
+       struct sock ** sockp, const struct sockevent_ops ** ops)
+{
+       struct udssock *uds;
+       int r;
+
+       dprintf(("UDS: socket(%d,%d,%d)\n", domain, type, protocol));
+
+       if (domain != PF_UNIX) {
+               /* This means the service was configured incorrectly. */
+               printf("UDS: got request for domain %d\n", domain);
+
+               return EAFNOSUPPORT;
         }
  
-       /* Check if we can write without blocking. */
-       if (ops & CDEV_OP_WR) {
-               bytes = uds_perform_write(minor, NONE, GRANT_INVALID, 1, 1);
-               if (bytes != 0 && bytes != EDONTREPLY)
-                       ready_ops |= CDEV_OP_WR;
+       /* We support the following three socket types. */
+       switch (type) {
+       case SOCK_STREAM:
+       case SOCK_SEQPACKET:
+       case SOCK_DGRAM:
+               break;
+       default:
+               return EPROTOTYPE;
         }
  
         /*
-        * If not all requested ops were ready, and the caller requests to be
-        * notified about changes, we add the remaining ops to the saved set.
+        * The PF_UNIX domain does not support particular protocols, so the
+        * given protocol must be zero (= anything that matches).
          */
-       ops &= ~ready_ops;
-       if (ops && watch) {
-               uds_fd_table[minor].sel_endpt = endpt;
-               uds_fd_table[minor].sel_ops |= ops;
-       }
+       if (protocol != UDSPROTO_UDS)
+               return EPROTONOSUPPORT;
  
-       return ready_ops;
+       if ((r = uds_alloc(&uds)) != OK)
+               return r;
+
+       dprintf(("UDS: socket returns %d\n", uds_get_id(uds)));
+
+       *sockp = &uds->uds_sock;
+       *ops = &uds_ops;
+       return uds_get_id(uds);
  }
  
-ssize_t
-uds_perform_read(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant,
-       size_t size, int pretend)
+/*
+ * Connect a pair of sockets.
+ */
+static int
+uds_pair(struct sock * sock1, struct sock * sock2, endpoint_t user_endpt)
  {
-       size_t pos, subsize;
-       int r, peer;
+       struct udssock *uds1 = (struct udssock *)sock1;
+       struct udssock *uds2 = (struct udssock *)sock2;
  
-       dprintf(("UDS: uds_perform_read(%d)\n", minor));
+       dprintf(("UDS: pair(%d,%d)\n", uds_get_id(uds1), uds_get_id(uds2)));
  
-       peer = uds_fd_table[minor].peer;
+       /* Only connection-oriented types are acceptable. */
+       if (uds_get_type(uds1) == SOCK_DGRAM)
+               return EOPNOTSUPP;
  
-       /* Skip reads of zero bytes. */
-       if (size == 0)
-               return 0;
+       /* Connect the sockets. */
+       uds1->uds_conn = uds2;
+       uds2->uds_conn = uds1;
+       uds1->uds_flags |= UDSF_CONNECTED;
+       uds2->uds_flags |= UDSF_CONNECTED;
  
-       /* Check if the socket isn't shut down for reads. */
-       if (!(uds_fd_table[minor].mode & UDS_R))
-               return EPIPE;
+       /* Obtain the (same) credentials for both sides of the connection. */
+       uds_get_cred(uds1, user_endpt);
+       memcpy(&uds2->uds_cred, &uds1->uds_cred, sizeof(uds2->uds_cred));
  
-       if (uds_fd_table[minor].size == 0) {
-               if (peer == -1) {
-                       /*
-                        * We're not connected. That's only a problem when this
-                        * socket is connection oriented.
-                        */
-                       if (uds_fd_table[minor].type == SOCK_STREAM ||
-                           uds_fd_table[minor].type == SOCK_SEQPACKET) {
-                               if (uds_fd_table[minor].err == ECONNRESET) {
-                                       if (!pretend)
-                                               uds_fd_table[minor].err = 0;
-                                       return ECONNRESET;
-                               } else
-                                       return ENOTCONN;
-                       }
+       return OK;
+}
+
+/*
+ * Disconnect a UDS socket, notifying or freeing up the other end of the
+ * connection depending on whether the socket was linked, that is, on the
+ * accept queue of a listening socket.
+ */
+static void
+uds_disconnect(struct udssock * uds, int was_linked)
+{
+       struct udssock *conn;
+
+       assert(uds_is_connected(uds));
+       assert(uds_has_conn(uds));
+
+       conn = uds->uds_conn;
+
+       assert(uds_is_connected(conn));
+       assert(uds_has_conn(conn));
+       assert(!uds_has_link(conn));
+       assert(conn->uds_conn == uds);
+
+       /* Disconnect the sockets. */
+       uds->uds_conn = NULL;
+       conn->uds_conn = NULL;
+
+       /*
+        * If the given socket is linked, then it is a connected socket for
+        * which the other end has been created but not yet accepted.  In that
+        * case, the other end ('conn') will have to be freed up.  Otherwise,
+        * it is a regular user-created socket and we must properly transition
+        * it into disconnected state.
+        */
+       if (!was_linked) {
+               sockevent_raise(&conn->uds_sock, SEV_SEND | SEV_RECV);
+
+               /*
+                * Clear the peer credentials so that they will not be mistaken
+                * for having been obtained at bind time.
+                */
+               uds_clear_cred(conn);
+       } else
+               sockevent_raise(&conn->uds_sock, SEV_CLOSE);
+}
+
+/*
+ * Add the socket 'link' to the queue of the socket 'uds'.  This also implies
+ * that 'link's link socket is set to 'uds'.
+ */
+static void
+uds_add_queue(struct udssock * uds, struct udssock * link)
+{
+
+       dprintf(("UDS: add_queue(%d,%d)\n",
+           uds_get_id(uds), uds_get_id(link)));
+
+       TAILQ_INSERT_TAIL(&uds->uds_queue, link, uds_next);
+
+       uds->uds_queued++;
+       assert(uds->uds_queued != 0);
+
+       link->uds_link = uds;
+}
+
+/*
+ * Remove the socket 'link' from the queue of the socket 'uds'.  This also
+ * reset 'link's link to NULL.
+ */
+static void
+uds_del_queue(struct udssock * uds, struct udssock * link)
+{
+
+       dprintf(("UDS: del_queue(%d,%d)\n",
+           uds_get_id(uds), uds_get_id(link)));
+
+       assert(link->uds_link == uds);
+
+       TAILQ_REMOVE(&uds->uds_queue, link, uds_next);
+
+       assert(uds->uds_queued > 0);
+       uds->uds_queued--;
+
+       link->uds_link = NULL;
+}
+
+/*
+ * Remove all sockets from the queue of the socket 'uds', with the exception of
+ * 'except' if non-NULL.  Raise an ECONNRESET error on all removed sockets that
+ * are not equal to 'uds'.
+ */
+static void
+uds_clear_queue(struct udssock * uds, struct udssock * except)
+{
+       struct udssock *link, *tmp;
+       int found;
+
+       dprintf(("UDS: clear_queue(%d,%d)\n",
+           uds_get_id(uds), (except != NULL) ? uds_get_id(except) : -1));
+
+       found = 0;
+
+       /*
+        * Abort all connecting sockets queued on this socket, except for the
+        * given exception, which may be NULL.
+        */
+       TAILQ_FOREACH_SAFE(link, &uds->uds_queue, uds_next, tmp) {
+               if (link == except) {
+                       found++;
+
+                       continue;
                 }
  
-               /* Check if process is reading from a closed pipe. */
-               if (peer != -1 && !(uds_fd_table[peer].mode & UDS_W) &&
-                   uds_fd_table[minor].size == 0)
-                       return 0;
+               dprintf(("UDS: clear_queue removes %d\n", uds_get_id(link)));
  
-               if (pretend)
-                       return EDONTREPLY;
+               assert(uds_get_type(link) == SOCK_DGRAM ||
+                   uds_is_connecting(link) || uds_is_connected(link));
  
-               if (peer != -1 &&
-                       uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE)
-                       panic("writer blocked on empty socket");
+               uds_del_queue(uds, link);
  
-               dprintf(("UDS: suspending read request on %d\n", minor));
+               /*
+                * Generate an error only if the socket was not linked to
+                * itself (only datagram sockets can be linked to themselves).
+                * The error is not helpful for applications in that case.
+                */
+               if (uds != link)
+                       sockevent_set_error(&link->uds_sock, ECONNRESET);
  
-               /* Process is reading from an empty pipe.  Suspend it. */
-               return EDONTREPLY;
+               /*
+                * If this is a listening socket, disconnect the connecting or
+                * connected end.  If a connected peer was already created for
+                * the queued socket, dispose of that peer.
+                *
+                * Clear credentials obtained when starting to connect (in
+                * which case the socket is always a connection-oriented
+                * socket), so that they will not be mistaken for credentials
+                * obtained at bind time.
+                */
+               if (uds_get_type(link) != SOCK_DGRAM) {
+                       if (uds_is_connected(link))
+                               uds_disconnect(link, TRUE /*was_linked*/);
+                       else
+                               uds_clear_cred(link);
+               }
         }
  
-       /* How much can we get from the ring buffer? */
-       if (size > uds_fd_table[minor].size)
-               size = uds_fd_table[minor].size;
+       assert(uds->uds_queued == found);
+}
  
-       if (pretend)
-               return size;
+/*
+ * Check whether the socket address given in 'addr', with length 'addr_len', is
+ * a valid UNIX domain socket address (including a path to a socket file).  On
+ * success, return the (non-zero) length of the socket file's path, minus the
+ * null terminator which may in fact not be present.  The caller is responsible
+ * for copying and terminating the path as needed.  A pointer to the path as
+ * stored in 'addr' is returned in 'pathp'.  On failure, return an error code.
+ */
+static int
+uds_check_addr(const struct sockaddr * addr, socklen_t addr_len,
+       const char ** pathp)
+{
+       const char *p;
+       size_t len;
  
-       /* Get the data from the tail of the ring buffer. */
-       pos = uds_fd_table[minor].pos;
+       /*
+        * We could cast to a sockaddr_un structure pointer first, but that
+        * would not provide any benefits here.  Instead, we use sa_data as the
+        * generic equivalent of sun_path.
+        */
+       if (addr_len < offsetof(struct sockaddr, sa_data))
+               return EINVAL;
  
-       subsize = UDS_BUF - pos;
-       if (subsize > size)
-               subsize = size;
+       if (addr->sa_family != AF_UNIX)
+               return EAFNOSUPPORT;
  
-       if ((r = sys_safecopyto(endpt, grant, 0,
-           (vir_bytes) &uds_fd_table[minor].buf[pos], subsize)) != OK)
-               return r;
+       len = (size_t)addr_len - offsetof(struct sockaddr, sa_data);
+       if (len > 0 && (p = memchr(addr->sa_data, '\0', len)) != NULL)
+               len = (size_t)(p - addr->sa_data);
  
-       if (subsize < size) {
-               if ((r = sys_safecopyto(endpt, grant, subsize,
-                   (vir_bytes) uds_fd_table[minor].buf,
-                   size - subsize)) != OK)
-                       return r;
+       /* The given path name must not be an empty string. */
+       if (len == 0)
+               return ENOENT;
+
+       /* This check should be redundant but better safe than sorry. */
+       if (len >= UDS_PATH_MAX)
+               return EINVAL;
+
+       *pathp = (const char *)addr->sa_data;
+       return len;
+}
+
+/*
+ * Given the socket file path given as 'path' with length 'path_len' (not
+ * necessarily null terminated), store a socket address with the path in
+ * 'addr', and return the socket address length in 'addr_len'.  The calling
+ * libraries (libsockdriver, libsockevent) and the static assert in uds.h
+ * guarantee that 'addr' is sufficiently large to store any address we generate
+ * here.  The libraries may subsequently copy out only a part of it to the user
+ * process.  This function always succeeds.
+ */
+void
+uds_make_addr(const char * path, size_t len, struct sockaddr * addr,
+       socklen_t * addr_len)
+{
+
+       /*
+        * Generate the address.  The stored length (sa_len/sun_len) does not
+        * include a null terminator.  The entire structure does include a null
+        * terminator, but only if the socket is bound.
+        */
+       addr->sa_len = offsetof(struct sockaddr, sa_data) + len;
+       addr->sa_family = AF_UNIX;
+       if (len > 0) {
+               /* This call may (intentionally) overrun the sa_data size. */
+               memcpy((char *)addr->sa_data, path, len);
+               ((char *)addr->sa_data)[len] = '\0';
+
+               /* The socket is bound, so include the null terminator. */
+               len++;
+               assert(len <= UDS_PATH_MAX);
         }
  
-       /* Advance the buffer tail. */
-       uds_fd_table[minor].pos = (pos + size) % UDS_BUF;
-       uds_fd_table[minor].size -= size;
-
-       /* Reset position if the buffer is empty (it may save a copy call). */
-       if (uds_fd_table[minor].size == 0)
-               uds_fd_table[minor].pos = 0;
-
-       /* See if we can wake up a blocked writer. */
-       if (peer != -1 && uds_fd_table[peer].suspended == UDS_SUSPENDED_WRITE)
-               uds_unsuspend(peer);
-
-       /* See if we can satisfy an ongoing select. */
-       if (peer != -1 && (uds_fd_table[peer].sel_ops & CDEV_OP_WR) &&
-           uds_fd_table[minor].size < UDS_BUF) {
-               /* A write on the peer is possible now. */
-               chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer,
-                   CDEV_OP_WR);
-               uds_fd_table[peer].sel_ops &= ~CDEV_OP_WR;
+       /* Note that this length may be different from sa_len/sun_len now. */
+       *addr_len = offsetof(struct sockaddr, sa_data) + len;
+}
+
+/*
+ * Bind a socket to a local address.
+ */
+static int
+uds_bind(struct sock * sock, const struct sockaddr * addr, socklen_t addr_len,
+       endpoint_t user_endpt)
+{
+       struct udssock *uds = (struct udssock *)sock;
+       struct udssock *uds2;
+       const char *path;
+       size_t len;
+       dev_t dev;
+       ino_t ino;
+       int r;
+
+       dprintf(("UDS: bind(%d)\n", uds_get_id(uds)));
+
+       /* A socket may be bound at any time, but only once. */
+       if (uds_is_bound(uds))
+               return EINVAL;
+
+       /* Verify that the user gave us an acceptable address. */
+       if ((r = uds_check_addr(addr, addr_len, &path)) < 0)
+               return r;
+       len = (size_t)r;
+
+       /* Attempt to create the socket file on the file system. */
+       r = socketpath(user_endpt, path, len, SPATH_CREATE, &dev, &ino);
+       if (r != OK)
+               return r;
+       assert(dev != NO_DEV && ino != 0);
+
+       /*
+        * It is possible that a socket file of a previously bound socket was
+        * unlinked, and due to inode number reuse, a new socket file has now
+        * been created with the same <dev,ino> pair.  In that case, we must
+        * unbind the old socket, because it must no longer be found.  The old
+        * socket will still have a path (and behave as though it is bound) but
+        * no longer be found through hash lookups.
+        */
+       if ((uds2 = udshash_get(dev, ino)) != NULL) {
+               udshash_del(uds2);
+
+               uds2->uds_dev = NO_DEV;
+               uds2->uds_ino = 0;
         }
  
-       return size; /* number of bytes read */
+       /*
+        * Obtain credentials for the socket, unless the socket is already
+        * connecting or connected, in which case we must not replace the
+        * credentials we obtained already.  We later clear those credentials
+        * upon a connection failure or disconnect, so that if the socket is
+        * then put in listening mode, we know there are no bind-time
+        * credentials.  Not ideal, but we really need two separate sets of
+        * credentials if we want to get this right, which is a waste of memory
+        * as no sane application writer would ever rely on credential passing
+        * after recycling a socket..
+        */
+       if (uds_get_type(uds) != SOCK_DGRAM && !uds_is_connecting(uds) &&
+           !uds_is_connected(uds))
+               uds_get_cred(uds, user_endpt);
+
+       /* Asssign the address to the socket. */
+       uds->uds_pathlen = len;
+       memcpy(&uds->uds_path, path, len);
+       uds->uds_dev = dev;
+       uds->uds_ino = ino;
+
+       udshash_add(uds);
+
+       return OK;
  }
  
-static ssize_t
-uds_perform_write(devminor_t minor, endpoint_t endpt, cp_grant_id_t grant,
-       size_t size, int pretend)
+/*
+ * Look up a UDS socket based on a user-given address.  If a socket exists for
+ * the address, check if it is type-compatible with the given UDS socket.
+ * On succes, return OK, with 'peerp' set to the socket that was found.  On
+ * failure, return a negative error code.
+ */
+int
+uds_lookup(struct udssock * uds, const struct sockaddr * addr,
+       socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp)
  {
-       size_t subsize, pos;
-       int i, r, peer;
+       struct udssock *peer;
+       const char *path;
+       size_t len;
+       dev_t dev;
+       ino_t ino;
+       int r;
+
+       /* Verify that the user gave us an acceptable address. */
+       if ((r = uds_check_addr(addr, addr_len, &path)) < 0)
+               return r;
+       len = (size_t)r;
  
-       dprintf(("UDS: uds_perform_write(%d)\n", minor));
+       /* Attempt to look up the socket file on the file system. */
+       r = socketpath(user_endpt, path, len, SPATH_CHECK, &dev, &ino);
+       if (r != OK)
+               return r;
+       assert(dev != NO_DEV && ino != 0);
  
-       /* Skip writes of zero bytes. */
-       if (size == 0)
-               return 0;
+       if ((peer = udshash_get(dev, ino)) == NULL)
+               return ECONNREFUSED;
+       if (uds_get_type(peer) != uds_get_type(uds))
+               return EPROTOTYPE;
  
-       /* Check if the socket isn't shut down for writes. */
-       if (!(uds_fd_table[minor].mode & UDS_W))
-               return EPIPE;
+       *peerp = peer;
+       return OK;
+}
  
-       /* Datagram messages must fit in the buffer entirely. */
-       if (size > UDS_BUF && uds_fd_table[minor].type != SOCK_STREAM)
-               return EMSGSIZE;
+/*
+ * Given the listening socket 'uds', and the socket 'link' that is calling or
+ * has called connect(2) and is or will be linked to the listening socket's
+ * queue, create a new socket and connect it to 'link', putting both sockets in
+ * the connected state.  The given link socket may be in unconnected,
+ * connecting, or disconnected state prior to the call.  Return OK or an error
+ * code.  The link state of the link socket remains unchanged in any case.
+ */
+static int
+uds_attach(struct udssock * uds, struct udssock * link)
+{
+       struct udssock *conn;
+       int r;
+
+       /*
+        * Allocate a new socket to use as peer socket for the connection that
+        * is about to be established.  The new socket is not yet known by
+        * libsockevent.
+        */
+       if ((r = uds_alloc(&conn)) != OK)
+               return r;
  
-       if (uds_fd_table[minor].type == SOCK_STREAM ||
-           uds_fd_table[minor].type == SOCK_SEQPACKET) {
+       /*
+        * Ask libsockevent to clone the sock object in the new UDS socket from
+        * the listening socket.  This adds the sock object to libsockevent's
+        * data structures and ensures that we can safely use the socket
+        * despite the fact that it has not yet been accepted (and thus
+        * returned to libsockevent).  From this moment on, we must either
+        * return the socket's ID (but not a pointer to it!) from uds_accept()
+        * or raise SEV_CLOSE on it.
+        */
+       sockevent_clone(&uds->uds_sock, &conn->uds_sock, uds_get_id(conn));
+
+       /* Connect the link socket to the new socket. */
+       link->uds_conn = conn;
+       link->uds_flags |= UDSF_CONNECTED;
+
+       /*
+        * Connect the new socket to the link socket as well.  The child
+        * socket should also inherit pretty much all settings from the
+        * listening socket, including the bind path and the listening socket's
+        * bind-time credentials.
+        */
+       conn->uds_conn = link;
+       conn->uds_flags = uds->uds_flags & (UDSF_PASSCRED | UDSF_CONNWAIT);
+       conn->uds_flags |= UDSF_CONNECTED;
+       conn->uds_pathlen = uds->uds_pathlen;
+       memcpy(conn->uds_path, uds->uds_path, (size_t)uds->uds_pathlen);
+       memcpy(&conn->uds_cred, &uds->uds_cred, sizeof(conn->uds_cred));
+
+       return OK;
+}
+
+/*
+ * Connect a socket to a remote address.
+ */
+static int
+uds_connect(struct sock * sock, const struct sockaddr * addr,
+       socklen_t addr_len, endpoint_t user_endpt)
+{
+       struct udssock *uds = (struct udssock *)sock;
+       struct udssock *link;
+       int r;
+
+       dprintf(("UDS: connect(%d)\n", uds_get_id(uds)));
+
+       /* For connection-oriented sockets, several state checks apply. */
+       if (uds_get_type(uds) != SOCK_DGRAM) {
+               if (uds_is_listening(uds))
+                       return EOPNOTSUPP;
+               if (uds_is_connecting(uds))
+                       return EALREADY;
+               if (uds_is_connected(uds))
+                       return EISCONN;
+               /* Disconnected sockets may be reconnected, see below. */
+       } else {
                 /*
-                * If we're writing to a connection-oriented socket, then it
-                * needs a peer to write to.  For disconnected sockets, writing
-                * is an error; for connecting sockets, writes should suspend.
+                * Connectionless sockets may be unconnected by providing an
+                * address with family AF_UNSPEC.  Handle this case first here.
                  */
-               peer = uds_fd_table[minor].peer;
-
-               if (peer == -1) {
-                       if (uds_fd_table[minor].err == ECONNRESET) {
-                               if (!pretend)
-                                       uds_fd_table[minor].err = 0;
-                               return ECONNRESET;
-                       } else
-                               return ENOTCONN;
-               } else if (uds_fd_table[peer].peer == -1) /* connecting */
-                       return EDONTREPLY;
-       } else /* uds_fd_table[minor].type == SOCK_DGRAM */ {
-               peer = -1;
-
-               /* Locate the "peer" we want to write to. */
-               for (i = 0; i < NR_FDS; i++) {
+               if (addr_len >= offsetof(struct sockaddr, sa_data) &&
+                   addr->sa_family == AF_UNSPEC) {
                         /*
-                        * Look for a SOCK_DGRAM socket that is bound on
-                        * the target address.
+                        * Reset this socket's previous connection to another
+                        * socket, if any.  Unconnecting has no effect on other
+                        * sockets connected to this socket, though.
                          */
-                       if (uds_fd_table[i].type == SOCK_DGRAM &&
-                           uds_fd_table[i].stale == FALSE &&
-                           uds_fd_table[i].addr.sun_family == AF_UNIX &&
-                           !strncmp(uds_fd_table[minor].target.sun_path,
-                           uds_fd_table[i].addr.sun_path,
-                           sizeof(uds_fd_table[i].addr.sun_path))) {
-                               peer = i;
-                               break;
-                       }
-               }
+                       if (uds_has_link(uds))
+                               uds_del_queue(uds->uds_link, uds);
  
-               if (peer == -1)
-                       return ENOENT;
+                       return OK;
+               }
         }
  
-       /* Check if we write to a closed pipe. */
-       if (!(uds_fd_table[peer].mode & UDS_R))
-               return EPIPE;
-
         /*
-        * We have to preserve the boundary for DGRAM.  If there's already a
-        * packet waiting, discard it silently and pretend it was written.
+        * Find the socket identified by the given address.  If it exists at
+        * all, see if it is a proper match.
          */
-       if (uds_fd_table[minor].type == SOCK_DGRAM &&
-           uds_fd_table[peer].size > 0)
-               return size;
+       if ((r = uds_lookup(uds, addr, addr_len, user_endpt, &link)) != OK)
+               return r;
  
         /*
-        * Check if the ring buffer is already full, and if the SEQPACKET
-        * message wouldn't write to an empty buffer.
+        * Handle connectionless sockets first, in which case a connect links
+        * the socket to a send target and limits receipt to datagrams from
+        * that target.  We actually point the socket to the peer socket,
+        * through uds_link.  That also means that if the target socket
+        * disappears, we have to reset any sockets connected to it, in which
+        * case we return them to the unconnected state.  In order to allow
+        * finding all sockets connected to a particular socket, we put all
+        * those sockets on their target's queue, hence why we use uds_link and
+        * not uds_conn.  As mentioned before, we allow reconnecting without
+        * restrictions.
+        * TODO: see if reconnecting should clear a pending ECONNRESET.
+        *
+        * An important note: 'uds' and 'link' may actually be the same socket,
+        * if the caller chooses to connect a socket with itself!
          */
-       if (uds_fd_table[peer].size == UDS_BUF ||
-           (uds_fd_table[minor].type == SOCK_SEQPACKET &&
-           uds_fd_table[peer].size > 0)) {
-               if (pretend)
-                       return EDONTREPLY;
+       if (uds_get_type(uds) == SOCK_DGRAM) {
+               /* Reconnecting to the same socket has no effect. */
+               if (uds_has_link(uds) && uds->uds_link == link)
+                       return OK;
  
-               if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ)
-                       panic("reader blocked on full socket");
+               /*
+                * If the intended target is linked to another socket, we
+                * refuse linking to it.  Sending or receiving would never work
+                * anyway.  Do allow a socket to link to itself after being
+                * linked to another socket.  The error code is the same as in
+                * the sending code, borrowed from Linux.
+                */
+               if (uds != link && uds_has_link(link) && link->uds_link != uds)
+                       return EPERM;
  
-               dprintf(("UDS: suspending write request on %d\n", minor));
+               /*
+                * Reset this socket's previous link to another socket, if any.
+                */
+               if (uds_has_link(uds))
+                       uds_del_queue(uds->uds_link, uds);
  
-               /* Process is reading from an empty pipe.  Suspend it. */
-               return EDONTREPLY;
-       }
+               /*
+                * Reset any links to this socket, except for the one by
+                * the intended target.  Sending or receiving would no longer
+                * work anyway.  If the socket was linked to itself, clear its
+                * self-link without generating an ECONNRESET.  If the socket
+                * is relinking to itself, reestablish the link after first
+                * clearing it.
+                */
+               uds_clear_queue(uds, (uds != link) ? link : NULL);
  
-       /* How much can we add to the ring buffer? */
-       if (size > UDS_BUF - uds_fd_table[peer].size)
-               size = UDS_BUF - uds_fd_table[peer].size;
+               uds_add_queue(link, uds);
  
-       if (pretend)
-               return size;
+               return OK;
+       }
  
-       /* Put the data at the head of the ring buffer. */
-       pos = (uds_fd_table[peer].pos + uds_fd_table[peer].size) % UDS_BUF;
+       /*
+        * For connection-oriented sockets there is more to do.  First, make
+        * sure that the peer is a listening socket, that it has not been shut
+        * down, and that its backlog is not already at the configured maximum.
+        */
+       if (!uds_is_listening(link))
+               return ECONNREFUSED;
  
-       subsize = UDS_BUF - pos;
-       if (subsize > size)
-               subsize = size;
+       if (uds_is_shutdown(link, SFL_SHUT_RD | SFL_SHUT_WR))
+               return ECONNREFUSED;
  
-       if ((r = sys_safecopyfrom(endpt, grant, 0,
-           (vir_bytes) &uds_fd_table[peer].buf[pos], subsize)) != OK)
-               return r;
+       if (link->uds_queued >= link->uds_backlog)
+               return ECONNREFUSED;
  
-       if (subsize < size) {
-               if ((r = sys_safecopyfrom(endpt, grant, subsize,
-                   (vir_bytes) uds_fd_table[peer].buf, size - subsize)) != OK)
+       /*
+        * The behavior of connect(2) now depends on whether LOCAL_CONNWAIT is
+        * set on either the connecting or the listening socket.  If it is not,
+        * the socket will be connected to a new as-yet invisible socket, which
+        * will be the one returned from accept(2) later.  If it was, the
+        * socket will be put in the connecting state.
+        */
+       if (!((uds->uds_flags | link->uds_flags) & UDSF_CONNWAIT)) {
+               if ((r = uds_attach(link, uds)) != OK)
                         return r;
-       }
  
-       /* Advance the buffer head. */
-       uds_fd_table[peer].size += size;
-
-       /* Fill in the source address to be returned by recvfrom, recvmsg. */
-       if (uds_fd_table[minor].type == SOCK_DGRAM)
-               memcpy(&uds_fd_table[peer].source, &uds_fd_table[minor].addr,
-                   sizeof(struct sockaddr_un));
-
-       /* See if we can wake up a blocked reader. */
-       if (uds_fd_table[peer].suspended == UDS_SUSPENDED_READ)
-               uds_unsuspend(peer);
-
-       /* See if we can satisfy an ongoing select. */
-       if ((uds_fd_table[peer].sel_ops & CDEV_OP_RD) &&
-           uds_fd_table[peer].size > 0) {
-               /* A read on the peer is possible now. */
-               chardriver_reply_select(uds_fd_table[peer].sel_endpt, peer,
-                   CDEV_OP_RD);
-               uds_fd_table[peer].sel_ops &= ~CDEV_OP_RD;
+               assert(uds_is_connected(uds));
+       } else {
+               /*
+                * Disconnected sockets now stop being connected.  Any pending
+                * data can still be received, though.
+                */
+               uds->uds_flags &= ~UDSF_CONNECTED;
+
+               r = SUSPEND;
         }
  
-       return size; /* number of bytes written */
+       /* Obtain credentials for the socket. */
+       uds_get_cred(uds, user_endpt);
+
+       /* Add the socket at the end of the listening socket's queue. */
+       uds_add_queue(link, uds);
+
+       assert(r != SUSPEND || uds_is_connecting(uds));
+
+       /*
+        * Let an accept call handle the rest, which will in turn resume this
+        * connect call.  The sockevent library ensures that this works even if
+        * the call is non-blocking.
+        */
+       sockevent_raise(&link->uds_sock, SEV_ACCEPT);
+
+       return r;
  }
  
-static ssize_t
-uds_read(devminor_t minor, u64_t position, endpoint_t endpt,
-       cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
+/*
+ * Put a socket in listening mode.
+ */
+static int
+uds_listen(struct sock * sock, int backlog)
  {
-       ssize_t rc;
+       struct udssock *uds = (struct udssock *)sock;
+
+       /* The maximum backlog value must not exceed its field size. */
+       assert(SOMAXCONN <= USHRT_MAX);
  
-       dprintf(("UDS: uds_read(%d)\n", minor));
+       dprintf(("UDS: listen(%d)\n", uds_get_id(uds)));
  
-       if (minor < 0 || minor >= NR_FDS) return ENXIO;
+       /* Only connection-oriented types may be put in listening mode. */
+       if (uds_get_type(uds) == SOCK_DGRAM)
+               return EOPNOTSUPP;
  
-       if (uds_fd_table[minor].state != UDS_INUSE)
+       /* A connecting or connected socket may not listen. */
+       if (uds_is_connecting(uds) || uds_is_connected(uds))
                 return EINVAL;
  
-       rc = uds_perform_read(minor, endpt, grant, size, 0);
+       /* POSIX says that this is now the appropriate error code here. */
+       if (!uds_is_bound(uds))
+               return EDESTADDRREQ;
+
+       /*
+        * The socket is now entering the listening state.  If it was
+        * previously disconnected, clear the connection flag.
+        */
+       uds->uds_flags &= ~UDSF_CONNECTED;
+
+       /*
+        * We do not remove sockets from the backlog if it is now being dropped
+        * below the current number of queued sockets.  We only refuse newly
+        * connecting sockets beyond the backlog size.
+        */
+       uds->uds_backlog = backlog;
  
-       /* If the call couldn't complete, suspend the caller. */
-       if (rc == EDONTREPLY) {
-               uds_fd_table[minor].suspended = UDS_SUSPENDED_READ;
-               uds_fd_table[minor].susp_endpt = endpt;
-               uds_fd_table[minor].susp_grant = grant;
-               uds_fd_table[minor].susp_size = size;
-               uds_fd_table[minor].susp_id = id;
+       return OK;
+}
+
+/*
+ * Test whether an accept request would block.  Return OK if a socket could be
+ * accepted, an appropriate error code if an accept call would fail instantly,
+ * or SUSPEND if the accept request would block waiting for a connection.
+ */
+static int
+uds_test_accept(struct sock * sock)
+{
+       struct udssock *uds = (struct udssock *)sock;
  
-               /* If the call wasn't supposed to block, cancel immediately. */
-               if (flags & CDEV_NONBLOCK) {
-                       uds_cancel(minor, endpt, id);
+       /*
+        * Ensure that the socket is in listening mode.  If not, we must return
+        * the error code that is appropriate for this socket type.
+        */
+       if (uds_get_type(uds) == SOCK_DGRAM)
+               return EOPNOTSUPP;
+       if (!uds_is_listening(uds))
+               return EINVAL;
  
-                       rc = EAGAIN;
-               }
+       /*
+        * If the socket has been shut down, new connections are no longer
+        * accepted and accept calls no longer block.  This is not a POSIX
+        * requirement, but rather an application convenience feature.
+        */
+       if (uds->uds_queued == 0) {
+               if (uds_is_shutdown(uds, SFL_SHUT_RD | SFL_SHUT_WR))
+                       return ECONNABORTED;
+
+               return SUSPEND;
         }
  
-       return rc;
+       return OK;
  }
  
-static ssize_t
-uds_write(devminor_t minor, u64_t position, endpoint_t endpt,
-       cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
+/*
+ * Accept a connection on a listening socket, creating a new socket.  On
+ * success, return the new socket identifier, with the new socket stored in
+ * 'newsockp'.  Otherwise, return an error code.
+ */
+static sockid_t
+uds_accept(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len,
+       endpoint_t user_endpt __unused, struct sock ** newsockp)
  {
-       ssize_t rc;
+       struct udssock *uds = (struct udssock *)sock;
+       struct udssock *link, *conn;
+       sockid_t r;
  
-       dprintf(("UDS: uds_write(%d)\n", minor));
+       dprintf(("UDS: accept(%d)\n", uds_get_id(uds)));
  
-       if (minor < 0 || minor >= NR_FDS) return ENXIO;
+       if ((r = uds_test_accept(sock)) != OK)
+               return r;
  
-       if (uds_fd_table[minor].state != UDS_INUSE)
-               return EINVAL;
+       /*
+        * Take the first connecting socket off the listening queue.
+        */
+       assert(!TAILQ_EMPTY(&uds->uds_queue));
  
-       rc = uds_perform_write(minor, endpt, grant, size, 0);
+       link = TAILQ_FIRST(&uds->uds_queue);
  
-       /* If the call couldn't complete, suspend the caller. */
-       if (rc == EDONTREPLY) {
-               uds_fd_table[minor].suspended = UDS_SUSPENDED_WRITE;
-               uds_fd_table[minor].susp_endpt = endpt;
-               uds_fd_table[minor].susp_grant = grant;
-               uds_fd_table[minor].susp_size = size;
-               uds_fd_table[minor].susp_id = id;
+       /*
+        * Depending on the LOCAL_CONNWAIT setting at the time of connect(2),
+        * the socket may be connecting or connected.  In the latter case, its
+        * attached socket is the socket we will return now.  Otherwise we have
+        * to attach a socket first.
+        */
+       assert(uds_is_connecting(link) || uds_is_connected(link));
  
-               /* If the call wasn't supposed to block, cancel immediately. */
-               if (flags & CDEV_NONBLOCK) {
-                       uds_cancel(minor, endpt, id);
+       if (uds_is_connecting(link)) {
+               /*
+                * Attach a new socket.  If this fails, return the error but
+                * leave the connecting socket on the listening queue.
+                */
+               if ((r = uds_attach(uds, link)) != OK)
+                       return r;
  
-                       rc = EAGAIN;
-               }
+               assert(uds_is_connected(link));
+
+               /*
+                * Wake up blocked (connect, send, select) calls on the peer
+                * socket.
+                */
+               sockevent_raise(&link->uds_sock, SEV_CONNECT);
         }
  
-       return rc;
+       uds_del_queue(uds, link);
+
+       /* Return the peer socket's address to the caller. */
+       uds_make_addr(link->uds_path, link->uds_pathlen, addr, addr_len);
+
+       conn = link->uds_conn;
+
+       dprintf(("UDS: accept returns %d\n", uds_get_id(conn)));
+
+       /*
+        * We already cloned the sock object, so return its ID but not a
+        * pointer to it.  That tells libsockevent not to reinitialize it.
+        */
+       *newsockp = NULL;
+       return uds_get_id(conn);
  }
  
+/*
+ * Set socket options.
+ */
  static int
-uds_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
-       cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
+uds_setsockopt(struct sock * sock, int level, int name,
+       const struct sockdriver_data * data, socklen_t len)
  {
-       int rc, s;
+       struct udssock *uds = (struct udssock *)sock;
+       int r, val;
  
-       dprintf(("UDS: uds_ioctl(%d, %lu)\n", minor, request));
+       dprintf(("UDS: setsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name));
  
-       if (minor < 0 || minor >= NR_FDS) return ENXIO;
+       switch (level) {
+       case SOL_SOCKET:
+               switch (name) {
+               case SO_SNDBUF:
+               case SO_RCVBUF:
+                       /*
+                        * The send buffer size may not be changed because the
+                        * buffer is the same as the other side's receive
+                        * buffer, and what the other side is may vary from
+                        * send call to send call.  Changing the receive buffer
+                        * size would disallow us from even accurately guessing
+                        * the send buffer size in getsockopt calls.  Therefore
+                        * both are hardcoded and cannot actually be changed.
+                        * In order to support applications that want at least
+                        * a certain minimum, we do accept requests to shrink
+                        * either buffer, but we ignore the given size.
+                        */
+                       if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+                           len)) != OK)
+                               return r;
  
-       if (uds_fd_table[minor].state != UDS_INUSE)
-               return EINVAL;
+                       if (val <= 0 || (size_t)val > uds_io_buflen())
+                               return EINVAL;
+
+                       return OK; /* ignore new value */
+               }
+
+               break;
+
+       case UDSPROTO_UDS:
+               switch (name) {
+               case LOCAL_CREDS:
+                       if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+                           len)) != OK)
+                               return r;
+
+                       if (val)
+                               uds->uds_flags |= UDSF_PASSCRED;
+                       else
+                               uds->uds_flags &= ~UDSF_PASSCRED;
+
+                       /*
+                        * In incredibly rare cases, disabling this flag may
+                        * allow blocked sends to be resumed, because suddenly
+                        * no room for the credentials is needed in the receive
+                        * buffer anymore.
+                        */
+                       if (!val)
+                               sockevent_raise(&uds->uds_sock, SEV_SEND);
  
-       /* Update the owner endpoint. */
-       uds_fd_table[minor].owner = user_endpt;
-
-       /* Let the UDS ioctl subsystem handle the actual request. */
-       rc = uds_do_ioctl(minor, request, endpt, grant);
-
-       /* If the call couldn't complete, suspend the caller. */
-       if (rc == EDONTREPLY) {
-               /* The suspension type is already set by the IOCTL handler. */
-               if ((s = uds_fd_table[minor].suspended) == UDS_NOT_SUSPENDED)
-                       panic("IOCTL did not actually suspend?");
-               uds_fd_table[minor].susp_endpt = endpt;
-               uds_fd_table[minor].susp_grant = grant;
-               uds_fd_table[minor].susp_size = 0; /* irrelevant */
-               uds_fd_table[minor].susp_id = id;
-
-               /* If the call wasn't supposed to block, cancel immediately. */
-               if (flags & CDEV_NONBLOCK) {
-                       uds_cancel(minor, endpt, id);
-                       if (s == UDS_SUSPENDED_CONNECT)
-                               rc = EINPROGRESS;
+                       return OK;
+
+               case LOCAL_CONNWAIT:
+                       if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
+                           len)) != OK)
+                               return r;
+
+                       if (val)
+                               uds->uds_flags |= UDSF_CONNWAIT;
                         else
-                               rc = EAGAIN;
+                               uds->uds_flags &= ~UDSF_CONNWAIT;
+
+                       /*
+                        * Changing the setting does not affect sockets that
+                        * are currently pending to be accepted.  Therefore,
+                        * uds_accept() may have to deal with either case on a
+                        * socket-by-socket basis.
+                        */
+                       return OK;
+
+               case LOCAL_PEEREID:
+                       /* This option may be retrieved but not set. */
+                       return ENOPROTOOPT;
                 }
+
+               break;
         }
  
-       return rc;
+       return ENOPROTOOPT;
  }
  
-void
-uds_unsuspend(devminor_t minor)
+/*
+ * Retrieve socket options.
+ */
+static int
+uds_getsockopt(struct sock * sock, int level, int name,
+       const struct sockdriver_data * data, socklen_t * len)
  {
-       int r;
-       uds_fd_t *fdp;
+       struct udssock *uds = (struct udssock *)sock;
+       int val;
  
-       fdp = &uds_fd_table[minor];
+       dprintf(("UDS: getsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name));
  
-       switch (fdp->suspended) {
-       case UDS_SUSPENDED_READ:
-               r = uds_perform_read(minor, fdp->susp_endpt, fdp->susp_grant,
-                   fdp->susp_size, 0);
+       switch (level) {
+       case SOL_SOCKET:
+               switch (name) {
+               case SO_SNDBUF:
+               case SO_RCVBUF:
+                       /* See uds_setsockopt() for why this is static. */
+                       val = (int)uds_io_buflen();
  
-               if (r == EDONTREPLY)
-                       return;
+                       return sockdriver_copyout_opt(data, &val, sizeof(val),
+                           len);
+               }
  
                 break;
  
-       case UDS_SUSPENDED_WRITE:
-               r = uds_perform_write(minor, fdp->susp_endpt, fdp->susp_grant,
-                   fdp->susp_size, 0);
+       case UDSPROTO_UDS:
+               switch (name) {
+               case LOCAL_CREDS:
+                       val = !!(uds->uds_flags & UDSF_PASSCRED);
  
-               if (r == EDONTREPLY)
-                       return;
+                       return sockdriver_copyout_opt(data, &val, sizeof(val),
+                           len);
  
-               break;
+               case LOCAL_CONNWAIT:
+                       val = !!(uds->uds_flags & UDSF_CONNWAIT);
  
-       case UDS_SUSPENDED_CONNECT:
-       case UDS_SUSPENDED_ACCEPT:
-               /*
-                * In both cases, the caller already set up the connection.
-                * The only thing to do here is unblock.
-                */
-               r = fdp->err;
-               fdp->err = 0;
+                       return sockdriver_copyout_opt(data, &val, sizeof(val),
+                           len);
  
-               break;
+               case LOCAL_PEEREID:
+                       /* getpeereid(3) documents these error codes. */
+                       if (uds_get_type(uds) == SOCK_DGRAM)
+                               return EINVAL;
+                       if (!uds_is_connected(uds))
+                               return ENOTCONN;
  
-       default:
-               panic("unknown suspension type %d", fdp->suspended);
+                       /*
+                        * This is a custom MINIX3 error, indicating that there
+                        * are no credentials to return.  This could be due to
+                        * a failure to obtain them (which *should* not happen)
+                        * but also if the socket was bound while connected,
+                        * disconnected, and then reused as listening socket.
+                        */
+                       if (uds->uds_conn->uds_cred.unp_pid == -1)
+                               return EINVAL;
+
+                       return sockdriver_copyout_opt(data,
+                           &uds->uds_conn->uds_cred,
+                           sizeof(uds->uds_conn->uds_cred), len);
+               }
+
+               break;
         }
  
-       chardriver_reply_task(fdp->susp_endpt, fdp->susp_id, r);
+       return ENOPROTOOPT;
+}
  
-       fdp->suspended = UDS_NOT_SUSPENDED;
+/*
+ * Retrieve a socket's local address.
+ */
+static int
+uds_getsockname(struct sock * sock, struct sockaddr * addr,
+       socklen_t * addr_len)
+{
+       struct udssock *uds = (struct udssock *)sock;
+
+       dprintf(("UDS: getsockname(%d)\n", uds_get_id(uds)));
+
+       uds_make_addr(uds->uds_path, uds->uds_pathlen, addr, addr_len);
+
+       return OK;
  }
  
+/*
+ * Retrieve a socket's remote address.
+ */
  static int
-uds_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
+uds_getpeername(struct sock * sock, struct sockaddr * addr,
+       socklen_t * addr_len)
  {
-       uds_fd_t *fdp;
-       int i;
+       struct udssock *uds = (struct udssock *)sock;
+       struct udssock *peer;
  
-       dprintf(("UDS: uds_cancel(%d)\n", minor));
+       dprintf(("UDS: getpeername(%d)\n", uds_get_id(uds)));
  
-       if (minor < 0 || minor >= NR_FDS) return EDONTREPLY;
+       /*
+        * For disconnected sockets, we no longer have a peer socket and thus
+        * also no peer address.  Too bad, but NetBSD does the same.
+        *
+        * For connecting sockets we could in fact return a peer address, but
+        * POSIX says (and other platforms agree) that we should deny the call.
+        */
+       peer = uds_get_peer(uds);
  
-       fdp = &uds_fd_table[minor];
+       if (peer == NULL || uds_is_connecting(uds))
+               return ENOTCONN;
  
-       if (fdp->state != UDS_INUSE) {
-               printf("UDS: cancel request for closed minor %d\n", minor);
-               return EDONTREPLY;
-       }
+       uds_make_addr(peer->uds_path, peer->uds_pathlen, addr, addr_len);
  
-       /* Make sure the cancel request is for a request we're hanging on. */
-       if (fdp->suspended == UDS_NOT_SUSPENDED || fdp->susp_endpt != endpt ||
-           fdp->susp_id != id)
-               return EDONTREPLY;      /* this happens. */
+       return OK;
+}
+
+/*
+ * Shut down socket send and receive operations.  Note that 'flags' is a
+ * bitwise mask with libsockevent's SFL_SHUT_{RD,WR} flags rather than the set
+ * of SHUT_{RD,WR,RDWR} values from userland.
+ */
+static int
+uds_shutdown(struct sock * sock, unsigned int flags)
+{
+       struct udssock *uds = (struct udssock *)sock;
+       struct udssock *conn;
+       unsigned int mask;
+
+       dprintf(("UDS: shutdown(%d,0x%x)\n", uds_get_id(uds), flags));
  
         /*
-        * The system call was cancelled, so the socket is not suspended
-        * anymore.
+        * If we are shutting down the socket for reading, we can already close
+        * any in-flight file descriptors associated with this socket.
          */
-       switch (fdp->suspended) {
-       case UDS_SUSPENDED_ACCEPT:
-               /* A partial accept() only sets the server's child. */
-               for (i = 0; i < NR_FDS; i++)
-                       if (uds_fd_table[i].child == minor)
-                               uds_fd_table[i].child = -1;
+       if (flags & SFL_SHUT_RD)
+               uds_io_reset(uds);
  
-               break;
+       /*
+        * A shutdown on this side of a connection may have an effect on
+        * ongoing operations on the other side.  Fire appropriate events.
+        */
+       if (uds_is_connected(uds)) {
+               assert(uds_get_type(uds) != SOCK_DGRAM);
  
-       case UDS_SUSPENDED_CONNECT:
-               /* Connect requests should continue asynchronously. */
-               break;
+               conn = uds->uds_conn;
  
-       case UDS_SUSPENDED_READ:
-       case UDS_SUSPENDED_WRITE:
-               /* Nothing more to do. */
-               break;
+               mask = 0;
+               if (flags & SFL_SHUT_RD)
+                       mask |= SEV_SEND;
+               if (flags & SFL_SHUT_WR)
+                       mask |= SEV_RECV;
  
-       default:
-               panic("unknown suspension type %d", fdp->suspended);
+               sockevent_raise(&conn->uds_sock, mask);
         }
  
-       fdp->suspended = UDS_NOT_SUSPENDED;
+       return OK;
+}
+
+/*
+ * Close a socket.
+ *
+ * The 'force' flag is unused because we need never wait for data to be sent,
+ * since we keep all in-flight data on the receiver side.
+ */
+static int
+uds_close(struct sock * sock, int force __unused)
+{
+       struct udssock *uds = (struct udssock *)sock;
+
+       dprintf(("UDS: close(%d)\n", uds_get_id(uds)));
+
+       if (uds_get_type(uds) == SOCK_DGRAM) {
+               /* If this socket is linked to a target, disconnect it. */
+               if (uds_has_link(uds))
+                       uds_del_queue(uds->uds_link, uds);
  
-       return EINTR;   /* reply to the original request */
+               /* Reset all sockets linked to this socket as a target. */
+               uds_clear_queue(uds, NULL);
+       } else if (uds_is_listening(uds)) {
+               /*
+                * Abort all connecting sockets queued on this socket, and
+                * break all connections for connected sockets queued on this
+                * socket, freeing their peers.
+                */
+               uds_clear_queue(uds, NULL);
+       } else if (uds_has_link(uds)) {
+               /*
+                * This socket is connecting or connected while the other side
+                * has not been accepted yet.  Remove the socket from the
+                * listening socket's queue, and if it was connected, get rid
+                * of its peer socket altogether.
+                */
+               assert(uds_is_listening(uds->uds_link));
+
+               uds_del_queue(uds->uds_link, uds);
+
+               if (uds_is_connected(uds))
+                       uds_disconnect(uds, TRUE /*was_linked*/);
+       } else if (uds_is_connected(uds)) {
+               /*
+                * Decouple the peer socket from this socket, and possibly wake
+                * up any pending operations on it.  The socket remains marked
+                * as connected, but will now be disconnected.
+                */
+               uds_disconnect(uds, FALSE /*was_linked*/);
+       }
+
+       if (uds_is_hashed(uds))
+               udshash_del(uds);
+
+       return OK;
  }
  
+static const struct sockevent_ops uds_ops = {
+       .sop_pair               = uds_pair,
+       .sop_bind               = uds_bind,
+       .sop_connect            = uds_connect,
+       .sop_listen             = uds_listen,
+       .sop_accept             = uds_accept,
+       .sop_test_accept        = uds_test_accept,
+       .sop_pre_send           = uds_pre_send,
+       .sop_send               = uds_send,
+       .sop_test_send          = uds_test_send,
+       .sop_pre_recv           = uds_pre_recv,
+       .sop_recv               = uds_recv,
+       .sop_test_recv          = uds_test_recv,
+       .sop_setsockopt         = uds_setsockopt,
+       .sop_getsockopt         = uds_getsockopt,
+       .sop_getsockname        = uds_getsockname,
+       .sop_getpeername        = uds_getpeername,
+       .sop_shutdown           = uds_shutdown,
+       .sop_close              = uds_close,
+       .sop_free               = uds_free
+};
+
  /*
- * Initialize the server.
+ * Initialize the service.
   */
  static int
-uds_init(int UNUSED(type), sef_init_info_t *UNUSED(info))
+uds_init(int type __unused, sef_init_info_t * info __unused)
  {
-       /* Setting everything to NULL implicitly sets the state to UDS_FREE. */
-       memset(uds_fd_table, '\0', sizeof(uds_fd_t) * NR_FDS);
+       unsigned int i;
+
+       /* Initialize the list of free sockets. */
+       TAILQ_INIT(&uds_freelist);
  
-       uds_exit_left = 0;
+       for (i = 0; i < __arraycount(uds_array); i++) {
+               uds_array[i].uds_flags = 0;
  
-       /* Announce we are up! */
-       chardriver_announce();
+               TAILQ_INSERT_TAIL(&uds_freelist, &uds_array[i], uds_next);
+       }
  
-       return(OK);
+       /* Initialize the file-to-socket hash table. */
+       udshash_init();
+
+       /* Initialize the input/output module. */
+       uds_io_init();
+
+       /* Initialize the status module. */
+       uds_stat_init();
+
+       /* Initialize the sockevent library. */
+       sockevent_init(uds_socket);
+
+       uds_in_use = 0;
+       uds_running = TRUE;
+
+       return OK;
+}
+
+/*
+ * Clean up before shutdown.
+ */
+static void
+uds_cleanup(void)
+{
+
+       /* Tell the status module to clean up. */
+       uds_stat_cleanup();
  }
  
+/*
+ * The service has received a signal.
+ */
  static void
  uds_signal(int signo)
  {
-       int i;
  
-       /* Only check for termination signal, ignore anything else. */
-       if (signo != SIGTERM) return;
+       /* Only check for the termination signal.  Ignore anything else. */
+       if (signo != SIGTERM)
+               return;
  
-       /* Only exit once all sockets have been closed. */
-       uds_exit_left = 0;
-       for (i = 0; i < NR_FDS; i++)
-               if (uds_fd_table[i].state == UDS_INUSE)
-                       uds_exit_left++;
+       /* Exit only once all sockets have been closed. */
+       uds_running = FALSE;
  
-       if (uds_exit_left == 0)
-               chardriver_terminate();
+       if (uds_in_use == 0)
+               sef_cancel();
  }
  
+/*
+ * Perform initialization using the System Event Framework (SEF).
+ */
  static void
  uds_startup(void)
  {
-       /* Register init callbacks. */
+
+       /* Register initialization callbacks. */
         sef_setcb_init_fresh(uds_init);
  
-       /* Register signal callbacks. */
+       /* Register signal callback. */
         sef_setcb_signal_handler(uds_signal);
  
         /* Let SEF perform startup. */
@@ -742,14 +1378,40 @@ uds_startup(void)
  }
  
  /*
- * The UNIX domain sockets driver.
+ * The UNIX Domain Sockets driver.
   */
  int
  main(void)
  {
+       message m;
+       int r, ipc_status;
+
+       /* Initialize the service. */
         uds_startup();
  
-       chardriver_task(&uds_tab);
+       /* Loop receiving and processing messages until instructed to stop. */
+       while (uds_running || uds_in_use > 0) {
+               if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) {
+                       if (r == EINTR)
+                               continue;       /* sef_cancel() was called */
+
+                       panic("UDS: sef_receive_status failed: %d", r);
+               }
+
+               /*
+                * Messages from the MIB service are (ultimately) for the
+                * status module.  Everything else is assumed to be a socket
+                * request and passed to libsockevent, which will ignore
+                * anything it does not recognize.
+                */
+               if (m.m_source == MIB_PROC_NR)
+                       rmib_process(&m, ipc_status);
+               else
+                       sockevent_process(&m, ipc_status);
+       }
+
+       /* Clean up before graceful shutdown. */
+       uds_cleanup();
  
-       return(OK);
+       return EXIT_SUCCESS;
  }
diff --git a/minix/net/uds/uds.conf b/minix/net/uds/uds.conf

new file mode 100644 (file)

index 0000000..481f891
--- /dev/null
+++ b/minix/net/uds/uds.conf
@@ -0,0 +1,9 @@
+service uds
+{
+       domain  LOCAL;
+       system  KILL;   # for SIGPIPE
+       uid     0;      # for socketpath(2) and copyfd(2)
+       ipc
+               SYSTEM vfs rs vm mib
+       ;
+};
diff --git a/minix/net/uds/uds.h b/minix/net/uds/uds.h

index 741b4bd479dd3b46f66dfa32be01c1940db1d3c5..4ccbaf425f4ebfdc546606670772026a46d56435 100644 (file)
--- a/minix/net/uds/uds.h
+++ b/minix/net/uds/uds.h
@@ -1,23 +1,48 @@
-#ifndef __UDS_UDS_H
-#define __UDS_UDS_H
+#ifndef MINIX_NET_UDS_UDS_H
+#define MINIX_NET_UDS_UDS_H
  
  #include <minix/drivers.h>
-#include <minix/chardriver.h>
-#undef send
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/ucred.h>
+#include <minix/sockevent.h>
+#include <minix/rmib.h>
  #include <sys/un.h>
-#include <sys/mman.h>
  
-/* Maximum number of UNIX domain sockets. */
-#define NR_FDS         256
+/*
+ * Maximum number of UNIX domain sockets.  The control structures for all of
+ * these are allocated statically, although each socket's receive buffer is
+ * allocated only when the socket is in use.  If this constant is increased
+ * beyond 65535, a few field sizes need to be changed.
+ */
+#define NR_UDSSOCK     256
+
+/* Number of slots in the <dev,ino>-to-udssock hash table. */
+#define UDSHASH_SLOTS  64
+
+/* UDS has no protocols, so we accept only an "any protocol" value. */
+#define UDSPROTO_UDS   0
+
+/*
+ * The size of each socket's receive buffer.  This size is currently a global
+ * setting which cannot be changed per socket at run time, and it would be
+ * rather tricky to change that.  In order not to waste resources, this size
+ * should be a multiple of the page size.  Due to the fact that data and
+ * metadata (such as lengths, source addresses and sender credentials) are
+ * intermixed in the same buffer, the actual amount of data that can be in
+ * transit at once is typically less than this value.  If this constant is
+ * increased beyond 65535, several fields and field sizes need to be changed.
+ */
+#define UDS_BUF                32768
  
-/* Connection backlog size for incoming connections. */
-#define UDS_SOMAXCONN  64
+/* Maximum size of control data that can be sent or received at once. */
+#define UDS_CTL_MAX    4096
  
-/* Maximum UDS socket buffer size. */
-#define UDS_BUF                PIPE_BUF
+/*
+ * We allow longer path names than the size of struct sockaddr_un's sun_path
+ * field.  The actual limit is determined by the maximum value of the sun_len
+ * field, which is 255 and includes the first two fields of the structure (one
+ * byte each) but not the null terminator of the path.  Thus, the maximum
+ * length of the path minus null terminator is 253; with terminator it is 254.
+ */
+#define UDS_PATH_MAX   (UINT8_MAX - sizeof(uint8_t) - sizeof(sa_family_t) + 1)
  
  /* Output debugging information? */
  #define DEBUG          0
@@ -29,191 +54,201 @@
  #endif
  
  /*
- * A light version of the "uucred" credentials structure.  We basically do not
- * support passing around groups lists, and by not using struct uucred as
- * storage, we save memory for those groups lists as well.  Note that the
- * original Linux uucred structure has a 'cr_pid' field as well, but this is
- * unsupported in NetBSD's version of the structure (and rightly so).
+ * We declare this structure only for the static assert right below it.  We
+ * have no need for the structure otherwise, as we use "struct sockaddr"
+ * directly instead.
   */
-struct luucred {
-       uid_t uid;
-       gid_t gid;
+struct sockaddr_unx {
+       uint8_t sunx_len;
+       sa_family_t sunx_family;
+       char sunx_path[UDS_PATH_MAX];
  };
-
-/* ancillary data to be sent */
-struct ancillary {
-       int fds[OPEN_MAX];
-       int nfiledes;
-       struct luucred cred;
-};
-
-#define UDS_R  0x1
-#define UDS_W  0x2
+STATIC_SOCKADDR_MAX_ASSERT(sockaddr_unx);
  
  /*
- * Internal State Information for a socket descriptor.
+ * In-flight file descriptor object.  Each in-use object is part of a socket's
+ * file descriptor queue, and the file descriptor is for a file open by this
+ * service.  For each set of in-flight file descriptors associated with a
+ * particular segment, the first object's count field contains the number of
+ * file descriptors in that set.  For all other objects in that set, the count
+ * field is zero.  TODO: the count should be stored in the segment itself.
   */
  struct uds_fd {
+       SIMPLEQ_ENTRY(uds_fd) ufd_next; /* next FD object for this socket */
+       int ufd_fd;                     /* local file descriptor number */
+       unsigned int ufd_count;         /* number of FDs for this segment */
+};
  
-/* Flags */
-
-       enum UDS_STATE {
-               /* This file descriptor is UDS_FREE and can be allocated. */
-               UDS_FREE  = 0,
-
-               /* OR it is UDS_INUSE and can't be allocated. */
-               UDS_INUSE = 1
-
-       /* state is set to UDS_INUSE in uds_open(). state is Set to
-        * UDS_FREE in uds_init() and uds_close(). state should be
-        * checked prior to all operations.
-        */
-       } state;
-
-/* Owner Info */
-
-       /* Socket Owner */
-       endpoint_t owner;
-
-/* Pipe Housekeeping */
-
-       char *buf;                      /* ring buffer */
-       size_t pos;                     /* tail position into ring buffer */
-       size_t size;                    /* size of used part of ring buffer */
-
-       /* control read/write, set by uds_open() and shutdown(2).
-        * Can be set to UDS_R|UDS_W, UDS_R, UDS_W, or 0
-        * for read and write, read only, write only, or neither.
-        * default is UDS_R|UDS_W.
-        */
-       int mode;
-
-/* Socket Info */
-
-       /* socket type - SOCK_STREAM, SOCK_DGRAM, or SOCK_SEQPACKET
-        * Set by uds_ioctl(NWIOSUDSTYPE). It defaults to -1 in
-        * uds_open(). Any action on a socket with type -1 besides
-        * uds_ioctl(NWIOSUDSTYPE) and uds_close() will result in
-        * an error.
-        */
-       int type;
-
-       /* queue of pending connections for server sockets.
-        * connect(2) inserts and accept(2) removes from the queue
-        */
-       int backlog[UDS_SOMAXCONN];
-
-       /* requested connection backlog size. Set by listen(2)
-        * Bounds (0 <= backlog_size <= UDS_SOMAXCONN)
-        * Defaults to UDS_SOMAXCONN which is defined above.
-        */
-       unsigned char backlog_size;
-
-       /* index of peer in uds_fd_table for connected sockets.
-        * -1 is used to mean no peer. Assumptions: peer != -1 means
-        * connected.
-        */
-       int peer;
-
-       /* index of child (client sd returned by accept(2))
-        * -1 is used to mean no child.
-        */
-       int child;
-
-       /* address -- the address the socket is bound to.
-        * Assumptions: addr.sun_family == AF_UNIX means its bound.
-        */
-       struct sockaddr_un addr;
-
-       /* target -- where DGRAMs are sent to on the next uds_write(). */
-       struct sockaddr_un target;
-
-       /* source -- address where DGRAMs are from. used to fill in the
-        * from address in recvfrom(2) and recvmsg(2).
-        */
-       struct sockaddr_un source;
-
-       /* Flag (TRUE or FALSE) - address overridden by newer socket.
-        * Default to FALSE.  Set to TRUE by do_bind() on another socket with
-        * the same path but its on-disk socket file removed in the meantime.
-        */
-       int stale;
-
-       /* Flag (TRUE or FALSE) - listening for incoming connections.
-        * Default to FALSE.  Set to TRUE by do_listen().
-        */
-       int listening;
-
-       /* stores file pointers and credentials being sent between
-        * processes with sendmsg(2) and recvmsg(2).
-        */
-       struct ancillary ancillary_data;
-
-       /* Holds an errno. This is set when a connected socket is
-        * closed and we need to pass ECONNRESET on to a suspended
-        * peer.
-        */
-       int err;
-
-/* Suspend/Revive Housekeeping */
-
-       /* SUSPEND State Flags */
-       enum UDS_SUSPENDED {
-
-               /* Socket isn't blocked. */
-               UDS_NOT_SUSPENDED     = 0,
-
-               /* Socket is blocked on read(2) waiting for data to read. */
-               UDS_SUSPENDED_READ    = 1,
-
-               /* Socket is blocked on write(2) for space to write data. */
-               UDS_SUSPENDED_WRITE   = 2,
-
-               /* Socket is blocked on connect(2) waiting for the server. */
-               UDS_SUSPENDED_CONNECT = 4,
-
-               /* Socket is blocked on accept(2) waiting for clients. */
-               UDS_SUSPENDED_ACCEPT  = 8
-       } suspended;
-
-       /* source endpoint, saved for later use by suspended procs */
-       endpoint_t susp_endpt;
-
-       /* i/o grant, saved for later use by suspended procs */
-       cp_grant_id_t susp_grant;
-
-       /* size of request, saved for later use by suspended procs */
-       size_t susp_size;
-
-       /* request ID, saved for later use by suspended procs */
-       cdev_id_t susp_id;
+/*
+ * Connection-type sockets (SOCK_STREAM, SOCK_SEQPACKET) are always in one of
+ * the following five states, each with unique characteristics:
+ *
+ * - Unconnected: this socket is not in any of the other states, usually
+ *   because it either has just been created, or because it has failed a
+ *   connection attempt.  This socket has no connected peer and does not have
+ *   the SO_ACCEPTCONN socket option set.
+ * - Listening: this socket is in listening mode.  It has a queue with sockets
+ *   that are connecting or connected to it but have not yet been accepted on
+ *   it.  This socket has no connected peer.  It has the SO_ACCEPTCONN socket
+ *   option set.
+ * - Connecting: this socket is on a listening socket's queue.  While in this
+ *   state, the socket has the listening socket as its linked peer, and it has
+ *   no connected peer.
+ * - Connected: this socket is connected to another socket, which is its
+ *   connected peer socket.  It has the UDSF_CONNECTED flag set.  A socket may
+ *   be connected and still be involved with a listening socket; see below.
+ * - Disconnected: this socket was connected to another socket, but that other
+ *   socket has been closed.  As a result, this socket has no peer.  It does
+ *   have the UDSF_CONNECTED flag set.
+ *
+ * The UDS service supports two different type of connect behaviors, depending
+ * on what the LOCAL_CONNWAIT option is set to on either the connecting or the
+ * listening socket.  If LOCAL_CONNWAIT is not set on either (the default), the
+ * connecting socket socket (let's call it "A") enters the connected state
+ * right away, even if the connection is not immediately accepted through
+ * accept(2).  In that case, a new limbo socket "B" is allocated as its
+ * connection peer.  Limbo socket B is also in connected state, and either
+ * returned from accept(2) later, or freed when socket A leaves the connected
+ * state.  Socket A can leave the connected state either by being closed or
+ * when the listening socket is closed.  If LOCAL_CONNWAIT is set, socket A
+ * stays in the connecting state until it is accepted through accept(2).
+ * Importantly, in both cases, it is socket A, and (in the first case) *not*
+ * socket B, that is on the queue of the listening socket!
+ *
+ * Connected peers (uds_conn) are always symmetric: if one socket is connected
+ * to another socket, that other socket is connected to it.  Any socket that is
+ * on the queue of another socket, is said to be "linked" to that other socket
+ * (uds_link). This is an asymmetric, one-to-many relationship: many sockets
+ * may be linked to one other socket, which keeps all those sockets on its
+ * queue. From the above story it should now be clear that for connection-type
+ * sockets, only listening sockets may have sockets on its queue, and while
+ * connecting sockets are always on a listening socket's queue, connected
+ * sockets may or may not be.  Sockets in other states never are.
+ *
+ * UNIX domain sockets are generally reusable.  This means that the listening
+ * state is the only final state; all other socket states allow the socket to
+ * enter another state, although not necessarily every other state.  For
+ * example, a disconnected socket may be reconnected to another target; if that
+ * connection fails, the socket will enter the unconnected state.  As a result,
+ * a socket in any state (even the listening state) may still have incoming
+ * data pending from a previous connection.  However, EOF is currently produced
+ * only for disconnected sockets.  To be sure: connecting and connected sockets
+ * must first enter the unconnected or disconnected state, respectively, before
+ * possibly being reconnected.
+ *
+ * For connectionless (i.e., SOCK_DGRAM) sockets, there are no separate states.
+ * However, a connectionless socket may have been connected to another socket.
+ * We maintain these links not with uds_conn but with uds_link, because such
+ * connections are not symmetric, and there is an interest in keeping track of
+ * which datagram sockets are connected to a particular socket (namely, to
+ * break the connection on close without doing an exhaustive search).  For that
+ * reason, when a datagram socket connects to another socket, it is linked to
+ * that other socket, and the other socket has this socket on its queue.  As a
+ * strange corner case, a connectionless socket may be connected to itself, in
+ * which case it is its own linked peer and it is also on its own queue.  For
+ * datagram sockets, uds_conn is always NULL and UDSF_CONNECTED is never set.
+ *
+ * For the purposes of sending and receiving, we generally refer to the
+ * communication partner of a socket as its "peer".  As should now be clear,
+ * for connection-type sockets, the socket's peer is identified with uds_conn;
+ * for connectionless sockets, the socket's peer is identified with uds_link.
+ */
+struct udssock {
+       struct sock uds_sock;           /* sock object */
+       struct udssock *uds_conn;       /* connected socket, or NULL if none */
+       struct udssock *uds_link;       /* linked socket, or NULL if none */
+       unsigned char *uds_buf;         /* receive buffer (memory-mapped) */
+       unsigned short uds_tail;        /* tail of data in receive buffer */
+       unsigned short uds_len;         /* length of data in receive buffer */
+       unsigned short uds_last;        /* offset to last header in buffer */
+       unsigned short uds_queued;      /* current nr of sockets on queue */
+       unsigned short uds_backlog;     /* maximum nr of connecting sockets */
+       unsigned char uds_flags;        /* UDS-specific flags (UDSF_) */
+       unsigned char uds_pathlen;      /* socket file path length (w/o nul) */
+       char uds_path[UDS_PATH_MAX - 1];/* socket file path (not terminated) */
+       dev_t uds_dev;                  /* socket file device number */
+       ino_t uds_ino;                  /* socket file inode number */
+       struct unpcbid uds_cred;        /* bind/connect-time credentials */
+       SLIST_ENTRY(udssock) uds_hash;  /* next in hash chain */
+       TAILQ_ENTRY(udssock) uds_next;  /* next in free list or queue */
+       SIMPLEQ_HEAD(, uds_fd) uds_fds; /* in-flight file descriptors */
+       TAILQ_HEAD(, udssock) uds_queue;/* queue of linked sockets */
+};
  
-/* select() */
-
-       /* when a select is in progress, we notify this endpoint
-        * of new data.
-        */
-       endpoint_t sel_endpt;
+#define UDSF_IN_USE            0x01    /* in use (for enumeration only) */
+#define UDSF_CONNECTED         0x02    /* connected or disconnected */
+#define UDSF_CONNWAIT          0x04    /* leave connecting until accept */
+#define UDSF_PASSCRED          0x08    /* pass credentials when receiving */
  
-       /* Options (CDEV_OP_RD,WR,ERR) that are requested. */
-       unsigned int sel_ops;
-};
+/* Macros. */
+#define uds_get_type(uds)      sockevent_get_type(&(uds)->uds_sock)
  
-typedef struct uds_fd uds_fd_t;
+/*
+ * A socket that can be found through hash table lookups always has a non-empty
+ * path as well as a valid <dev,ino> pair identifying the socket file that is,
+ * or once was, identified by that path.  However, a socket that is bound, even
+ * though it will still have an associated path, is not necessarily hashed.
+ * The reason for the difference is <dev,ino> pair reuse.  This case is
+ * elaborated on in uds_bind().
+ */
+#define uds_is_bound(uds)      ((uds)->uds_pathlen != 0)
+#define uds_is_hashed(uds)     ((uds)->uds_dev != NO_DEV)
  
-/* File Descriptor Table -- Defined in uds.c */
-EXTERN uds_fd_t uds_fd_table[NR_FDS];
+/*
+ * These macros may be used on all socket types.  However, the uds_is_connected
+ * macro returns TRUE only for connection-oriented sockets.  To see if a
+ * datagram socket is connected to a target, use uds_has_link instead.
+ */
+#define uds_has_conn(uds)      ((uds)->uds_conn != NULL)
+#define uds_has_link(uds)      ((uds)->uds_link != NULL)
+#define uds_get_peer(uds)      \
+       ((uds_get_type(uds) != SOCK_DGRAM) ? (uds)->uds_conn : (uds)->uds_link)
+#define uds_is_listening(uds)  sockevent_is_listening(&(uds)->uds_sock)
+#define uds_is_connecting(uds)                                         \
+       (uds_has_link(uds) && !((uds)->uds_flags & UDSF_CONNECTED) &&   \
+       uds_get_type(uds) != SOCK_DGRAM)
+#define uds_is_connected(uds)  \
+       (((uds)->uds_flags & UDSF_CONNECTED) && uds_has_conn(uds))
+#define uds_is_disconnected(uds)       \
+       (((uds)->uds_flags & UDSF_CONNECTED) && !uds_has_conn(uds))
+
+#define uds_is_shutdown(uds, mask)     \
+       sockevent_is_shutdown(&(uds)->uds_sock, (mask))
  
  /* Function prototypes. */
  
-/* ioc_uds.c */
-int uds_clear_fds(devminor_t minor, struct ancillary *data);
-int uds_do_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
-       cp_grant_id_t grant);
-
  /* uds.c */
-ssize_t uds_perform_read(devminor_t minor, endpoint_t endpt,
-       cp_grant_id_t grant, size_t size, int pretend);
-void uds_unsuspend(devminor_t minor);
-
-#endif /* !__UDS_UDS_H */
+sockid_t uds_get_id(struct udssock * uds);
+struct udssock *uds_enum(struct udssock * prev, int type);
+void uds_make_addr(const char * path, size_t len, struct sockaddr * addr,
+       socklen_t * addr_len);
+int uds_lookup(struct udssock * uds, const struct sockaddr * addr,
+       socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp);
+
+/* io.c */
+void uds_io_init(void);
+int uds_io_setup(struct udssock * uds);
+void uds_io_cleanup(struct udssock * uds);
+void uds_io_reset(struct udssock * uds);
+size_t uds_io_buflen(void);
+int uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len,
+       const struct sockaddr * addr, socklen_t addr_len,
+       endpoint_t user_endpt, int flags);
+int uds_send(struct sock * sock, const struct sockdriver_data * data,
+       size_t len, size_t * off, const struct sockdriver_data * ctl,
+       socklen_t ctl_len, socklen_t * ctl_off, const struct sockaddr * addr,
+       socklen_t addr_len, endpoint_t user_endpt, int flags, size_t min);
+int uds_test_send(struct sock * sock, size_t min);
+int uds_pre_recv(struct sock * sock, endpoint_t user_endpt, int flags);
+int uds_recv(struct sock * sock, const struct sockdriver_data * data,
+       size_t len, size_t * off, const struct sockdriver_data * ctl,
+       socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr,
+       socklen_t * addr_len, endpoint_t user_endpt, int flags, size_t min,
+       int * rflags);
+int uds_test_recv(struct sock * sock, size_t min, size_t * size);
+
+/* stat.c */
+void uds_stat_init(void);
+void uds_stat_cleanup(void);
+
+#endif /* !MINIX_NET_UDS_UDS_H */
diff --git a/minix/net/uds/unix.8 b/minix/net/uds/unix.8

index 131753e49f5fac271f858c78edd64a98491ca05d..ffde8edc968c1fa32bfa783aa0a9f4ac6a84ae1a 100644 (file)
--- a/minix/net/uds/unix.8
+++ b/minix/net/uds/unix.8
@@ -10,6 +10,7 @@ unix \- Unix Domain Sockets (PF_UNIX) / Local Sockets (PF_LOCAL)
  .in +5
  .ti -5
  int socket(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP);
+.br
  .ti -5
  int socketpair(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP, int \fIsv[2]\fP);
  .br
@@ -18,9 +19,8 @@ int socketpair(int \fIdomain\fP, int \fItype\fP, int \fIprotocol\fP, int \fIsv[2
  Local sockets, more commonly known as Unix Domain Sockets, provide a 
  means of interprocess communication using the socket API.
  .SH SEE ALSO
-.BR socket(2),
-.BR socketpair(2),
-.BR getpeereid(2),
-.BR uds(8)
+.BR socket(2) ,
+.BR socketpair(2) ,
+.BR getpeereid(3)
  .SH HISTORY
-This Unix Domain Sockets first appeared in Minix 3.1.8.
+This Unix Domain Sockets first appeared in MINIX 3.1.8.
diff --git a/minix/servers/vfs/filedes.c b/minix/servers/vfs/filedes.c

index 7ddb51b42be44f1538ac64f3e7b1dc0e88e5378e..2051221e60177ee1616667432f5cd27fffc158e9 100644 (file)
--- a/minix/servers/vfs/filedes.c
+++ b/minix/servers/vfs/filedes.c
@@ -525,7 +525,8 @@ int do_copyfd(void)
  {
  /* Copy a file descriptor between processes, or close a remote file descriptor.
   * This call is used as back-call by device drivers (UDS, VND), and is expected
- * to be used in response to an IOCTL to such device drivers.
+ * to be used in response to either an IOCTL to VND or a SEND or RECV socket
+ * request to UDS.
   */
    struct fproc *rfp;
    struct filp *rfilp;
@@ -548,9 +549,9 @@ int do_copyfd(void)
    rfp = &fproc[slot];
  
    /* FIXME: we should now check that the user process is indeed blocked on an
-   * IOCTL call, so that we can safely mess with its file descriptors.  We
-   * currently do not have the necessary state to verify this, so we assume
-   * that the call is always used in the right way.
+   * IOCTL or socket call, so that we can safely mess with its file
+   * descriptors.  We currently do not have the necessary state to verify this,
+   * so we assume that the call is always used in the right way.
     */
  
    /* Depending on the operation, get the file descriptor from the caller or the
@@ -566,7 +567,7 @@ int do_copyfd(void)
     * passes in the file descriptor to the device node on which it is performing
     * the IOCTL.  We do not allow manipulation of such device nodes.  In
     * practice, this only applies to block-special files (and thus VND), because
-   * character-special files (as used by UDS) are unlocked during the IOCTL.
+   * socket files (as used by UDS) are unlocked during the socket operation.
     */
    if (rfilp->filp_ioctl_fp == rfp)
         return(EBADF);
diff --git a/minix/servers/vfs/open.c b/minix/servers/vfs/open.c

index fc5b0c227ff81a254d9be4d120c5505faaf7b846..014122fd23006bf1e82c47e37f0d2d8f27989b39 100644 (file)
--- a/minix/servers/vfs/open.c
+++ b/minix/servers/vfs/open.c
@@ -535,9 +535,9 @@ int do_mknod(void)
    resolve.l_vnode_lock = VNODE_WRITE;
  
    /* Only the super_user may make nodes other than fifos. */
-  if (!super_user && (!S_ISFIFO(mode_bits) && !S_ISSOCK(mode_bits))) {
+  if (!super_user && !S_ISFIFO(mode_bits))
         return(EPERM);
-  }
+
    bits = (mode_bits & S_IFMT) | (mode_bits & ACCESSPERMS & fp->fp_umask);
  
    /* Open directory that's going to hold the new node. */
diff --git a/minix/servers/vfs/path.c b/minix/servers/vfs/path.c

index 5f0191a4b71eda9f87ae368392b1897d485c6687..5360d2c1f3f7a5c9760fe128ae17770480c39aab 100644 (file)
--- a/minix/servers/vfs/path.c
+++ b/minix/servers/vfs/path.c
@@ -15,7 +15,6 @@
  #include <minix/vfsif.h>
  #include <sys/param.h>
  #include <sys/stat.h>
-#include <sys/un.h>
  #include <sys/dirent.h>
  #include "vmnt.h"
  #include "vnode.h"
@@ -819,7 +818,6 @@ int do_socketpath(void)
    struct fproc *rfp;
    char path[PATH_MAX];
    struct lookup resolve, resolve2;
-  struct sockaddr_un sun;
    mode_t bits;
  
    /* This should be replaced by an ACL check. */
@@ -831,24 +829,16 @@ int do_socketpath(void)
    what = job_m_in.m_lsys_vfs_socketpath.what;
  
    if (isokendpt(ep, &slot) != OK) return(EINVAL);
-  if (pathlen < sizeof(sun.sun_path) || pathlen >= PATH_MAX) return(EINVAL);
+  rfp = &fproc[slot];
  
-  rfp = &(fproc[slot]);
+  /* Copy in the path name, which must not be empty.  It is typically not null
+   * terminated.
+   */
+  if (pathlen < 1 || pathlen >= sizeof(path)) return(EINVAL);
    r = sys_safecopyfrom(who_e, io_gr, (vir_bytes)0, (vir_bytes)path, pathlen);
    if (r != OK) return(r);
    path[pathlen] = '\0';
  
-  /* If requested, turn path into canonical path to the socket file */
-  if (what & SPATH_CANONIZE) {
-       if ((r = canonical_path(path, rfp)) != OK) return(r);
-       if (strlen(path) >= pathlen) return(ENAMETOOLONG);
-
-       /* copy path back to the caller */
-       r = sys_safecopyto(who_e, (cp_grant_id_t)io_gr, (vir_bytes)0,
-           (vir_bytes)path, pathlen);
-       if (r != OK) return(r);
-  }
-
    /* Now perform the requested action.  For the SPATH_CHECK action, a socket
     * file is expected to exist already, and we should check whether the given
     * user process has access to it.  For the SPATH_CREATE action, no file is
@@ -859,7 +849,7 @@ int do_socketpath(void)
     * Since the above canonicalization releases all locks once done, we need to
     * recheck absolutely everything now.  TODO: do not release locks in between.
     */
-  switch (what & ~SPATH_CANONIZE) {
+  switch (what) {
    case SPATH_CHECK:
         lookup_init(&resolve, path, PATH_NOFLAGS, &vmp, &vp);
         resolve.l_vmnt_lock = VMNT_READ;
diff --git a/minix/tests/common-socket.c b/minix/tests/common-socket.c

index 1517513b4af3b31a4248bb8a8f4df1a4c1ff4940..e84ac351c2155f505447ac53ffeeb1983274b753 100644 (file)
--- a/minix/tests/common-socket.c
+++ b/minix/tests/common-socket.c
@@ -50,16 +50,19 @@ static char *get_timestamp(void)
  void test_fail_fl(char *msg, char *file, int line)
  {
         char *timestamp;
+       int e;
+       e = errno;
         timestamp = get_timestamp();
         if (errct == 0) fprintf(stderr, "\n");
+       errno = e;
         fprintf(stderr, "[ERROR][%s] (%s Line %d) %s [pid=%d:errno=%d:%s]\n",
-                       timestamp, file, line, msg, getpid(),
-                                       errno, strerror(errno));
+           timestamp, file, line, msg, getpid(), errno, strerror(errno));
         fflush(stderr);
         if (timestamp != NULL) {
                 free(timestamp);
                 timestamp = NULL;
         }
+       errno = e;
         e(7);
  }
  
@@ -317,7 +320,7 @@ void test_shutdown(const struct socket_test_info *info)
                 SOCKET(sd, info->domain, info->type, 0);
                 errno = 0;
                 rc = shutdown(sd, how[i]);
-               if (!(rc == -1 && errno == ENOTCONN) &&
+               if (rc != 0 && !(rc == -1 && errno == ENOTCONN) &&
                         !info->bug_shutdown_not_conn &&
                         !info->bug_shutdown) {
                         test_fail("shutdown() should have failed");
@@ -328,10 +331,10 @@ void test_shutdown(const struct socket_test_info *info)
         SOCKET(sd, info->domain, info->type, 0);
         errno = 0;
         rc = shutdown(sd, -1);
-       if (!(rc == -1 && errno == ENOTCONN) &&
+       if (!(rc == -1 && errno == EINVAL) &&
                 !info->bug_shutdown_not_conn &&
                 !info->bug_shutdown) {
-               test_fail("shutdown(sd, -1) should have failed with ENOTCONN");
+               test_fail("shutdown(sd, -1) should have failed with EINVAL");
         }
         CLOSE(sd);
  
@@ -431,8 +434,6 @@ void test_sockopts(const struct socket_test_info *info)
                 CLOSE(sd);
         }
  
-
-
         SOCKET(sd, info->domain, info->type, 0);
  
         debug("Test setsockopt() works");
@@ -901,9 +902,6 @@ static void test_xfer_client(const struct socket_test_info *info)
                 test_fail("[client] getpeername() should have worked");
         }
  
-       /* we need to use the full path "/usr/src/test/DIR_56/test.sock"
-        * because that is what is returned by getpeername().
-        */
  
         info->callback_check_sockaddr((struct sockaddr *) &peer_addr,
                 peer_addr_len, "getpeername", 1);
@@ -1299,8 +1297,8 @@ static void test_abort_client(const struct socket_test_info *info,
                         if (!info->ignore_write_conn_reset) {
                                 test_fail("write should have failed\n");
                         }
-               } else if (errno != ECONNRESET) {
-                       test_fail("errno should've been ECONNRESET\n");
+               } else if (errno != EPIPE && errno != ECONNRESET) {
+                       test_fail("errno should've been EPIPE/ECONNRESET\n");
                 }
         }
  
@@ -1353,7 +1351,7 @@ static void test_abort_server(const struct socket_test_info *info,
         if (abort_type == 1) {
                 memset(buf, '\0', BUFSIZE);
                 rc = read(client_sd, buf, BUFSIZE);
-               if (rc != -1 && (rc != 0 || !info->ignore_read_conn_reset)) {
+               if (rc != 0 && rc != -1) {
                         test_fail("read should've failed or returned zero\n");
                 }
                 if (rc != 0 && errno != ECONNRESET) {
@@ -1518,9 +1516,6 @@ void test_msg_dgram(const struct socket_test_info *info)
                 test_fail("recvmsg");
         }
  
-       /* we need to use the full path "/usr/src/test/DIR_56/testb.sock"
-        * because that is what is returned by recvmsg().
-        */
         info->callback_check_sockaddr((struct sockaddr *) &addr,
                 msg2.msg_namelen, "recvmsg", 2);
  
@@ -1603,6 +1598,9 @@ test_nonblock(const struct socket_test_info *info)
         if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
                 test_fail("bind() should have worked");
  
+       if (info->callback_set_listen_opt != NULL)
+               info->callback_set_listen_opt(server_sd);
+
         if (listen(server_sd, 8) == -1)
                 test_fail("listen() should have worked");
  
@@ -1813,6 +1811,9 @@ test_intr(const struct socket_test_info *info)
         if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
                 test_fail("bind() should have worked");
  
+       if (info->callback_set_listen_opt != NULL)
+               info->callback_set_listen_opt(server_sd);
+
         if (listen(server_sd, 8) == -1)
                 test_fail("listen() should have worked");
  
@@ -1844,6 +1845,9 @@ test_intr(const struct socket_test_info *info)
                 errct = 0;
                 close(client_sd);
  
+               /* Ensure that the parent is blocked on the send(). */
+               sleep(1);
+
                 check_select(server_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/);
  
                 len = sizeof(addr);
@@ -1932,6 +1936,9 @@ test_connect_close(const struct socket_test_info *info)
         if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
                 test_fail("bind() should have worked");
  
+       if (info->callback_set_listen_opt != NULL)
+               info->callback_set_listen_opt(server_sd);
+
         if (listen(server_sd, 8) == -1)
                 test_fail("listen() should have worked");
  
@@ -1989,6 +1996,9 @@ test_listen_close(const struct socket_test_info *info)
         if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
                 test_fail("bind() should have worked");
  
+       if (info->callback_set_listen_opt != NULL)
+               info->callback_set_listen_opt(server_sd);
+
         if (listen(server_sd, 8) == -1)
                 test_fail("listen() should have worked");
  
@@ -2009,7 +2019,6 @@ test_listen_close(const struct socket_test_info *info)
  
         byte = 0;
         if (write(client_sd, &byte, 1) != -1 || errno != ENOTCONN)
-               /* Yes, you fucked up the fix for the FIXME below. */
                 test_fail("write() should have yielded ENOTCONN");
  
         if (connect(client_sd, info->clientaddr, info->clientaddrlen) != -1) {
@@ -2021,14 +2030,16 @@ test_listen_close(const struct socket_test_info *info)
         }
  
         /*
-        * FIXME: currently UDS cannot distinguish between sockets that have
-        * not yet been connected, and sockets that have been disconnected.
-        * Thus, we get the same error for both: ENOTCONN instead of EPIPE.
+        * The error we get on the next write() depends on whether the socket
+        * may be reused after a failed connect: for TCP/IP, it may not, so we
+        * get EPIPE; for UDS, it may be reused, so we get ENOTCONN.
          */
-#if 0
-       if (write(client_sd, &byte, 1) != -1 || errno != EPIPE)
-               test_fail("write() should have yielded EPIPE");
-#endif
+       if (!info->bug_connect_after_close) {
+               if (write(client_sd, &byte, 1) != -1 ||
+                   (errno != EPIPE && errno != ENOTCONN))
+                       test_fail("write() should have yielded "
+                           "EPIPE/ENOTCONN");
+       }
  
         close(client_sd);
  
@@ -2059,6 +2070,9 @@ test_listen_close_nb(const struct socket_test_info *info)
         if (bind(server_sd, info->serveraddr, info->serveraddrlen) == -1)
                 test_fail("bind() should have worked");
  
+       if (info->callback_set_listen_opt != NULL)
+               info->callback_set_listen_opt(server_sd);
+
         if (listen(server_sd, 8) == -1)
                 test_fail("listen() should have worked");
  
@@ -2097,16 +2111,6 @@ test_listen_close_nb(const struct socket_test_info *info)
                 test_fail("write() should have yielded ECONNRESET");
         }
  
-       /*
-        * FIXME: currently UDS cannot distinguish between sockets that have
-        * not yet been connected, and sockets that have been disconnected.
-        * Thus, we get the same error for both: ENOTCONN instead of EPIPE.
-        */
-#if 0
-       if (write(client_sd, &byte, 1) != -1 || errno != EPIPE)
-               test_fail("write() should have yielded EPIPE");
-#endif
-
         check_select_cond(client_sd, 1 /*read*/, 1 /*write*/, 0 /*block*/,
                 !info->ignore_select_delay);
  
diff --git a/minix/tests/common-socket.h b/minix/tests/common-socket.h

index f03015be5c2cb462a5853eed79d0eecdb880372d..03ba77e6cfc75107ceb3d454de983bbee11f2c75 100644 (file)
--- a/minix/tests/common-socket.h
+++ b/minix/tests/common-socket.h
@@ -88,7 +88,6 @@ struct socket_test_info {
         int ignore_accept_delay; /* success from accept after aborted connect */
         int ignore_connect_delay; /* nb connect not instant */
         int ignore_connect_unaccepted; /* connect succeeds without accept */
-       int ignore_read_conn_reset; /* read does not guarantee ECONNRESET */
         int ignore_select_delay; /* select delay reflecting other side nb op */
         int ignore_send_waiting; /* can send while waiting for nb recv */
         int ignore_write_conn_reset; /* write does not guarantee ECONNRESET */
@@ -98,6 +97,7 @@ struct socket_test_info {
         void (* callback_cleanup)(void);
         void (* callback_xfer_peercred)(int sd); /* can be NULL */
         void (* callback_xfer_prepclient)(void); /* can be NULL */
+       void (* callback_set_listen_opt)(int sd); /* can be NULL */
  };
  
  void test_abort_client_server(const struct socket_test_info *info,
diff --git a/minix/tests/test56.c b/minix/tests/test56.c

index 583468dd57ef4945ca8ff564d17b5a3f217eb3d5..d7a211b0099e2c26087ec54cb9dbeea1475379be 100644 (file)
--- a/minix/tests/test56.c
+++ b/minix/tests/test56.c
@@ -78,20 +78,6 @@ int max_error = 4;
  
  /* socket types supported */
  static int types[3] = {SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM};
-static char sock_fullpath[PATH_MAX + 1];
-
-/* Convert name to the full path of the socket. Assumes name is in cwd. */
-static char *fullpath(const char *name)
-{
-       char cwd[PATH_MAX + 1];
-
-       if (realpath(".", cwd) == NULL)
-               test_fail("Couldn't retrieve current working dir");
-
-       snprintf(sock_fullpath, PATH_MAX, "%s/%s", cwd, name);
-
-       return(sock_fullpath);
-}
  
  static void test_header(void)
  {
@@ -187,16 +173,16 @@ static void test_socketpair(void)
  
  static void test_ucred(void)
  {
-       struct uucred credentials;
+       struct unpcbid credentials;
         socklen_t ucred_length;
         uid_t euid = geteuid();
         gid_t egid = getegid();
         int sv[2];
         int rc;
  
-       debug("Test credentials passing");
+       debug("Test peer credentials");
  
-       ucred_length = sizeof(struct uucred);
+       ucred_length = sizeof(credentials);
  
         rc = socketpair(PF_UNIX, SOCK_STREAM, 0, sv);
         if (rc == -1) {
@@ -204,22 +190,24 @@ static void test_ucred(void)
         }
  
         memset(&credentials, '\0', ucred_length);
-       rc = getsockopt(sv[0], SOL_SOCKET, SO_PEERCRED, &credentials, 
+       rc = getsockopt(sv[0], 0, LOCAL_PEEREID, &credentials,
                                                         &ucred_length);
         if (rc == -1) {
-               test_fail("getsockopt(SO_PEERCRED) failed");
-       } else if (credentials.cr_ngroups != 0 ||
-                       credentials.cr_uid != geteuid() ||
-                       credentials.cr_gid != getegid()) {
-               /* printf("%d=%d %d=%d %d=%d",credentials.cr_ngroups, 0,
-                credentials.cr_uid, geteuid(), credentials.cr_gid, getegid()); */
+               test_fail("getsockopt(LOCAL_PEEREID) failed");
+       } else if (credentials.unp_pid != getpid() ||
+                       credentials.unp_euid != geteuid() ||
+                       credentials.unp_egid != getegid()) {
+               printf("%d=%d %d=%d %d=%d",credentials.unp_pid, getpid(),
+                   credentials.unp_euid, geteuid(),
+                   credentials.unp_egid, getegid());
                 test_fail("Credential passing gave us the wrong cred");
         }
  
         rc = getpeereid(sv[0], &euid, &egid);
         if (rc == -1) {
                 test_fail("getpeereid(sv[0], &euid, &egid) failed");
-       } else if (credentials.cr_uid != euid || credentials.cr_gid != egid) {
+       } else if (credentials.unp_euid != euid ||
+           credentials.unp_egid != egid) {
                 test_fail("getpeereid() didn't give the correct euid/egid");
         }
  
@@ -245,7 +233,7 @@ static void callback_check_sockaddr(const struct sockaddr *sockaddr,
  
         if (!(sockaddr_un->sun_family == AF_UNIX &&
                         strncmp(sockaddr_un->sun_path,
-                       fullpath(path),
+                       path,
                         sizeof(sockaddr_un->sun_path) - 1) == 0)) {
  
                 snprintf(buf, sizeof(buf), "%s() didn't return the right addr",
@@ -293,7 +281,6 @@ static void test_bind_unix(void)
         UNLINK(TEST_SYM_A);
         UNLINK(TEST_SYM_B);
  
-       SYMLINK(TEST_SYM_A, TEST_SYM_B);
         SYMLINK(TEST_SYM_B, TEST_SYM_A);
  
         SOCKET(sd, PF_UNIX, SOCK_STREAM, 0);
@@ -301,6 +288,19 @@ static void test_bind_unix(void)
         strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1);
         errno = 0;
         rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un));
+       if (!((rc == -1) && (errno == EADDRINUSE))) {
+               test_fail("bind() should have failed with EADDRINUSE");
+       }
+       CLOSE(sd);
+
+       SYMLINK(TEST_SYM_A, TEST_SYM_B);
+
+       SOCKET(sd, PF_UNIX, SOCK_STREAM, 0);
+
+       strncpy(addr.sun_path, TEST_SYM_A, sizeof(addr.sun_path) - 1);
+       strlcat(addr.sun_path, "/x", sizeof(addr.sun_path));
+       errno = 0;
+       rc = bind(sd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un));
         if (!((rc == -1) && (errno == ELOOP))) {
                 test_fail("bind() should have failed with ELOOP");
         }
@@ -337,28 +337,49 @@ static void callback_xfer_prepclient(void) {
  }
  
  static void callback_xfer_peercred(int sd) {
-       struct uucred credentials;
+       struct unpcbid credentials;
         int rc;
         socklen_t ucred_length;
  
-       ucred_length = sizeof(struct uucred);
+       ucred_length = sizeof(credentials);
  
-       debug("Test passing the client credentials to the server");
+       debug("Test obtaining the peer credentials");
  
         memset(&credentials, '\0', ucred_length);
-       rc = getsockopt(sd, SOL_SOCKET, SO_PEERCRED, &credentials,
-                                                       &ucred_length);
+       rc = getsockopt(sd, 0, LOCAL_PEEREID, &credentials, &ucred_length);
  
         if (rc == -1) {
                 test_fail("[client] getsockopt() failed");
-       }  else if (credentials.cr_uid != geteuid() ||
-                                       credentials.cr_gid != getegid()) {
-               printf("%d=%d=%d %d=%d=%d\n", credentials.cr_uid, getuid(),
-                       geteuid(), credentials.cr_gid, getgid(), getegid());
+       } else if (credentials.unp_euid != geteuid() ||
+           credentials.unp_egid != getegid()) {
+               printf("%d=* %d=%d %d=%d", credentials.unp_pid,
+                   credentials.unp_euid, geteuid(),
+                   credentials.unp_egid, getegid());
                 test_fail("[client] Credential passing gave us a bad UID/GID");
         }
  }
  
+static void
+callback_set_listen_opt(int sd)
+{
+       int val;
+
+       /*
+        * Several of the tests assume that a new connection to a server will
+        * not be established (i.e., go from "connecting" to "connected" state)
+        * until the server actually accepts the connection with an accept(2)
+        * call.  With the new UDS implementation, this is no longer true: to
+        * match the behavior of other systems, UDS now preemptively connects
+        * the socket in anticipation of the accept(2) call.  We can change
+        * back to the old behavior by setting LOCAL_CONNWAIT however, and
+        * since the test effectively tests a larger set of socket transitions
+        * that way, that is what we do for these tests.
+        */
+       val = 1;
+       if (setsockopt(sd, 0, LOCAL_CONNWAIT, &val, sizeof(val)) != 0)
+               test_fail("setsockopt(LOCAL_CONNWAIT)");
+}
+
  static void test_vectorio(int type)
  {
         int sv[2];
@@ -563,7 +584,11 @@ static void test_scm_credentials(void)
         int rc;
         int src;
         int dst;
-       struct uucred cred;
+       int one;
+       union {
+               struct sockcred cred;
+               char buf[SOCKCREDSIZE(NGROUPS_MAX)];
+       } cred;
         struct cmsghdr *cmsg = NULL;
         struct sockaddr_un addr;
         struct iovec iov[3];
@@ -573,7 +598,7 @@ static void test_scm_credentials(void)
         char buf2[BUFSIZE];
         char buf3[BUFSIZE];
         char ctrl[BUFSIZE];
-       socklen_t addrlen = sizeof(struct sockaddr_un);
+       socklen_t len, addrlen = sizeof(struct sockaddr_un);
  
         debug("test_scm_credentials");
  
@@ -615,6 +640,16 @@ static void test_scm_credentials(void)
                 test_fail("bind");
         }
  
+       debug("request credential passing");
+
+       one = 1;
+       rc = setsockopt(dst, 0, LOCAL_CREDS, &one, sizeof(one));
+       if (rc == -1) {
+               test_fail("setsockopt(LOCAL_CREDS)");
+       }
+
+       debug("sending msg1");
+
         memset(&buf1, '\0', BUFSIZE);
         memset(&buf2, '\0', BUFSIZE);
         memset(&buf3, '\0', BUFSIZE);
@@ -640,8 +675,6 @@ static void test_scm_credentials(void)
         msg1.msg_controllen = 0;
         msg1.msg_flags = 0;
  
-       debug("sending msg1");
-
         rc = sendmsg(src, &msg1, 0);
         if (rc == -1) {
                 test_fail("sendmsg");
@@ -684,27 +717,50 @@ static void test_scm_credentials(void)
          * because that is what is returned by recvmsg().
          */
         if (addr.sun_family != AF_UNIX || strcmp(addr.sun_path,
-                                       fullpath(TEST_SUN_PATHB))) {
+                                       TEST_SUN_PATHB)) {
                 test_fail("recvmsg");
         }
  
         debug("looking for credentials");
  
-       memset(&cred, '\0', sizeof(struct uucred));
+       len = 0;
+
+       memset(&cred, 'x', sizeof(cred));
         for (cmsg = CMSG_FIRSTHDR(&msg2); cmsg != NULL;
                                         cmsg = CMSG_NXTHDR(&msg2, cmsg)) {
  
                 if (cmsg->cmsg_level == SOL_SOCKET &&
                                 cmsg->cmsg_type == SCM_CREDS) {
-
-                       memcpy(&cred, CMSG_DATA(cmsg), sizeof(struct uucred));
+                       /* Great, this alignment business!  But then at least
+                        * give me a macro to compute the actual data length..
+                        */
+                       len = cmsg->cmsg_len - (socklen_t)
+                           ((char *)CMSG_DATA(cmsg) - (char *)cmsg);
+
+                       if (len < sizeof(struct sockcred))
+                               test_fail("credentials too small");
+                       else if (len > sizeof(cred))
+                               test_fail("credentials too large");
+                       memcpy(cred.buf, CMSG_DATA(cmsg), len);
                         break;
                 }
         }
  
-       if (cred.cr_ngroups != 0 || cred.cr_uid != geteuid() ||
-                                               cred.cr_gid != getegid()) {
+       if (len == 0)
+               test_fail("no credentials found");
+
+       if (len != SOCKCREDSIZE(cred.cred.sc_ngroups))
+               test_fail("wrong credentials size");
  
+       /*
+        * TODO: check supplementary groups.  This whole test is pretty much
+        * pointless since we're running with very standard credentials anyway.
+        */
+       if (cred.cred.sc_uid != getuid() ||
+           cred.cred.sc_euid != geteuid() ||
+           cred.cred.sc_gid != getgid() ||
+           cred.cred.sc_egid != getegid() ||
+           cred.cred.sc_ngroups < 0 || cred.cred.sc_ngroups > NGROUPS_MAX) {
                 test_fail("did no receive the proper credentials");
         }
  
@@ -1384,22 +1440,18 @@ static void test_fchmod(void)
   * Test various aspects related to the socket files on the file system.
   * This subtest is woefully incomplete and currently only attempts to test
   * aspects that have recently been affected by code changes.  In the future,
- * there should be tests for path canonicalization and the entire range of file
- * system path and access related error codes (TODO).
+ * there should be tests for the entire range of file system path and access
+ * related error codes (TODO).
   */
  static void
  test_file(void)
  {
-       struct sockaddr_un addr;
-#if NOT_YET
-       struct sockaddr_un saddr, saddr2;
+       struct sockaddr_un addr, saddr, saddr2;
         char buf[1];
         socklen_t len;
         struct stat st;
         mode_t omask;
-       int, csd, fd;
-#endif
-       int sd, sd2;
+       int sd, sd2, csd, fd;
  
         /*
          * If the provided socket path exists on the file system, the bind(2)
@@ -1426,7 +1478,6 @@ test_file(void)
  
         CLOSE(sd);
  
-#if NOT_YET
         if (bind(sd2, (struct sockaddr *)&addr, sizeof(addr)) != -1)
                 test_fail("Binding socket unexpectedly succeeded");
         if (errno != EADDRINUSE)
@@ -1497,29 +1548,8 @@ test_file(void)
         if (memcmp(&saddr, &saddr2, sizeof(saddr)))
                 test_fail("Unexpected old socket address");
  
-       /*
-        * Currently, our implementation "hides" the old socket even if the new
-        * socket is closed, but since this is not standard behavior and may be
-        * changed later, we do not test for it.  However, in any case,
-        * rebinding the hidden socket should make it "visible" again.
-        */
-       strlcpy(saddr2.sun_path, TEST_SUN_PATHB, sizeof(saddr2.sun_path));
-       if (bind(sd, (struct sockaddr *)&saddr2, sizeof(saddr2)) != 0)
-               test_fail("Can't rebind socket");
-
-       memset(buf, 'Z', sizeof(buf));
-       if (sendto(csd, buf, sizeof(buf), 0, (struct sockaddr *)&saddr2,
-           sizeof(saddr2)) != sizeof(buf))
-               test_fail("Can't send to socket");
-       if (recvfrom(sd, buf, sizeof(buf), 0, NULL, 0) != sizeof(buf))
-               test_fail("Can't receive from socket");
-       if (buf[0] != 'Z')
-               test_fail("Transmission failure");
-
         if (unlink(TEST_SUN_PATH) != 0)
                 test_fail("Can't unlink socket");
-       if (unlink(TEST_SUN_PATHB) != 0)
-               test_fail("Can't unlink other socket");
  
         CLOSE(sd);
         CLOSE(sd2);
@@ -1580,7 +1610,6 @@ test_file(void)
         UNLINK(TEST_SUN_PATH);
  
         umask(omask);
-#endif
  
         /*
          * Only socket(2), socketpair(2), and accept(2) may be used to obtain
@@ -1631,8 +1660,8 @@ int main(int argc, char *argv[])
                 .clientaddrsym            = (struct sockaddr *) &clientaddrsym,
                 .clientaddrsymlen         = sizeof(clientaddrsym),
                 .domain                   = PF_UNIX,
-               .expected_rcvbuf          = PIPE_BUF,
-               .expected_sndbuf          = PIPE_BUF,
+               .expected_rcvbuf          = 32768 - 5, /* no constants: */
+               .expected_sndbuf          = 32768 - 5, /* UDS internals */
                 .serveraddr               = (struct sockaddr *) &clientaddr,
                 .serveraddrlen            = sizeof(clientaddr),
                 .serveraddr2              = (struct sockaddr *) &clientaddr2,
@@ -1644,12 +1673,16 @@ int main(int argc, char *argv[])
                 .callback_cleanup         = callback_cleanup,
                 .callback_xfer_prepclient = callback_xfer_prepclient,
                 .callback_xfer_peercred   = callback_xfer_peercred,
+               .callback_set_listen_opt  = callback_set_listen_opt,
         };
  
         debug("entering main()");
  
         start(56);
  
+       /* This test was written before UDS started supporting SIGPIPE. */
+       signal(SIGPIPE, SIG_IGN);
+
         test_socket(&info);
         test_bind(&info);
         test_bind_unix();
diff --git a/minix/tests/test80.c b/minix/tests/test80.c

index 0975387aaa1f07b692a7dc54ee7cfbc79f76ff3f..f6bc69ad9897e624062373705a2187fa64ebad6b 100644 (file)
--- a/minix/tests/test80.c
+++ b/minix/tests/test80.c
@@ -96,7 +96,6 @@ int main(int argc, char *argv[])
                 .ignore_accept_delay       = 1,
                 .ignore_connect_unaccepted = 1,
                 .ignore_connect_delay      = 1,
-               .ignore_read_conn_reset    = 1,
                 .ignore_select_delay       = 1,
                 .ignore_send_waiting       = 1,
                 .ignore_write_conn_reset   = 1,
diff --git a/minix/tests/test81.c b/minix/tests/test81.c

index 1a3188cd6c425a991355cbc33db833eb8caccac0..f23c174d3427806dd674bf72fea2c9674a9fc70b 100644 (file)
--- a/minix/tests/test81.c
+++ b/minix/tests/test81.c
@@ -99,7 +99,6 @@ int main(int argc, char *argv[])
                 .ignore_accept_delay       = 1,
                 .ignore_connect_unaccepted = 1,
                 .ignore_connect_delay      = 1,
-               .ignore_read_conn_reset    = 1,
                 .ignore_select_delay       = 1,
                 .ignore_send_waiting       = 1,
                 .ignore_write_conn_reset   = 1,
diff --git a/minix/usr.bin/trace/ioctl/net.c b/minix/usr.bin/trace/ioctl/net.c

index 8d7591c80b9ade9666836ef6b3f2f1299c4dc928..1842d5e6e248ef6e6cb20af4bfd67406761b14f4 100644 (file)
--- a/minix/usr.bin/trace/ioctl/net.c
+++ b/minix/usr.bin/trace/ioctl/net.c
@@ -189,6 +189,27 @@ static const struct flags udpopt_flags[] = {
         FLAG(NWUO_DI_IPOPT),
  };
  
+static void
+put_struct_uucred(struct trace_proc * proc, const char * name, int flags,
+       vir_bytes addr)
+{
+       struct uucred cred;
+
+       if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred)))
+               return;
+
+       put_value(proc, "cr_uid", "%u", cred.cr_uid);
+       if (verbose > 0) {
+               put_value(proc, "cr_gid", "%u", cred.cr_gid);
+               if (verbose > 1)
+                       put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups);
+               put_groups(proc, "cr_groups", PF_LOCADDR,
+                   (vir_bytes)&cred.cr_groups, cred.cr_ngroups);
+       }
+
+       put_close_struct(proc, verbose > 0);
+}
+
  static void
  put_msg_control(struct trace_proc * proc, struct msg_control * ptr)
  {
diff --git a/minix/usr.bin/trace/proto.h b/minix/usr.bin/trace/proto.h

index 11591edc170a5a0d80e78c577e6871212a268237..e27637b2c9bc52376d2c750fb3c726dc75f7342f 100644 (file)
--- a/minix/usr.bin/trace/proto.h
+++ b/minix/usr.bin/trace/proto.h
@@ -115,8 +115,6 @@ void put_dev(struct trace_proc *proc, const char *name, dev_t dev);
  void put_in_addr(struct trace_proc *proc, const char *name, struct in_addr in);
  void put_socket_type(struct trace_proc *proc, const char *name, int type);
  void put_socket_family(struct trace_proc *proc, const char *name, int family);
-void put_struct_uucred(struct trace_proc *proc, const char *name, int flags,
-       vir_bytes addr);
  void put_cmsg_type(struct trace_proc *proc, const char *name, int type);
  void put_shutdown_how(struct trace_proc *proc, const char *name, int how);
  
diff --git a/minix/usr.bin/trace/service/vfs.c b/minix/usr.bin/trace/service/vfs.c

index 93c0055fa509b1ddc9fa829adcd27c5933f21db7..8163f4dd7c698c21f33a170842602b2cd1e48619 100644 (file)
--- a/minix/usr.bin/trace/service/vfs.c
+++ b/minix/usr.bin/trace/service/vfs.c
@@ -1802,25 +1802,32 @@ put_struct_iovec(struct trace_proc * proc, const char * name, int flags,
         put_close(proc, "]");
  }
  
-void
-put_struct_uucred(struct trace_proc * proc, const char * name, int flags,
-       vir_bytes addr)
+static void
+put_struct_sockcred(struct trace_proc * proc, const char * name, int flags,
+       vir_bytes addr, size_t left)
  {
-       struct uucred cred;
+       struct sockcred sc;
  
-       if (!put_open_struct(proc, name, flags, addr, &cred, sizeof(cred)))
+       if (!put_open_struct(proc, name, flags, addr, &sc, sizeof(sc)))
                 return;
  
-       put_value(proc, "cr_uid", "%u", cred.cr_uid);
+       put_value(proc, "sc_uid", "%u", sc.sc_uid);
+       if (verbose > 0)
+               put_value(proc, "sc_euid", "%u", sc.sc_euid);
+       put_value(proc, "sc_gid", "%u", sc.sc_gid);
         if (verbose > 0) {
-               put_value(proc, "cr_gid", "%u", cred.cr_gid);
+               put_value(proc, "sc_egid", "%u", sc.sc_egid);
                 if (verbose > 1)
-                       put_value(proc, "cr_ngroups", "%d", cred.cr_ngroups);
-               put_groups(proc, "cr_groups", PF_LOCADDR,
-                   (vir_bytes)&cred.cr_groups, cred.cr_ngroups);
+                       put_value(proc, "sc_ngroups", "%d", sc.sc_ngroups);
+               if (left >= sizeof(sc.sc_groups[0]) * (sc.sc_ngroups - 1)) {
+                       put_groups(proc, "sc_groups", flags,
+                           addr + offsetof(struct sockcred, sc_groups),
+                           sc.sc_ngroups);
+               } else
+                       put_field(proc, "sc_groups", "..");
         }
  
-       put_close_struct(proc, verbose > 0);
+       put_close_struct(proc, verbose > 1);
  }
  
  static void
@@ -1907,7 +1914,7 @@ put_cmsg(struct trace_proc * proc, const char * name, vir_bytes addr,
         size_t len)
  {
         struct cmsghdr cmsg;
-       char buf[CMSG_SPACE(sizeof(struct uucred))];
+       char buf[CMSG_SPACE(sizeof(struct sockcred))];
         size_t off, chunk, datalen;
  
         if (valuesonly > 1 || addr == 0 || len < CMSG_LEN(0)) {
@@ -1960,10 +1967,11 @@ put_cmsg(struct trace_proc * proc, const char * name, vir_bytes addr,
                             addr + off + chunk, datalen);
                 } else if (cmsg.cmsg_level == SOL_SOCKET &&
                     cmsg.cmsg_type == SCM_CREDS &&
-                   datalen >= sizeof(struct uucred) &&
+                   datalen >= sizeof(struct sockcred) &&
                     chunk >= CMSG_LEN(datalen)) {
-                       put_struct_uucred(proc, "cmsg_data", PF_LOCADDR,
-                           (vir_bytes)&buf[CMSG_LEN(0)]);
+                       put_struct_sockcred(proc, "cmsg_data", PF_LOCADDR,
+                           (vir_bytes)&buf[CMSG_LEN(0)],
+                           datalen - sizeof(struct sockcred));
                 } else if (datalen > 0)
                         put_field(proc, "cmsg_data", "..");
  
@@ -2129,8 +2137,6 @@ put_sockopt_name(struct trace_proc * proc, const char * name, int level,
                 TEXT(SO_REUSEPORT);
                 TEXT(SO_NOSIGPIPE);
                 TEXT(SO_TIMESTAMP);
-               TEXT(SO_PASSCRED);
-               TEXT(SO_PEERCRED);
                 TEXT(SO_SNDBUF);
                 TEXT(SO_RCVBUF);
                 TEXT(SO_SNDLOWAT);
@@ -2157,7 +2163,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
         const char *text;
         int i;
         struct linger l;
-       struct uucred cr;
         struct timeval tv;
         void *ptr;
         size_t size;
@@ -2183,7 +2188,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
         case SO_REUSEPORT:
         case SO_NOSIGPIPE:
         case SO_TIMESTAMP:
-       case SO_PASSCRED:
         case SO_SNDBUF:
         case SO_RCVBUF:
         case SO_SNDLOWAT:
@@ -2199,10 +2203,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
                 ptr = &l;
                 size = sizeof(l);
                 break;
-       case SO_PEERCRED:
-               ptr = &cr;
-               size = sizeof(cr);
-               break;
         case SO_SNDTIMEO:
         case SO_RCVTIMEO:
                 ptr = &tv;
@@ -2229,9 +2229,6 @@ put_sockopt_data(struct trace_proc * proc, const char * name, int flags,
                 put_value(proc, "l_linger", "%d", l.l_linger);
                 put_close(proc, "}");
                 break;
-       case SO_PEERCRED:
-               put_struct_uucred(proc, name, PF_LOCADDR, (vir_bytes)&cr);
-               break;
         case SO_ERROR:
                 put_open(proc, name, 0, "{", ", ");
                 if (!valuesonly && (text = get_error_name(i)) != NULL)
diff --git a/sys/sys/socket.h b/sys/sys/socket.h

index 219b90baf6d9249ac5d5ec7daba86ff235bf5da4..d7340af641275e07261dddc0ef8e2fef1ea536f2 100644 (file)
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -133,12 +133,6 @@ typedef    _BSD_SSIZE_T_   ssize_t;
  #define        SO_ACCEPTFILTER 0x1000          /* there is an accept filter */
  #define        SO_TIMESTAMP    0x2000          /* timestamp received dgram traffic */
  
-#if defined(__minix) && defined(_MINIX_SYSTEM)
-/* Minixism which should go, so hide it from userland. */
-#define SO_PASSCRED    0x100000
-#define SO_PEERCRED    0x200000
-#endif /* defined(__minix) */
-
  /*
   * Additional options, not kept in so_options.
   */
author	David van Moolenbroek <david@minix3.org>
	Sun, 21 Feb 2016 22:59:04 +0000 (22:59 +0000)
committer	David van Moolenbroek <david@minix3.org>
	Thu, 9 Mar 2017 23:39:56 +0000 (23:39 +0000)
distrib/sets/lists/minix-base/mi		patch \| blob \| history
distrib/sets/lists/minix-man/mi		patch \| blob \| history
etc/system.conf		patch \| blob \| history
etc/usr/rc		patch \| blob \| history
external/bsd/tmux/dist/client.c		patch \| blob \| history
external/bsd/tmux/dist/server.c		patch \| blob \| history
lib/libc/gen/syslog.c		patch \| blob \| history
lib/libc/net/Makefile.inc		patch \| blob \| history
minix/commands/DESCRIBE/DESCRIBE.sh		patch \| blob \| history
minix/commands/MAKEDEV/MAKEDEV.sh		patch \| blob \| history
minix/include/minix/dmap.h		patch \| blob \| history
minix/include/minix/syslib.h		patch \| blob \| history
minix/lib/libc/net/getpeereid.c	[deleted file]	patch \| blob \| history
minix/lib/libc/sys/getsockopt.c		patch \| blob \| history
minix/lib/libc/sys/setsockopt.c		patch \| blob \| history
minix/lib/libsys/socketpath.c		patch \| blob \| history
minix/man/man2/Makefile		patch \| blob \| history
minix/man/man2/getpeereid.2	[deleted file]	patch \| blob \| history
minix/net/uds/Makefile		patch \| blob \| history
minix/net/uds/io.c	[new file with mode: 0644]	patch \| blob
minix/net/uds/ioc_uds.c	[deleted file]	patch \| blob \| history
minix/net/uds/stat.c	[new file with mode: 0644]	patch \| blob
minix/net/uds/uds.8	[deleted file]	patch \| blob \| history
minix/net/uds/uds.c		patch \| blob \| history
minix/net/uds/uds.conf	[new file with mode: 0644]	patch \| blob
minix/net/uds/uds.h		patch \| blob \| history
minix/net/uds/unix.8		patch \| blob \| history
minix/servers/vfs/filedes.c		patch \| blob \| history
minix/servers/vfs/open.c		patch \| blob \| history
minix/servers/vfs/path.c		patch \| blob \| history
minix/tests/common-socket.c		patch \| blob \| history
minix/tests/common-socket.h		patch \| blob \| history
minix/tests/test56.c		patch \| blob \| history
minix/tests/test80.c		patch \| blob \| history
minix/tests/test81.c		patch \| blob \| history
minix/usr.bin/trace/ioctl/net.c		patch \| blob \| history
minix/usr.bin/trace/proto.h		patch \| blob \| history
minix/usr.bin/trace/service/vfs.c		patch \| blob \| history
sys/sys/socket.h		patch \| blob \| history