From: David van Moolenbroek Date: Sun, 21 Feb 2016 19:28:24 +0000 (+0000) Subject: VFS: add BSD socket API, socket driver support X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/dnssec-keyfromlabel.html?a=commitdiff_plain;h=e3b8d4bb58a799dc7fd563ac39bf3015762af03b;p=minix.git VFS: add BSD socket API, socket driver support This patch adds the implementation of the BSD socket system calls which have been introduced in an earlier patch. At the same time, it adds support for communication with socket drivers, using a new "socket device" (SDEV_) protocol. These two parts, implemented in socket.c and sdev.c respectively, form the upper and lower halves of the new BSD socket support in VFS. New mapping functionality for socket domains and drivers is added as well, implemented in smap.c. The rest of the changes mainly facilitate the separation of character and socket driver calls, and do not make any fundamental alterations. For example, while this patch changes VFS's select.c rather heavily, the new select logic for socket drivers is the exact same as for character drivers; the changes mainly separate the driver type specific parts from the generic select logic further than before. Change-Id: I2f13084dd3c8d3a68bfc69da0621120c8291f707 --- diff --git a/minix/fs/pfs/pfs.c b/minix/fs/pfs/pfs.c index 0a5b0ca08..6248d8e79 100644 --- a/minix/fs/pfs/pfs.c +++ b/minix/fs/pfs/pfs.c @@ -132,7 +132,7 @@ pfs_newnode(mode_t mode, uid_t uid, gid_t gid, dev_t dev, /* Check the file type. Do we support it at all? */ isfifo = S_ISFIFO(mode); - isdev = S_ISBLK(mode) || S_ISCHR(mode); + isdev = S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode); if (!isfifo && !isdev) return EINVAL; /* this means VFS is misbehaving.. */ diff --git a/minix/include/minix/com.h b/minix/include/minix/com.h index 35367e9ea..f43d01856 100644 --- a/minix/include/minix/com.h +++ b/minix/include/minix/com.h @@ -30,6 +30,7 @@ * 0x1600 - 0x16FF VirtualBox (VBOX) requests (see vboxif.h) * 0x1700 - 0x17FF PTYFS requests * 0x1800 - 0x18FF Management Information Base (MIB) requests + * 0x1900 - 0x19FF Socket device requests and responses * * Zero and negative values are widely used for OK and error responses. */ @@ -1027,6 +1028,54 @@ #define NR_MIB_CALLS 3 /* highest number from base plus one */ +/*===========================================================================* + * Messages for socket devices * + *===========================================================================*/ + +/* Base type for socket device requests and responses. */ +#define SDEV_RQ_BASE 0x1900 +#define SDEV_RS_BASE 0x1980 + +#define IS_SDEV_RQ(type) (((type) & ~0x7f) == SDEV_RQ_BASE) +#define IS_SDEV_RS(type) (((type) & ~0x7f) == SDEV_RS_BASE) + +/* Message types for socket device requests. */ +#define SDEV_SOCKET (SDEV_RQ_BASE + 0) /* create socket */ +#define SDEV_SOCKETPAIR (SDEV_RQ_BASE + 1) /* make socket pair */ +#define SDEV_BIND (SDEV_RQ_BASE + 2) /* bind to address */ +#define SDEV_CONNECT (SDEV_RQ_BASE + 3) /* start connection */ +#define SDEV_LISTEN (SDEV_RQ_BASE + 4) /* enter listen mode */ +#define SDEV_ACCEPT (SDEV_RQ_BASE + 5) /* accept connection */ +#define SDEV_SEND (SDEV_RQ_BASE + 6) /* send data */ +#define SDEV_RECV (SDEV_RQ_BASE + 7) /* receive data */ +#define SDEV_IOCTL (SDEV_RQ_BASE + 8) /* I/O control */ +#define SDEV_SETSOCKOPT (SDEV_RQ_BASE + 9) /* set socket option */ +#define SDEV_GETSOCKOPT (SDEV_RQ_BASE + 10) /* get socket option */ +#define SDEV_GETSOCKNAME (SDEV_RQ_BASE + 11) /* get socket name */ +#define SDEV_GETPEERNAME (SDEV_RQ_BASE + 12) /* get peer name */ +#define SDEV_SHUTDOWN (SDEV_RQ_BASE + 13) /* shut down I/O */ +#define SDEV_CLOSE (SDEV_RQ_BASE + 14) /* close socket */ +#define SDEV_CANCEL (SDEV_RQ_BASE + 15) /* cancel request */ +#define SDEV_SELECT (SDEV_RQ_BASE + 16) /* select on socket */ + +/* Message types for socket device responses. */ +#define SDEV_REPLY (SDEV_RS_BASE + 0) /* generic reply */ +#define SDEV_SOCKET_REPLY (SDEV_RS_BASE + 1) /* socket reply */ +#define SDEV_ACCEPT_REPLY (SDEV_RS_BASE + 2) /* accept reply */ +#define SDEV_RECV_REPLY (SDEV_RS_BASE + 3) /* receive reply */ +#define SDEV_SELECT1_REPLY (SDEV_RS_BASE + 4) /* select reply 1 */ +#define SDEV_SELECT2_REPLY (SDEV_RS_BASE + 5) /* select reply 2 */ + +/* Bits in the 'sflags' field of socket device transfer requests. */ +# define SDEV_NOFLAGS 0x00 /* no flags are set */ +# define SDEV_NONBLOCK 0x01 /* do not suspend I/O request */ + +/* Bits in the 'ops', 'status' fields of socket device select messages. */ +# define SDEV_OP_RD 0x01 /* selected for read operation */ +# define SDEV_OP_WR 0x02 /* selected for write operation */ +# define SDEV_OP_ERR 0x04 /* selected for error operation */ +# define SDEV_NOTIFY 0x08 /* notification requested */ + /*===========================================================================* * Internal codes used by several services * *===========================================================================*/ diff --git a/minix/include/minix/ipc.h b/minix/include/minix/ipc.h index 13075e10a..7a3507106 100644 --- a/minix/include/minix/ipc.h +++ b/minix/include/minix/ipc.h @@ -999,6 +999,52 @@ typedef struct { } mess_linputdriver_input_event; _ASSERT_MSG_SIZE(mess_linputdriver_input_event); +typedef struct { + int32_t req_id; + int32_t sock_id; + int status; + unsigned int len; + + uint8_t padding[40]; +} mess_lsockdriver_vfs_accept_reply; +_ASSERT_MSG_SIZE(mess_lsockdriver_vfs_accept_reply); + +typedef struct { + int32_t req_id; + int status; + unsigned int ctl_len; + unsigned int addr_len; + int flags; + + uint8_t padding[36]; +} mess_lsockdriver_vfs_recv_reply; +_ASSERT_MSG_SIZE(mess_lsockdriver_vfs_recv_reply); + +typedef struct { + int32_t req_id; + int status; + + uint8_t padding[48]; +} mess_lsockdriver_vfs_reply; +_ASSERT_MSG_SIZE(mess_lsockdriver_vfs_reply); + +typedef struct { + int32_t sock_id; + int status; + + uint8_t padding[48]; +} mess_lsockdriver_vfs_select_reply; +_ASSERT_MSG_SIZE(mess_lsockdriver_vfs_select_reply); + +typedef struct { + int32_t req_id; + int32_t sock_id; + int32_t sock_id2; + + uint8_t padding[44]; +} mess_lsockdriver_vfs_socket_reply; +_ASSERT_MSG_SIZE(mess_lsockdriver_vfs_socket_reply); + typedef struct { cp_grant_id_t gid; size_t size; @@ -2131,6 +2177,86 @@ typedef struct { } mess_vfs_lchardriver_select; _ASSERT_MSG_SIZE(mess_vfs_lchardriver_select); +typedef struct { + int32_t req_id; + int32_t sock_id; + cp_grant_id_t grant; + unsigned int len; + endpoint_t user_endpt; + int sflags; + + uint8_t padding[32]; +} mess_vfs_lsockdriver_addr; +_ASSERT_MSG_SIZE(mess_vfs_lsockdriver_addr); + +typedef struct { + int32_t req_id; + int32_t sock_id; + int level; + int name; + cp_grant_id_t grant; + unsigned int len; + + uint8_t padding[32]; +} mess_vfs_lsockdriver_getset; +_ASSERT_MSG_SIZE(mess_vfs_lsockdriver_getset); + +typedef struct { + int32_t req_id; + int32_t sock_id; + unsigned long request; + cp_grant_id_t grant; + endpoint_t user_endpt; + int sflags; + + uint8_t padding[32]; +} mess_vfs_lsockdriver_ioctl; +_ASSERT_MSG_SIZE(mess_vfs_lsockdriver_ioctl); + +typedef struct { + int32_t sock_id; + int ops; + + uint8_t padding[48]; +} mess_vfs_lsockdriver_select; +_ASSERT_MSG_SIZE(mess_vfs_lsockdriver_select); + +typedef struct { + int32_t req_id; + int32_t sock_id; + cp_grant_id_t data_grant; + size_t data_len; + cp_grant_id_t ctl_grant; + unsigned int ctl_len; + cp_grant_id_t addr_grant; + unsigned int addr_len; + endpoint_t user_endpt; + int flags; + + uint8_t padding[16]; +} mess_vfs_lsockdriver_sendrecv; +_ASSERT_MSG_SIZE(mess_vfs_lsockdriver_sendrecv); + +typedef struct { + int32_t req_id; + int32_t sock_id; + int param; + + uint8_t padding[44]; +} mess_vfs_lsockdriver_simple; +_ASSERT_MSG_SIZE(mess_vfs_lsockdriver_simple); + +typedef struct { + int32_t req_id; + int domain; + int type; + int protocol; + endpoint_t user_endpt; + + uint8_t padding[36]; +} mess_vfs_lsockdriver_socket; +_ASSERT_MSG_SIZE(mess_vfs_lsockdriver_socket); + typedef struct { cp_grant_id_t grant; size_t size; @@ -2301,6 +2427,15 @@ typedef struct noxfer_message { mess_li2cdriver_i2c_busc_i2c_exec m_li2cdriver_i2c_busc_i2c_exec; mess_li2cdriver_i2c_busc_i2c_reserve m_li2cdriver_i2c_busc_i2c_reserve; mess_linputdriver_input_event m_linputdriver_input_event; + mess_lsockdriver_vfs_accept_reply + m_lsockdriver_vfs_accept_reply; + mess_lsockdriver_vfs_recv_reply + m_lsockdriver_vfs_recv_reply; + mess_lsockdriver_vfs_reply m_lsockdriver_vfs_reply; + mess_lsockdriver_vfs_select_reply + m_lsockdriver_vfs_select_reply; + mess_lsockdriver_vfs_socket_reply + m_lsockdriver_vfs_socket_reply; mess_lsys_fi_ctl m_lsys_fi_ctl; mess_lsys_fi_reply m_lsys_fi_reply; mess_lsys_getsysinfo m_lsys_getsysinfo; @@ -2423,6 +2558,13 @@ typedef struct noxfer_message { mess_vfs_lchardriver_openclose m_vfs_lchardriver_openclose; mess_vfs_lchardriver_readwrite m_vfs_lchardriver_readwrite; mess_vfs_lchardriver_select m_vfs_lchardriver_select; + mess_vfs_lsockdriver_addr m_vfs_lsockdriver_addr; + mess_vfs_lsockdriver_getset m_vfs_lsockdriver_getset; + mess_vfs_lsockdriver_ioctl m_vfs_lsockdriver_ioctl; + mess_vfs_lsockdriver_select m_vfs_lsockdriver_select; + mess_vfs_lsockdriver_sendrecv m_vfs_lsockdriver_sendrecv; + mess_vfs_lsockdriver_simple m_vfs_lsockdriver_simple; + mess_vfs_lsockdriver_socket m_vfs_lsockdriver_socket; mess_vfs_lsys_gcov m_vfs_lsys_gcov; mess_vfs_utimens m_vfs_utimens; mess_vm_vfs_mmap m_vm_vfs_mmap; diff --git a/minix/servers/is/dmp_fs.c b/minix/servers/is/dmp_fs.c index 0ed191736..53c675e8d 100644 --- a/minix/servers/is/dmp_fs.c +++ b/minix/servers/is/dmp_fs.c @@ -51,6 +51,7 @@ fproc_dmp(void) ); if (fp->fp_blocked_on == FP_BLOCKED_ON_CDEV) printf("%4d\n", fp->fp_cdev.endpt); + /* TODO: for FP_BLOCKED_ON_SDEV we do not have the endpoint.. */ else printf(" nil\n"); } diff --git a/minix/servers/mib/proc.c b/minix/servers/mib/proc.c index 60a901785..4a35536f2 100644 --- a/minix/servers/mib/proc.c +++ b/minix/servers/mib/proc.c @@ -306,10 +306,11 @@ get_lwp_stat(int mslot, uint64_t * wcptr, char * wmptr, size_t wmsz, wmesg = "select"; break; case FP_BLOCKED_ON_CDEV: + case FP_BLOCKED_ON_SDEV: /* - * Add the task (= character driver) endpoint to the - * wchan value, and use the driver's process name, - * without parentheses, as wmesg text. + * Add the task (= character or socket driver) endpoint + * to the wchan value, and use the driver's process + * name, without parentheses, as wmesg text. */ wchan |= (uint64_t)fp->fpl_task << 16; fill_wmesg(wmptr, wmsz, fp->fpl_task, FALSE /*ipc*/); diff --git a/minix/servers/vfs/Makefile b/minix/servers/vfs/Makefile index aee65536f..c1ff239b9 100644 --- a/minix/servers/vfs/Makefile +++ b/minix/servers/vfs/Makefile @@ -8,7 +8,7 @@ SRCS= main.c open.c read.c write.c pipe.c dmap.c \ lock.c misc.c utility.c select.c table.c \ vnode.c vmnt.c request.c \ tll.c comm.c worker.c coredump.c \ - bdev.c cdev.c socket.c + bdev.c cdev.c sdev.c smap.c socket.c .if ${MKCOVERAGE} != "no" SRCS+= gcov.c diff --git a/minix/servers/vfs/README b/minix/servers/vfs/README index 5f28b3c0a..af4e88552 100644 --- a/minix/servers/vfs/README +++ b/minix/servers/vfs/README @@ -47,9 +47,9 @@ spread out over the kernel, VM, PM, and VFS). For example, it maintains state for select(2) calls, file descriptors and file positions. Also, it cooperates with the Process Manager to handle the fork, exec, and exit system calls. Third, VFS keeps track of endpoints that are supposed to be drivers for -character or block special files. File Servers can be regarded as drivers for -block special files, although they are handled entirely different compared -to other drivers. +character or block special files, as well as for socket protocol families. +File Servers can be regarded as drivers for block special files, although they +are handled entirely different compared to other drivers. The following diagram depicts how a read() on a file in /home is being handled: {{{ @@ -88,10 +88,10 @@ fetches a message (internally referred to as a job in some cases), executes the request embedded in the message, returns a reply, and fetches the next job. There are several sources for new jobs: from user processes, from PM, from the kernel, and from suspended jobs inside VFS itself (suspended operations -on pipes, locks, or character special files). File Servers are regarded as -normal user processes in this case, but their abilities are limited. This -is to prevent deadlocks. Once a job is received, a worker thread starts -executing it. During the lifetime of a job, the worker thread might need +on pipes, locks, character special files, or sockets). File Servers are +regarded as normal user processes in this case, but their abilities are +limited. This is to prevent deadlocks. Once a job is received, a worker thread +starts executing it. During the lifetime of a job, the worker thread might need to talk to several File Servers. The protocol VFS speaks with File Servers is fully documented on the Wiki at [0]. The protocol fields are defined in . If the job is an operation on a character or block special @@ -122,10 +122,10 @@ Driver replies are processed directly from the main thread. As a consequence, these processing routines may not block their calling thread. In some cases, these routines may resume a thread that is blocked waiting for the reply. This is always the case for block driver replies, and may or may not be the case for -character driver replies. The character driver reply processing routines may -also unblock suspended processes which in turn generate new jobs to be handled -by the main loop (e.g., suspended reads and writes on pipes). So depending -on the reply a new thread may have to be started. +character and socket driver replies. The character and socket driver reply +processing routines may also unblock suspended processes which in turn generate +new jobs to be handled by the main loop (e.g., suspended reads and writes on +pipes). So depending on the reply a new thread may have to be started. Worker threads are strictly tied to a process, and each process can have at most one worker thread running for it. Generally speaking, there are two types @@ -655,9 +655,9 @@ Table 7: VFS-FS requests locking guarantees == Recovery from driver crashes == ## 5 Recovery from driver crashes -VFS can recover from block special file and character special file driver -crashes. It can recover to some degree from a crashed File Server (which we -can regard as a driver). +VFS can recover from block, character, and socket driver crashes. It can +recover to some degree from a crashed File Server (which we can regard as a +driver). === Recovery from block drivers crashes === ## 5.1 Recovery from block drivers crashes @@ -672,17 +672,18 @@ files can cause the block driver to crash again. When that happens, VFS will stop the recovery. A driver can return ERESTART to VFS to tell it to retry a request. VFS does this with an arbitrary maximum of 5 attempts. -=== Recovery from character driver crashes === -## 5.2 Recovery from character driver crashes +=== Recovery from character and socket driver crashes === +## 5.2 Recovery from character and socket driver crashes While VFS used to support minimal recovery from character driver crashes, the added complexity has so far proven to outweigh the benefits, especially since such crash recovery can never be fully transparent: it depends entirely on the character device as to whether repeating an I/O request makes sense at all. Currently, all operations except close(2) on a file descriptor that identifies -a device on a crashed character driver, will result in an EIO error. It is up -to the application to reopen the character device and retry whatever it was -doing in the appropriate manner. In the future, automatic reopen and I/O -restart may be reintroduced for a limited subset of character drivers. +a device on a crashed character or socket driver, will result in an EIO error. +It is up to the application to reopen the character device or socket and retry +whatever it was doing in the appropriate manner. In the future, automatic +reopen and I/O restart may be reintroduced for a limited subset of character +drivers. === Recovery from File Server crashes === ## 5.3 Recovery from File Server crashes diff --git a/minix/servers/vfs/bdev.c b/minix/servers/vfs/bdev.c index c18fb3a28..714e24648 100644 --- a/minix/servers/vfs/bdev.c +++ b/minix/servers/vfs/bdev.c @@ -195,7 +195,7 @@ bdev_reply(void) struct worker_thread *wp; struct dmap *dp; - if ((dp = get_dmap(who_e)) == NULL) { + if ((dp = get_dmap_by_endpt(who_e)) == NULL) { printf("VFS: ignoring block dev reply from unknown driver " "%d\n", who_e); return; diff --git a/minix/servers/vfs/cdev.c b/minix/servers/vfs/cdev.c index e4e8b1c2c..8f58a5bd8 100644 --- a/minix/servers/vfs/cdev.c +++ b/minix/servers/vfs/cdev.c @@ -481,7 +481,7 @@ void cdev_reply(void) { - if (get_dmap(who_e) == NULL) { + if (get_dmap_by_endpt(who_e) == NULL) { printf("VFS: ignoring char dev reply from unknown driver %d\n", who_e); return; @@ -492,11 +492,13 @@ cdev_reply(void) cdev_generic_reply(&m_in); break; case CDEV_SEL1_REPLY: - select_reply1(m_in.m_source, m_in.m_lchardriver_vfs_sel1.minor, + select_cdev_reply1(m_in.m_source, + m_in.m_lchardriver_vfs_sel1.minor, m_in.m_lchardriver_vfs_sel1.status); break; case CDEV_SEL2_REPLY: - select_reply2(m_in.m_source, m_in.m_lchardriver_vfs_sel2.minor, + select_cdev_reply2(m_in.m_source, + m_in.m_lchardriver_vfs_sel2.minor, m_in.m_lchardriver_vfs_sel2.status); break; default: diff --git a/minix/servers/vfs/comm.c b/minix/servers/vfs/comm.c index e21e62159..49199f3d6 100644 --- a/minix/servers/vfs/comm.c +++ b/minix/servers/vfs/comm.c @@ -101,7 +101,7 @@ int drv_sendrec(endpoint_t drv_e, message *reqmp) return EIO; } - if ((dp = get_dmap(drv_e)) == NULL) + if ((dp = get_dmap_by_endpt(drv_e)) == NULL) panic("driver endpoint %d invalid", drv_e); lock_dmap(dp); diff --git a/minix/servers/vfs/const.h b/minix/servers/vfs/const.h index 4385350ed..21addf66a 100644 --- a/minix/servers/vfs/const.h +++ b/minix/servers/vfs/const.h @@ -7,6 +7,7 @@ #define NR_MNTS 16 /* # slots in mount table */ #define NR_VNODES 1024 /* # slots in vnode table */ #define NR_WTHREADS 9 /* # slots in worker thread table */ +#define NR_SOCKDEVS 8 /* # slots in smap table */ #define NR_NONEDEVS NR_MNTS /* # slots in nonedev bitmap */ @@ -21,6 +22,7 @@ #define FP_BLOCKED_ON_POPEN 3 /* susp'd on pipe open */ #define FP_BLOCKED_ON_SELECT 4 /* susp'd on select */ #define FP_BLOCKED_ON_CDEV 5 /* blocked on character device I/O */ +#define FP_BLOCKED_ON_SDEV 6 /* blocked on socket I/O */ /* test if the process is blocked on something */ #define fp_is_blocked(fp) ((fp)->fp_blocked_on != FP_BLOCKED_ON_NONE) @@ -40,6 +42,11 @@ #define SEL_WR CDEV_OP_WR #define SEL_ERR CDEV_OP_ERR #define SEL_NOTIFY CDEV_NOTIFY /* not a real select operation */ +/* If these constants diverge, VFS must be extended to perform mapping. */ +#if (CDEV_OP_RD != SDEV_OP_RD || CDEV_OP_WR != SDEV_OP_WR || \ + CDEV_OP_ERR != SDEV_OP_ERR || CDEV_NOTIFY != SDEV_NOTIFY) +#error "CDEV and SDEV select constants are different" +#endif /* special driver endpoint for CTTY_MAJOR; must be able to pass isokendpt() */ #define CTTY_ENDPT VFS_PROC_NR diff --git a/minix/servers/vfs/device.c b/minix/servers/vfs/device.c index 0253de722..08251468c 100644 --- a/minix/servers/vfs/device.c +++ b/minix/servers/vfs/device.c @@ -45,6 +45,10 @@ do_ioctl(void) f->filp_flags); break; + case S_IFSOCK: + r = sdev_ioctl(vp->v_sdev, request, arg, f->filp_flags); + break; + default: r = ENOTTY; } diff --git a/minix/servers/vfs/dmap.c b/minix/servers/vfs/dmap.c index bec47a5f0..044a7d5c8 100644 --- a/minix/servers/vfs/dmap.c +++ b/minix/servers/vfs/dmap.c @@ -110,6 +110,7 @@ int do_mapdriver(void) * etc), and its label. This label is registered with DS, and allows us to * retrieve the driver's endpoint. */ + const int *domains; int r, slot, ndomains; devmajor_t major; endpoint_t endpoint; @@ -125,7 +126,7 @@ int do_mapdriver(void) label_len = job_m_in.m_lsys_vfs_mapdriver.labellen; major = job_m_in.m_lsys_vfs_mapdriver.major; ndomains = job_m_in.m_lsys_vfs_mapdriver.ndomains; - /* domains = job_m_in.m_lsys_vfs_mapdriver.domains; */ + domains = job_m_in.m_lsys_vfs_mapdriver.domains; /* Get the label */ if (label_len > sizeof(label)) { /* Can we store this label? */ @@ -164,8 +165,7 @@ int do_mapdriver(void) return r; } if (ndomains != 0) { - r = EINVAL; /* TODO: add support for mapping socket drivers */ - if (r != OK) { + if ((r = smap_map(label, endpoint, domains, ndomains)) != OK) { if (major != NO_DEV) map_driver(NULL, major, NONE); /* undo */ return r; @@ -314,7 +314,7 @@ void dmap_endpt_up(endpoint_t proc_e, int is_blk) /*===========================================================================* * get_dmap * *===========================================================================*/ -struct dmap *get_dmap(endpoint_t proc_e) +struct dmap *get_dmap_by_endpt(endpoint_t proc_e) { /* See if 'proc_e' endpoint belongs to a valid dmap entry. If so, return a * pointer */ diff --git a/minix/servers/vfs/file.h b/minix/servers/vfs/file.h index a1a4a51e0..7f184b508 100644 --- a/minix/servers/vfs/file.h +++ b/minix/servers/vfs/file.h @@ -28,8 +28,8 @@ EXTERN struct filp { int filp_select_flags; /* Select flags for the filp */ /* following are for fd-type-specific select() */ - int filp_pipe_select_ops; - dev_t filp_char_select_dev; + int filp_pipe_select_ops; /* used for pipes */ + dev_t filp_select_dev; /* used for character and socket devices */ } filp[NR_FILPS]; #define FILP_CLOSED 0 /* filp_mode: associated device closed/gone */ diff --git a/minix/servers/vfs/filedes.c b/minix/servers/vfs/filedes.c index 1233dd9dd..1bbeeb6ef 100644 --- a/minix/servers/vfs/filedes.c +++ b/minix/servers/vfs/filedes.c @@ -82,6 +82,28 @@ void init_filps(void) } +/*===========================================================================* + * check_fds * + *===========================================================================*/ +int check_fds(struct fproc *rfp, int nfds) +{ +/* Check whether at least 'nfds' file descriptors can be created in the process + * 'rfp'. Return OK on success, or otherwise an appropriate error code. + */ + int i; + + assert(nfds >= 1); + + for (i = 0; i < OPEN_MAX; i++) { + if (rfp->fp_filp[i] == NULL) { + if (--nfds == 0) + return OK; + } + } + + return EMFILE; +} + /*===========================================================================* * get_fd * *===========================================================================*/ @@ -119,7 +141,7 @@ int get_fd(struct fproc *rfp, int start, mode_t bits, int *k, struct filp **fpt) f->filp_selectors = 0; f->filp_select_ops = 0; f->filp_pipe_select_ops = 0; - f->filp_char_select_dev = NO_DEV; + f->filp_select_dev = NO_DEV; f->filp_flags = 0; f->filp_select_flags = 0; f->filp_softlock = NULL; @@ -201,6 +223,27 @@ struct filp *find_filp(struct vnode *vp, mode_t bits) return(NULL); } +/*===========================================================================* + * find_filp_by_sock_dev * + *===========================================================================*/ +struct filp *find_filp_by_sock_dev(dev_t dev) +{ +/* See if there is a file pointer for a socket with the given socket device + * number. + */ + struct filp *f; + + for (f = &filp[0]; f < &filp[NR_FILPS]; f++) { + if (f->filp_count != 0 && f->filp_vno != NULL && + S_ISSOCK(f->filp_vno->v_mode) && f->filp_vno->v_sdev == dev && + f->filp_mode != FILP_CLOSED) { + return f; + } + } + + return NULL; +} + /*===========================================================================* * invalidate_filp * *===========================================================================*/ @@ -228,6 +271,27 @@ void invalidate_filp_by_char_major(devmajor_t major) } } +/*===========================================================================* + * invalidate_filp_by_sock_drv * + *===========================================================================*/ +void invalidate_filp_by_sock_drv(unsigned int num) +{ +/* Invalidate all file pointers for sockets owned by the socket driver with the + * smap number 'num'. + */ + struct filp *f; + struct smap *sp; + + for (f = &filp[0]; f < &filp[NR_FILPS]; f++) { + if (f->filp_count != 0 && f->filp_vno != NULL) { + if (S_ISSOCK(f->filp_vno->v_mode) && + (sp = get_smap_by_dev(f->filp_vno->v_sdev, NULL)) != NULL + && sp->smap_num == num) + invalidate_filp(f); + } + } +} + /*===========================================================================* * invalidate_filp_by_endpt * *===========================================================================*/ @@ -363,7 +427,8 @@ close_filp(struct filp *f) if (f->filp_count - 1 == 0 && f->filp_mode != FILP_CLOSED) { /* Check to see if the file is special. */ - if (S_ISCHR(vp->v_mode) || S_ISBLK(vp->v_mode)) { + if (S_ISCHR(vp->v_mode) || S_ISBLK(vp->v_mode) || + S_ISSOCK(vp->v_mode)) { dev = vp->v_sdev; if (S_ISBLK(vp->v_mode)) { lock_bsf(); @@ -377,8 +442,23 @@ close_filp(struct filp *f) unlock_bsf(); (void) bdev_close(dev); /* Ignore errors */ - } else { + } else if (S_ISCHR(vp->v_mode)) { (void) cdev_close(dev); /* Ignore errors */ + } else { + /* + * TODO: this should be completely redone. Sockets may + * take a while to be closed (SO_LINGER etc), and thus, + * we should be able to issue a suspending close to a + * socket driver. Getting this working for close(2) is + * the easy case, but there's also eg dup2(2), which if + * interrupted by a signal should fail without closing + * the file descriptor. Then there are cases where the + * close should probably never block: close-on-exec, + * exit, and UDS closing in-flight FDs (currently just + * using close(2), but it could set the FD to non- + * blocking) for instance. There is much to do here. + */ + (void) sdev_close(dev); /* Ignore errors */ } f->filp_mode = FILP_CLOSED; diff --git a/minix/servers/vfs/fproc.h b/minix/servers/vfs/fproc.h index 13c0cf61c..9ba946aea 100644 --- a/minix/servers/vfs/fproc.h +++ b/minix/servers/vfs/fproc.h @@ -49,6 +49,15 @@ EXTERN struct fproc { endpoint_t endpt; /* driver endpoint */ cp_grant_id_t grant; /* data grant */ } u_cdev; + struct { /* FP_BLOCKED_ON_SDEV */ + dev_t dev; /* socket number for blocking call */ + int callnr; /* user call: a VFS_ socket call */ + cp_grant_id_t grant[3]; /* data grant(s) */ + union ixfer_u_aux { + int fd; /* listener file descr. (VFS_ACCEPT) */ + vir_bytes buf; /* user buffer address (VFS_RECVMSG) */ + } aux; /* call-specific auxiliary data */ + } u_sdev; } fp_u; uid_t fp_realuid; /* real user id */ @@ -77,6 +86,7 @@ EXTERN struct fproc { #define fp_popen fp_u.u_popen #define fp_flock fp_u.u_flock #define fp_cdev fp_u.u_cdev +#define fp_sdev fp_u.u_sdev /* fp_flags */ #define FP_NOFLAGS 0000 diff --git a/minix/servers/vfs/fs.h b/minix/servers/vfs/fs.h index 500d2ed33..bfe5d0f36 100644 --- a/minix/servers/vfs/fs.h +++ b/minix/servers/vfs/fs.h @@ -18,10 +18,14 @@ #include #include #include +#include #include #include #include +#include +#include +#include #include #include diff --git a/minix/servers/vfs/main.c b/minix/servers/vfs/main.c index 9be33245e..77e939ddf 100644 --- a/minix/servers/vfs/main.c +++ b/minix/servers/vfs/main.c @@ -38,7 +38,6 @@ static void do_reply(struct worker_thread *wp); static void do_work(void); static void do_init_root(void); static void handle_work(void (*func)(void)); -static void reply(message *m_out, endpoint_t whom, int result); static int get_work(void); static void service_pm(void); @@ -130,6 +129,9 @@ int main(void) } else if (IS_CDEV_RS(call_nr)) { /* We've got results for a character device request. */ cdev_reply(); + } else if (IS_SDEV_RS(call_nr)) { + /* We've got results for a socket driver request. */ + sdev_reply(); } else { /* Normal syscall. This spawns a new thread. */ handle_work(do_work); @@ -447,6 +449,7 @@ static int sef_cb_init_fresh(int UNUSED(type), sef_init_info_t *info) panic("VFS: couldn't initialize block special file lock"); init_dmap(); /* Initialize device table. */ + init_smap(); /* Initialize socket table. */ /* Map all the services in the boot image. */ if ((s = sys_safecopyfrom(RS_PROC_NR, info->rproctab_gid, 0, @@ -632,7 +635,7 @@ static int get_work(void) /*===========================================================================* * reply * *===========================================================================*/ -static void reply(message *m_out, endpoint_t whom, int result) +void reply(message *m_out, endpoint_t whom, int result) { /* Send a reply to a user process. If the send fails, just ignore it. */ int r; diff --git a/minix/servers/vfs/misc.c b/minix/servers/vfs/misc.c index 54ed7ad0f..4ea85f11c 100644 --- a/minix/servers/vfs/misc.c +++ b/minix/servers/vfs/misc.c @@ -53,6 +53,7 @@ int do_getsysinfo(void) { struct fproc *rfp; struct fproc_light *rfpl; + struct smap *sp; vir_bytes src_addr, dst_addr; size_t len, buf_size; int what; @@ -85,6 +86,9 @@ int do_getsysinfo(void) rfpl->fpl_blocked_on = rfp->fp_blocked_on; if (rfp->fp_blocked_on == FP_BLOCKED_ON_CDEV) rfpl->fpl_task = rfp->fp_cdev.endpt; + else if (rfp->fp_blocked_on == FP_BLOCKED_ON_SDEV && + (sp = get_smap_by_dev(rfp->fp_sdev.dev, NULL)) != NULL) + rfpl->fpl_task = sp->smap_endpt; else rfpl->fpl_task = NONE; } @@ -656,10 +660,11 @@ static void free_proc(int flags) /* Check if any process is SUSPENDed on this driver. * If a driver exits, unmap its entries in the dmap table. * (unmapping has to be done after the first step, because the - * dmap table is used in the first step.) + * dmap/smap tables are used in the first step.) */ unsuspend_by_endpt(fp->fp_endpoint); dmap_unmap_by_endpt(fp->fp_endpoint); + smap_unmap_by_endpt(fp->fp_endpoint); worker_stop_by_endpt(fp->fp_endpoint); /* Unblock waiting threads */ vmnt_unmap_by_endpt(fp->fp_endpoint); /* Invalidate open files if this @@ -939,17 +944,20 @@ ds_event(void) char key[DS_MAX_KEYLEN]; char *blkdrv_prefix = "drv.blk."; char *chrdrv_prefix = "drv.chr."; + char *sckdrv_prefix = "drv.sck."; u32_t value; - int type, r, is_blk; + int type, ftype, r; endpoint_t owner_endpoint; /* Get the event and the owner from DS. */ while ((r = ds_check(key, &type, &owner_endpoint)) == OK) { - /* Only check for block and character driver up events. */ + /* Only check for block, character, socket driver up events. */ if (!strncmp(key, blkdrv_prefix, strlen(blkdrv_prefix))) { - is_blk = TRUE; + ftype = S_IFBLK; } else if (!strncmp(key, chrdrv_prefix, strlen(chrdrv_prefix))) { - is_blk = FALSE; + ftype = S_IFCHR; + } else if (!strncmp(key, sckdrv_prefix, strlen(sckdrv_prefix))) { + ftype = S_IFSOCK; } else { continue; } @@ -961,7 +969,10 @@ ds_event(void) if (value != DS_DRIVER_UP) continue; /* Perform up. */ - dmap_endpt_up(owner_endpoint, is_blk); + if (ftype == S_IFBLK || ftype == S_IFCHR) + dmap_endpt_up(owner_endpoint, (ftype == S_IFBLK)); + else + smap_endpt_up(owner_endpoint); } if (r != ENOENT) printf("VFS: ds_event: ds_check failed: %d\n", r); diff --git a/minix/servers/vfs/pipe.c b/minix/servers/vfs/pipe.c index bd89fedaa..7a441baee 100644 --- a/minix/servers/vfs/pipe.c +++ b/minix/servers/vfs/pipe.c @@ -337,12 +337,17 @@ void unsuspend_by_endpt(endpoint_t proc_e) * return code EIO. */ struct fproc *rp; + struct smap *sp; for (rp = &fproc[0]; rp < &fproc[NR_PROCS]; rp++) { if (rp->fp_pid == PID_FREE) continue; if (rp->fp_blocked_on == FP_BLOCKED_ON_CDEV && rp->fp_cdev.endpt == proc_e) revive(rp->fp_endpoint, EIO); + else if (rp->fp_blocked_on == FP_BLOCKED_ON_SDEV && + (sp = get_smap_by_dev(rp->fp_sdev.dev, NULL)) != NULL && + sp->smap_endpt == proc_e) + sdev_stop(rp); } /* Revive processes waiting in drivers on select()s with EAGAIN too */ @@ -430,8 +435,8 @@ void release(struct vnode * vp, int op, int count) void revive(endpoint_t proc_e, int returned) { /* Revive a previously blocked process. When a process hangs on tty, this - * is the way it is eventually released. For processes blocked on _SELECT and - * _CDEV, this function MUST NOT block its calling thread. + * is the way it is eventually released. For processes blocked on _SELECT, + * _CDEV, or _SDEV, this function MUST NOT block its calling thread. */ struct fproc *rfp; int blocked_on; @@ -454,13 +459,15 @@ void revive(endpoint_t proc_e, int returned) reviving++; /* process was waiting on pipe or lock */ } else { rfp->fp_blocked_on = FP_BLOCKED_ON_NONE; - if (blocked_on == FP_BLOCKED_ON_POPEN) { + switch (blocked_on) { + case FP_BLOCKED_ON_POPEN: /* process blocked in open or create */ replycode(proc_e, rfp->fp_popen.fd); - } else if (blocked_on == FP_BLOCKED_ON_SELECT) { + break; + case FP_BLOCKED_ON_SELECT: replycode(proc_e, returned); - } else { - assert(blocked_on == FP_BLOCKED_ON_CDEV); + break; + case FP_BLOCKED_ON_CDEV: /* If a grant has been issued by FS for this I/O, revoke * it again now that I/O is done. */ @@ -471,6 +478,15 @@ void revive(endpoint_t proc_e, int returned) } } replycode(proc_e, returned);/* unblock the process */ + break; + case FP_BLOCKED_ON_SDEV: + /* + * Cleaning up socket requests is too complex to put here, and + * neither sdev_reply() nor sdev_stop() call revive(). + */ + panic("revive should not be used for socket calls"); + default: + panic("unknown block state %d", blocked_on); } } } @@ -491,8 +507,9 @@ void unpause(void) blocked_on = fp->fp_blocked_on; /* Clear the block status now. The procedure below might make blocking calls - * and it is imperative that while at least cdev_cancel() is executing, other - * parts of VFS do not perceive this process as blocked on something. + * and it is imperative that while at least cdev_cancel() or sdev_cancel() + * are executing, other parts of VFS do not perceive this process as blocked + * on something. */ fp->fp_blocked_on = FP_BLOCKED_ON_NONE; @@ -526,6 +543,11 @@ void unpause(void) fp->fp_cdev.grant); break; + + case FP_BLOCKED_ON_SDEV: /* process blocked on socket I/O */ + sdev_cancel(); + return; /* sdev_cancel() sends its own reply */ + default : panic("VFS: unknown block reason: %d", blocked_on); } diff --git a/minix/servers/vfs/proto.h b/minix/servers/vfs/proto.h index a927b7338..fada70710 100644 --- a/minix/servers/vfs/proto.h +++ b/minix/servers/vfs/proto.h @@ -9,6 +9,7 @@ #include "request.h" #include "threads.h" #include "tll.h" +#include "type.h" /* Structs used in prototypes must be declared as such first. */ struct filp; @@ -58,7 +59,7 @@ int do_mapdriver(void); void init_dmap(void); int dmap_driver_match(endpoint_t proc, devmajor_t major); void dmap_endpt_up(endpoint_t proc_nr, int is_blk); -struct dmap *get_dmap(endpoint_t proc_e); +struct dmap *get_dmap_by_endpt(endpoint_t proc_e); struct dmap *get_dmap_by_major(devmajor_t major); void dmap_unmap_by_endpt(endpoint_t proc_nr); int map_service(struct rprocpub *rpub); @@ -75,6 +76,8 @@ void check_filp_locks(void); void check_filp_locks_by_me(void); void init_filps(void); struct filp *find_filp(struct vnode *vp, mode_t bits); +struct filp *find_filp_by_sock_dev(dev_t dev); +int check_fds(struct fproc *rfp, int nfds); int get_fd(struct fproc *rfp, int start, mode_t bits, int *k, struct filp **fpt); struct filp *get_filp(int fild, tll_access_t locktype); @@ -85,6 +88,7 @@ void unlock_filps(struct filp *filp1, struct filp *filp2); void invalidate_filp(struct filp *); void invalidate_filp_by_endpt(endpoint_t proc_e); void invalidate_filp_by_char_major(devmajor_t major); +void invalidate_filp_by_sock_drv(unsigned int num); void close_filp(struct filp *fp); int do_copyfd(void); @@ -108,6 +112,7 @@ void lock_revive(void); int main(void); void lock_proc(struct fproc *rfp); void unlock_proc(struct fproc *rfp); +void reply(message *m_out, endpoint_t whom, int result); void replycode(endpoint_t whom, int result); void service_pm_postponed(void); void thread_cleanup(void); @@ -254,6 +259,45 @@ int req_utime(endpoint_t fs_e, ino_t inode_nr, struct timespec * actv, struct timespec * modtv); int req_newdriver(endpoint_t fs_e, dev_t dev, char *label); +/* sdev.c */ +int sdev_socket(int domain, int type, int protocol, dev_t *dev, int pair); +int sdev_bind(dev_t dev, vir_bytes addr, unsigned int addr_len, + int filp_flags); +int sdev_connect(dev_t dev, vir_bytes addr, unsigned int addr_len, + int filp_flags); +int sdev_listen(dev_t dev, int backlog); +int sdev_accept(dev_t dev, vir_bytes addr, unsigned int addr_len, + int filp_flags, int fd); +int sdev_readwrite(dev_t dev, vir_bytes data_buf, size_t data_len, + vir_bytes ctl_buf, unsigned int ctl_len, vir_bytes addr_buf, + unsigned int addr_len, int flags, int rw_flag, int filp_flags, + vir_bytes user_buf); +int sdev_ioctl(dev_t dev, unsigned long request, vir_bytes buf, + int filp_flags); +int sdev_setsockopt(dev_t dev, int level, int name, vir_bytes addr, + unsigned int len); +int sdev_getsockopt(dev_t dev, int level, int name, vir_bytes addr, + unsigned int *len); +int sdev_getsockname(dev_t dev, vir_bytes addr, unsigned int *addr_len); +int sdev_getpeername(dev_t dev, vir_bytes addr, unsigned int *addr_len); +int sdev_shutdown(dev_t dev, int how); +int sdev_close(dev_t dev); +int sdev_select(dev_t dev, int ops); +void sdev_stop(struct fproc *rfp); +void sdev_cancel(void); +void sdev_reply(void); + +/* smap.c */ +void init_smap(void); +int smap_map(const char *label, endpoint_t endpt, const int *domains, + unsigned int ndomains); +void smap_unmap_by_endpt(endpoint_t endpt); +void smap_endpt_up(endpoint_t endpt); +dev_t make_smap_dev(struct smap *sp, sockid_t sockid); +struct smap *get_smap_by_endpt(endpoint_t endpt); +struct smap *get_smap_by_domain(int domain); +struct smap *get_smap_by_dev(dev_t dev, sockid_t * sockidp); + /* socket.c */ int do_socket(void); int do_socketpair(void); @@ -356,8 +400,10 @@ int do_select(void); void init_select(void); void select_callback(struct filp *, int ops); void select_forget(void); -void select_reply1(endpoint_t driver_e, devminor_t minor, int status); -void select_reply2(endpoint_t driver_e, devminor_t minor, int status); +void select_cdev_reply1(endpoint_t driver_e, devminor_t minor, int status); +void select_cdev_reply2(endpoint_t driver_e, devminor_t minor, int status); +void select_sdev_reply1(dev_t dev, int status); +void select_sdev_reply2(dev_t dev, int status); void select_unsuspend_by_endpt(endpoint_t proc); void select_dump(void); diff --git a/minix/servers/vfs/read.c b/minix/servers/vfs/read.c index c28831e6e..cfc1c9ec8 100644 --- a/minix/servers/vfs/read.c +++ b/minix/servers/vfs/read.c @@ -199,6 +199,17 @@ int read_write(struct fproc *rfp, int rw_flag, int fd, struct filp *f, */ position += size; } + } else if (S_ISSOCK(vp->v_mode)) { + if (rw_flag == PEEKING) { + printf("VFS: read_write tries to peek on sock dev\n"); + return EINVAL; + } + + if (vp->v_sdev == NO_DEV) + panic("VFS: read_write tries to access sock dev NO_DEV"); + + r = sdev_readwrite(vp->v_sdev, buf, size, 0, 0, 0, 0, 0, rw_flag, + f->filp_flags, 0); } else if (S_ISBLK(vp->v_mode)) { /* Block special files. */ if (vp->v_sdev == NO_DEV) panic("VFS: read_write tries to access block dev NO_DEV"); diff --git a/minix/servers/vfs/sdev.c b/minix/servers/vfs/sdev.c new file mode 100644 index 000000000..7bc667292 --- /dev/null +++ b/minix/servers/vfs/sdev.c @@ -0,0 +1,1090 @@ +/* + * This file implements the lower socket layer of VFS: communication with + * socket drivers. Socket driver communication evolved out of character driver + * communication, and the two have many similarities between them. Most + * importantly, socket driver communication also has the distinction between + * short-lived and long-lived requests. + * + * Short-lived requests are expected to be replied to by the socket driver + * immediately in all cases. For such requests, VFS keeps the worker thread + * for the calling process alive until the reply arrives. In contrast, + * long-lived requests may block. For such requests, VFS suspends the calling + * process until a reply comes in, or until a signal interrupts the request. + * Both short-lived and long-lived requests may be aborted if VFS finds that + * the corresponding socket driver has died. Even though long-lived requests + * may be marked as nonblocking, nonblocking calls are still handled as + * long-lived in terms of VFS processing. + * + * For an overview of the socket driver requests and replies, message layouts, + * and which requests are long-lived or short-lived (i.e. may suspend or not), + * please refer to the corresponding table in the libsockdriver source code. + * + * For most long-lived socket requests, the main VFS thread processes the reply + * from the socket driver. This typically consists of waking up the user + * process that originally issued the system call on the socket by simply + * relaying the call's result code. Some socket calls require a specific reply + * message and/or additional post-call actions; for those, resume_*() calls are + * made back into the upper socket layer. + * + * If a process is interrupted by a signal, any ongoing long-lived socket + * request must be canceled. This is done by sending a one-way cancel request + * to the socket driver, and waiting for it to reply to the original request. + * In this case, the reply will be processed from the worker thread that is + * handling the cancel operation. Canceling does not imply call failure: the + * cancellation may result in a partial I/O reply, and a successful reply may + * cross the cancel request. + * + * One main exception is the reply to an accept request. Once a connection has + * been accepted, a new socket has to be created for it. This requires actions + * that require the ability to block the current thread, and so, a worker + * thread is spawned for processing successful accept replies, unless the reply + * was received from a worker thread already (as may be the case if the accept + * request was being canceled). + * + * As a current shortcoming, close requests should be long-lived (in order to + * support SO_LINGER) but are modeled as short-lived in VFS. This is the + * result of implementation limitations that have to be resolved first; see the + * comments in sdev_close() and close_filp() for more information. + */ + +#include "fs.h" +#include +#include + +/* + * Send a short-lived request message to the given socket driver, and suspend + * the current worker thread until a reply message has been received. On + * success, the function will return OK, and the reply message will be stored + * in the message structure pointed to by 'm_ptr'. The function may fail if + * the socket driver dies before sending a reply. In that case, the function + * will return a negative error code, and also store the same negative error + * code in the m_type field of the 'm_ptr' message structure. + */ +static int +sdev_sendrec(struct smap * sp, message * m_ptr) +{ + int r; + + /* Send the request to the driver. */ + if ((r = asynsend3(sp->smap_endpt, m_ptr, AMF_NOREPLY)) != OK) + panic("VFS: asynsend in sdev_sendrec failed: %d", r); + + /* Suspend this thread until we have received the response. */ + self->w_task = sp->smap_endpt; + self->w_drv_sendrec = m_ptr; + + worker_wait(); + + self->w_task = NONE; + assert(self->w_drv_sendrec == NULL); + + return (!IS_SDEV_RS(m_ptr->m_type)) ? m_ptr->m_type : OK; +} + +/* + * Suspend the current process for later completion of its system call. + */ +int +sdev_suspend(dev_t dev, cp_grant_id_t grant0, cp_grant_id_t grant1, + cp_grant_id_t grant2, int fd, vir_bytes buf) +{ + + fp->fp_sdev.dev = dev; + fp->fp_sdev.callnr = job_call_nr; + fp->fp_sdev.grant[0] = grant0; + fp->fp_sdev.grant[1] = grant1; + fp->fp_sdev.grant[2] = grant2; + + if (job_call_nr == VFS_ACCEPT) { + assert(fd != -1); + assert(buf == 0); + fp->fp_sdev.aux.fd = fd; + } else if (job_call_nr == VFS_RECVMSG) { + assert(fd == -1); + /* + * TODO: we are not yet consistent enough in dealing with + * mapped NULL pages to have an assert(buf != 0) here.. + */ + fp->fp_sdev.aux.buf = buf; + } else { + assert(fd == -1); + assert(buf == 0); + } + + suspend(FP_BLOCKED_ON_SDEV); + return SUSPEND; +} + +/* + * Create a socket or socket pair. Return OK on success, with the new socket + * device identifier(s) stored in the 'dev' array. Return an error code upon + * failure. + */ +int +sdev_socket(int domain, int type, int protocol, dev_t * dev, int pair) +{ + struct smap *sp; + message m; + sockid_t sock_id, sock_id2; + int r; + + /* We could return EAFNOSUPPORT, but the caller should have checked. */ + if ((sp = get_smap_by_domain(domain)) == NULL) + panic("VFS: sdev_socket for unknown domain"); + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = pair ? SDEV_SOCKETPAIR : SDEV_SOCKET; + m.m_vfs_lsockdriver_socket.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_socket.domain = domain; + m.m_vfs_lsockdriver_socket.type = type; + m.m_vfs_lsockdriver_socket.protocol = protocol; + m.m_vfs_lsockdriver_socket.user_endpt = who_e; + + /* Send the request, and wait for the reply. */ + if ((r = sdev_sendrec(sp, &m)) != OK) + return r; /* socket driver died */ + + /* Parse the reply message, and check for protocol errors. */ + if (m.m_type != SDEV_SOCKET_REPLY) { + printf("VFS: %d sent bad reply type %d for call %d\n", + sp->smap_endpt, m.m_type, job_call_nr); + return EIO; + } + + sock_id = m.m_lsockdriver_vfs_socket_reply.sock_id; + sock_id2 = m.m_lsockdriver_vfs_socket_reply.sock_id2; + + /* Check for regular errors. Upon success, return the socket(s). */ + if (sock_id < 0) + return sock_id; + + dev[0] = make_smap_dev(sp, sock_id); + + if (pair) { + /* Okay, one more protocol error. */ + if (sock_id2 < 0) { + printf("VFS: %d sent bad SOCKETPAIR socket ID %d\n", + sp->smap_endpt, sock_id2); + (void)sdev_close(dev[0]); + return EIO; + } + + dev[1] = make_smap_dev(sp, sock_id2); + } + + return OK; +} + +/* + * Bind or connect a socket to a particular address. These calls may block, so + * suspend the current process instead of making the thread wait for the reply. + */ +static int +sdev_bindconn(dev_t dev, int type, vir_bytes addr, unsigned int addr_len, + int filp_flags) +{ + struct smap *sp; + sockid_t sock_id; + cp_grant_id_t grant; + message m; + int r; + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Allocate resources. */ + grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, addr_len, + CPF_READ); + if (!GRANT_VALID(grant)) + panic("VFS: cpf_grant_magic failed"); + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = type; + m.m_vfs_lsockdriver_addr.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_addr.sock_id = sock_id; + m.m_vfs_lsockdriver_addr.grant = grant; + m.m_vfs_lsockdriver_addr.len = addr_len; + m.m_vfs_lsockdriver_addr.user_endpt = who_e; + m.m_vfs_lsockdriver_addr.sflags = + (filp_flags & O_NONBLOCK) ? SDEV_NONBLOCK : 0; + + /* Send the request to the driver. */ + if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK) + panic("VFS: asynsend in sdev_bindconn failed: %d", r); + + /* Suspend the process until the reply arrives. */ + return sdev_suspend(dev, grant, GRANT_INVALID, GRANT_INVALID, -1, 0); +} + +/* + * Bind a socket to a local address. + */ +int +sdev_bind(dev_t dev, vir_bytes addr, unsigned int addr_len, int filp_flags) +{ + + return sdev_bindconn(dev, SDEV_BIND, addr, addr_len, filp_flags); +} + +/* + * Connect a socket to a remote address. + */ +int +sdev_connect(dev_t dev, vir_bytes addr, unsigned int addr_len, int filp_flags) +{ + + return sdev_bindconn(dev, SDEV_CONNECT, addr, addr_len, filp_flags); +} + +/* + * Send and receive a "simple" request: listen, shutdown, or close. Note that + * while cancel requests use the same request format, they require a different + * way of handling their replies. + */ +static int +sdev_simple(dev_t dev, int type, int param) +{ + struct smap *sp; + sockid_t sock_id; + message m; + int r; + + assert(type == SDEV_LISTEN || type == SDEV_SHUTDOWN || + type == SDEV_CLOSE); + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = type; + m.m_vfs_lsockdriver_simple.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_simple.sock_id = sock_id; + m.m_vfs_lsockdriver_simple.param = param; + + /* Send the request, and wait for the reply. */ + if ((r = sdev_sendrec(sp, &m)) != OK) + return r; /* socket driver died */ + + /* Parse and return the reply. */ + if (m.m_type != SDEV_REPLY) { + printf("VFS: %d sent bad reply type %d for call %d\n", + sp->smap_endpt, m.m_type, job_call_nr); + return EIO; + } + + return m.m_lsockdriver_vfs_reply.status; +} + +/* + * Put a socket in listening mode. + */ +int +sdev_listen(dev_t dev, int backlog) +{ + + assert(backlog >= 0); + + return sdev_simple(dev, SDEV_LISTEN, backlog); +} + +/* + * Accept a new connection on a socket. + */ +int +sdev_accept(dev_t dev, vir_bytes addr, unsigned int addr_len, int filp_flags, + int listen_fd) +{ + struct smap *sp; + sockid_t sock_id; + cp_grant_id_t grant; + message m; + int r; + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Allocate resources. */ + if (addr != 0) { + grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, addr_len, + CPF_WRITE); + if (!GRANT_VALID(grant)) + panic("VFS: cpf_grant_magic failed"); + } else + grant = GRANT_INVALID; + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = SDEV_ACCEPT; + m.m_vfs_lsockdriver_addr.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_addr.sock_id = sock_id; + m.m_vfs_lsockdriver_addr.grant = grant; + m.m_vfs_lsockdriver_addr.len = addr_len; + m.m_vfs_lsockdriver_addr.user_endpt = who_e; + m.m_vfs_lsockdriver_addr.sflags = + (filp_flags & O_NONBLOCK) ? SDEV_NONBLOCK : 0; + + /* Send the request to the driver. */ + if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK) + panic("VFS: asynsend in sdev_accept failed: %d", r); + + /* Suspend the process until the reply arrives. */ + return sdev_suspend(dev, grant, GRANT_INVALID, GRANT_INVALID, + listen_fd, 0); +} + +/* + * Send or receive a message on a socket. All read (read(2), recvfrom(2), and + * recvmsg(2)) and write (write(2), sendto(2), sendmsg(2)) system calls on + * sockets pass through this function. The function is named sdev_readwrite + * rather than sdev_sendrecv to avoid confusion with sdev_sendrec. + */ +int +sdev_readwrite(dev_t dev, vir_bytes data_buf, size_t data_len, + vir_bytes ctl_buf, unsigned int ctl_len, vir_bytes addr_buf, + unsigned int addr_len, int flags, int rw_flag, int filp_flags, + vir_bytes user_buf) +{ + struct smap *sp; + sockid_t sock_id; + cp_grant_id_t data_grant, ctl_grant, addr_grant; + message m; + int r, bits; + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Allocate resources. */ + data_grant = GRANT_INVALID; + ctl_grant = GRANT_INVALID; + addr_grant = GRANT_INVALID; + bits = (rw_flag == WRITING) ? CPF_READ : CPF_WRITE; + + /* + * Supposedly it is allowed to send or receive zero data bytes, even + * though it is a bad idea as the return value will then be zero, which + * may also indicate EOF (as per W. Richard Stevens). + */ + if (data_buf != 0) { + data_grant = cpf_grant_magic(sp->smap_endpt, who_e, data_buf, + data_len, bits); + if (!GRANT_VALID(data_grant)) + panic("VFS: cpf_grant_magic failed"); + } + + if (ctl_buf != 0) { + ctl_grant = cpf_grant_magic(sp->smap_endpt, who_e, ctl_buf, + ctl_len, bits); + if (!GRANT_VALID(ctl_grant)) + panic("VFS: cpf_grant_magic failed"); + } + + if (addr_buf != 0) { + addr_grant = cpf_grant_magic(sp->smap_endpt, who_e, addr_buf, + addr_len, bits); + if (!GRANT_VALID(addr_grant)) + panic("VFS: cpf_grant_magic failed"); + } + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = (rw_flag == WRITING) ? SDEV_SEND : SDEV_RECV; + m.m_vfs_lsockdriver_sendrecv.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_sendrecv.sock_id = sock_id; + m.m_vfs_lsockdriver_sendrecv.data_grant = data_grant; + m.m_vfs_lsockdriver_sendrecv.data_len = data_len; + m.m_vfs_lsockdriver_sendrecv.ctl_grant = ctl_grant; + m.m_vfs_lsockdriver_sendrecv.ctl_len = ctl_len; + m.m_vfs_lsockdriver_sendrecv.addr_grant = addr_grant; + m.m_vfs_lsockdriver_sendrecv.addr_len = addr_len; + m.m_vfs_lsockdriver_sendrecv.user_endpt = who_e; + m.m_vfs_lsockdriver_sendrecv.flags = flags; + if (filp_flags & O_NONBLOCK) + m.m_vfs_lsockdriver_sendrecv.flags |= MSG_DONTWAIT; + if (rw_flag == WRITING && (filp_flags & O_NOSIGPIPE)) + m.m_vfs_lsockdriver_sendrecv.flags |= MSG_NOSIGNAL; + + /* Send the request to the driver. */ + if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK) + panic("VFS: asynsend in sdev_readwrite failed: %d", r); + + /* Suspend the process until the reply arrives. */ + return sdev_suspend(dev, data_grant, ctl_grant, addr_grant, -1, + user_buf); +} + +/* + * Perform I/O control. + */ +int +sdev_ioctl(dev_t dev, unsigned long request, vir_bytes buf, int filp_flags) +{ + struct smap *sp; + sockid_t sock_id; + cp_grant_id_t grant; + message m; + int r; + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Allocate resources. */ + grant = make_ioctl_grant(sp->smap_endpt, who_e, buf, request); + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = SDEV_IOCTL; + m.m_vfs_lsockdriver_ioctl.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_ioctl.sock_id = sock_id; + m.m_vfs_lsockdriver_ioctl.request = request; + m.m_vfs_lsockdriver_ioctl.grant = grant; + m.m_vfs_lsockdriver_ioctl.user_endpt = who_e; + m.m_vfs_lsockdriver_ioctl.sflags = + (filp_flags & O_NONBLOCK) ? SDEV_NONBLOCK : 0; + + /* Send the request to the driver. */ + if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK) + panic("VFS: asynsend in sdev_ioctl failed: %d", r); + + /* Suspend the process until the reply arrives. */ + return sdev_suspend(dev, grant, GRANT_INVALID, GRANT_INVALID, -1, 0); +} + +/* + * Set socket options. + */ +int +sdev_setsockopt(dev_t dev, int level, int name, vir_bytes addr, + unsigned int len) +{ + struct smap *sp; + sockid_t sock_id; + cp_grant_id_t grant; + message m; + int r; + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Allocate resources. */ + grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, len, CPF_READ); + if (!GRANT_VALID(grant)) + panic("VFS: cpf_grant_magic failed"); + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = SDEV_SETSOCKOPT; + m.m_vfs_lsockdriver_getset.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_getset.sock_id = sock_id; + m.m_vfs_lsockdriver_getset.level = level; + m.m_vfs_lsockdriver_getset.name = name; + m.m_vfs_lsockdriver_getset.grant = grant; + m.m_vfs_lsockdriver_getset.len = len; + + /* Send the request, and wait for the reply. */ + r = sdev_sendrec(sp, &m); + + /* Free resources. */ + (void)cpf_revoke(grant); + + if (r != OK) + return r; /* socket driver died */ + + /* Parse and return the reply. */ + if (m.m_type != SDEV_REPLY) { + printf("VFS: %d sent bad reply type %d for call %d\n", + sp->smap_endpt, m.m_type, job_call_nr); + return EIO; + } + + return m.m_lsockdriver_vfs_reply.status; +} + +/* + * Send and receive a "get" request: getsockopt, getsockname, or getpeername. + */ +static int +sdev_get(dev_t dev, int type, int level, int name, vir_bytes addr, + unsigned int * len) +{ + struct smap *sp; + sockid_t sock_id; + cp_grant_id_t grant; + message m; + int r; + + assert(type == SDEV_GETSOCKOPT || type == SDEV_GETSOCKNAME || + type == SDEV_GETPEERNAME); + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Allocate resources. */ + grant = cpf_grant_magic(sp->smap_endpt, who_e, addr, *len, CPF_WRITE); + if (!GRANT_VALID(grant)) + panic("VFS: cpf_grant_magic failed"); + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = type; + m.m_vfs_lsockdriver_getset.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_getset.sock_id = sock_id; + m.m_vfs_lsockdriver_getset.level = level; + m.m_vfs_lsockdriver_getset.name = name; + m.m_vfs_lsockdriver_getset.grant = grant; + m.m_vfs_lsockdriver_getset.len = *len; + + /* Send the request, and wait for the reply. */ + r = sdev_sendrec(sp, &m); + + /* Free resources. */ + (void)cpf_revoke(grant); + + if (r != OK) + return r; /* socket driver died */ + + /* Parse and return the reply. */ + if (m.m_type != SDEV_REPLY) { + printf("VFS: %d sent bad reply type %d for call %d\n", + sp->smap_endpt, m.m_type, job_call_nr); + return EIO; + } + + if ((r = m.m_lsockdriver_vfs_reply.status) < 0) + return r; + + *len = (unsigned int)r; + return OK; +} + +/* + * Get socket options. + */ +int +sdev_getsockopt(dev_t dev, int level, int name, vir_bytes addr, + unsigned int * len) +{ + + return sdev_get(dev, SDEV_GETSOCKOPT, level, name, addr, len); +} + +/* + * Get the local address of a socket. + */ +int +sdev_getsockname(dev_t dev, vir_bytes addr, unsigned int * addr_len) +{ + + return sdev_get(dev, SDEV_GETSOCKNAME, 0, 0, addr, addr_len); +} + +/* + * Get the remote address of a socket. + */ +int +sdev_getpeername(dev_t dev, vir_bytes addr, unsigned int * addr_len) +{ + + return sdev_get(dev, SDEV_GETPEERNAME, 0, 0, addr, addr_len); +} + +/* + * Shut down socket send and receive operations. + */ +int +sdev_shutdown(dev_t dev, int how) +{ + + assert(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR); + + return sdev_simple(dev, SDEV_SHUTDOWN, how); +} + +/* + * Close the socket identified by the given socket device number. + */ +int +sdev_close(dev_t dev) +{ + + /* + * TODO: for now, we generate only nonblocking requests, because VFS as + * a whole does not yet support blocking close operations. See also + * the comment in close_filp(). All callers of sdev_close() currently + * ignore the return value, so socket drivers can already implement + * support for blocking close requests if they want, although at some + * later point we may have to introduce an additional SDEV_ flag to + * indicate whether the close call should be interrupted (keeping the + * socket open and returning EINTR, for dup2(2)) or continue in the + * background (closing the socket later and returning EINPROGRESS, for + * close(2)). + */ + return sdev_simple(dev, SDEV_CLOSE, SDEV_NONBLOCK); +} + +/* + * Initiate a select call on a socket device. Return OK iff the request was + * sent, without suspending the process. + */ +int +sdev_select(dev_t dev, int ops) +{ + struct smap *sp; + sockid_t sock_id; + message m; + int r; + + if ((sp = get_smap_by_dev(dev, &sock_id)) == NULL) + return EIO; + + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = SDEV_SELECT; + m.m_vfs_lsockdriver_select.sock_id = sock_id; + m.m_vfs_lsockdriver_select.ops = ops; + + /* Send the request to the driver. */ + if ((r = asynsend3(sp->smap_endpt, &m, AMF_NOREPLY)) != OK) + panic("VFS: asynsend in sdev_select failed: %d", r); + + return OK; +} + +/* + * A reply has arrived for a previous socket accept request, and the reply + * indicates that a socket has been accepted. A status is also returned; + * usually, this status is OK, but if not, the newly accepted socket must be + * closed immediately again. Process the low-level aspects of the reply, and + * call resume_accept() to let the upper socket layer handle the rest. This + * function is always called from a worker thread, and may thus block. + */ +static void +sdev_finish_accept(struct fproc * rfp, message * m_ptr) +{ + struct smap *sp; + sockid_t sock_id; + dev_t dev; + unsigned int len; + int status; + + assert(rfp->fp_sdev.callnr == VFS_ACCEPT); + assert(m_ptr->m_type == SDEV_ACCEPT_REPLY); + assert(m_ptr->m_lsockdriver_vfs_accept_reply.sock_id >= 0); + + /* Free resources. Accept requests use up to one grant. */ + if (GRANT_VALID(rfp->fp_sdev.grant[0])) + cpf_revoke(rfp->fp_sdev.grant[0]); + assert(!GRANT_VALID(rfp->fp_sdev.grant[1])); + assert(!GRANT_VALID(rfp->fp_sdev.grant[2])); + + sock_id = m_ptr->m_lsockdriver_vfs_accept_reply.sock_id; + status = m_ptr->m_lsockdriver_vfs_accept_reply.status; + len = m_ptr->m_lsockdriver_vfs_accept_reply.len; + + /* + * We do not want the upper socket layer (socket.c) to deal with smap + * and socket ID details, so we construct the new socket device number + * here. We won't use the saved listen FD to determine the smap entry + * here, since that involves file pointers and other upper-layer-only + * stuff. So we have to look it up by the source endpoint. As a + * result, we detect some driver deaths here (but not all: see below). + */ + if ((sp = get_smap_by_endpt(m_ptr->m_source)) != NULL) { + /* Leave 'status' as is, regardless of whether it is OK. */ + dev = make_smap_dev(sp, sock_id); + } else { + /* + * The driver must have died while the thread was blocked on + * activation. Extremely rare, but theoretically possible. + * Some driver deaths are indicated only by a driver-up + * announcement though; resume_accept() will detect this by + * checking that the listening socket has not been invalidated. + */ + status = EIO; + dev = NO_DEV; + } + + /* Let the upper socket layer handle the rest. */ + resume_accept(rfp, status, dev, len, rfp->fp_sdev.aux.fd); +} + +/* + * Worker thread stub for finishing successful accept requests. + */ +static void +do_accept_reply(void) +{ + + sdev_finish_accept(fp, &job_m_in); +} + +/* + * With the exception of successful accept requests, this function is called + * whenever a reply is received for a socket driver request for which the + * corresponding user process was suspended (as opposed to requests which just + * suspend the worker thread), i.e., for long-lasting socket calls. This + * function is also called if the socket driver has died during a long-lasting + * socket call, in which case the given message's m_type is a negative error + * code. + * + * The division between the upper socket layer (socket.c) and the lower socket + * layer (this file) here is roughly: if resuming the system call involves no + * more than a simple replycode() call, do that here; otherwise call into the + * upper socket layer to handle the details. In any case, do not ever let the + * upper socket layer deal with reply message parsing or suspension state. + * + * This function may or may not be called from a worker thread; as such, it + * MUST NOT block its calling thread. This function is called for failed + * accept requests; successful accept requests have their replies routed + * through sdev_finish_accept() instead, because those require a worker thread. + */ +static void +sdev_finish(struct fproc * rfp, message * m_ptr) +{ + unsigned int ctl_len, addr_len; + int callnr, status, flags; + + /* The suspension status must just have been cleared by the caller. */ + assert(rfp->fp_blocked_on == FP_BLOCKED_ON_NONE); + + /* + * Free resources. Every suspending call sets all grant fields, so we + * can safely revoke all of them without testing the original call. + */ + if (GRANT_VALID(rfp->fp_sdev.grant[0])) + cpf_revoke(rfp->fp_sdev.grant[0]); + if (GRANT_VALID(rfp->fp_sdev.grant[1])) + cpf_revoke(rfp->fp_sdev.grant[1]); + if (GRANT_VALID(rfp->fp_sdev.grant[2])) + cpf_revoke(rfp->fp_sdev.grant[2]); + + /* + * Now that the socket driver call has finished (or been stopped due to + * driver death), we need to finish the corresponding system call from + * the user process. The action to take depends on the system call. + */ + callnr = rfp->fp_sdev.callnr; + + switch (callnr) { + case VFS_BIND: + case VFS_CONNECT: + case VFS_WRITE: + case VFS_SENDTO: + case VFS_SENDMSG: + case VFS_IOCTL: + /* + * These calls all use the same SDEV_REPLY reply type and only + * need to reply an OK-or-error status code back to userland. + */ + if (m_ptr->m_type == SDEV_REPLY) { + status = m_ptr->m_lsockdriver_vfs_reply.status; + } else if (m_ptr->m_type < 0) { + status = m_ptr->m_type; + } else { + printf("VFS: %d sent bad reply type %d for call %d\n", + m_ptr->m_source, m_ptr->m_type, callnr); + status = EIO; + } + replycode(rfp->fp_endpoint, status); + break; + + case VFS_READ: + case VFS_RECVFROM: + case VFS_RECVMSG: + /* + * These calls use SDEV_RECV_REPLY. The action to take depends + * on the exact call. + */ + ctl_len = addr_len = 0; + flags = 0; + if (m_ptr->m_type == SDEV_RECV_REPLY) { + status = m_ptr->m_lsockdriver_vfs_recv_reply.status; + ctl_len = m_ptr->m_lsockdriver_vfs_recv_reply.ctl_len; + addr_len = + m_ptr->m_lsockdriver_vfs_recv_reply.addr_len; + flags = m_ptr->m_lsockdriver_vfs_recv_reply.flags; + } else if (m_ptr->m_type < 0) { + status = m_ptr->m_type; + } else { + printf("VFS: %d sent bad reply type %d for call %d\n", + m_ptr->m_source, m_ptr->m_type, callnr); + status = EIO; + } + + switch (callnr) { + case VFS_READ: + replycode(rfp->fp_endpoint, status); + break; + case VFS_RECVFROM: + resume_recvfrom(rfp, status, addr_len); + break; + case VFS_RECVMSG: + resume_recvmsg(rfp, status, ctl_len, addr_len, flags, + rfp->fp_sdev.aux.buf); + break; + } + break; + + case VFS_ACCEPT: + /* + * This call uses SDEV_ACCEPT_REPLY. We only get here if the + * accept call has failed without creating a new socket, in + * which case we can simply call replycode() with the error. + * For nothing other than consistency, we let resume_accept() + * handle this case too. + */ + addr_len = 0; + if (m_ptr->m_type == SDEV_ACCEPT_REPLY) { + assert(m_ptr->m_lsockdriver_vfs_accept_reply.sock_id < + 0); + status = m_ptr->m_lsockdriver_vfs_accept_reply.status; + addr_len = m_ptr->m_lsockdriver_vfs_accept_reply.len; + } else if (m_ptr->m_type < 0) { + status = m_ptr->m_type; + } else { + printf("VFS: %d sent bad reply type %d for call %d\n", + m_ptr->m_source, m_ptr->m_type, callnr); + status = EIO; + } + /* + * Quick rundown of m_lsockdriver_vfs_accept_reply cases: + * + * - sock_id >= 0, status == OK: new socket accepted + * - sock_id >= 0, status != OK: new socket must be closed + * - sock_id < 0, status != OK: failure accepting socket + * - sock_id < 0, status == OK: invalid, covered right here + * + * See libsockdriver for why there are two reply fields at all. + */ + if (status >= 0) { + printf("VFS: %d sent bad status %d for call %d\n", + m_ptr->m_source, status, callnr); + status = EIO; + } + resume_accept(rfp, status, NO_DEV, addr_len, + rfp->fp_sdev.aux.fd); + break; + + default: + /* + * Ultimately, enumerating all system calls that may cause + * socket I/O may prove too cumbersome. In that case, the + * callnr field could be replaced by a field that stores the + * combination of the expected reply type and the action to + * take, for example. + */ + panic("VFS: socket reply %d for unknown call %d from %d", + m_ptr->m_type, callnr, rfp->fp_endpoint); + } +} + +/* + * Abort the suspended socket call for the given process, because the + * corresponding socket driver has died. + */ +void +sdev_stop(struct fproc * rfp) +{ + message m; + + assert(rfp->fp_blocked_on == FP_BLOCKED_ON_SDEV); + + rfp->fp_blocked_on = FP_BLOCKED_ON_NONE; + + /* + * We use one single approach both here and when stopping worker + * threads: the reply message's m_type is set to an error code (always + * EIO for now) instead of an actual SDEV_ reply code. We test for + * this case in non-suspending calls as well as in sdev_finish(). + */ + m.m_type = EIO; + sdev_finish(rfp, &m); +} + +/* + * Cancel the ongoing long-lasting socket call, because the calling process has + * received a caught or terminating signal. This function is always called + * from a worker thread (as part of PM) work, with 'fp' set to the process that + * issued the original system call. The calling function has just unsuspended + * the process out of _SDEV blocking state. The job of this function is to + * issue a cancel request and then block until a reply comes in; the reply may + * indicate success, in which case it must be handled accordingly. + */ +void +sdev_cancel(void) +{ + struct smap *sp; + message m; + sockid_t sock_id; + + /* The suspension status must just have been cleared by the caller. */ + assert(fp->fp_blocked_on == FP_BLOCKED_ON_NONE); + + if ((sp = get_smap_by_dev(fp->fp_sdev.dev, &sock_id)) != NULL) { + /* Prepare the request message. */ + memset(&m, 0, sizeof(m)); + m.m_type = SDEV_CANCEL; + m.m_vfs_lsockdriver_simple.req_id = (sockid_t)who_e; + m.m_vfs_lsockdriver_simple.sock_id = sock_id; + + /* + * Send the cancel request, and wait for a reply. The reply + * will be for the original request and must be processed + * accordingly. It is possible that the original request + * actually succeeded, because 1) the cancel request resulted + * in partial success or 2) the original reply and the cancel + * request crossed each other. It is because of the second + * case that a socket driver must not respond at all to a + * cancel operation for an unknown request. + */ + sdev_sendrec(sp, &m); + } else + m.m_type = EIO; + + /* + * Successful accept requests require special processing, but since we + * are already operating from a working thread here, we need not spawn + * an additional worker thread for this case. + */ + if (m.m_type == SDEV_ACCEPT_REPLY && + m.m_lsockdriver_vfs_accept_reply.sock_id >= 0) + sdev_finish_accept(fp, &m); + else + sdev_finish(fp, &m); +} + +/* + * A socket driver has sent a reply to a socket request. Process it, by either + * waking up an active worker thread, finishing the system call from here, or + * (in the exceptional case of accept calls) spawning a new worker thread to + * process the reply. This function MUST NOT block its calling thread. + */ +void +sdev_reply(void) +{ + struct fproc *rfp; + struct smap *sp; + struct worker_thread *wp; + sockid_t req_id = -1; + dev_t dev; + int slot; + + if ((sp = get_smap_by_endpt(who_e)) == NULL) { + printf("VFS: ignoring sock dev reply from unknown driver %d\n", + who_e); + return; + } + + switch (call_nr) { + case SDEV_REPLY: + req_id = m_in.m_lsockdriver_vfs_reply.req_id; + break; + case SDEV_SOCKET_REPLY: + req_id = m_in.m_lsockdriver_vfs_socket_reply.req_id; + break; + case SDEV_ACCEPT_REPLY: + req_id = m_in.m_lsockdriver_vfs_accept_reply.req_id; + break; + case SDEV_RECV_REPLY: + req_id = m_in.m_lsockdriver_vfs_recv_reply.req_id; + break; + case SDEV_SELECT1_REPLY: + dev = make_smap_dev(sp, + m_in.m_lsockdriver_vfs_select_reply.sock_id); + select_sdev_reply1(dev, + m_in.m_lsockdriver_vfs_select_reply.status); + return; + case SDEV_SELECT2_REPLY: + dev = make_smap_dev(sp, + m_in.m_lsockdriver_vfs_select_reply.sock_id); + select_sdev_reply2(dev, + m_in.m_lsockdriver_vfs_select_reply.status); + return; + default: + printf("VFS: ignoring unknown sock dev reply %d from %d\n", + call_nr, who_e); + return; + } + + if (isokendpt((endpoint_t)req_id, &slot) != OK) { + printf("VFS: ignoring sock dev reply from %d for unknown %d\n", + who_e, req_id); + return; + } + + rfp = &fproc[slot]; + wp = rfp->fp_worker; + if (wp != NULL && wp->w_task == who_e && wp->w_drv_sendrec != NULL) { + assert(!fp_is_blocked(rfp)); + *wp->w_drv_sendrec = m_in; + wp->w_drv_sendrec = NULL; + worker_signal(wp); /* resume suspended thread */ + /* + * It is up to the worker thread to 1) check that the reply is + * of the right type for the request, and 2) keep in mind that + * the reply type may be EIO in case the socket driver died. + */ + } else if (rfp->fp_blocked_on != FP_BLOCKED_ON_SDEV || + get_smap_by_dev(rfp->fp_sdev.dev, NULL) != sp) { + printf("VFS: ignoring sock dev reply, %d not blocked on %d\n", + rfp->fp_endpoint, who_e); + return; + } else if (call_nr == SDEV_ACCEPT_REPLY && + m_in.m_lsockdriver_vfs_accept_reply.sock_id >= 0) { + /* + * For accept replies that return a new socket, we need to + * spawn a worker thread, because accept calls may block (so + * there will no longer be a worker thread) and processing the + * reply requires additional blocking calls (which we cannot + * issue from the main thread). This is tricky. Under no + * circumstances may we "lose" a legitimate reply, because this + * would lead to resource leaks in the socket driver. To this + * end, we rely on the current worker thread model to + * prioritize regular work over PM work. Still, sdev_cancel() + * may end up receiving the accept reply if it was already + * blocked waiting for the reply message, and it must then + * perform the same tasks. + */ + /* + * It is possible that if all threads are in use, there is a + * "gap" between starting the thread and its activation. The + * main problem for this case is that the socket driver dies + * within that gap. For accepts, we address this with no less + * than two checks: 1) in this file, by looking up the smap + * entry by the reply source endpoint again - if the entry is + * no longer valid, the socket driver must have died; 2) in + * socket.c, by revalidating the original listening socket - if + * the listening socket has been invalidated, the driver died. + * + * Since we unsuspend the process now, a socket driver sending + * two accept replies in a row may never cause VFS to attempt + * spawning two threads; the second reply should be ignored. + */ + assert(fp->fp_func == NULL); + + worker_start(rfp, do_accept_reply, &m_in, FALSE /*use_spare*/); + + /* + * TODO: I just introduced the notion of not using the fp_u + * union across yields after unsuspension, but for socket calls + * we have a lot of socket state to carry over, so I'm now + * immediately violating my own rule again here. Possible + * solutions: 1) introduce another blocking state just to mark + * the fp_u union in use (this has side effects though), 2) + * introduce a pseudo message type which covers both the accept + * reply fields and the fp_u state (do_pending_pipe does this), + * or 3) add a fp_flags flag for this purpose. In any case, + * the whole point is that we catch any attempts to reuse fp_u + * for other purposes and thus cause state corruption. This + * should not happen anyway, but it's too dangerous to leave + * entirely unchecked. --dcvmoole + */ + rfp->fp_blocked_on = FP_BLOCKED_ON_NONE; + } else { + rfp->fp_blocked_on = FP_BLOCKED_ON_NONE; + + sdev_finish(rfp, &m_in); + } +} diff --git a/minix/servers/vfs/select.c b/minix/servers/vfs/select.c index c8347d0bb..649fac424 100644 --- a/minix/servers/vfs/select.c +++ b/minix/servers/vfs/select.c @@ -60,11 +60,14 @@ static void ops2tab(int ops, int fd, struct selectentry *e); static int is_regular_file(struct filp *f); static int is_pipe(struct filp *f); static int is_char_device(struct filp *f); +static int is_sock_device(struct filp *f); static void select_lock_filp(struct filp *f, int ops); static int select_request_file(struct filp *f, int *ops, int block, struct fproc *rfp); static int select_request_char(struct filp *f, int *ops, int block, struct fproc *rfp); +static int select_request_sock(struct filp *f, int *ops, int block, + struct fproc *rfp); static int select_request_pipe(struct filp *f, int *ops, int block, struct fproc *rfp); static void select_cancel_all(struct selectentry *e); @@ -81,6 +84,7 @@ static struct fdtype { int (*type_match)(struct filp *f); } fdtypes[] = { { select_request_char, is_char_device }, + { select_request_sock, is_sock_device }, { select_request_file, is_regular_file }, { select_request_pipe, is_pipe }, }; @@ -198,8 +202,8 @@ int do_select(void) * other types of file is unspecified." * * In our case, terminal and pseudo-terminal devices are handled by the - * TTY major and sockets by either INET major (socket type AF_INET) or - * UDS major (socket type AF_UNIX). Additionally, we give other + * TTY and PTY character drivers respectively. Sockets are handled by + * by their respective socket drivers. Additionally, we give other * character drivers the chance to handle select for any of their * device nodes. Some may not implement support for select and let * libchardriver return EBADF, which we then pass to the calling @@ -362,50 +366,27 @@ static int is_char_device(struct filp *f) } /*===========================================================================* - * select_request_char * + * is_sock_device * *===========================================================================*/ -static int select_request_char(struct filp *f, int *ops, int block, - struct fproc *rfp) +static int is_sock_device(struct filp *f) { -/* Check readiness status on a character device. Unless suitable results are - * available right now, this will only initiate the polling process, causing - * result processing to be deferred. This function MUST NOT block its calling - * thread. The given filp may or may not be locked. +/* See if this filp is a handle on a socket device. This function MUST NOT + * block its calling thread. The given filp may or may not be locked. */ - dev_t dev; - int r, rops; - struct dmap *dp; - /* Start by remapping the device node number to a "real" device number. Those - * two are different only for CTTY_MAJOR aka /dev/tty, but that one single - * exception requires quite some extra effort here: the select code matches - * character driver replies to their requests based on the device number, so - * it needs to be aware that device numbers may be mapped. The idea is to - * perform the mapping once and store the result in the filp object, so that - * at least we don't run into problems when a process loses its controlling - * terminal while doing a select (see also free_proc). It should be noted - * that it is possible that multiple processes share the same /dev/tty filp, - * and they may not all have a controlling terminal. The ctty-less processes - * should never pass the mapping; a more problematic case is checked below. - * - * The cdev_map call also checks the major number for rough validity, so that - * we can use it to index the dmap array safely a bit later. - */ - if ((dev = cdev_map(f->filp_vno->v_sdev, rfp)) == NO_DEV) - return(ENXIO); + return (f && f->filp_vno && S_ISSOCK(f->filp_vno->v_mode)); +} - if (f->filp_char_select_dev != NO_DEV && f->filp_char_select_dev != dev) { - /* Currently, this case can occur as follows: a process with a - * controlling terminal opens /dev/tty and forks, the new child starts - * a new session, opens a new controlling terminal, and both parent and - * child call select on the /dev/tty file descriptor. If this case ever - * becomes real, a better solution may be to force-close a filp for - * /dev/tty when a new controlling terminal is opened. - */ - printf("VFS: file pointer has multiple controlling TTYs!\n"); - return(EIO); - } - f->filp_char_select_dev = dev; /* set before possibly suspending */ +/*===========================================================================* + * select_filter * + *===========================================================================*/ +static int select_filter(struct filp *f, int *ops, int block) +{ +/* Determine which select operations can be satisfied immediately and which + * should be requested. Used for character and socket devices. This function + * MUST NOT block its calling thread. + */ + int rops; rops = *ops; @@ -432,7 +413,7 @@ static int select_request_char(struct filp *f, int *ops, int block, if ((rops & SEL_ERR) && (f->filp_select_flags & FSF_ERR_BLOCK)) rops &= ~SEL_ERR; if (!(rops & (SEL_RD|SEL_WR|SEL_ERR))) - return(OK); + return(0); } f->filp_select_flags |= FSF_UPDATE; @@ -446,6 +427,58 @@ static int select_request_char(struct filp *f, int *ops, int block, if (f->filp_select_flags & FSF_BUSY) return(SUSPEND); + return rops; +} + +/*===========================================================================* + * select_request_char * + *===========================================================================*/ +static int select_request_char(struct filp *f, int *ops, int block, + struct fproc *rfp) +{ +/* Check readiness status on a character device. Unless suitable results are + * available right now, this will only initiate the polling process, causing + * result processing to be deferred. This function MUST NOT block its calling + * thread. The given filp may or may not be locked. + */ + dev_t dev; + int r, rops; + struct dmap *dp; + + /* Start by remapping the device node number to a "real" device number. Those + * two are different only for CTTY_MAJOR aka /dev/tty, but that one single + * exception requires quite some extra effort here: the select code matches + * character driver replies to their requests based on the device number, so + * it needs to be aware that device numbers may be mapped. The idea is to + * perform the mapping once and store the result in the filp object, so that + * at least we don't run into problems when a process loses its controlling + * terminal while doing a select (see also free_proc). It should be noted + * that it is possible that multiple processes share the same /dev/tty filp, + * and they may not all have a controlling terminal. The ctty-less processes + * should never pass the mapping; a more problematic case is checked below. + * + * The cdev_map call also checks the major number for rough validity, so that + * we can use it to index the dmap array safely a bit later. + */ + if ((dev = cdev_map(f->filp_vno->v_sdev, rfp)) == NO_DEV) + return(ENXIO); + + if (f->filp_select_dev != NO_DEV && f->filp_select_dev != dev) { + /* Currently, this case can occur as follows: a process with a + * controlling terminal opens /dev/tty and forks, the new child starts + * a new session, opens a new controlling terminal, and both parent and + * child call select on the /dev/tty file descriptor. If this case ever + * becomes real, a better solution may be to force-close a filp for + * /dev/tty when a new controlling terminal is opened. + */ + printf("VFS: file pointer has multiple controlling TTYs!\n"); + return(EIO); + } + f->filp_select_dev = dev; /* set before possibly suspending */ + + if ((rops = select_filter(f, ops, block)) <= 0) + return(rops); /* OK or suspend: nothing to do for now */ + dp = &dmap[major(dev)]; if (dp->dmap_sel_busy) return(SUSPEND); @@ -462,6 +495,46 @@ static int select_request_char(struct filp *f, int *ops, int block, return(SUSPEND); } +/*===========================================================================* + * select_request_sock * + *===========================================================================*/ +static int select_request_sock(struct filp *f, int *ops, int block, + struct fproc *rfp __unused) +{ +/* Check readiness status on a socket device. Unless suitable results are + * available right now, this will only initiate the polling process, causing + * result processing to be deferred. This function MUST NOT block its calling + * thread. The given filp may or may not be locked. + */ + struct smap *sp; + dev_t dev; + int r, rops; + + dev = f->filp_vno->v_sdev; + + if ((sp = get_smap_by_dev(dev, NULL)) == NULL) + return(ENXIO); /* this should not happen */ + + f->filp_select_dev = dev; /* set before possibly suspending */ + + if ((rops = select_filter(f, ops, block)) <= 0) + return(rops); /* OK or suspend: nothing to do for now */ + + if (sp->smap_sel_busy) + return(SUSPEND); + + f->filp_select_flags &= ~FSF_UPDATE; + r = sdev_select(dev, rops); + if (r != OK) + return(r); + + sp->smap_sel_busy = TRUE; + sp->smap_sel_filp = f; + f->filp_select_flags |= FSF_BUSY; + + return(SUSPEND); +} + /*===========================================================================* * select_request_file * *===========================================================================*/ @@ -644,6 +717,7 @@ static void select_cancel_filp(struct filp *f) * its calling thread. */ devmajor_t major; + struct smap *sp; assert(f); assert(f->filp_selectors > 0); @@ -657,16 +731,22 @@ static void select_cancel_filp(struct filp *f) f->filp_pipe_select_ops = 0; /* If this filp is the subject of an ongoing select query to a - * character device, mark the query as stale, so that this filp will - * not be checked when the result arrives. The filp select device may - * still be NO_DEV if do_select fails on the initial fd check. + * character or socket device, mark the query as stale, so that this + * filp will not be checked when the result arrives. The filp select + * device may still be NO_DEV if do_select fails on the initial fd + * check. */ - if (is_char_device(f) && f->filp_char_select_dev != NO_DEV) { - major = major(f->filp_char_select_dev); + if (is_char_device(f) && f->filp_select_dev != NO_DEV) { + major = major(f->filp_select_dev); if (dmap[major].dmap_sel_busy && dmap[major].dmap_sel_filp == f) dmap[major].dmap_sel_filp = NULL; /* leave _busy set */ - f->filp_char_select_dev = NO_DEV; + f->filp_select_dev = NO_DEV; + } else if (is_sock_device(f) && f->filp_select_dev != NO_DEV) { + if ((sp = get_smap_by_dev(f->filp_select_dev, NULL)) != NULL && + sp->smap_sel_busy && sp->smap_sel_filp == f) + sp->smap_sel_filp = NULL; /* leave _busy set */ + f->filp_select_dev = NO_DEV; } } } @@ -778,11 +858,19 @@ void select_timeout_check(int s) void select_unsuspend_by_endpt(endpoint_t proc_e) { /* Revive blocked processes when a driver has disappeared */ + struct dmap *dp; + struct smap *sp; devmajor_t major; - int fd, s; + int fd, s, is_driver; struct selectentry *se; struct filp *f; + /* Either or both of these may be NULL. */ + dp = get_dmap_by_endpt(proc_e); + sp = get_smap_by_endpt(proc_e); + + is_driver = (dp != NULL || sp != NULL); + for (s = 0; s < MAXSELECTS; s++) { int wakehim = 0; se = &selecttab[s]; @@ -793,31 +881,102 @@ void select_unsuspend_by_endpt(endpoint_t proc_e) continue; } + /* Skip the more expensive "driver died" checks for non-drivers. */ + if (!is_driver) + continue; + for (fd = 0; fd < se->nfds; fd++) { - if ((f = se->filps[fd]) == NULL || !is_char_device(f)) + if ((f = se->filps[fd]) == NULL) continue; - - assert(f->filp_char_select_dev != NO_DEV); - major = major(f->filp_char_select_dev); - if (dmap_driver_match(proc_e, major)) { - se->filps[fd] = NULL; - se->error = EIO; - select_cancel_filp(f); - wakehim = 1; + if (is_char_device(f)) { + assert(f->filp_select_dev != NO_DEV); + major = major(f->filp_select_dev); + if (dmap_driver_match(proc_e, major)) { + se->filps[fd] = NULL; + se->error = EIO; + select_cancel_filp(f); + wakehim = 1; + } + } else if (sp != NULL && is_sock_device(f)) { + assert(f->filp_select_dev != NO_DEV); + if (get_smap_by_dev(f->filp_select_dev, NULL) == sp) { + se->filps[fd] = NULL; + se->error = EIO; + select_cancel_filp(f); + wakehim = 1; + } } } if (wakehim && !is_deferred(se)) select_return(se); } + + /* Any outstanding queries will never be answered, so forget about them. */ + if (dp != NULL) { + assert(dp->dmap_sel_filp == NULL); + dp->dmap_sel_busy = FALSE; + } + if (sp != NULL) { + assert(sp->smap_sel_filp == NULL); + sp->smap_sel_busy = FALSE; + } } /*===========================================================================* * select_reply1 * *===========================================================================*/ -void select_reply1(endpoint_t driver_e, devminor_t minor, int status) +static void select_reply1(struct filp *f, int status) +{ +/* Handle the initial reply to a character or socket select request. This + * function MUST NOT block its calling thread. + */ + + assert(f->filp_count >= 1); + assert(f->filp_select_flags & FSF_BUSY); + + f->filp_select_flags &= ~FSF_BUSY; + + /* The select call is done now, except when + * - another process started a select on the same filp with possibly a + * different set of operations. + * - a process does a select on the same filp but using different file + * descriptors. + * - the select has a timeout. Upon receiving this reply the operations + * might not be ready yet, so we want to wait for that to ultimately + * happen. + * Therefore we need to keep remembering what the operations are. + */ + if (!(f->filp_select_flags & (FSF_UPDATE|FSF_BLOCKED))) + f->filp_select_ops = 0; /* done selecting */ + else if (status > 0 && !(f->filp_select_flags & FSF_UPDATE)) + /* there may be operations pending */ + f->filp_select_ops &= ~status; + + /* Record new filp status */ + if (!(status == 0 && (f->filp_select_flags & FSF_BLOCKED))) { + if (status > 0) { /* operations ready */ + if (status & SEL_RD) + f->filp_select_flags &= ~FSF_RD_BLOCK; + if (status & SEL_WR) + f->filp_select_flags &= ~FSF_WR_BLOCK; + if (status & SEL_ERR) + f->filp_select_flags &= ~FSF_ERR_BLOCK; + } else if (status < 0) { /* error */ + /* Always unblock upon error */ + f->filp_select_flags &= ~FSF_BLOCKED; + } + } + + filp_status(f, status); /* Tell filp owners about the results */ +} + +/*===========================================================================* + * select_cdev_reply1 * + *===========================================================================*/ +void select_cdev_reply1(endpoint_t driver_e, devminor_t minor, int status) { -/* Handle the initial reply to CDEV_SELECT request. This function MUST NOT +/* Handle the initial reply to a CDEV_SELECT request. This function MUST NOT * block its calling thread. */ devmajor_t major; @@ -826,7 +985,7 @@ void select_reply1(endpoint_t driver_e, devminor_t minor, int status) struct dmap *dp; /* Figure out which device is replying */ - if ((dp = get_dmap(driver_e)) == NULL) return; + if ((dp = get_dmap_by_endpt(driver_e)) == NULL) return; major = dp-dmap; dev = makedev(major, minor); @@ -845,13 +1004,13 @@ void select_reply1(endpoint_t driver_e, devminor_t minor, int status) if ((f = dp->dmap_sel_filp) != NULL) { /* Find vnode and check we got a reply from the device we expected */ assert(is_char_device(f)); - assert(f->filp_char_select_dev != NO_DEV); - if (f->filp_char_select_dev != dev) { + assert(f->filp_select_dev != NO_DEV); + if (f->filp_select_dev != dev) { /* This should never happen. The driver may be misbehaving. * For now we assume that the reply we want will arrive later.. */ printf("VFS (%s:%d): expected reply from dev %llx not %llx\n", - __FILE__, __LINE__, f->filp_char_select_dev, dev); + __FILE__, __LINE__, f->filp_select_dev, dev); return; } } @@ -860,83 +1019,78 @@ void select_reply1(endpoint_t driver_e, devminor_t minor, int status) dp->dmap_sel_busy = FALSE; dp->dmap_sel_filp = NULL; - /* Process the select result only if the filp is valid. */ - if (f != NULL) { - assert(f->filp_count >= 1); - assert(f->filp_select_flags & FSF_BUSY); - - f->filp_select_flags &= ~FSF_BUSY; - - /* The select call is done now, except when - * - another process started a select on the same filp with possibly a - * different set of operations. - * - a process does a select on the same filp but using different file - * descriptors. - * - the select has a timeout. Upon receiving this reply the operations - * might not be ready yet, so we want to wait for that to ultimately - * happen. - * Therefore we need to keep remembering what the operations are. - */ - if (!(f->filp_select_flags & (FSF_UPDATE|FSF_BLOCKED))) - f->filp_select_ops = 0; /* done selecting */ - else if (status > 0 && !(f->filp_select_flags & FSF_UPDATE)) - /* there may be operations pending */ - f->filp_select_ops &= ~status; - - /* Record new filp status */ - if (!(status == 0 && (f->filp_select_flags & FSF_BLOCKED))) { - if (status > 0) { /* operations ready */ - if (status & SEL_RD) - f->filp_select_flags &= ~FSF_RD_BLOCK; - if (status & SEL_WR) - f->filp_select_flags &= ~FSF_WR_BLOCK; - if (status & SEL_ERR) - f->filp_select_flags &= ~FSF_ERR_BLOCK; - } else if (status < 0) { /* error */ - /* Always unblock upon error */ - f->filp_select_flags &= ~FSF_BLOCKED; - } - } - - filp_status(f, status); /* Tell filp owners about the results */ - } + /* Process the status change, if still applicable. */ + if (f != NULL) + select_reply1(f, status); + /* See if we should send a select request for another filp now. */ select_restart_filps(); } - /*===========================================================================* - * select_reply2 * + * select_sdev_reply1 * *===========================================================================*/ -void select_reply2(endpoint_t driver_e, devminor_t minor, int status) +void select_sdev_reply1(dev_t dev, int status) { -/* Handle secondary reply to DEV_SELECT request. A secondary reply occurs when - * the select request is 'blocking' until an operation becomes ready. This - * function MUST NOT block its calling thread. +/* Handle the initial reply to a SDEV_SELECT request. This function MUST NOT + * block its calling thread. */ - int slot, found, fd; - devmajor_t major; - dev_t dev; + struct smap *sp; struct filp *f; - struct dmap *dp; - struct selectentry *se; - if (status == 0) { - printf("VFS (%s:%d): weird status (%d) to report\n", - __FILE__, __LINE__, status); + if ((sp = get_smap_by_dev(dev, NULL)) == NULL) return; - } - /* Figure out which device is replying */ - if ((dp = get_dmap(driver_e)) == NULL) { - printf("VFS (%s:%d): endpoint %d is not a known driver endpoint\n", - __FILE__, __LINE__, driver_e); + /* Get the file pointer for the socket device. */ + if (!sp->smap_sel_busy) { + printf("VFS: was not expecting a SDEV_SELECT reply from %d\n", + sp->smap_endpt); return; } - major = dp-dmap; - dev = makedev(major, minor); - /* Find all file descriptors selecting for this device */ + /* The select filp may have been set to NULL if the requestor has been + * unpaused in the meantime. In that case, we ignore the result, but we do + * look for other filps to restart later. + */ + if ((f = sp->smap_sel_filp) != NULL) { + /* Find vnode and check we got a reply from the device we expected */ + assert(is_sock_device(f)); + assert(f->filp_select_dev != NO_DEV); + if (f->filp_select_dev != dev) { + /* This should never happen. The driver may be misbehaving. + * For now we assume that the reply we want will arrive later.. + */ + printf("VFS: expected reply from sock dev %llx, not %llx\n", + f->filp_select_dev, dev); + return; + } + } + + /* We are no longer waiting for a reply from this socket driver. */ + sp->smap_sel_busy = FALSE; + sp->smap_sel_filp = NULL; + + /* Process the status change, if still applicable. */ + if (f != NULL) + select_reply1(f, status); + + /* See if we should send a select request for another filp now. */ + select_restart_filps(); +} + +/*===========================================================================* + * select_reply2 * + *===========================================================================*/ +static void select_reply2(int is_char, dev_t dev, int status) +{ +/* Find all file descriptors selecting for the given character (is_char==TRUE) + * or socket (is_char==FALSE) device, update their statuses, and resume + * activities accordingly. + */ + int slot, found, fd; + struct filp *f; + struct selectentry *se; + for (slot = 0; slot < MAXSELECTS; slot++) { se = &selecttab[slot]; if (se->requestor == NULL) continue; /* empty slot */ @@ -944,9 +1098,10 @@ void select_reply2(endpoint_t driver_e, devminor_t minor, int status) found = FALSE; for (fd = 0; fd < se->nfds; fd++) { if ((f = se->filps[fd]) == NULL) continue; - if (!is_char_device(f)) continue; - assert(f->filp_char_select_dev != NO_DEV); - if (f->filp_char_select_dev != dev) continue; + if (is_char && !is_char_device(f)) continue; + if (!is_char && !is_sock_device(f)) continue; + assert(f->filp_select_dev != NO_DEV); + if (f->filp_select_dev != dev) continue; if (status > 0) { /* Operations ready */ /* Clear the replied bits from the request @@ -979,6 +1134,56 @@ void select_reply2(endpoint_t driver_e, devminor_t minor, int status) select_restart_filps(); } +/*===========================================================================* + * select_cdev_reply2 * + *===========================================================================*/ +void select_cdev_reply2(endpoint_t driver_e, devminor_t minor, int status) +{ +/* Handle a secondary reply to a CDEV_SELECT request. A secondary reply occurs + * when the select request is 'blocking' until an operation becomes ready. This + * function MUST NOT block its calling thread. + */ + devmajor_t major; + struct dmap *dp; + dev_t dev; + + if (status == 0) { + printf("VFS (%s:%d): weird status (%d) to report\n", + __FILE__, __LINE__, status); + return; + } + + /* Figure out which device is replying */ + if ((dp = get_dmap_by_endpt(driver_e)) == NULL) { + printf("VFS (%s:%d): endpoint %d is not a known driver endpoint\n", + __FILE__, __LINE__, driver_e); + return; + } + major = dp-dmap; + dev = makedev(major, minor); + + select_reply2(TRUE /*is_char*/, dev, status); +} + +/*===========================================================================* + * select_sdev_reply2 * + *===========================================================================*/ +void select_sdev_reply2(dev_t dev, int status) +{ +/* Handle a secondary reply to a SDEV_SELECT request. A secondary reply occurs + * when the select request is 'blocking' until an operation becomes ready. This + * function MUST NOT block its calling thread. + */ + + if (status == 0) { + printf("VFS: weird socket device status (%d)\n", status); + + return; + } + + select_reply2(FALSE /*is_char*/, dev, status); +} + /*===========================================================================* * select_restart_filps * *===========================================================================*/ @@ -1013,14 +1218,19 @@ static void select_restart_filps(void) if (!(f->filp_select_flags & FSF_UPDATE)) /* Must be in */ continue; /* 'update' state */ - /* This function is suitable only for character devices. In - * particular, checking pipes the same way would introduce a - * serious locking problem. + /* This function is suitable only for character and socket + * devices. In particular, checking pipes the same way would + * introduce a serious locking problem. */ - assert(is_char_device(f)); + assert(is_char_device(f) || is_sock_device(f)); wantops = ops = f->filp_select_ops; - r = select_request_char(f, &wantops, se->block, se->requestor); + if (is_char_device(f)) + r = select_request_char(f, &wantops, se->block, + se->requestor); + else + r = select_request_sock(f, &wantops, se->block, + se->requestor); if (r != OK && r != SUSPEND) { se->error = r; restart_proc(se); @@ -1122,7 +1332,9 @@ select_dump(void) struct selectentry *se; struct filp *f; struct dmap *dp; + struct smap *sp; dev_t dev; + sockid_t sockid; int s, fd; for (s = 0; s < MAXSELECTS; s++) { @@ -1158,6 +1370,18 @@ select_dump(void) dp->dmap_sel_filp); } else printf("unknown)\n"); + } else if (is_sock_device(f)) { + dev = f->filp_vno->v_sdev; + printf("sock (dev "); + sp = get_smap_by_dev(dev, &sockid); + if (sp != NULL) { + printf("<%d,%d>, smap busy %d filp " + "%p)\n", sp->smap_num, sockid, + sp->smap_sel_busy, + sp->smap_sel_filp); + } else + printf("<0x%"PRIx64">, smap " + "unknown)\n", dev); } else printf("unknown\n"); } diff --git a/minix/servers/vfs/smap.c b/minix/servers/vfs/smap.c new file mode 100644 index 000000000..0af6cbbe6 --- /dev/null +++ b/minix/servers/vfs/smap.c @@ -0,0 +1,273 @@ +/* + * This file contains the table with socket driver mappings. One socket driver + * may implement multiple domains (e.g., PF_INET and PF_INET6). For this + * reason, we assign a unique number to each socket driver, and use a "socket + * device map" table (smap) that maps from those numbers to information about + * socket drivers. This number is combined with a per-driver socket identifier + * to form a globally unique socket ID (64-bit, stored as dev_t). In addition, + * we use a table that maps from PF_xxx domains to socket drivers (pfmap). + */ + +#include "fs.h" +#include +#include + +static struct smap smap[NR_SOCKDEVS]; +static struct smap *pfmap[PF_MAX]; + +/* + * Initialize the socket device map table. + */ +void +init_smap(void) +{ + unsigned int i; + + for (i = 0; i < __arraycount(smap); i++) { + /* + * The smap numbers are one-based so as to ensure that no + * socket will have the device number NO_DEV, which would + * create problems with eg the select code. + */ + smap[i].smap_num = i + 1; + smap[i].smap_endpt = NONE; + } + + memset(pfmap, 0, sizeof(pfmap)); +} + +/* + * Register a socket driver. This action can only be requested by RS. The + * process identified by the given DS label 'label' and endpoint 'endpt' is to + * be responsible for sockets created in the domains as given in the 'domains' + * array, which contains 'ndomains' elements. Return OK upon successful + * registration, or an error code otherwise. + */ +int +smap_map(const char * label, endpoint_t endpt, const int * domains, + unsigned int ndomains) +{ + struct smap *sp; + unsigned int i, num = 0; + int domain; + + if (ndomains <= 0 || ndomains > NR_DOMAIN) + return EINVAL; + + /* + * See if there is already a socket device map entry for this label. + * If so, the socket driver is probably being restarted, and we should + * overwrite its previous entry. + */ + sp = NULL; + for (i = 0; i < __arraycount(smap); i++) { + if (smap[i].smap_endpt != NONE && + !strcmp(smap[i].smap_label, label)) { + sp = &smap[i]; + break; + } + } + + /* + * See if all given domains are valid and not already reserved by a + * socket driver other than (if applicable) this driver's old instance. + */ + for (i = 0; i < ndomains; i++) { + domain = domains[i]; + if (domain < 0 || domain >= __arraycount(pfmap)) + return EINVAL; + if (domain == PF_UNSPEC) + return EINVAL; + if (pfmap[domain] != NULL && pfmap[domain] != sp) + return EBUSY; + } + + /* + * If we are not about to replace an existing socket device map entry, + * find a free entry, returning an error if all entries are in use. + */ + if (sp == NULL) { + for (num = 0; num < __arraycount(smap); num++) + if (smap[num].smap_endpt == NONE) + break; + + if (num == __arraycount(smap)) + return ENOMEM; + } else + num = (unsigned int)(sp - smap); + + /* + * At this point, the registration will succeed, and we can start + * modifying tables. Just to be sure, unmap the domain mappings for + * the old instance, in case it is somehow registered with a different + * set of domains. Also, if the endpoint of the service has changed, + * cancel any operations involving the previous endpoint and invalidate + * any preexisting sockets. However, for stateful restarts where the + * service endpoint does not change, leave things as is. + */ + if (sp != NULL) { + if (sp->smap_endpt != endpt) { + /* + * For stateless restarts, it is common that the new + * endpoint is made ready before the old endpoint is + * exited, so we cannot wait for the exit handling code + * to do these steps, as they rely on the old socket + * mapping still being around. + */ + unsuspend_by_endpt(sp->smap_endpt); + + invalidate_filp_by_sock_drv(sp->smap_num); + } + + for (i = 0; i < __arraycount(pfmap); i++) + if (pfmap[i] == sp) + pfmap[i] = NULL; + } + + /* + * Initialize the socket driver map entry, and set up the domain map + * entries. + */ + sp = &smap[num]; + sp->smap_endpt = endpt; + strlcpy(sp->smap_label, label, sizeof(sp->smap_label)); + sp->smap_sel_busy = FALSE; + sp->smap_sel_filp = NULL; + + for (i = 0; i < ndomains; i++) + pfmap[domains[i]] = sp; + + return OK; +} + +/* + * The process with the given endpoint has exited. If the endpoint identifies + * a socket driver, deregister the driver and invalidate any sockets it owned. + */ +void +smap_unmap_by_endpt(endpoint_t endpt) +{ + struct smap *sp; + unsigned int i; + + if ((sp = get_smap_by_endpt(endpt)) == NULL) + return; + + /* + * Invalidation requires that the smap entry still be around, so do + * this before clearing the endpoint. + */ + invalidate_filp_by_sock_drv(sp->smap_num); + + sp->smap_endpt = NONE; + + for (i = 0; i < __arraycount(pfmap); i++) + if (pfmap[i] == sp) + pfmap[i] = NULL; +} + +/* + * The given endpoint has announced itself as a socket driver. + */ +void +smap_endpt_up(endpoint_t endpt) +{ + struct smap *sp; + + if ((sp = get_smap_by_endpt(endpt)) == NULL) + return; + + /* + * The announcement indicates that the socket driver has either started + * anew or restarted statelessly. In the second case, none of its + * previously existing sockets will have survived, so mark them as + * invalid. + */ + invalidate_filp_by_sock_drv(sp->smap_num); +} + +/* + * Construct a device number that combines the entry number of the given socket + * map and the given per-driver socket identifier, thus constructing a unique + * identifier for the socket. Generally speaking, we use the dev_t type + * because the value is stored as special device number (sdev) on a socket node + * on PFS. We use our own bit division rather than the standard major/minor + * division because this simplifies using each half as a 32-bit value. The + * block/character device numbers and socket device numbers are in different + * namespaces, and numbers may overlap (even though this is currently + * practically impossible), so one must always test the file type first. + */ +dev_t +make_smap_dev(struct smap * sp, sockid_t sockid) +{ + + assert(sp->smap_endpt != NONE); + assert(sockid >= 0); + + return (dev_t)(((uint64_t)sp->smap_num << 32) | (uint32_t)sockid); +} + +/* + * Return a pointer to the smap structure for the socket driver associated with + * the socket device number. In addition, if the given socket ID pointer is + * not NULL, store the per-driver socket identifier in it. Return NULL if the + * given socket device number is not a socket for a valid socket driver. + */ +struct smap * +get_smap_by_dev(dev_t dev, sockid_t * sockidp) +{ + struct smap *sp; + unsigned int num; + sockid_t id; + + num = (unsigned int)(dev >> 32); + id = (sockid_t)(dev & ((1ULL << 32) - 1)); + if (num == 0 || num > __arraycount(smap) || id < 0) + return NULL; + + sp = &smap[num - 1]; + assert(sp->smap_num == num); + + if (sp->smap_endpt == NONE) + return NULL; + + if (sockidp != NULL) + *sockidp = id; + return sp; +} + +/* + * Return a pointer to the smap structure for the socket driver with the given + * endpoint. Return NULL if the endpoint does not identify a socket driver. + */ +struct smap * +get_smap_by_endpt(endpoint_t endpt) +{ + unsigned int i; + + /* + * TODO: this function is used rather frequently, so it would be nice + * to get rid of the O(n) loop here. The get_dmap_by_endpt() function + * suffers from the same problem. It might be worth adding an extra + * field to the fproc structure for this. + */ + for (i = 0; i < __arraycount(smap); i++) + if (smap[i].smap_endpt == endpt) + return &smap[i]; + + return NULL; +} + +/* + * Return a pointer to the smap structure for the socket driver handling the + * given domain (protocol family). Return NULL if there is no match. + */ +struct smap * +get_smap_by_domain(int domain) +{ + + if (domain < 0 || domain >= __arraycount(pfmap)) + return NULL; + + return pfmap[domain]; /* may be NULL */ +} diff --git a/minix/servers/vfs/socket.c b/minix/servers/vfs/socket.c index 8201b0f03..dd8af0a12 100644 --- a/minix/servers/vfs/socket.c +++ b/minix/servers/vfs/socket.c @@ -1,7 +1,4 @@ /* - * IMPORTANT NOTICE: THIS FILE CONTAINS STUBS ONLY RIGHT NOW, TO ENABLE A - * SEAMLESS TRANSITION TO THE NEW API FOR PROGRAMS STATICALLY LINKED TO LIBC! - * * This file implements the upper socket layer of VFS: the BSD socket system * calls, and any associated file descriptor, file pointer, vnode, and file * system processing. In most cases, this layer will call into the lower @@ -35,17 +32,189 @@ */ #include "fs.h" +#include "vnode.h" +#include "file.h" #include +/* + * Convert any SOCK_xx open flags to O_xx open flags. + */ +static int +get_sock_flags(int type) +{ + int flags; + + flags = 0; + if (type & SOCK_CLOEXEC) + flags |= O_CLOEXEC; + if (type & SOCK_NONBLOCK) + flags |= O_NONBLOCK; + if (type & SOCK_NOSIGPIPE) + flags |= O_NOSIGPIPE; + + return flags; +} + +/* + * Perform cheap pre-call checks to ensure that the given number of socket FDs + * can be created for the current process. + */ +static int +check_sock_fds(int nfds) +{ + + /* + * For now, we simply check if there are enough file descriptor slots + * free in the process. Since the process is blocked on a socket call, + * this aspect will not change. Availability of file pointers, vnodes, + * and PFS nodes may vary, and is therefore less interesting to check + * here - it will have to be checked again upon completion anyway. + */ + return check_fds(fp, nfds); +} + +/* + * Create a new file descriptor, including supporting objects, for the open + * socket identified by 'dev', in the current process, using the O_xx open + * flags 'flags'. On success, return the file descriptor number. The results + * of a successful call can be undone with close_fd(), which will also close + * the socket itself. On failure, return a negative error code. In this case, + * the socket will be left open. + */ +static int +make_sock_fd(dev_t dev, int flags) +{ + struct vmnt *vmp; + struct vnode *vp; + struct filp *filp; + struct node_details res; + int r, fd; + + assert((flags & ~(O_CLOEXEC | O_NONBLOCK | O_NOSIGPIPE)) == 0); + +#if !NDEBUG + /* + * Check whether there is a socket object for the new device already. + * This is an expensive check, but if the socket driver sends us a new + * socket ID that is already in use, this is a sure sign of driver + * misbehavior. So far it does seem like nothing would go wrong within + * VFS in this case though, which is why this is a debug-only check. + */ + if (find_filp_by_sock_dev(dev) != NULL) { + printf("VFS: socket driver %d generated in-use socket ID!\n", + get_smap_by_dev(dev, NULL)->smap_endpt); + return EIO; + } +#endif /* !NDEBUG */ + + /* + * Get a lock on PFS. TODO: it is not clear whether locking PFS is + * needed at all, let alone which lock: map_vnode() uses a write lock, + * create_pipe() uses a read lock, and cdev_clone() uses no lock at + * all. As is, the README prescribes VMNT_READ, so that's what we use + * here. The code below largely copies the create_pipe() code anyway. + */ + if ((vmp = find_vmnt(PFS_PROC_NR)) == NULL) + panic("PFS gone"); + if ((r = lock_vmnt(vmp, VMNT_READ)) != OK) + return r; + + /* Obtain a free vnode. */ + if ((vp = get_free_vnode()) == NULL) { + unlock_vmnt(vmp); + return err_code; + } + lock_vnode(vp, VNODE_OPCL); + + /* Acquire a file descriptor. */ + if ((r = get_fd(fp, 0, R_BIT | W_BIT, &fd, &filp)) != OK) { + unlock_vnode(vp); + unlock_vmnt(vmp); + return r; + } + + /* Create a PFS node for the socket. */ + if ((r = req_newnode(PFS_PROC_NR, fp->fp_effuid, fp->fp_effgid, + S_IFSOCK | ACCESSPERMS, dev, &res)) != OK) { + unlock_filp(filp); + unlock_vnode(vp); + unlock_vmnt(vmp); + return r; + } + + /* Fill in the objects, and link them together. */ + vp->v_fs_e = res.fs_e; + vp->v_inode_nr = res.inode_nr; + vp->v_mode = res.fmode; + vp->v_sdev = dev; + vp->v_fs_count = 1; + vp->v_ref_count = 1; + vp->v_vmnt = NULL; + vp->v_dev = NO_DEV; + vp->v_size = 0; + + filp->filp_vno = vp; + filp->filp_flags = O_RDWR | flags; + filp->filp_count = 1; + + fp->fp_filp[fd] = filp; + if (flags & O_CLOEXEC) + FD_SET(fd, &fp->fp_cloexec_set); + + /* Release locks, and return the new file descriptor. */ + unlock_filp(filp); /* this also unlocks the vnode now! */ + unlock_vmnt(vmp); + + return fd; +} + /* * Create a socket. */ int do_socket(void) { + int domain, type, sock_type, protocol; + dev_t dev; + int r, flags; + + domain = job_m_in.m_lc_vfs_socket.domain; + type = job_m_in.m_lc_vfs_socket.type; + protocol = job_m_in.m_lc_vfs_socket.protocol; + + /* Is there a socket driver for this domain at all? */ + if (get_smap_by_domain(domain) == NULL) + return EAFNOSUPPORT; + + /* + * Ensure that it is at least likely that after creating a socket, we + * will be able to create a file descriptor for it, along with all the + * necessary supporting objects. While it would be slightly neater to + * allocate these objects before trying to create the socket, this is + * offset by the fact that that approach results in a downright mess in + * do_socketpair() below, and with the current approach we can reuse + * the same code for accepting sockets as well. For newly created + * sockets, it is no big deal to close them right after creation; for + * newly accepted sockets, we have no choice but to do that anyway. + * Moreover, object creation failures should be rare and our approach + * does not cause significantly more overhead anyway, so the entire + * issue is largely philosophical anyway. For now, this will do. + */ + if ((r = check_sock_fds(1)) != OK) + return r; - return EAFNOSUPPORT; + sock_type = type & ~SOCK_FLAGS_MASK; + flags = get_sock_flags(type); + + if ((r = sdev_socket(domain, sock_type, protocol, &dev, + FALSE /*pair*/)) != OK) + return r; + + if ((r = make_sock_fd(dev, flags)) < 0) + (void)sdev_close(dev); + + return r; } /* @@ -54,8 +223,82 @@ do_socket(void) int do_socketpair(void) { + int domain, type, sock_type, protocol; + dev_t dev[2]; + int r, fd0, fd1, flags; + + domain = job_m_in.m_lc_vfs_socket.domain; + type = job_m_in.m_lc_vfs_socket.type; + protocol = job_m_in.m_lc_vfs_socket.protocol; + + /* Is there a socket driver for this domain at all? */ + if (get_smap_by_domain(domain) == NULL) + return EAFNOSUPPORT; + + /* + * See the lengthy comment in do_socket(). This time we need two of + * everything, though. + */ + if ((r = check_sock_fds(2)) != OK) + return r; + + sock_type = type & ~SOCK_FLAGS_MASK; + flags = get_sock_flags(type); + + if ((r = sdev_socket(domain, sock_type, protocol, dev, + TRUE /*pair*/)) != OK) + return r; + + if ((fd0 = make_sock_fd(dev[0], flags)) < 0) { + (void)sdev_close(dev[0]); + (void)sdev_close(dev[1]); + return fd0; + } + + if ((fd1 = make_sock_fd(dev[1], flags)) < 0) { + close_fd(fp, fd0); + (void)sdev_close(dev[1]); + return fd1; + } + + job_m_out.m_vfs_lc_fdpair.fd0 = fd0; + job_m_out.m_vfs_lc_fdpair.fd1 = fd1; + return OK; +} + +/* + * Check whether the given file descriptor identifies an open socket in the + * current process. If so, return OK, with the socket device number stored in + * 'dev' and its file pointer flags stored in 'flags' (if not NULL). If not, + * return an appropriate error code. + */ +static int +get_sock(int fd, dev_t * dev, int * flags) +{ + struct filp *filp; - return EAFNOSUPPORT; + if ((filp = get_filp(fd, VNODE_READ)) == NULL) + return err_code; + + if (!S_ISSOCK(filp->filp_vno->v_mode)) { + unlock_filp(filp); + return ENOTSOCK; + } + + *dev = filp->filp_vno->v_sdev; + if (flags != NULL) + *flags = filp->filp_flags; + + /* + * It is safe to leave the file pointer object unlocked during the + * actual call. Since the current process is blocked for the duration + * of the socket call, we know the socket's file descriptor, and thus + * its file pointer, can not possibly be freed. In addition, we will + * not be accessing the file pointer anymore later, with the exception + * of accept calls, which reacquire the lock when the reply comes in. + */ + unlock_filp(filp); + return OK; } /* @@ -64,8 +307,16 @@ do_socketpair(void) int do_bind(void) { + dev_t dev; + int r, fd, flags; + + fd = job_m_in.m_lc_vfs_sockaddr.fd; - return ENOTSOCK; + if ((r = get_sock(fd, &dev, &flags)) != OK) + return r; + + return sdev_bind(dev, job_m_in.m_lc_vfs_sockaddr.addr, + job_m_in.m_lc_vfs_sockaddr.addr_len, flags); } /* @@ -74,8 +325,16 @@ do_bind(void) int do_connect(void) { + dev_t dev; + int r, fd, flags; + + fd = job_m_in.m_lc_vfs_sockaddr.fd; - return ENOTSOCK; + if ((r = get_sock(fd, &dev, &flags)) != OK) + return r; + + return sdev_connect(dev, job_m_in.m_lc_vfs_sockaddr.addr, + job_m_in.m_lc_vfs_sockaddr.addr_len, flags); } /* @@ -84,8 +343,19 @@ do_connect(void) int do_listen(void) { + dev_t dev; + int r, fd, backlog; + + fd = job_m_in.m_lc_vfs_listen.fd; + backlog = job_m_in.m_lc_vfs_listen.backlog; - return ENOTSOCK; + if ((r = get_sock(fd, &dev, NULL)) != OK) + return r; + + if (backlog < 0) + backlog = 0; + + return sdev_listen(dev, backlog); } /* @@ -94,8 +364,116 @@ do_listen(void) int do_accept(void) { + dev_t dev; + int r, fd, flags; + + fd = job_m_in.m_lc_vfs_sockaddr.fd; + + if ((r = get_sock(fd, &dev, &flags)) != OK) + return r; - return ENOTSOCK; + if ((r = check_sock_fds(1)) != OK) + return r; + + return sdev_accept(dev, job_m_in.m_lc_vfs_sockaddr.addr, + job_m_in.m_lc_vfs_sockaddr.addr_len, flags, fd); +} + +/* + * Resume a previously suspended accept(2) system call. This routine must + * cover three distinct cases, depending on the 'status' and 'dev' values: + * + * #1. If the 'status' parameter is set to OK, the accept call succeeded. In + * that case, the function is guaranteed to be called from a worker thread, + * with 'fp' set to the user process that made the system call. In that + * case, this function may block its calling thread. The 'dev' parameter + * will contain the device number of the newly accepted socket. + * #2. If the 'status' parameter contains a negative error code, but 'dev' is + * *not* set to NO_DEV, then the same as above applies, except that the new + * socket must be closed immediately. + * #3. If 'status' is a negative error code and 'dev' is set to NO_DEV, then + * the accept call has failed and no new socket was ever created. In this + * case, the function MUST NOT block its calling thread. + */ +void +resume_accept(struct fproc * rfp, int status, dev_t dev, unsigned int addr_len, + int listen_fd) +{ + message m; + dev_t ldev; + int r, flags; + + /* + * If the call did not succeed and no socket was created (case #3), we + * cannot and should not do more than send the error to the user + * process. + */ + if (status != OK && dev == NO_DEV) { + replycode(rfp->fp_endpoint, status); + + return; + } + + /* + * The call succeeded. The lower socket layer (sdev.c) ensures that in + * that case, we are called from a worker thread which is associated + * with the original user process. Thus, we can block the current + * thread. Start by verifying that the listening socket is still + * around. If it is not, it must have been invalidated as a result of + * a socket driver death, in which case we must report an error but + * need not close the new socket. As a side effect, obtain the + * listening socket's flags, which on BSD systems are inherited by the + * accepted socket. + */ + assert(fp == rfp); /* needed for get_sock() and make_sock_fd() */ + + if (get_sock(listen_fd, &ldev, &flags) != OK) { + replycode(rfp->fp_endpoint, EIO); + + return; + } + + /* The same socket driver must host both sockets, obviously. */ + assert(get_smap_by_dev(ldev, NULL) == get_smap_by_dev(dev, NULL)); + + /* + * If an error status was returned (case #2), we must now close the + * newly accepted socket. Effectively, this allows socket drivers to + * handle address copy failures in the cleanest possible way. + */ + if (status != OK) { + (void)sdev_close(dev); + + replycode(rfp->fp_endpoint, status); + + return; + } + + /* + * A new socket has been successfully accepted (case #1). Try to + * create a file descriptor for the new socket. If this fails, we have + * to close the new socket after all. That is not great, but we have + * no way to prevent this except by preallocating all objects for the + * duration of the accept call, which is not exactly great either. + */ + flags &= O_CLOEXEC | O_NONBLOCK | O_NOSIGPIPE; + + if ((r = make_sock_fd(dev, flags)) < 0) { + (void)sdev_close(dev); + + replycode(rfp->fp_endpoint, r); + + return; + } + + /* + * The accept call has succeeded. Send a reply message with the new + * file descriptor and an address length (which may be zero). + */ + memset(&m, 0, sizeof(m)); + m.m_vfs_lc_socklen.len = addr_len; + + reply(&m, rfp->fp_endpoint, r); } /* @@ -104,8 +482,19 @@ do_accept(void) int do_sendto(void) { + dev_t dev; + int r, fd, flags; + + fd = job_m_in.m_lc_vfs_sendrecv.fd; + + if ((r = get_sock(fd, &dev, &flags)) != OK) + return r; - return ENOTSOCK; + return sdev_readwrite(dev, job_m_in.m_lc_vfs_sendrecv.buf, + job_m_in.m_lc_vfs_sendrecv.len, 0, 0, + job_m_in.m_lc_vfs_sendrecv.addr, + job_m_in.m_lc_vfs_sendrecv.addr_len, + job_m_in.m_lc_vfs_sendrecv.flags, WRITING, flags, 0); } /* @@ -114,8 +503,37 @@ do_sendto(void) int do_recvfrom(void) { + dev_t dev; + int r, fd, flags; + + fd = job_m_in.m_lc_vfs_sendrecv.fd; + + if ((r = get_sock(fd, &dev, &flags)) != OK) + return r; + + return sdev_readwrite(dev, job_m_in.m_lc_vfs_sendrecv.buf, + job_m_in.m_lc_vfs_sendrecv.len, 0, 0, + job_m_in.m_lc_vfs_sendrecv.addr, + job_m_in.m_lc_vfs_sendrecv.addr_len, + job_m_in.m_lc_vfs_sendrecv.flags, READING, flags, 0); +} + +/* + * Resume a previously suspended recvfrom(2) system call. This function MUST + * NOT block its calling thread. + */ +void +resume_recvfrom(struct fproc * rfp, int status, unsigned int addr_len) +{ + message m; - return ENOTSOCK; + if (status >= 0) { + memset(&m, 0, sizeof(m)); + m.m_vfs_lc_socklen.len = addr_len; + + reply(&m, rfp->fp_endpoint, status); + } else + replycode(rfp->fp_endpoint, status); } /* @@ -124,8 +542,112 @@ do_recvfrom(void) int do_sockmsg(void) { + struct msghdr msg; + struct iovec iov; + vir_bytes msg_buf, data_buf; + size_t data_len; + dev_t dev; + int r, fd, flags; + + assert(job_call_nr == VFS_SENDMSG || job_call_nr == VFS_RECVMSG); + + fd = job_m_in.m_lc_vfs_sockmsg.fd; + msg_buf = job_m_in.m_lc_vfs_sockmsg.msgbuf; + + if ((r = get_sock(fd, &dev, &flags)) != OK) + return r; + + if ((r = sys_datacopy_wrapper(who_e, msg_buf, SELF, (vir_bytes)&msg, + sizeof(msg))) != OK) + return r; + + data_buf = 0; + data_len = 0; + if (msg.msg_iovlen > 0) { + /* + * We do not yet support vectors with more than one element; + * for this reason, libc is currently expected to consolidate + * the entire vector into a single element. Once we do add + * proper vector support, the ABI itself need not be changed. + */ + if (msg.msg_iovlen > 1) + return EMSGSIZE; + + if ((r = sys_datacopy_wrapper(who_e, (vir_bytes)msg.msg_iov, + SELF, (vir_bytes)&iov, sizeof(iov))) != OK) + return r; + + if (iov.iov_len > SSIZE_MAX) + return EINVAL; + + if (iov.iov_len > 0) { + data_buf = (vir_bytes)iov.iov_base; + data_len = iov.iov_len; + } + } + + return sdev_readwrite(dev, data_buf, data_len, + (vir_bytes)msg.msg_control, msg.msg_controllen, + (vir_bytes)msg.msg_name, msg.msg_namelen, + job_m_in.m_lc_vfs_sockmsg.flags, + (job_call_nr == VFS_RECVMSG) ? READING : WRITING, flags, + (job_call_nr == VFS_RECVMSG) ? msg_buf : 0); +} + +/* + * Resume a previously suspended recvmsg(2) system call. The 'status' + * parameter contains either the number of data bytes received or a negative + * error code. The 'msg_buf' parameter contains the user address of the msghdr + * structure. If a failure occurs in this function, the received data + * (including, in the worst case, references to received file descriptors) will + * be lost - while seriously ugly, this is always the calling process's fault, + * extremely hard to deal with, and on par with current behavior in other + * operating systems. This function MUST NOT block its calling thread. + */ +void +resume_recvmsg(struct fproc * rfp, int status, unsigned int ctl_len, + unsigned int addr_len, int flags, vir_bytes msg_buf) +{ + struct msghdr msg; + int r; + + if (status < 0) { + replycode(rfp->fp_endpoint, status); + + return; + } + + /* + * Unfortunately, we now need to update a subset of the fields of the + * msghdr structure. We can 1) copy in the entire structure for the + * second time, modify some fields, and copy it out in its entirety + * again, 2) copy out individual fields that have been changed, 3) save + * a copy of the original structure somewhere. The third option is the + * most efficient, but would increase the fproc structure size by quite + * a bit. The main difference between the first and second options is + * the number of kernel calls; we choose to use the first option. + */ + if ((r = sys_datacopy_wrapper(rfp->fp_endpoint, msg_buf, SELF, + (vir_bytes)&msg, sizeof(msg))) != OK) { + /* We copied it in before, how could it fail now? */ + printf("VFS: resume_recvmsg cannot copy in msghdr? (%d)\n", r); + + replycode(rfp->fp_endpoint, r); + + return; + } - return ENOTSOCK; + /* Modify and copy out the structure, and wake up the caller. */ + msg.msg_controllen = ctl_len; + msg.msg_flags = flags; + if (addr_len > 0) + msg.msg_namelen = addr_len; + + if ((r = sys_datacopy_wrapper(SELF, (vir_bytes)&msg, rfp->fp_endpoint, + msg_buf, sizeof(msg))) != OK) + status = r; + + replycode(rfp->fp_endpoint, status); } /* @@ -134,8 +656,17 @@ do_sockmsg(void) int do_setsockopt(void) { + dev_t dev; + int r, fd; + + fd = job_m_in.m_lc_vfs_sockopt.fd; + + if ((r = get_sock(fd, &dev, NULL)) != OK) + return r; - return ENOTSOCK; + return sdev_setsockopt(dev, job_m_in.m_lc_vfs_sockopt.level, + job_m_in.m_lc_vfs_sockopt.name, job_m_in.m_lc_vfs_sockopt.buf, + job_m_in.m_lc_vfs_sockopt.len); } /* @@ -144,8 +675,23 @@ do_setsockopt(void) int do_getsockopt(void) { + unsigned int len; + dev_t dev; + int r, fd; - return ENOTSOCK; + fd = job_m_in.m_lc_vfs_sockopt.fd; + len = job_m_in.m_lc_vfs_sockopt.len; + + if ((r = get_sock(fd, &dev, NULL)) != OK) + return r; + + r = sdev_getsockopt(dev, job_m_in.m_lc_vfs_sockopt.level, + job_m_in.m_lc_vfs_sockopt.name, job_m_in.m_lc_vfs_sockopt.buf, + &len); + + if (r == OK) + job_m_out.m_vfs_lc_socklen.len = len; + return r; } /* @@ -154,8 +700,21 @@ do_getsockopt(void) int do_getsockname(void) { + unsigned int len; + dev_t dev; + int r, fd; + + fd = job_m_in.m_lc_vfs_sockaddr.fd; + len = job_m_in.m_lc_vfs_sockaddr.addr_len; - return ENOTSOCK; + if ((r = get_sock(fd, &dev, NULL)) != OK) + return r; + + r = sdev_getsockname(dev, job_m_in.m_lc_vfs_sockaddr.addr, &len); + + if (r == OK) + job_m_out.m_vfs_lc_socklen.len = len; + return r; } /* @@ -164,8 +723,21 @@ do_getsockname(void) int do_getpeername(void) { + unsigned int len; + dev_t dev; + int r, fd; + + fd = job_m_in.m_lc_vfs_sockaddr.fd; + len = job_m_in.m_lc_vfs_sockaddr.addr_len; + + if ((r = get_sock(fd, &dev, NULL)) != OK) + return r; - return ENOTSOCK; + r = sdev_getpeername(dev, job_m_in.m_lc_vfs_sockaddr.addr, &len); + + if (r == OK) + job_m_out.m_vfs_lc_socklen.len = len; + return r; } /* @@ -174,6 +746,17 @@ do_getpeername(void) int do_shutdown(void) { + dev_t dev; + int r, fd, how; + + fd = job_m_in.m_lc_vfs_shutdown.fd; + how = job_m_in.m_lc_vfs_shutdown.how; + + if ((r = get_sock(fd, &dev, NULL)) != OK) + return r; + + if (how != SHUT_RD && how != SHUT_WR && how != SHUT_RDWR) + return EINVAL; - return ENOTSOCK; + return sdev_shutdown(dev, how); } diff --git a/minix/servers/vfs/type.h b/minix/servers/vfs/type.h index 673be5f1f..58a7afadd 100644 --- a/minix/servers/vfs/type.h +++ b/minix/servers/vfs/type.h @@ -38,4 +38,14 @@ struct statvfs_cache { unsigned long f_namemax; /* maximum filename length */ }; +struct smap { + unsigned int smap_num; /* one-based number into smap array */ + endpoint_t smap_endpt; /* driver endpoint, NONE if free */ + char smap_label[LABEL_MAX]; /* driver label */ + int smap_sel_busy; /* doing initial select on socket? */ + struct filp * smap_sel_filp; /* socket being selected on */ +}; + +typedef int32_t sockid_t; + #endif