From: David van Moolenbroek Date: Thu, 29 Sep 2016 23:07:07 +0000 (+0000) Subject: Add lwip: a new lwIP-based TCP/IP service X-Git-Url: http://zhaoyanbai.com/repos/%22http:/www.isc.org/icons/doxygen.log?a=commitdiff_plain;h=ef8d499e2d2af900e9b2ab297171d7b088652482;p=minix.git Add lwip: a new lwIP-based TCP/IP service This commit adds a new TCP/IP service to MINIX 3. As its core, the service uses the lwIP TCP/IP stack for maintenance reasons. The service aims to be compatible with NetBSD userland, including its low-level network management utilities. It also aims to support modern features such as IPv6. In summary, the new LWIP service has support for the following main features: - TCP, UDP, RAW sockets with mostly standard BSD API semantics; - IPv6 support: host mode (complete) and router mode (partial); - most of the standard BSD API socket options (SO_); - all of the standard BSD API message flags (MSG_); - the most used protocol-specific socket and control options; - a default loopback interface and the ability to create one more; - configuration-free ethernet interfaces and driver tracking; - queuing and multiple concurrent requests to each ethernet driver; - standard ioctl(2)-based BSD interface management; - radix tree backed, destination-based routing; - routing sockets for standard BSD route reporting and management; - multicast traffic and multicast group membership tracking; - Berkeley Packet Filter (BPF) devices; - standard and custom sysctl(7) nodes for many internals; - a slab allocation based, hybrid static/dynamic memory pool model. Many of its modules come with fairly elaborate comments that cover many aspects of what is going on. The service is primarily a socket driver built on top of the libsockdriver library, but for BPF devices it is at the same time also a character driver. Change-Id: Ib0c02736234b21143915e5fcc0fda8fe408f046f --- diff --git a/distrib/sets/lists/minix-base/mi b/distrib/sets/lists/minix-base/mi index d3e8604bb..0ee5192a8 100644 --- a/distrib/sets/lists/minix-base/mi +++ b/distrib/sets/lists/minix-base/mi @@ -194,7 +194,7 @@ ./etc/system.conf.d/hello minix-base ./etc/system.conf.d/inet minix-base obsolete ./etc/system.conf.d/ipc minix-base -./etc/system.conf.d/lwip minix-base obsolete +./etc/system.conf.d/lwip minix-base ./etc/system.conf.d/random minix-base ./etc/system.conf.d/uds minix-base ./etc/system.conf.d/usb_hub minix-base @@ -277,7 +277,7 @@ ./service/is minix-base ./service/isofs minix-base ./service/log minix-base -./service/lwip minix-base obsolete +./service/lwip minix-base ./service/memory minix-base ./service/mfs minix-base ./service/mib minix-base diff --git a/distrib/sets/lists/minix-comp/mi b/distrib/sets/lists/minix-comp/mi index 64428697e..e0cc28825 100644 --- a/distrib/sets/lists/minix-comp/mi +++ b/distrib/sets/lists/minix-comp/mi @@ -1182,6 +1182,7 @@ ./usr/include/minix/blockdriver.h minix-comp ./usr/include/minix/blockdriver_mt.h minix-comp ./usr/include/minix/board.h minix-comp +./usr/include/minix/bpf.h minix-comp ./usr/include/minix/btrace.h minix-comp ./usr/include/minix/callnr.h minix-comp ./usr/include/minix/chardriver.h minix-comp @@ -1208,6 +1209,7 @@ ./usr/include/minix/hgfs.h minix-comp ./usr/include/minix/i2c.h minix-comp ./usr/include/minix/i2cdriver.h minix-comp +./usr/include/minix/if.h minix-comp ./usr/include/minix/input.h minix-comp ./usr/include/minix/inputdriver.h minix-comp ./usr/include/minix/ioctl.h minix-comp diff --git a/distrib/sets/lists/minix-debug/mi b/distrib/sets/lists/minix-debug/mi index cf038b14b..c3c7e0980 100644 --- a/distrib/sets/lists/minix-debug/mi +++ b/distrib/sets/lists/minix-debug/mi @@ -200,7 +200,7 @@ ./usr/libdata/debug/service/is.debug minix-debug debug ./usr/libdata/debug/service/isofs.debug minix-debug debug ./usr/libdata/debug/service/log.debug minix-debug debug -./usr/libdata/debug/service/lwip.debug minix-debug debug,obsolete +./usr/libdata/debug/service/lwip.debug minix-debug debug ./usr/libdata/debug/service/memory.debug minix-debug debug ./usr/libdata/debug/service/mfs.debug minix-debug debug ./usr/libdata/debug/service/mib.debug minix-debug debug diff --git a/minix/commands/DESCRIBE/DESCRIBE.sh b/minix/commands/DESCRIBE/DESCRIBE.sh index 472dc9517..b2419c628 100644 --- a/minix/commands/DESCRIBE/DESCRIBE.sh +++ b/minix/commands/DESCRIBE/DESCRIBE.sh @@ -130,6 +130,9 @@ do ;; 6,0) des="line printer, parallel port" dev=lp ;; + 7,0) + des="Berkeley Packet Filter device" dev=bpf + ;; 9,0) des="unix98 pseudoterminal master" dev=ptmx ;; diff --git a/minix/commands/MAKEDEV/MAKEDEV.sh b/minix/commands/MAKEDEV/MAKEDEV.sh index 8279a4f46..4039a0114 100755 --- a/minix/commands/MAKEDEV/MAKEDEV.sh +++ b/minix/commands/MAKEDEV/MAKEDEV.sh @@ -33,6 +33,7 @@ RAMDISK_DEVICES=" STD_DEVICES=" ${RAMDISK_DEVICES} bmp085b1s77 bmp085b2s77 bmp085b3s77 + bpf eepromb1s50 eepromb1s51 eepromb1s52 eepromb1s53 eepromb1s54 eepromb1s55 eepromb1s56 eepromb1s57 eepromb2s50 eepromb2s51 eepromb2s52 eepromb2s53 @@ -128,6 +129,7 @@ Where key is one of the following: tty00 ... tty03 # Make serial lines ttyp0 ... ttyq0 ... # Make tty, pty pairs audio mixer # Make audio devices + bpf # Make /dev/bpf klog # Make /dev/klog ptmx # Make /dev/ptmx random # Make /dev/random, /dev/urandom @@ -215,6 +217,13 @@ do makedev bmp085b${bus}s77 c ${major} 0 ${uname} ${gname} 444 ;; + bpf) + # Berkeley Packet Filter device, for the LWIP service + # This is a cloning device, but some programs (e.g., dhclient) + # assume individual devices are numbered, so also create bpf0. + makedev ${dev} c 7 0 ${uname} ${gname} 600 + makedev ${dev}0 c 7 0 ${uname} ${gname} 600 + ;; c[0-3]d[0-7]) # Whole disk devices. disk=`expr ${dev} : '...\\(.\\)'` diff --git a/minix/fs/procfs/service.c b/minix/fs/procfs/service.c index e9efa1a00..fe758347c 100644 --- a/minix/fs/procfs/service.c +++ b/minix/fs/procfs/service.c @@ -125,7 +125,7 @@ service_get_policies(struct policies * pol, index_t slot) { .label = "ptyfs", .policy_str = "" }, { .label = "vbfs", .policy_str = "" }, /* net */ - { .label = "lwip", .policy_str = "" }, + { .label = "lwip", .policy_str = "reset" }, /* servers */ { .label = "devman", .policy_str = "restart" }, { .label = "ds", .policy_str = "restart" }, diff --git a/minix/include/minix/Makefile b/minix/include/minix/Makefile index 72e3afa63..2ed8564df 100644 --- a/minix/include/minix/Makefile +++ b/minix/include/minix/Makefile @@ -5,14 +5,14 @@ INCSDIR= /usr/include/minix INCS+= paths.h param.h INCS+= acpi.h audio_fw.h bitmap.h \ bdev.h blockdriver.h blockdriver_mt.h \ - board.h btrace.h \ + board.h bpf.h btrace.h \ callnr.h chardriver.h clkconf.h com.h \ config.h const.h cpufeature.h \ debug.h devio.h devman.h dmap.h \ driver.h drivers.h drvlib.h ds.h \ endpoint.h fb.h fsdriver.h fslib.h gpio.h gcov.h hash.h \ - hgfs.h i2c.h i2cdriver.h ioctl.h input.h \ - inputdriver.h ipc.h ipc_filter.h ipcconst.h \ + hgfs.h i2c.h i2cdriver.h if.h input.h inputdriver.h \ + ioctl.h ipc.h ipc_filter.h ipcconst.h \ keymap.h log.h mmio.h mthread.h minlib.h \ netdriver.h optset.h padconf.h partition.h portio.h \ priv.h procfs.h profile.h \ diff --git a/minix/include/minix/bpf.h b/minix/include/minix/bpf.h new file mode 100644 index 000000000..acc38206a --- /dev/null +++ b/minix/include/minix/bpf.h @@ -0,0 +1,42 @@ +#ifndef _MINIX_BPF_H +#define _MINIX_BPF_H + +#include + +/* + * MINIX3-specific extensions to the NetBSD Berkeley Packet Filter header. + * These extensions are necessary because NetBSD BPF uses a few ioctl(2) + * structure formats that contain pointers--something that MINIX3 has to avoid, + * due to its memory granting mechanisms. Thus, those ioctl(2) calls have to + * be converted from NetBSD to MINIX3 format. We currently do that in libc. + * This header specifies the numbers and formats for the MINIX3 versions. + * + * See for details on how things work here. + */ + +/* BIOCSETF: set BPF filter program. */ +/* + * This ioctl is an exception, as it is write-only, so we do not need the + * original structure. Also, the size of this structure is currently slightly + * over 4KB, which makes it too big for a regular ioctl call. Thus, we have to + * use a big ioctl call. Note that future changes of BPF_MAXINSNS will + * unfortunately (necessarily) change the ioctl call number. + */ +struct minix_bpf_program { + u_int mbf_len; + struct bpf_insn mbf_insns[BPF_MAXINSNS]; +}; + +#define MINIX_BIOCSETF _IOW_BIG(2, struct minix_bpf_program) + +/* BIOCGDLTLIST: retrieve list of possible data link types. */ +#define MINIX_BPF_MAXDLT 256 + +struct minix_bpf_dltlist { + struct bpf_dltlist mbfl_dltlist; /* MUST be first */ + u_int mbfl_list[MINIX_BPF_MAXDLT]; +}; + +#define MINIX_BIOCGDLTLIST _IOWR('B', 119, struct minix_bpf_dltlist) + +#endif /* !_MINIX_BPF_H */ diff --git a/minix/include/minix/dmap.h b/minix/include/minix/dmap.h index 0bb46e49f..4617b807d 100644 --- a/minix/include/minix/dmap.h +++ b/minix/include/minix/dmap.h @@ -25,7 +25,7 @@ #define TTY_MAJOR 4 /* 4 = /dev/tty00 (ttys) */ #define CTTY_MAJOR 5 /* 5 = /dev/tty */ #define PRINTER_MAJOR 6 /* 6 = /dev/lp (printer driver) */ - /* 7 = (unused) */ +#define TCPIP_MAJOR 7 /* 7 = /dev/bpf (TCP/IP service) */ /* 8 = /dev/c1 */ #define PTY_MAJOR 9 /* 9 = /dev/ptyp0 (pty driver) */ /* 10 = /dev/c2 */ diff --git a/minix/include/minix/if.h b/minix/include/minix/if.h new file mode 100644 index 000000000..7a22df45a --- /dev/null +++ b/minix/include/minix/if.h @@ -0,0 +1,51 @@ +#ifndef _MINIX_IF_H +#define _MINIX_IF_H + +#include +#include + +/* + * MINIX3-specific extensions to the network interface headers. These + * extensions are necessary because NetBSD IF uses a few ioctl(2) structure + * formats that contain pointers--something that MINIX3 has to avoid, due to + * its memory granting mechanisms. Thus, those ioctl(2) calls have to be + * converted from NetBSD to MINIX3 format. We currently do that in libc. + * This header specifies the numbers and formats for the MINIX3 versions. + * + * The general idea is that we rewrite the ioctl request data to include both + * the original structure and a buffer for the array of values to which the + * original structure uses a pointer. Important: in those cases, the original + * structure is expected to be the first element of the replacement structure. + * + * There is typically no configured upper bound for the maximum number of + * values in the array, and so we pick size values that are hopefully always + * oversized and yet keep the ioctl sizes within the range of regular ioctls + * (4095 bytes, as per sys/ioccom.h). If there may be larger amounts of data, + * we have to use "big" ioctls. + * + * For the replacement ioctl codes, we use the original ioctl class and number + * with a different size. That should virtually eliminate the possibility of + * accidental collisions. + */ + +/* SIOCGIFMEDIA: retrieve interface media status and types. */ +#define MINIX_IF_MAXMEDIA 256 + +struct minix_ifmediareq { + struct ifmediareq mifm_ifm; /* MUST be first */ + int mifm_list[MINIX_IF_MAXMEDIA]; +}; + +#define MINIX_SIOCGIFMEDIA _IOWR('i', 54, struct minix_ifmediareq) + +/* SIOCIFGCLONERS: retrieve interface "cloners" (virtual types). */ +#define MINIX_IF_MAXCLONERS 128 + +struct minix_if_clonereq { + struct if_clonereq mifcr_ifcr; /* MUST be first */ + char mifcr_buffer[MINIX_IF_MAXCLONERS * IFNAMSIZ]; +}; + +#define MINIX_SIOCIFGCLONERS _IOWR('i', 120, struct minix_if_clonereq) + +#endif /* !_MINIX_IF_H */ diff --git a/minix/include/minix/sysctl.h b/minix/include/minix/sysctl.h index 38a817521..39cfb2387 100644 --- a/minix/include/minix/sysctl.h +++ b/minix/include/minix/sysctl.h @@ -28,6 +28,7 @@ #define MINIX_TEST 0 #define MINIX_MIB 1 #define MINIX_PROC 2 +#define MINIX_LWIP 3 /* * These identifiers, under MINIX_TEST, are used by test87 to test the MIB diff --git a/minix/lib/libc/sys/ioctl.c b/minix/lib/libc/sys/ioctl.c index 6e2875ced..26e02066d 100644 --- a/minix/lib/libc/sys/ioctl.c +++ b/minix/lib/libc/sys/ioctl.c @@ -9,6 +9,10 @@ #include #include #include +#include +#include +#include +#include static void rewrite_i2c_netbsd_to_minix(minix_i2c_ioctl_exec_t *out, i2c_ioctl_exec_t *in); @@ -45,6 +49,199 @@ static void rewrite_i2c_minix_to_netbsd(i2c_ioctl_exec_t *out, } } +/* + * Convert a network interface related IOCTL with pointers to a flat format + * suitable for MINIX3. Return a pointer to the new data on success, or zero + * (with errno set) on failure. The original request code is given in + * 'request' and must be replaced by the new request code to be used. + */ +static vir_bytes +ioctl_convert_if_to_minix(void * data, unsigned long * request) +{ + struct minix_ifmediareq *mifm; + struct ifmediareq *ifm; + struct minix_if_clonereq *mifcr; + struct if_clonereq *ifcr; + + switch (*request) { + case SIOCGIFMEDIA: + ifm = (struct ifmediareq *)data; + + mifm = (struct minix_ifmediareq *)malloc(sizeof(*mifm)); + if (mifm != NULL) { + /* + * The count may exceed MINIX_IF_MAXMEDIA, and should + * be truncated as needed by the IF implementation. + */ + memcpy(&mifm->mifm_ifm, ifm, sizeof(*ifm)); + + *request = MINIX_SIOCGIFMEDIA; + } else + errno = ENOMEM; + + return (vir_bytes)mifm; + + case SIOCIFGCLONERS: + ifcr = (struct if_clonereq *)data; + + mifcr = (struct minix_if_clonereq *)malloc(sizeof(*mifcr)); + if (mifcr != NULL) { + /* + * The count may exceed MINIX_IF_MAXCLONERS, and should + * be truncated as needed by the IF implementation. + */ + memcpy(&mifcr->mifcr_ifcr, ifcr, sizeof(*ifcr)); + + *request = MINIX_SIOCIFGCLONERS; + } else + errno = ENOMEM; + + return (vir_bytes)mifcr; + + default: + assert(0); + + errno = ENOTTY; + return 0; + } +} + +/* + * Convert a the result of a network interface related IOCTL with pointers from + * the flat format used to make the call to MINIX3. Called on success only. + * The given request code is that of the (NetBSD-type) original. + */ +static void +ioctl_convert_if_from_minix(vir_bytes addr, void * data, unsigned long request) +{ + struct minix_ifmediareq *mifm; + struct ifmediareq *ifm; + struct minix_if_clonereq *mifcr; + struct if_clonereq *ifcr; + int count; + + switch (request) { + case SIOCGIFMEDIA: + mifm = (struct minix_ifmediareq *)addr; + ifm = (struct ifmediareq *)data; + + memcpy(ifm, &mifm->mifm_ifm, sizeof(*ifm)); + + if (ifm->ifm_ulist != NULL && ifm->ifm_count > 0) + memcpy(ifm->ifm_ulist, mifm->mifm_list, + ifm->ifm_count * sizeof(ifm->ifm_ulist[0])); + + break; + + case SIOCIFGCLONERS: + mifcr = (struct minix_if_clonereq *)addr; + ifcr = (struct if_clonereq *)data; + + memcpy(ifcr, &mifcr->mifcr_ifcr, sizeof(*ifcr)); + + count = (ifcr->ifcr_count < ifcr->ifcr_total) ? + ifcr->ifcr_count : ifcr->ifcr_total; + if (ifcr->ifcr_buffer != NULL && count > 0) + memcpy(ifcr->ifcr_buffer, mifcr->mifcr_buffer, + count * IFNAMSIZ); + + break; + + default: + assert(0); + } +} + +/* + * Convert a BPF (Berkeley Packet Filter) related IOCTL with pointers to a flat + * format suitable for MINIX3. Return a pointer to the new data on success, or + * zero (with errno set) on failure. The original request code is given in + * 'request' and must be replaced by the new request code to be used. + */ +static vir_bytes +ioctl_convert_bpf_to_minix(void * data, unsigned long * request) +{ + struct minix_bpf_program *mbf; + struct bpf_program *bf; + struct minix_bpf_dltlist *mbfl; + struct bpf_dltlist *bfl; + + switch (*request) { + case BIOCSETF: + bf = (struct bpf_program *)data; + + if (bf->bf_len > __arraycount(mbf->mbf_insns)) { + errno = EINVAL; + return 0; + } + + mbf = (struct minix_bpf_program *)malloc(sizeof(*mbf)); + if (mbf != NULL) { + mbf->mbf_len = bf->bf_len; + memcpy(mbf->mbf_insns, bf->bf_insns, + bf->bf_len * sizeof(mbf->mbf_insns[0])); + + *request = MINIX_BIOCSETF; + } else + errno = ENOMEM; + + return (vir_bytes)mbf; + + case BIOCGDLTLIST: + bfl = (struct bpf_dltlist *)data; + + mbfl = (struct minix_bpf_dltlist *)malloc(sizeof(*mbfl)); + if (mbfl != NULL) { + /* + * The length may exceed MINIX_BPF_MAXDLT, and should + * be truncated as needed by the BPF implementation. + */ + memcpy(&mbfl->mbfl_dltlist, bfl, sizeof(*bfl)); + + *request = MINIX_BIOCGDLTLIST; + } else + errno = ENOMEM; + + return (vir_bytes)mbfl; + + default: + assert(0); + + errno = ENOTTY; + return 0; + } +} + +/* + * Convert a the result of BPF (Berkeley Packet Filter) related IOCTL with + * pointers from the flat format used to make the call to MINIX3. Called on + * success only. The given request code is that of the (NetBSD-type) original. + */ +static void +ioctl_convert_bpf_from_minix(vir_bytes addr, void * data, + unsigned long request) +{ + struct minix_bpf_dltlist *mbfl; + struct bpf_dltlist *bfl; + + switch (request) { + case BIOCGDLTLIST: + mbfl = (struct minix_bpf_dltlist *)addr; + bfl = (struct bpf_dltlist *)data; + + memcpy(bfl, &mbfl->mbfl_dltlist, sizeof(*bfl)); + + if (bfl->bfl_list != NULL && bfl->bfl_len > 0) + memcpy(bfl->bfl_list, mbfl->mbfl_list, + bfl->bfl_len * sizeof(bfl->bfl_list[0])); + + break; + + default: + assert(0); + } +} + /* * Library implementation of FIOCLEX and FIONCLEX. */ @@ -110,6 +307,7 @@ ioctl_to_fcntl(int fd, unsigned long request, void * data) int ioctl(int fd, unsigned long request, ...) { + minix_i2c_ioctl_exec_t i2c; int r, request_save; message m; vir_bytes addr; @@ -124,8 +322,6 @@ int ioctl(int fd, unsigned long request, ...) * To support compatibility with interfaces on other systems, certain * requests are re-written to flat structures (i.e. without pointers). */ - minix_i2c_ioctl_exec_t i2c; - request_save = request; switch (request) { @@ -142,6 +338,19 @@ int ioctl(int fd, unsigned long request, ...) addr = (vir_bytes) &i2c; request = MINIX_I2C_IOCTL_EXEC; break; + + case SIOCGIFMEDIA: + case SIOCIFGCLONERS: + if ((addr = ioctl_convert_if_to_minix(data, &request)) == 0) + return -1; /* errno has already been set */ + break; + + case BIOCSETF: + case BIOCGDLTLIST: + if ((addr = ioctl_convert_bpf_to_minix(data, &request)) == 0) + return -1; /* errno has already been set */ + break; + default: /* Keep original as-is */ addr = (vir_bytes)data; @@ -155,11 +364,30 @@ int ioctl(int fd, unsigned long request, ...) r = _syscall(VFS_PROC_NR, VFS_IOCTL, &m); - /* Translate back to original form */ + /* + * Translate back to original form. Do this on failure as well, as + * temporarily allocated resources may have to be freed up again. + */ switch (request_save) { case I2C_IOCTL_EXEC: rewrite_i2c_minix_to_netbsd(data, &i2c); break; + + case SIOCGIFMEDIA: + case SIOCIFGCLONERS: + if (r == 0) + ioctl_convert_if_from_minix(addr, data, request_save); + free((void *)addr); + break; + + case BIOCGDLTLIST: + if (r == 0) + ioctl_convert_bpf_from_minix(addr, data, request_save); + /* FALLTHROUGH */ + case BIOCSETF: + free((void *)addr); + break; + default: /* Nothing to do */ break; diff --git a/minix/net/Makefile b/minix/net/Makefile index 374b5e974..9de770739 100644 --- a/minix/net/Makefile +++ b/minix/net/Makefile @@ -1,6 +1,7 @@ .include .if ${MKIMAGEONLY} == "no" +SUBDIR+= lwip SUBDIR+= uds .endif # ${MKIMAGEONLY} == "no" diff --git a/minix/net/lwip/Makefile b/minix/net/lwip/Makefile new file mode 100644 index 000000000..a463dcfcd --- /dev/null +++ b/minix/net/lwip/Makefile @@ -0,0 +1,34 @@ +# Makefile for the lwIP TCP/IP socket driver service (LWIP) + +.include + +PROG= lwip +SRCS= lwip.c mempool.c pchain.c addr.c addrpol.c tcpisn.c mcast.c ipsock.c \ + pktsock.c tcpsock.c udpsock.c rawsock.c ifdev.c ifaddr.c loopif.c \ + ethif.c ndev.c rttree.c route.c rtsock.c lnksock.c lldata.c mibtree.c \ + ifconf.c bpfdev.c bpf_filter.c util.c + +FILES=${PROG}.conf +FILESNAME=${PROG} +FILESDIR= /etc/system.conf.d + +CPPFLAGS+= -I${NETBSDSRCDIR}/minix/lib/liblwip/dist/src/include +CPPFLAGS+= -I${NETBSDSRCDIR}/minix/lib/liblwip/lib + +# Disabling USE_INET6 only superficially hides IPv6 support in the service. +.if (${USE_INET6} != "no") +CPPFLAGS+= -DINET6 +.endif + +# Some warnings are the result of usage of lwIP macros. We must not generate +# errors for those, but even producing the warnings is not helpful, so we +# disable them altogether. +CPPFLAGS+= -Wno-address + +DPADD+= ${LIBLWIP} ${LIBSOCKEVENT} ${LIBSOCKDRIVER} ${LIBCHARDRIVER} \ + ${LIBSYS} ${LIBTIMERS} +LDADD+= -llwip -lsockevent -lsockdriver -lchardriver -lsys -ltimers + +WARNS?= 5 + +.include diff --git a/minix/net/lwip/addr.c b/minix/net/lwip/addr.c new file mode 100644 index 000000000..845092124 --- /dev/null +++ b/minix/net/lwip/addr.c @@ -0,0 +1,692 @@ +/* LWIP service - addr.c - socket address verification and conversion */ + +#include "lwip.h" + +/* + * Return TRUE if the given socket address is of type AF_UNSPEC, or FALSE + * otherwise. + */ +int +addr_is_unspec(const struct sockaddr * addr, socklen_t addr_len) +{ + + return (addr_len >= offsetof(struct sockaddr, sa_data) && + addr->sa_family == AF_UNSPEC); +} + +/* + * Check whether the given multicast address is generally valid. This check + * should not be moved into addr_get_inet(), as we do not want to forbid + * creating routes for such addresses, for example. We do however apply the + * restrictions here to all provided source and destination addresses. Return + * TRUE if the address is an acceptable multicast address, or FALSE otherwise. + */ +int +addr_is_valid_multicast(const ip_addr_t * ipaddr) +{ + uint8_t scope; + + assert(ip_addr_ismulticast(ipaddr)); + + /* We apply restrictions to IPv6 multicast addresses only. */ + if (IP_IS_V6(ipaddr)) { + scope = ip6_addr_multicast_scope(ip_2_ip6(ipaddr)); + + if (scope == IP6_MULTICAST_SCOPE_RESERVED0 || + scope == IP6_MULTICAST_SCOPE_RESERVEDF) + return FALSE; + + /* + * We do not impose restrictions on the three defined embedded + * flags, even though we put no effort into supporting them, + * especially in terms of automatically creating routes for + * all cases. We do force the fourth flag to be zero. + * Unfortunately there is no lwIP macro to check for this flag. + */ + if (ip_2_ip6(ipaddr)->addr[0] & PP_HTONL(0x00800000UL)) + return FALSE; + + /* Prevent KAME-embedded zone IDs from entering the system. */ + if (ip6_addr_has_scope(ip_2_ip6(ipaddr), IP6_UNKNOWN) && + (ip_2_ip6(ipaddr)->addr[0] & PP_HTONL(0x0000ffffUL))) + return FALSE; + } + + return TRUE; +} + +/* + * Load a sockaddr structure, as copied from userland, as a lwIP-style IP + * address and (optionally) a port number. The expected type of IP address is + * given as 'type', which must be one of IPADDR_TYPE_{V4,ANY,V6}. If it is + * IPADDR_TYPE_V4, 'addr' is expected to point to a sockaddr_in structure. If + * it is IPADDR_TYPE_{ANY,V6}, 'addr' is expected to point to a sockaddr_in6 + * structure. For the _ANY case, the result will be an _ANY address only if it + * is the unspecified (all-zeroes) address and a _V6 address in all other + * cases. For the _V6 case, the result will always be a _V6 address. The + * length of the structure pointed to by 'addr' is given as 'addr_len'. If the + * boolean 'kame' flag is set, addresses will be interpreted to be KAME style, + * meaning that for scoped IPv6 addresses, the zone is embedded in the address + * rather than given in sin6_scope_id. On success, store the resulting IP + * address in 'ipaddr'. If 'port' is not NULL, store the port number in it; + * otherwise, ignore the port number. On any parsing failure, return an + * appropriate negative error code. + */ +int +addr_get_inet(const struct sockaddr * addr, socklen_t addr_len, uint8_t type, + ip_addr_t * ipaddr, int kame, uint16_t * port) +{ + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + ip6_addr_t *ip6addr; + uint32_t ifindex; + + switch (type) { + case IPADDR_TYPE_V4: + if (addr_len != sizeof(sin)) + return EINVAL; + + /* + * Getting around strict aliasing problems. Oh, the irony of + * doing an extra memcpy so that the compiler can do a better + * job at optimizing.. + */ + memcpy(&sin, addr, sizeof(sin)); + + if (sin.sin_family != AF_INET) + return EAFNOSUPPORT; + + ip_addr_set_ip4_u32(ipaddr, sin.sin_addr.s_addr); + + if (port != NULL) + *port = ntohs(sin.sin_port); + + return OK; + + case IPADDR_TYPE_ANY: + case IPADDR_TYPE_V6: + if (addr_len != sizeof(sin6)) + return EINVAL; + + /* Again, strict aliasing.. */ + memcpy(&sin6, addr, sizeof(sin6)); + + if (sin6.sin6_family != AF_INET6) + return EAFNOSUPPORT; + + memset(ipaddr, 0, sizeof(*ipaddr)); + + /* + * This is a bit ugly, but NetBSD does not expose s6_addr32 and + * s6_addr is a series of bytes, which is a mismatch for lwIP. + * The alternative would be another memcpy.. + */ + ip6addr = ip_2_ip6(ipaddr); + assert(sizeof(ip6addr->addr) == sizeof(sin6.sin6_addr)); + memcpy(ip6addr->addr, &sin6.sin6_addr, sizeof(ip6addr->addr)); + + /* + * If the address may have a scope, extract the zone ID. + * Where the zone ID is depends on the 'kame' parameter: KAME- + * style addresses have it embedded within the address, whereas + * non-KAME addresses use the (misnamed) sin6_scope_id field. + */ + if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)) { + if (kame) { + ifindex = + ntohl(ip6addr->addr[0]) & 0x0000ffffUL; + + ip6addr->addr[0] &= PP_HTONL(0xffff0000UL); + } else { + /* + * Reject KAME-style addresses for normal + * socket calls, to save ourselves the trouble + * of mixed address styles elsewhere. + */ + if (ip6addr->addr[0] & PP_HTONL(0x0000ffffUL)) + return EINVAL; + + ifindex = sin6.sin6_scope_id; + } + + /* + * Reject invalid zone IDs. This also enforces that + * no zone IDs wider than eight bits enter the system. + * As a side effect, it is not possible to add routes + * for invalid zones, but that should be no problem. + */ + if (ifindex != 0 && + ifdev_get_by_index(ifindex) == NULL) + return ENXIO; + + ip6_addr_set_zone(ip6addr, ifindex); + } else + ip6_addr_clear_zone(ip6addr); + + /* + * Set the type to ANY if it was ANY and the address itself is + * ANY as well. Otherwise, we are binding to a specific IPv6 + * address, so IPV6_V6ONLY stops being relevant and we should + * leave the address set to V6. Destination addresses for ANY + * are set to V6 elsewhere. + */ + if (type == IPADDR_TYPE_ANY && ip6_addr_isany(ip6addr)) + IP_SET_TYPE(ipaddr, type); + else + IP_SET_TYPE(ipaddr, IPADDR_TYPE_V6); + + if (port != NULL) + *port = ntohs(sin6.sin6_port); + + return OK; + + default: + return EAFNOSUPPORT; + } +} + +/* + * Store an lwIP-style IP address and port number as a sockaddr structure + * (sockaddr_in or sockaddr_in6, depending on the given IP address) to be + * copied to userland. The result is stored in the buffer pointed to by + * 'addr'. Before the call, 'addr_len' must be set to the size of this buffer. + * This is an internal check to prevent buffer overflows, and must not be used + * to validate input, since a mismatch will trigger a panic. After the call, + * 'addr_len' will be set to the size of the resulting structure. The lwIP- + * style address is given as 'ipaddr'. If the boolean 'kame' flag is set, the + * address will be stored KAME-style, meaning that for scoped IPv6 addresses, + * the address zone will be stored embedded in the address rather than in + * sin6_scope_id. If relevant, 'port' contains the port number in host-byte + * order; otherwise it should be set to zone. + */ +void +addr_put_inet(struct sockaddr * addr, socklen_t * addr_len, + const ip_addr_t * ipaddr, int kame, uint16_t port) +{ + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + const ip6_addr_t *ip6addr; + uint32_t zone; + + switch (IP_GET_TYPE(ipaddr)) { + case IPADDR_TYPE_V4: + if (*addr_len < sizeof(sin)) + panic("provided address buffer too small"); + + memset(&sin, 0, sizeof(sin)); + + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_port = htons(port); + sin.sin_addr.s_addr = ip_addr_get_ip4_u32(ipaddr); + + memcpy(addr, &sin, sizeof(sin)); + *addr_len = sizeof(sin); + + break; + + case IPADDR_TYPE_ANY: + case IPADDR_TYPE_V6: + if (*addr_len < sizeof(sin6)) + panic("provided address buffer too small"); + + ip6addr = ip_2_ip6(ipaddr); + + memset(&sin6, 0, sizeof(sin6)); + + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + sin6.sin6_port = htons(port); + memcpy(&sin6.sin6_addr, ip6addr->addr, sizeof(sin6.sin6_addr)); + + /* + * If the IPv6 address has a zone set, it must be scoped, and + * we put the zone in the result. It may occur that a scoped + * IPv6 address does not have a zone here though, for example + * if packet routing fails for sendto() with a zoneless address + * on an unbound socket, resulting in an RTM_MISS message. In + * such cases, simply leave the zone index blank in the result. + */ + if (ip6_addr_has_zone(ip6addr)) { + assert(ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)); + + zone = ip6_addr_zone(ip6addr); + assert(zone <= UINT8_MAX); + + if (kame) + sin6.sin6_addr.s6_addr[3] = zone; + else + sin6.sin6_scope_id = zone; + } + + memcpy(addr, &sin6, sizeof(sin6)); + *addr_len = sizeof(sin6); + + break; + + default: + panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); + } +} + +/* + * Load a link-layer sockaddr structure (sockaddr_dl), as copied from userland, + * and return the contained name and/or hardware address. The address is + * provided as 'addr', with length 'addr_len'. On success, return OK. If + * 'name' is not NULL, it must be of size 'name_max', and will be used to store + * the (null-terminated) interface name in the given structure if present, or + * the empty string if not. If 'hwaddr' is not NULL, it will be used to store + * the hardware address in the given structure, which must in that case be + * present and exactly 'hwaddr_len' bytes long. On any parsing failure, return + * an appropriate negative error code. + */ +int +addr_get_link(const struct sockaddr * addr, socklen_t addr_len, char * name, + size_t name_max, uint8_t * hwaddr, size_t hwaddr_len) +{ + struct sockaddr_dlx sdlx; + size_t nlen, alen; + + if (addr_len < offsetof(struct sockaddr_dlx, sdlx_data)) + return EINVAL; + + /* + * We cannot prevent callers from passing in massively oversized + * sockaddr_dl structure. However, we insist that all the actual data + * be contained within the size of our sockaddr_dlx version. + */ + if (addr_len > sizeof(sdlx)) + addr_len = sizeof(sdlx); + + memcpy(&sdlx, addr, addr_len); + + if (sdlx.sdlx_family != AF_LINK) + return EAFNOSUPPORT; + + /* Address selectors are not currently supported. */ + if (sdlx.sdlx_slen != 0) + return EINVAL; + + nlen = (size_t)sdlx.sdlx_nlen; + alen = (size_t)sdlx.sdlx_alen; + + /* The nlen and alen fields are 8-bit, so no risks of overflow here. */ + if (addr_len < offsetof(struct sockaddr_dlx, sdlx_data) + nlen + alen) + return EINVAL; + + /* + * Copy out the name, truncating it if needed. The name in the + * sockaddr is not null terminated, so we have to do that. If the + * sockaddr has no name, copy out an empty name. + */ + if (name != NULL) { + assert(name_max > 0); + + if (name_max > nlen + 1) + name_max = nlen + 1; + + memcpy(name, sdlx.sdlx_data, name_max - 1); + name[name_max - 1] = '\0'; + } + + /* + * Copy over the hardware address. For simplicity, we require that the + * caller specify the exact hardware address length. + */ + if (hwaddr != NULL) { + if (alen != hwaddr_len) + return EINVAL; + + memcpy(hwaddr, sdlx.sdlx_data + nlen, hwaddr_len); + } + + return OK; +} + +/* + * Store a link-layer sockaddr structure (sockaddr_dl), to be copied to + * userland. The result is stored in the buffer pointed to by 'addr'. Before + * the call, 'addr_len' must be set to the size of this buffer. This is an + * internal check to prevent buffer overflows, and must not be used to validate + * input, since a mismatch will trigger a panic. After the call, 'addr_len' + * will be set to the size of the resulting structure. The given interface + * index 'ifindex' and (IFT_) interface type 'type' will always be stored in + * the resulting structure. If 'name' is not NULL, it must be a null- + * terminated interface name string which will be included in the structure. + * If 'hwaddr' is not NULL, it must be a hardware address of length + * 'hwaddr_len', which will also be included in the structure. + */ +void +addr_put_link(struct sockaddr * addr, socklen_t * addr_len, uint32_t ifindex, + uint32_t type, const char * name, const uint8_t * hwaddr, + size_t hwaddr_len) +{ + struct sockaddr_dlx sdlx; + size_t name_len; + socklen_t len; + + name_len = (name != NULL) ? strlen(name) : 0; + + if (hwaddr == NULL) + hwaddr_len = 0; + + assert(name_len < IFNAMSIZ); + assert(hwaddr_len <= NETIF_MAX_HWADDR_LEN); + + len = offsetof(struct sockaddr_dlx, sdlx_data) + name_len + hwaddr_len; + + if (*addr_len < len) + panic("provided address buffer too small"); + + memset(&sdlx, 0, sizeof(sdlx)); + sdlx.sdlx_len = len; + sdlx.sdlx_family = AF_LINK; + sdlx.sdlx_index = ifindex; + sdlx.sdlx_type = type; + sdlx.sdlx_nlen = name_len; + sdlx.sdlx_alen = hwaddr_len; + if (name_len > 0) + memcpy(sdlx.sdlx_data, name, name_len); + if (hwaddr_len > 0) + memcpy(sdlx.sdlx_data + name_len, hwaddr, hwaddr_len); + + memcpy(addr, &sdlx, len); + *addr_len = len; +} + +/* + * Convert an IPv4 or IPv6 netmask, given as sockaddr structure 'addr', to a + * prefix length. The length of the sockaddr structure is given as 'addr_len'. + * For consistency with addr_get_inet(), the expected address type is given as + * 'type', and must be either IPADDR_TYPE_V4 or IPADDR_TYPE_V6. On success, + * return OK with the number of set prefix bits returned in 'prefix', and + * optionally with a lwIP representation of the netmask stored in 'ipaddr' (if + * not NULL). On failure, return an appropriate negative error code. Note + * that this function does not support compressed IPv4 network masks; such + * addresses must be expanded before a call to this function. + */ +int +addr_get_netmask(const struct sockaddr * addr, socklen_t addr_len, + uint8_t type, unsigned int * prefix, ip_addr_t * ipaddr) +{ + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + unsigned int byte, bit; + uint32_t val; + + switch (type) { + case IPADDR_TYPE_V4: + if (addr_len != sizeof(sin)) + return EINVAL; + + memcpy(&sin, addr, sizeof(sin)); + + if (sin.sin_family != AF_INET) + return EAFNOSUPPORT; + + val = ntohl(sin.sin_addr.s_addr); + + /* Find the first zero bit. */ + for (bit = 0; bit < IP4_BITS; bit++) + if (!(val & (1 << (IP4_BITS - bit - 1)))) + break; + + *prefix = bit; + + /* All bits after the first zero bit must also be zero. */ + if (bit < IP4_BITS && + (val & ((1 << (IP4_BITS - bit - 1)) - 1))) + return EINVAL; + + if (ipaddr != NULL) + ip_addr_set_ip4_u32(ipaddr, sin.sin_addr.s_addr); + + return OK; + + case IPADDR_TYPE_V6: + if (addr_len != sizeof(sin6)) + return EINVAL; + + memcpy(&sin6, addr, sizeof(sin6)); + + if (sin6.sin6_family != AF_INET6) + return EAFNOSUPPORT; + + /* Find the first zero bit. */ + for (byte = 0; byte < __arraycount(sin6.sin6_addr.s6_addr); + byte++) + if (sin6.sin6_addr.s6_addr[byte] != 0xff) + break; + + /* If all bits are set, there is nothing more to do. */ + if (byte == __arraycount(sin6.sin6_addr.s6_addr)) { + *prefix = __arraycount(sin6.sin6_addr.s6_addr) * NBBY; + + return OK; + } + + for (bit = 0; bit < NBBY; bit++) + if (!(sin6.sin6_addr.s6_addr[byte] & + (1 << (NBBY - bit - 1)))) + break; + + *prefix = byte * NBBY + bit; + + /* All bits after the first zero bit must also be zero. */ + if (bit < NBBY && (sin6.sin6_addr.s6_addr[byte] & + ((1 << (NBBY - bit - 1)) - 1))) + return EINVAL; + + for (byte++; byte < __arraycount(sin6.sin6_addr.s6_addr); + byte++) + if (sin6.sin6_addr.s6_addr[byte] != 0) + return EINVAL; + + if (ipaddr != NULL) { + ip_addr_set_zero_ip6(ipaddr); + + memcpy(ip_2_ip6(ipaddr)->addr, &sin6.sin6_addr, + sizeof(ip_2_ip6(ipaddr)->addr)); + } + + return OK; + + default: + panic("unknown IP address type: %u", type); + } +} + +/* + * Generate a raw network mask based on the given prefix length. + */ +void +addr_make_netmask(uint8_t * addr, socklen_t addr_len, unsigned int prefix) +{ + unsigned int byte, bit; + + byte = prefix / NBBY; + bit = prefix % NBBY; + + assert(byte + !!bit <= addr_len); + + if (byte > 0) + memset(addr, 0xff, byte); + if (bit != 0) + addr[byte++] = (uint8_t)(0xff << (NBBY - bit)); + if (byte < addr_len) + memset(&addr[byte], 0, addr_len - byte); +} + +/* + * Store a network mask as a sockaddr structure, in 'addr'. Before the call, + * 'addr_len' must be set to the memory size of 'addr'. The address type is + * given as 'type', and must be either IPADDR_TYPE_V4 or IPADDR_TYPE_V6. The + * prefix length from which to generate the network mask is given as 'prefix'. + * Upon return, 'addr_len' is set to the size of the resulting sockaddr + * structure. + */ +void +addr_put_netmask(struct sockaddr * addr, socklen_t * addr_len, uint8_t type, + unsigned int prefix) +{ + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + + switch (type) { + case IPADDR_TYPE_V4: + if (*addr_len < sizeof(sin)) + panic("provided address buffer too small"); + + assert(prefix <= IP4_BITS); + + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + + addr_make_netmask((uint8_t *)&sin.sin_addr.s_addr, + sizeof(sin.sin_addr.s_addr), prefix); + + memcpy(addr, &sin, sizeof(sin)); + *addr_len = sizeof(sin); + + break; + + case IPADDR_TYPE_V6: + if (*addr_len < sizeof(sin6)) + panic("provided address buffer too small"); + + assert(prefix <= IP6_BITS); + + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + + addr_make_netmask(sin6.sin6_addr.s6_addr, + sizeof(sin6.sin6_addr.s6_addr), prefix); + + memcpy(addr, &sin6, sizeof(sin6)); + *addr_len = sizeof(sin6); + + break; + + default: + panic("unknown IP address type: %u", type); + } +} + +/* + * Normalize the given address in 'src' to the given number of prefix bits, + * setting all other bits to zero. Return the result in 'dst'. + */ +void +addr_normalize(ip_addr_t * dst, const ip_addr_t * src, unsigned int prefix) +{ + unsigned int addr_len, byte, bit; + const uint8_t *srcaddr; + uint8_t type, *dstaddr; + + type = IP_GET_TYPE(src); + + memset(dst, 0, sizeof(*dst)); + IP_SET_TYPE(dst, type); + + switch (type) { + case IPADDR_TYPE_V4: + srcaddr = (const uint8_t *)&ip_2_ip4(src)->addr; + dstaddr = (uint8_t *)&ip_2_ip4(dst)->addr; + addr_len = sizeof(ip_2_ip4(src)->addr); + + break; + + case IPADDR_TYPE_V6: + ip6_addr_set_zone(ip_2_ip6(dst), ip6_addr_zone(ip_2_ip6(src))); + + srcaddr = (const uint8_t *)&ip_2_ip6(src)->addr; + dstaddr = (uint8_t *)&ip_2_ip6(dst)->addr; + addr_len = sizeof(ip_2_ip6(src)->addr); + + break; + + default: + panic("unknown IP address type: %u", type); + } + + byte = prefix / NBBY; + bit = prefix % NBBY; + + assert(byte + !!bit <= addr_len); + + if (byte > 0) + memcpy(dstaddr, srcaddr, byte); + if (bit != 0) { + dstaddr[byte] = + srcaddr[byte] & (uint8_t)(0xff << (NBBY - bit)); + byte++; + } +} + +/* + * Return the number of common bits between the given two addresses, up to the + * given maximum. Thus, return a value between 0 and 'max' inclusive. + */ +unsigned int +addr_get_common_bits(const ip_addr_t * ipaddr1, const ip_addr_t * ipaddr2, + unsigned int max) +{ + unsigned int addr_len, prefix, bit; + const uint8_t *addr1, *addr2; + uint8_t byte; + + switch (IP_GET_TYPE(ipaddr1)) { + case IPADDR_TYPE_V4: + assert(IP_IS_V4(ipaddr2)); + + addr1 = (const uint8_t *)&ip_2_ip4(ipaddr1)->addr; + addr2 = (const uint8_t *)&ip_2_ip4(ipaddr2)->addr; + addr_len = sizeof(ip_2_ip4(ipaddr1)->addr); + + break; + + case IPADDR_TYPE_V6: + assert(IP_IS_V6(ipaddr2)); + + addr1 = (const uint8_t *)&ip_2_ip6(ipaddr1)->addr; + addr2 = (const uint8_t *)&ip_2_ip6(ipaddr2)->addr; + addr_len = sizeof(ip_2_ip6(ipaddr1)->addr); + + break; + + default: + panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr1)); + } + + if (addr_len > max * NBBY) + addr_len = max * NBBY; + + prefix = 0; + + for (prefix = 0; addr_len > 0; addr1++, addr2++, prefix += NBBY) { + if ((byte = (*addr1 ^ *addr2)) != 0) { + /* TODO: see if we want a lookup table for this. */ + for (bit = 0; bit < NBBY; bit++, prefix++) + if (byte & (1 << (NBBY - bit - 1))) + break; + break; + } + } + + if (prefix > max) + prefix = max; + + return prefix; +} + +/* + * Convert the given IPv4 address to an IPv4-mapped IPv6 address. + */ +void +addr_make_v4mapped_v6(ip_addr_t * dst, const ip4_addr_t * src) +{ + + IP_ADDR6(dst, 0, 0, PP_HTONL(0x0000ffffUL), ip4_addr_get_u32(src)); +} diff --git a/minix/net/lwip/addr.h b/minix/net/lwip/addr.h new file mode 100644 index 000000000..0697ab70f --- /dev/null +++ b/minix/net/lwip/addr.h @@ -0,0 +1,33 @@ +#ifndef MINIX_NET_LWIP_ADDR_H +#define MINIX_NET_LWIP_ADDR_H + +int addr_is_unspec(const struct sockaddr * addr, socklen_t addr_len); + +int addr_is_valid_multicast(const ip_addr_t * ipaddr); + +int addr_get_inet(const struct sockaddr * addr, socklen_t addr_len, + uint8_t type, ip_addr_t * ipaddr, int kame, uint16_t * port); +void addr_put_inet(struct sockaddr * addr, socklen_t * addr_len, + const ip_addr_t * ipaddr, int kame, uint16_t port); + +int addr_get_link(const struct sockaddr * addr, socklen_t addr_len, + char * name, size_t name_max, uint8_t * hwaddr, size_t hwaddr_len); +void addr_put_link(struct sockaddr * addr, socklen_t * addr_len, + uint32_t ifindex, uint32_t type, const char * name, + const uint8_t * hwaddr, size_t hwaddr_len); + +int addr_get_netmask(const struct sockaddr * addr, socklen_t addr_len, + uint8_t type, unsigned int * prefix, ip_addr_t * ipaddr); +void addr_make_netmask(uint8_t * addr, socklen_t addr_len, + unsigned int prefix); +void addr_put_netmask(struct sockaddr * addr, socklen_t * addr_len, + uint8_t type, unsigned int prefix); + +void addr_normalize(ip_addr_t * dst, const ip_addr_t * src, + unsigned int prefix); +unsigned int addr_get_common_bits(const ip_addr_t * addr1, + const ip_addr_t * addr2, unsigned int max); + +void addr_make_v4mapped_v6(ip_addr_t * dst, const ip4_addr_t * src); + +#endif /* !MINIX_NET_LWIP_ADDR_H */ diff --git a/minix/net/lwip/addrpol.c b/minix/net/lwip/addrpol.c new file mode 100644 index 000000000..144f02c1c --- /dev/null +++ b/minix/net/lwip/addrpol.c @@ -0,0 +1,143 @@ +/* LWIP service - addrpol.c - address policy table and values */ +/* + * The main purpose of this module is to implement the address policy table + * described in RFC 6724. In general, the policy table is used for two + * purposes: source address selection, which is part of this service, and + * destination address selection, which is implemented in libc. NetBSD 7, the + * version that MINIX 3 is synced against at this moment, does not actually + * implement the libc part yet, though. That will change with NetBSD 8, where + * libc uses sysctl(7) to obtain the kernel's policy table, which itself can be + * changed with the new ip6addrctl(8) utility. Once we resync to NetBSD 8, we + * will also have to support this new functionality, and this module is where + * it would be implemented. Since NetBSD 7 is even lacking the necessary + * definitions, we cannot do that ahead of time, though. Thus, until then, + * this module is rather simple, as it only implements a static policy table + * used for source address selection. No changes beyond this module should be + * necessary, e.g. we are purposely not caching labels for local addresses. + */ + +#include "lwip.h" + +/* + * Address policy table. Currently hardcoded to the default of RFC 6724. + * Sorted by prefix length, so that the first match is always also the longest. + */ +static const struct { + ip_addr_t ipaddr; + unsigned int prefix; + int precedence; + int label; +} addrpol_table[] = { + { IPADDR6_INIT_HOST(0, 0, 0, 1), 128, 50, 0 }, + { IPADDR6_INIT_HOST(0, 0, 0x0000ffffUL, 0), 96, 35, 4 }, + { IPADDR6_INIT_HOST(0, 0, 0, 0), 96, 1, 3 }, + { IPADDR6_INIT_HOST(0x20010000UL, 0, 0, 0), 32, 5, 5 }, + { IPADDR6_INIT_HOST(0x20020000UL, 0, 0, 0), 16, 30, 2 }, + { IPADDR6_INIT_HOST(0x3ffe0000UL, 0, 0, 0), 16, 1, 12 }, + { IPADDR6_INIT_HOST(0xfec00000UL, 0, 0, 0), 10, 1, 11 }, + { IPADDR6_INIT_HOST(0xfc000000UL, 0, 0, 0), 7, 3, 13 }, + { IPADDR6_INIT_HOST(0, 0, 0, 0), 0, 40, 1 } +}; + +/* + * Obtain the label value for the given IP address from the address policy + * table. Currently only IPv6 addresses may be given. This function is linear + * in number of address policy table entries, requiring a relatively expensive + * normalization operation for each entry, so it should not be called lightly. + * Its results should not be cached beyond local contexts either, because the + * policy table itself may be changed from userland (in the future). + * + * TODO: convert IPv4 addresses to IPv4-mapped IPv6 addresses. + * TODO: embed the interface index in link-local addresses. + */ +int +addrpol_get_label(const ip_addr_t * iporig) +{ + ip_addr_t ipaddr; + unsigned int i; + + assert(IP_IS_V6(iporig)); + + /* + * The policy table is sorted by prefix length such that the first + * match is also the one with the longest prefix, and as such the best. + */ + for (i = 0; i < __arraycount(addrpol_table); i++) { + addr_normalize(&ipaddr, iporig, addrpol_table[i].prefix); + + if (ip_addr_cmp(&addrpol_table[i].ipaddr, &ipaddr)) + return addrpol_table[i].label; + } + + /* + * We cannot possibly get here with the default policy table, because + * the last entry will always match. It is not clear what we should + * return if there is no matching entry, though. For now, we return + * the default label value for the default (::/0) entry, which is 1. + */ + return 1; +} + +/* + * Return an opaque positive value (possibly zero) that represents the scope of + * the given IP address. A larger value indicates a wider scope. The 'is_src' + * flag indicates whether the address is a source or a destination address, + * which affects the value returned for unknown addresses. A scope is a direct + * function of only the given address, so the result may be cached on a per- + * address basis without risking invalidation at any point in time. + */ +int +addrpol_get_scope(const ip_addr_t * ipaddr, int is_src) +{ + const ip6_addr_t *ip6addr; + + /* + * For now, all IPv4 addresses are considered global. This function is + * currently called only for IPv6 addresses anyway. + */ + if (IP_IS_V4(ipaddr)) + return IP6_MULTICAST_SCOPE_GLOBAL; + + assert(IP_IS_V6(ipaddr)); + + ip6addr = ip_2_ip6(ipaddr); + + /* + * These are ordered not by ascending scope, but (roughly) by expected + * likeliness to match, for performance reasons. + */ + if (ip6_addr_isglobal(ip6addr)) + return IP6_MULTICAST_SCOPE_GLOBAL; + + if (ip6_addr_islinklocal(ip6addr) || ip6_addr_isloopback(ip6addr)) + return IP6_MULTICAST_SCOPE_LINK_LOCAL; + + /* + * We deliberately deviate from RFC 6724 Sec. 3.1 by considering + * Unique-Local Addresses (ULAs) to be of smaller scope than global + * addresses, to avoid that during source address selection, a + * preferred ULA is picked over a deprecated global address when given + * a global address as destination, as that would likely result in + * broken two-way communication. + */ + if (ip6_addr_isuniquelocal(ip6addr)) + return IP6_MULTICAST_SCOPE_ORGANIZATION_LOCAL; + + if (ip6_addr_ismulticast(ip6addr)) + return ip6_addr_multicast_scope(ip6addr); + + /* Site-local addresses are deprecated. */ + if (ip6_addr_issitelocal(ip6addr)) + return IP6_MULTICAST_SCOPE_SITE_LOCAL; + + /* + * If the address is a source address, give it a scope beyond global to + * make sure that a "real" global address is picked first. If the + * address is a destination address, give it a global scope so as to + * pick "real" global addresses over unknown-scope source addresses. + */ + if (is_src) + return IP6_MULTICAST_SCOPE_RESERVEDF; /* greater than GLOBAL */ + else + return IP6_MULTICAST_SCOPE_GLOBAL; +} diff --git a/minix/net/lwip/bpf_filter.c b/minix/net/lwip/bpf_filter.c new file mode 100644 index 000000000..8c0efca6f --- /dev/null +++ b/minix/net/lwip/bpf_filter.c @@ -0,0 +1,561 @@ +/* LWIP service - bpf_filter.c - Berkeley Packet Filter core implementation */ +/* + * This is basically a drop-in replacement of NetBSD's bpf_filter.c, which + * itself can be compiled for either the NetBSD kernel or for userland. On + * MINIX 3, we would like to perform certain checks that NetBSD implements only + * for its kernel (e.g., memory store access validation) while replacing the + * NetBSD kernel specifics with our own (pbuf instead of mbuf, no BPF contexts + * for now, etc.). As a result, it is easier to reimplement the whole thing, + * because there is not all that much to it. + * + * Support for the standard BSD API allows us to run standard tests against + * this module from userland, where _MINIX_SYSTEM is not defined. MINIX 3 + * specific extensions are enabled only if _MINIX_SYSTEM is defined. + */ +#include +#include +#include +#include + +#ifdef _MINIX_SYSTEM +#include "lwip.h" + +/* + * Obtain an unsigned 32-bit value in network byte order from the pbuf chain + * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds. + */ +static uint32_t +bpf_get32_ext(const struct pbuf * pbuf, uint32_t k) +{ + uint32_t val; + unsigned int i; + + /* + * Find the pbuf that contains the first byte. We expect that most + * filters will operate only on the headers of packets, so that we + * mostly avoid going through this O(n) loop. Since only the superuser + * can open BPF devices at all, we need not be worried about abuse in + * this regard. However, it turns out that this loop is particularly + * CPU-intensive after all, we can probably improve it by caching the + * last visited pbuf, as read locality is likely high. + */ + while (k >= pbuf->len) { + k -= pbuf->len; + pbuf = pbuf->next; + assert(pbuf != NULL); + } + + /* + * We assume that every pbuf has some data, but we make no assumptions + * about any minimum amount of data per pbuf. Therefore, we may have + * to take the bytes from anywhere between one and four pbufs. + * Hopefully the compiler will unroll this loop for us. + */ + val = (uint32_t)(((u_char *)pbuf->payload)[k]) << 24; + + for (i = 0; i < 3; i++) { + if (k >= (uint32_t)pbuf->len - 1) { + k = 0; + pbuf = pbuf->next; + assert(pbuf != NULL); + } else + k++; + val = (val << 8) | (uint32_t)(((u_char *)pbuf->payload)[k]); + } + + return val; +} + +/* + * Obtain an unsigned 16-bit value in network byte order from the pbuf chain + * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds. + */ +static uint32_t +bpf_get16_ext(const struct pbuf * pbuf, uint32_t k) +{ + + /* As above. */ + while (k >= pbuf->len) { + k -= pbuf->len; + pbuf = pbuf->next; + assert(pbuf != NULL); + } + + /* + * There are only two possible cases to cover here: either the two + * bytes are in the same pbuf, or they are in subsequent ones. + */ + if (k < (uint32_t)pbuf->len - 1) { + return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) | + (uint32_t)(((u_char *)pbuf->next->payload)[k + 1]); + } else { + assert(pbuf->next != NULL); + return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) | + (uint32_t)(((u_char *)pbuf->next->payload)[0]); + } +} + +/* + * Obtain an unsigned 8-bit value from the pbuf chain 'pbuf' at offset 'k'. + * The given offset is guaranteed to be within bounds. + */ +static uint32_t +bpf_get8_ext(const struct pbuf * pbuf, uint32_t k) +{ + + /* As above. */ + while (k >= pbuf->len) { + k -= pbuf->len; + pbuf = pbuf->next; + assert(pbuf != NULL); + } + + return (uint32_t)(((u_char *)pbuf->payload)[k]); +} + +#endif /* _MINIX_SYSTEM */ + +/* + * Execute a BPF filter program on (the first part of) a packet, and return the + * maximum size of the packet that should be delivered to the filter owner. + * + * The 'pc' parameter points to an array of BPF instructions that together form + * the filter program to be executed. If 'pc' is NULL, the packet is fully + * accepted. Otherwise, the given program MUST have passed a previous call to + * bpf_validate(). Not doing so will allow for arbitrary memory access. + * + * The 'packet' array contains up to the whole packet. The value of 'total' + * denotes the total length of the packet; 'len' contains the size of the array + * 'packet'. Chunked storage of the packet is not supported at this time. + * + * If executing the program succeeds, the return value is the maximum number of + * bytes from the packet to be delivered. The return value may exceed the full + * packet size. If the number of bytes returned is zero, the packet is to be + * ignored. If the program fails to execute properly and return a value, a + * value of zero is returned as well, thus also indicating that the packet + * should be ignored. This is intentional: it saves filter programs from + * having to perform explicit checks on the packet they are filtering. + */ +u_int +bpf_filter(const struct bpf_insn * pc, const u_char * packet, u_int total, + u_int len) +#ifdef _MINIX_SYSTEM +{ + + return bpf_filter_ext(pc, NULL /*pbuf*/, packet, total, len); +} + +u_int +bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf, + const u_char * packet, u_int total, u_int len) +#endif /* _MINIX_SYSTEM */ +{ + uint32_t k, a, x, mem[BPF_MEMWORDS]; + + /* An empty program accepts all packets. */ + if (pc == NULL) + return UINT_MAX; + + /* + * We need not clear 'mem': the checker guarantees that each memory + * store word is always written before it is read. + */ + a = 0; + x = 0; + + /* Execute the program. */ + for (;; pc++) { + k = pc->k; + + switch (pc->code) { + case BPF_LD+BPF_W+BPF_IND: /* A <- P[X+k:4] */ + if (k + x < k) + return 0; + k += x; + /* FALLTHROUGH */ + case BPF_LD+BPF_W+BPF_ABS: /* A <- P[k:4] */ + /* + * 'k' may have any value, so check bounds in such a + * way that 'k' cannot possibly overflow and wrap. + */ + if (len >= 3 && k < len - 3) + a = ((uint32_t)packet[k] << 24) | + ((uint32_t)packet[k + 1] << 16) | + ((uint32_t)packet[k + 2] << 8) | + (uint32_t)packet[k + 3]; +#ifdef _MINIX_SYSTEM + else if (total >= 3 && k < total - 3) + a = bpf_get32_ext(pbuf, k); +#endif /* _MINIX_SYSTEM */ + else + return 0; + break; + case BPF_LD+BPF_H+BPF_IND: /* A <- P[X+k:2] */ + if (k + x < k) + return 0; + k += x; + /* FALLTHROUGH */ + case BPF_LD+BPF_H+BPF_ABS: /* A <- P[k:2] */ + /* As above. */ + if (len >= 1 && k < len - 1) + a = ((uint32_t)packet[k] << 8) | + (uint32_t)packet[k + 1]; +#ifdef _MINIX_SYSTEM + else if (total >= 1 && k < total - 1) + a = bpf_get16_ext(pbuf, k); +#endif /* _MINIX_SYSTEM */ + else + return 0; + break; + case BPF_LD+BPF_B+BPF_IND: /* A <- P[X+k:1] */ + if (k + x < k) + return 0; + k += x; + /* FALLTHROUGH */ + case BPF_LD+BPF_B+BPF_ABS: /* A <- P[k:1] */ + if (k < len) + a = (uint32_t)packet[k]; +#ifdef _MINIX_SYSTEM + else if (k < total) + a = bpf_get8_ext(pbuf, k); +#endif /* _MINIX_SYSTEM */ + else + return 0; + break; + case BPF_LD+BPF_W+BPF_LEN: /* A <- len */ + a = total; + break; + case BPF_LD+BPF_IMM: /* A <- k */ + a = k; + break; + case BPF_LD+BPF_MEM: /* A <- M[k] */ + a = mem[k]; + break; + + case BPF_LDX+BPF_IMM: /* X <- k */ + x = k; + break; + case BPF_LDX+BPF_MEM: /* X <- M[k] */ + x = mem[k]; + break; + case BPF_LDX+BPF_LEN: /* X <- len */ + x = total; + break; + case BPF_LDX+BPF_B+BPF_MSH: /* X <- 4*(P[k:1]&0xf) */ + if (k < len) + x = ((uint32_t)packet[k] & 0xf) << 2; +#ifdef _MINIX_SYSTEM + else if (k < total) + x = (bpf_get8_ext(pbuf, k) & 0xf) << 2; +#endif /* _MINIX_SYSTEM */ + else + return 0; + break; + + case BPF_ST: /* M[k] <- A */ + mem[k] = a; + break; + + case BPF_STX: /* M[k] <- X */ + mem[k] = x; + break; + + case BPF_ALU+BPF_ADD+BPF_K: /* A <- A + k */ + a += k; + break; + case BPF_ALU+BPF_SUB+BPF_K: /* A <- A - k */ + a -= k; + break; + case BPF_ALU+BPF_MUL+BPF_K: /* A <- A * k */ + a *= k; + break; + case BPF_ALU+BPF_DIV+BPF_K: /* A <- A / k */ + a /= k; + break; + case BPF_ALU+BPF_MOD+BPF_K: /* A <- A % k */ + a %= k; + break; + case BPF_ALU+BPF_AND+BPF_K: /* A <- A & k */ + a &= k; + break; + case BPF_ALU+BPF_OR+BPF_K: /* A <- A | k */ + a |= k; + break; + case BPF_ALU+BPF_XOR+BPF_K: /* A <- A ^ k */ + a ^= k; + break; + case BPF_ALU+BPF_LSH+BPF_K: /* A <- A << k */ + a <<= k; + break; + case BPF_ALU+BPF_RSH+BPF_K: /* A <- A >> k */ + a >>= k; + break; + case BPF_ALU+BPF_ADD+BPF_X: /* A <- A + X */ + a += x; + break; + case BPF_ALU+BPF_SUB+BPF_X: /* A <- A - X */ + a -= x; + break; + case BPF_ALU+BPF_MUL+BPF_X: /* A <- A * X */ + a *= x; + break; + case BPF_ALU+BPF_DIV+BPF_X: /* A <- A / X */ + if (x == 0) + return 0; + a /= x; + break; + case BPF_ALU+BPF_MOD+BPF_X: /* A <- A % X */ + if (x == 0) + return 0; + a %= x; + break; + case BPF_ALU+BPF_AND+BPF_X: /* A <- A & X */ + a &= x; + break; + case BPF_ALU+BPF_OR+BPF_X: /* A <- A | X */ + a |= x; + break; + case BPF_ALU+BPF_XOR+BPF_X: /* A <- A ^ X */ + a ^= x; + break; + case BPF_ALU+BPF_LSH+BPF_X: /* A <- A << X */ + if (x >= 32) + return 0; + a <<= x; + break; + case BPF_ALU+BPF_RSH+BPF_X: /* A <- A >> X */ + if (x >= 32) + return 0; + a >>= x; + break; + case BPF_ALU+BPF_NEG: /* A <- -A */ + a = -a; + break; + + case BPF_JMP+BPF_JA: /* pc += k */ + pc += k; + break; + case BPF_JMP+BPF_JGT+BPF_K: /* pc += (A > k) ? jt : jf */ + pc += (a > k) ? pc->jt : pc->jf; + break; + case BPF_JMP+BPF_JGE+BPF_K: /* pc += (A >= k) ? jt : jf */ + pc += (a >= k) ? pc->jt : pc->jf; + break; + case BPF_JMP+BPF_JEQ+BPF_K: /* pc += (A == k) ? jt : jf */ + pc += (a == k) ? pc->jt : pc->jf; + break; + case BPF_JMP+BPF_JSET+BPF_K: /* pc += (A & k) ? jt : jf */ + pc += (a & k) ? pc->jt : pc->jf; + break; + case BPF_JMP+BPF_JGT+BPF_X: /* pc += (A > X) ? jt : jf */ + pc += (a > x) ? pc->jt : pc->jf; + break; + case BPF_JMP+BPF_JGE+BPF_X: /* pc += (A >= X) ? jt : jf */ + pc += (a >= x) ? pc->jt : pc->jf; + break; + case BPF_JMP+BPF_JEQ+BPF_X: /* pc += (A == X) ? jt : jf */ + pc += (a == x) ? pc->jt : pc->jf; + break; + case BPF_JMP+BPF_JSET+BPF_X: /* pc += (A & X) ? jt : jf */ + pc += (a & x) ? pc->jt : pc->jf; + break; + + case BPF_RET+BPF_A: /* accept A bytes */ + return a; + case BPF_RET+BPF_K: /* accept K bytes */ + return k; + + case BPF_MISC+BPF_TAX: /* X <- A */ + x = a; + break; + case BPF_MISC+BPF_TXA: /* A <- X */ + a = x; + break; + + default: /* unknown instruction */ + return 0; + } + } + + /* NOTREACHED */ +} + +/* + * In order to avoid having to perform explicit memory allocation, we store + * some validation state on the stack, using data types that are as small as + * possible for the current definitions. The data types, and in fact the whole + * assumption that we can store the state on the stack, may need to be revised + * if certain constants are increased in the future. As of writing, the + * validation routine uses a little over 1KB of stack memory. + */ +#if BPF_MEMWORDS <= 16 /* value as of writing: 16 */ +typedef uint16_t meminv_t; +#else +#error "increased BPF_MEMWORDS may require code revision" +#endif + +#if BPF_MAXINSNS > 2048 /* value as of writing: 512 */ +#error "increased BPF_MAXINSNS may require code revision" +#endif + +/* + * Verify that the given filter program is safe to execute, by performing as + * many static validity checks as possible. The program is given as 'insns', + * which must be an array of 'ninsns' BPF instructions. Unlike bpf_filter(), + * this function does not accept empty filter programs. The function returns 1 + * if the program was successfully validated, or 0 if the program should not be + * accepted. + */ +int +bpf_validate(const struct bpf_insn * insns, int ninsns) +{ + bitchunk_t reachable[BITMAP_CHUNKS(BPF_MAXINSNS)]; + meminv_t invalid, meminv[BPF_MAXINSNS]; + const struct bpf_insn *insn; + u_int pc, count, target; + int advance; + + if (insns == NULL || ninsns <= 0 || ninsns > BPF_MAXINSNS) + return 0; + count = (u_int)ninsns; + + memset(reachable, 0, sizeof(reachable[0]) * BITMAP_CHUNKS(count)); + memset(meminv, 0, sizeof(meminv[0]) * count); + + SET_BIT(reachable, 0); + meminv[0] = (meminv_t)~0; + + for (pc = 0; pc < count; pc++) { + /* We completely ignore instructions that are not reachable. */ + if (!GET_BIT(reachable, pc)) + continue; + + invalid = meminv[pc]; + advance = 1; + + insn = &insns[pc]; + + switch (insn->code) { + case BPF_LD+BPF_W+BPF_ABS: + case BPF_LD+BPF_H+BPF_ABS: + case BPF_LD+BPF_B+BPF_ABS: + case BPF_LD+BPF_W+BPF_IND: + case BPF_LD+BPF_H+BPF_IND: + case BPF_LD+BPF_B+BPF_IND: + case BPF_LD+BPF_LEN: + case BPF_LD+BPF_IMM: + case BPF_LDX+BPF_IMM: + case BPF_LDX+BPF_LEN: + case BPF_LDX+BPF_B+BPF_MSH: + case BPF_ALU+BPF_ADD+BPF_K: + case BPF_ALU+BPF_SUB+BPF_K: + case BPF_ALU+BPF_MUL+BPF_K: + case BPF_ALU+BPF_AND+BPF_K: + case BPF_ALU+BPF_OR+BPF_K: + case BPF_ALU+BPF_XOR+BPF_K: + case BPF_ALU+BPF_ADD+BPF_X: + case BPF_ALU+BPF_SUB+BPF_X: + case BPF_ALU+BPF_MUL+BPF_X: + case BPF_ALU+BPF_DIV+BPF_X: + case BPF_ALU+BPF_MOD+BPF_X: + case BPF_ALU+BPF_AND+BPF_X: + case BPF_ALU+BPF_OR+BPF_X: + case BPF_ALU+BPF_XOR+BPF_X: + case BPF_ALU+BPF_LSH+BPF_X: + case BPF_ALU+BPF_RSH+BPF_X: + case BPF_ALU+BPF_NEG: + case BPF_MISC+BPF_TAX: + case BPF_MISC+BPF_TXA: + /* Nothing we can check for these. */ + break; + case BPF_ALU+BPF_DIV+BPF_K: + case BPF_ALU+BPF_MOD+BPF_K: + /* No division by zero. */ + if (insn->k == 0) + return 0; + break; + case BPF_ALU+BPF_LSH+BPF_K: + case BPF_ALU+BPF_RSH+BPF_K: + /* Do not invoke undefined behavior. */ + if (insn->k >= 32) + return 0; + break; + case BPF_LD+BPF_MEM: + case BPF_LDX+BPF_MEM: + /* + * Only allow loading words that have been stored in + * all execution paths leading up to this instruction. + */ + if (insn->k >= BPF_MEMWORDS || + (invalid & (1 << insn->k))) + return 0; + break; + case BPF_ST: + case BPF_STX: + if (insn->k >= BPF_MEMWORDS) + return 0; + invalid &= ~(1 << insn->k); + break; + case BPF_JMP+BPF_JA: + /* + * Make sure that the target instruction of the jump is + * still part of the program, and mark it as reachable. + */ + if (insn->k >= count - pc - 1) + return 0; + target = pc + insn->k + 1; + SET_BIT(reachable, target); + meminv[target] |= invalid; + advance = 0; + break; + case BPF_JMP+BPF_JGT+BPF_K: + case BPF_JMP+BPF_JGE+BPF_K: + case BPF_JMP+BPF_JEQ+BPF_K: + case BPF_JMP+BPF_JSET+BPF_K: + case BPF_JMP+BPF_JGT+BPF_X: + case BPF_JMP+BPF_JGE+BPF_X: + case BPF_JMP+BPF_JEQ+BPF_X: + case BPF_JMP+BPF_JSET+BPF_X: + /* + * Make sure that both target instructions are still + * part of the program, and mark both as reachable. + * There is no chance that the additions will overflow. + */ + target = pc + insn->jt + 1; + if (target >= count) + return 0; + SET_BIT(reachable, target); + meminv[target] |= invalid; + + target = pc + insn->jf + 1; + if (target >= count) + return 0; + SET_BIT(reachable, target); + meminv[target] |= invalid; + + advance = 0; + break; + case BPF_RET+BPF_A: + case BPF_RET+BPF_K: + advance = 0; + break; + default: + return 0; + } + + /* + * After most instructions, we simply advance to the next. For + * one thing, this means that there must be a next instruction + * at all. + */ + if (advance) { + if (pc + 1 == count) + return 0; + SET_BIT(reachable, pc + 1); + meminv[pc + 1] |= invalid; + } + } + + /* The program has passed all our basic tests. */ + return 1; +} diff --git a/minix/net/lwip/bpfdev.c b/minix/net/lwip/bpfdev.c new file mode 100644 index 000000000..3e12c8dac --- /dev/null +++ b/minix/net/lwip/bpfdev.c @@ -0,0 +1,1365 @@ +/* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */ +/* + * BPF is a cloning device: opening /dev/bpf returns a new BPF device which is + * independent from any other opened BPF devices. We assume that each BPF + * device is used by one single user process, and this implementation therefore + * does not support multiple concurrent device calls on the same BPF device. + * + * Packet buffering basically follows the BSD model: each BPF device that is + * configured (that is, it has been attached to an interface) has two buffers, + * each of the configured size: a store buffer, where new packets are stored, + * and a hold buffer, which is typically full and awaiting retrieval through a + * read call from userland. The buffers are swapped ("rotated") when the store + * buffer is filled up and the hold buffer is empty - if the hold buffer is not + * empty is not empty either, additional packets are dropped. + * + * These buffers are allocated when the BPF device is attached to an interface. + * The interface may later disappear, in which case the BPF device is detached + * from it, allowing any final packets to be read before read requests start + * returning I/O errors. The buffers are freed only when the device is closed. + */ + +#include "lwip.h" +#include "bpfdev.h" + +#include +#include +#include +#include +#include + +/* + * Make sure that our implementation matches the BPF version in the NetBSD + * headers. If they change the version number, we may have to make changes + * here accordingly. + */ +#if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1 +#error "NetBSD BPF version has changed" +#endif + +/* The number of BPF devices. */ +#define NR_BPFDEV 16 + +/* BPF receive buffer size: allowed range and default. */ +#define BPF_BUF_MIN BPF_WORDALIGN(sizeof(struct bpf_hdr)) +#define BPF_BUF_DEF 32768 +#define BPF_BUF_MAX 262144 + +/* + * By opening /dev/bpf, one will obtain a cloned device with a different minor + * number, which maps to one of the BPF devices. + */ +#define BPFDEV_MINOR 0 /* minor number of /dev/bpf */ +#define BPFDEV_BASE_MINOR 1 /* base minor number for BPF devices */ + +static struct bpfdev { + struct bpfdev_link bpf_link; /* structure link, MUST be first */ + TAILQ_ENTRY(bpfdev) bpf_next; /* next on free or interface list */ + struct ifdev *bpf_ifdev; /* associated interface, or NULL */ + unsigned int bpf_flags; /* flags (BPFF_) */ + size_t bpf_size; /* size of packet buffers */ + char *bpf_sbuf; /* store buffer (mmap'd, or NULL) */ + char *bpf_hbuf; /* hold buffer (mmap'd, or NULL) */ + size_t bpf_slen; /* used part of store buffer */ + size_t bpf_hlen; /* used part of hold buffer */ + struct bpf_insn *bpf_filter; /* verified BPF filter, or NULL */ + size_t bpf_filterlen; /* length of filter, for munmap */ + pid_t bpf_pid; /* process ID of last using process */ + clock_t bpf_timeout; /* timeout for read calls (0 = none) */ + struct { /* state for pending read request */ + endpoint_t br_endpt; /* reading endpoint, or NONE */ + cp_grant_id_t br_grant; /* grant for reader's buffer */ + cdev_id_t br_id; /* read request identifier */ + minix_timer_t br_timer; /* timer for read timeout */ + } bpf_read; + struct { /* state for pending select request */ + endpoint_t bs_endpt; /* selecting endpoint, or NONE */ + unsigned int bs_selops; /* pending select operations */ + } bpf_select; + struct { /* packet capture statistics */ + uint64_t bs_recv; /* # of packets run through filter */ + uint64_t bs_drop; /* # of packets dropped: buffer full */ + uint64_t bs_capt; /* # of packets accepted by filter */ + } bpf_stat; +} bpf_array[NR_BPFDEV]; + +#define BPFF_IN_USE 0x01 /* this BPF device object is in use */ +#define BPFF_PROMISC 0x02 /* promiscuous mode enabled */ +#define BPFF_IMMEDIATE 0x04 /* immediate mode is enabled */ +#define BPFF_SEESENT 0x08 /* also process host-sent packets */ +#define BPFF_HDRCMPLT 0x10 /* do not fill in link-layer source */ +#define BPFF_FEEDBACK 0x20 /* feed back written packet as input */ + +static TAILQ_HEAD(, bpfdev_link) bpfl_freelist; /* list of free BPF devices */ + +static struct bpf_stat bpf_stat; + +static ssize_t bpfdev_peers(struct rmib_call *, struct rmib_node *, + struct rmib_oldp *, struct rmib_newp *); + +/* The CTL_NET NET_BPF subtree. All nodes are dynamically numbered. */ +static struct rmib_node net_bpf_table[] = { + RMIB_INT(RMIB_RO, BPF_BUF_MAX, "maxbufsize", + "Maximum size for data capture buffer"), /* TODO: read-write */ + RMIB_STRUCT(RMIB_RO, sizeof(bpf_stat), &bpf_stat, "stats", + "BPF stats"), + RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, bpfdev_peers, "peers", + "BPF peers"), +}; + +static struct rmib_node net_bpf_node = + RMIB_NODE(RMIB_RO, net_bpf_table, "bpf", "BPF options"); + +/* + * Initialize the BPF module. + */ +void +bpfdev_init(void) +{ + const int mib[] = { CTL_NET, NET_BPF }; + unsigned int slot; + int r; + + /* Initialize data structures. */ + TAILQ_INIT(&bpfl_freelist); + + for (slot = 0; slot < __arraycount(bpf_array); slot++) { + bpf_array[slot].bpf_flags = 0; + + TAILQ_INSERT_TAIL(&bpfl_freelist, &bpf_array[slot].bpf_link, + bpfl_next); + } + + memset(&bpf_stat, 0, sizeof(bpf_stat)); + + /* Register the "net.bpf" subtree with the MIB service. */ + if ((r = rmib_register(mib, __arraycount(mib), &net_bpf_node)) != OK) + panic("unable to register net.bpf RMIB tree: %d", r); +} + +/* + * Given a BPF device object, return the corresponding minor number. + */ +static devminor_t +bpfdev_get_minor(struct bpfdev * bpfdev) +{ + + assert(bpfdev != NULL); + + return BPFDEV_BASE_MINOR + (devminor_t)(bpfdev - bpf_array); +} + +/* + * Given a minor number, return the corresponding BPF device object, or NULL if + * the minor number does not identify a BPF device. + */ +static struct bpfdev * +bpfdev_get_by_minor(devminor_t minor) +{ + + if (minor < BPFDEV_BASE_MINOR || + (unsigned int)minor >= BPFDEV_BASE_MINOR + __arraycount(bpf_array)) + return NULL; + + return &bpf_array[minor - BPFDEV_BASE_MINOR]; +} + +/* + * Open a BPF device, returning a cloned device instance. + */ +static int +bpfdev_open(devminor_t minor, int access __unused, endpoint_t user_endpt) +{ + struct bpfdev_link *bpfl; + struct bpfdev *bpf; + + /* Disallow opening cloned devices through device nodes. */ + if (minor != BPFDEV_MINOR) + return ENXIO; + + if (TAILQ_EMPTY(&bpfl_freelist)) + return ENOBUFS; + + bpfl = TAILQ_FIRST(&bpfl_freelist); + TAILQ_REMOVE(&bpfl_freelist, bpfl, bpfl_next); + + bpf = (struct bpfdev *)bpfl; + + memset(bpf, 0, sizeof(*bpf)); + + bpf->bpf_flags = BPFF_IN_USE | BPFF_SEESENT; + bpf->bpf_size = BPF_BUF_DEF; + bpf->bpf_pid = getnpid(user_endpt); + bpf->bpf_read.br_endpt = NONE; + bpf->bpf_select.bs_endpt = NONE; + + return CDEV_CLONED | bpfdev_get_minor(bpf); +} + +/* + * Close a BPF device. + */ +static int +bpfdev_close(devminor_t minor) +{ + struct bpfdev *bpf; + + if ((bpf = bpfdev_get_by_minor(minor)) == NULL) + return EINVAL; + + /* + * There cannot possibly be a pending read request, so we never need to + * cancel the read timer from here either. + */ + assert(bpf->bpf_read.br_endpt == NONE); + + if (bpf->bpf_sbuf != NULL) { + assert(bpf->bpf_hbuf != NULL); + + if (munmap(bpf->bpf_sbuf, bpf->bpf_size) != 0) + panic("munmap failed: %d", -errno); + if (munmap(bpf->bpf_hbuf, bpf->bpf_size) != 0) + panic("munmap failed: %d", -errno); + + bpf->bpf_sbuf = NULL; + bpf->bpf_hbuf = NULL; + } else + assert(bpf->bpf_hbuf == NULL); + + if (bpf->bpf_filter != NULL) { + assert(bpf->bpf_filterlen > 0); + + if (munmap(bpf->bpf_filter, bpf->bpf_filterlen) != 0) + panic("munmap failed: %d", -errno); + + bpf->bpf_filter = NULL; + } + + /* + * If the BPF device was attached to an interface, and that interface + * has not disappeared in the meantime, detach from it now. + */ + if (bpf->bpf_ifdev != NULL) { + if (bpf->bpf_flags & BPFF_PROMISC) + ifdev_clear_promisc(bpf->bpf_ifdev); + + ifdev_detach_bpf(bpf->bpf_ifdev, &bpf->bpf_link); + + bpf->bpf_ifdev = NULL; + } + + bpf->bpf_flags = 0; /* mark as no longer in use */ + + TAILQ_INSERT_HEAD(&bpfl_freelist, &bpf->bpf_link, bpfl_next); + + return OK; +} + +/* + * Rotate buffers for the BPF device, by swapping the store buffer and the hold + * buffer. + */ +static void +bpfdev_rotate(struct bpfdev * bpf) +{ + char *buf; + size_t len; + + /* + * When rotating, the store buffer may or may not be empty, but the + * hold buffer must always be empty. + */ + assert(bpf->bpf_hlen == 0); + + buf = bpf->bpf_sbuf; + len = bpf->bpf_slen; + bpf->bpf_sbuf = bpf->bpf_hbuf; + bpf->bpf_slen = bpf->bpf_hlen; + bpf->bpf_hbuf = buf; + bpf->bpf_hlen = len; +} + +/* + * Test whether any of the given select operations are ready on the BPF device, + * and return the set of ready operations. + */ +static unsigned int +bpfdev_test_select(struct bpfdev * bpf, unsigned int ops) +{ + unsigned int ready_ops; + + ready_ops = 0; + + /* + * The BPF device is ready for reading if the hold buffer is not empty + * (i.e.: the store buffer has been filled up completely and was + * therefore rotated) or if immediate mode is set and the store buffer + * is not empty (i.e.: any packet is available at all). In the latter + * case, the buffers will be rotated during the read. We do not + * support applying the read timeout to selects and maintaining state + * between the select and the following read, because despite that + * libpcap claims that it is the right behavior, that is just insane. + */ + if (ops & CDEV_OP_RD) { + if (bpf->bpf_ifdev == NULL) + ready_ops |= CDEV_OP_RD; + else if (bpf->bpf_hlen > 0) + ready_ops |= CDEV_OP_RD; + else if ((bpf->bpf_flags & BPFF_IMMEDIATE) && + bpf->bpf_slen > 0) + ready_ops |= CDEV_OP_RD; + } + + if (ops & CDEV_OP_WR) + ready_ops |= CDEV_OP_WR; + + return ready_ops; +} + +/* + * There has been a state change on the BPF device. If now possible, resume a + * pending select query, if any. + */ +static void +bpfdev_resume_select(struct bpfdev * bpf) +{ + unsigned int ops, ready_ops; + endpoint_t endpt; + + /* First see if there is a pending select request at all. */ + if ((endpt = bpf->bpf_select.bs_endpt) == NONE) + return; + ops = bpf->bpf_select.bs_selops; + + assert(ops != 0); + + /* Then see if any of the pending operations are now ready. */ + if ((ready_ops = bpfdev_test_select(bpf, ops)) == 0) + return; + + /* If so, notify VFS about the ready operations. */ + chardriver_reply_select(bpf->bpf_select.bs_endpt, + bpfdev_get_minor(bpf), ready_ops); + + /* + * Forget about the ready operations. If that leaves no pending + * operations, forget about the select request altogether. + */ + if ((bpf->bpf_select.bs_selops &= ~ready_ops) == 0) + bpf->bpf_select.bs_endpt = NONE; +} + +/* + * There has been a state change on the BPF device. If now possible, resume a + * pending read request, if any. If the call is a result of a timeout, + * 'is_timeout' is set. In that case, the read request must be resumed with an + * EAGAIN error if no packets are available, and the running timer must be + * canceled. Otherwise, the resumption is due to a full buffer or a + * disappeared interface, and 'is_timeout' is not set. In this case, the read + * request must be resumed with an I/O error if no packets are available. + */ +static void +bpfdev_resume_read(struct bpfdev * bpf, int is_timeout) +{ + ssize_t r; + + assert(bpf->bpf_read.br_endpt != NONE); + + /* + * If the hold buffer is still empty, see if the store buffer has + * any packets to copy out. + */ + if (bpf->bpf_hlen == 0) + bpfdev_rotate(bpf); + + /* Return any available packets, or otherwise an error. */ + if (bpf->bpf_hlen > 0) { + assert(bpf->bpf_hlen <= bpf->bpf_size); + + r = sys_safecopyto(bpf->bpf_read.br_endpt, + bpf->bpf_read.br_grant, 0, (vir_bytes)bpf->bpf_hbuf, + bpf->bpf_hlen); + + if (r == OK) { + r = (ssize_t)bpf->bpf_hlen; + + bpf->bpf_hlen = 0; + + assert(bpf->bpf_slen != bpf->bpf_size); + + /* + * Allow readers to get the last packets after the + * interface has disappeared, before getting errors. + */ + if (bpf->bpf_ifdev == NULL) + bpfdev_rotate(bpf); + } + } else + r = (is_timeout) ? EAGAIN : EIO; + + chardriver_reply_task(bpf->bpf_read.br_endpt, bpf->bpf_read.br_id, r); + + bpf->bpf_read.br_endpt = NONE; + + /* Was there still a timer running? Then cancel it now. */ + if (bpf->bpf_timeout > 0 && !is_timeout) + cancel_timer(&bpf->bpf_read.br_timer); +} + +/* + * A read timeout has triggered for the BPF device. Wake up the pending read + * request. + */ +static void +bpfdev_timeout(int arg) +{ + struct bpfdev *bpf; + + assert(arg >= 0 && (unsigned int)arg < __arraycount(bpf_array)); + + bpf = &bpf_array[arg]; + + assert(bpf->bpf_read.br_endpt != NONE); + + bpfdev_resume_read(bpf, TRUE /*is_timeout*/); +} + +/* + * Read from a BPF device. + */ +static ssize_t +bpfdev_read(devminor_t minor, uint64_t position, endpoint_t endpt, + cp_grant_id_t grant, size_t size, int flags, cdev_id_t id) +{ + struct bpfdev *bpf; + ssize_t r; + int suspend; + + if ((bpf = bpfdev_get_by_minor(minor)) == NULL) + return EINVAL; + + /* Allow only one read call at a time. */ + if (bpf->bpf_read.br_endpt != NONE) + return EIO; + + /* Has this BPF device been configured at all yet? */ + if (bpf->bpf_sbuf == NULL) + return EINVAL; + + /* + * Does the read call size match the entire buffer size? This is a + * ridiculous requirement but it makes our job quite a bit easier.. + */ + if (size != bpf->bpf_size) + return EINVAL; + + /* + * Following standard receive semantics, if the interface is gone, + * return all the packets that were pending before returning an error. + * This requires extra buffer rotations after read completion, too. + */ + if (bpf->bpf_ifdev == NULL && bpf->bpf_hlen == 0) + return EIO; + + /* + * If immediate mode is not enabled, we should always suspend the read + * call if the hold buffer is empty. If immediate mode is enabled, we + * should only suspend the read call if both buffers are empty, and + * return data from the hold buffer or otherwise the store buffer, + * whichever is not empty. A non-blocking call behaves as though + * immediate mode is enabled, except it will return EAGAIN instead of + * suspending the read call if both buffers are empty. Thus, we may + * have to rotate buffers for both immediate mode and non-blocking + * calls. The latter is necessary for libpcap to behave correctly. + */ + if ((flags & CDEV_NONBLOCK) || (bpf->bpf_flags & BPFF_IMMEDIATE)) + suspend = (bpf->bpf_hlen == 0 && bpf->bpf_slen == 0); + else + suspend = (bpf->bpf_hlen == 0); + + if (suspend) { + if (flags & CDEV_NONBLOCK) + return EAGAIN; + + /* Suspend the read call for later. */ + bpf->bpf_read.br_endpt = endpt; + bpf->bpf_read.br_grant = grant; + bpf->bpf_read.br_id = id; + + /* Set a timer if requested. */ + if (bpf->bpf_timeout > 0) + set_timer(&bpf->bpf_read.br_timer, bpf->bpf_timeout, + bpfdev_timeout, (int)(bpf - bpf_array)); + + return EDONTREPLY; + } + + /* If we get here, either buffer has data; rotate buffers if needed. */ + if (bpf->bpf_hlen == 0) + bpfdev_rotate(bpf); + assert(bpf->bpf_hlen > 0); + + if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)bpf->bpf_hbuf, + bpf->bpf_hlen)) != OK) + return r; + + r = (ssize_t)bpf->bpf_hlen; + + bpf->bpf_hlen = 0; + + /* + * If the store buffer is exactly full, rotate it now. Also, if the + * interface has disappeared, the store buffer will never fill up. + * Rotate it so that the application will get any remaining data before + * getting errors about the interface being gone. + */ + if (bpf->bpf_slen == bpf->bpf_size || bpf->bpf_ifdev == NULL) + bpfdev_rotate(bpf); + + return r; +} + +/* + * Write to a BPF device. + */ +static ssize_t +bpfdev_write(devminor_t minor, uint64_t position, endpoint_t endpt, + cp_grant_id_t grant, size_t size, int flags, cdev_id_t id) +{ + struct bpfdev *bpf; + struct pbuf *pbuf, *pptr, *pcopy; + size_t off; + err_t err; + int r; + + if ((bpf = bpfdev_get_by_minor(minor)) == NULL) + return EINVAL; + + if (bpf->bpf_ifdev == NULL) + return EINVAL; + + /* VFS skips zero-sized I/O calls right now, but that may change. */ + if (size == 0) + return 0; /* nothing to do */ + + if (size > ifdev_get_hdrlen(bpf->bpf_ifdev) + + ifdev_get_mtu(bpf->bpf_ifdev)) + return EMSGSIZE; + + if ((pbuf = pchain_alloc(PBUF_LINK, size)) == NULL) + return ENOMEM; + + /* TODO: turn this into a series of vector copies. */ + off = 0; + for (pptr = pbuf; pptr != NULL; pptr = pptr->next) { + if ((r = sys_safecopyfrom(endpt, grant, off, + (vir_bytes)pptr->payload, pptr->len)) != OK) { + pbuf_free(pbuf); + + return r; + } + off += pptr->len; + } + assert(off == size); + + /* + * In feedback mode, we cannot use the same packet buffers for both + * output and input, so make a copy. We do this before calling the + * output function, which may change part of the buffers, because the + * BSDs take this approach as well. + */ + if (bpf->bpf_flags & BPFF_FEEDBACK) { + if ((pcopy = pchain_alloc(PBUF_LINK, size)) == NULL) { + pbuf_free(pbuf); + + return ENOMEM; + } + + if (pbuf_copy(pcopy, pbuf) != ERR_OK) + panic("unexpected pbuf copy failure"); + } else + pcopy = NULL; + + /* Pass in the packet as output, and free it again. */ + err = ifdev_output(bpf->bpf_ifdev, pbuf, NULL /*netif*/, + TRUE /*to_bpf*/, !!(bpf->bpf_flags & BPFF_HDRCMPLT)); + + pbuf_free(pbuf); + + /* In feedback mode, pass in the copy as input, if output succeeded. */ + if (err == ERR_OK && (bpf->bpf_flags & BPFF_FEEDBACK)) + ifdev_input(bpf->bpf_ifdev, pcopy, NULL /*netif*/, + FALSE /*to_bpf*/); + else if (pcopy != NULL) + pbuf_free(pcopy); + + return (err == ERR_OK) ? (ssize_t)size : util_convert_err(err); +} + +/* + * Attach a BPF device to a network interface, using the interface name given + * in an ifreq structure. As side effect, allocate hold and store buffers for + * the device. These buffers will stay allocated until the device is closed, + * even though the interface may disappear before that. Return OK if the BPF + * device was successfully attached to the interface, or a negative error code + * otherwise. + */ +static int +bpfdev_attach(struct bpfdev * bpf, struct ifreq * ifr) +{ + struct ifdev *ifdev; + void *sbuf, *hbuf; + + /* Find the interface with the given name. */ + ifr->ifr_name[sizeof(ifr->ifr_name) - 1] = '\0'; + if ((ifdev = ifdev_find_by_name(ifr->ifr_name)) == NULL) + return ENXIO; + + /* + * Allocate a store buffer and a hold buffer. Preallocate the memory, + * or we might get killed later during low-memory conditions. + */ + if ((sbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) + return ENOMEM; + + if ((hbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) { + (void)munmap(sbuf, bpf->bpf_size); + + return ENOMEM; + } + + bpf->bpf_ifdev = ifdev; + bpf->bpf_sbuf = sbuf; + bpf->bpf_hbuf = hbuf; + assert(bpf->bpf_slen == 0); + assert(bpf->bpf_hlen == 0); + + ifdev_attach_bpf(ifdev, &bpf->bpf_link); + + return OK; +} + +/* + * Detach the BPF device from its interface, which is about to disappear. + */ +void +bpfdev_detach(struct bpfdev_link * bpfl) +{ + struct bpfdev *bpf = (struct bpfdev *)bpfl; + + assert(bpf->bpf_flags & BPFF_IN_USE); + assert(bpf->bpf_ifdev != NULL); + + /* + * We deliberately leave the buffers allocated here, for two reasons: + * + * 1) it lets applications to read any last packets in the buffers; + * 2) it prevents reattaching the BPF device to another interface. + */ + bpf->bpf_ifdev = NULL; + + /* + * Resume pending read and select requests, returning any data left, + * or an error if none. + */ + if (bpf->bpf_hlen == 0) + bpfdev_rotate(bpf); + + if (bpf->bpf_read.br_endpt != NONE) + bpfdev_resume_read(bpf, FALSE /*is_timeout*/); + + bpfdev_resume_select(bpf); +} + +/* + * Flush the given BPF device, resetting its buffer contents and statistics + * counters. + */ +static void +bpfdev_flush(struct bpfdev * bpf) +{ + + bpf->bpf_slen = 0; + bpf->bpf_hlen = 0; + + bpf->bpf_stat.bs_recv = 0; + bpf->bpf_stat.bs_drop = 0; + bpf->bpf_stat.bs_capt = 0; +} + +/* + * Install a filter program on the BPF device. A new filter replaces any old + * one. A zero-sized filter simply clears a previous filter. On success, + * perform a flush and return OK. On failure, return a negative error code + * without making any modifications to the current filter. + */ +static int +bpfdev_setfilter(struct bpfdev * bpf, endpoint_t endpt, cp_grant_id_t grant) +{ + struct bpf_insn *filter; + unsigned int count; + size_t len; + int r; + + if ((r = sys_safecopyfrom(endpt, grant, + offsetof(struct minix_bpf_program, mbf_len), (vir_bytes)&count, + sizeof(count))) != OK) + return r; + + if (count > BPF_MAXINSNS) + return EINVAL; + len = count * sizeof(struct bpf_insn); + + if (len > 0) { + if ((filter = (struct bpf_insn *)mmap(NULL, len, + PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0)) == + MAP_FAILED) + return ENOMEM; + + if ((r = sys_safecopyfrom(endpt, grant, + offsetof(struct minix_bpf_program, mbf_insns), + (vir_bytes)filter, len)) != OK) { + (void)munmap(filter, len); + + return r; + } + + if (!bpf_validate(filter, count)) { + (void)munmap(filter, len); + + return EINVAL; + } + } else + filter = NULL; + + if (bpf->bpf_filter != NULL) + (void)munmap(bpf->bpf_filter, bpf->bpf_filterlen); + + bpf->bpf_filter = filter; + bpf->bpf_filterlen = len; + + bpfdev_flush(bpf); + + return OK; +} + +/* + * Process an I/O control request on the BPF device. + */ +static int +bpfdev_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt, + cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id) +{ + struct bpfdev *bpf; + struct bpf_stat bs; + struct bpf_version bv; + struct bpf_dltlist bfl; + struct timeval tv; + struct ifreq ifr; + unsigned int uval; + int r, val; + + if ((bpf = bpfdev_get_by_minor(minor)) == NULL) + return EINVAL; + + /* + * We do not support multiple concurrent requests in this module. That + * not only means that we forbid a read(2) call on a BPF device object + * while another read(2) is already pending: we also disallow IOCTL + * IOCTL calls while such a read(2) call is in progress. This + * restriction should never be a problem for user programs, and allows + * us to rely on the fact that that no settings can change between the + * start and end of any read call. As a side note, pending select(2) + * queries may be similarly affected, and will also not be fully + * accurate if any options are changed while pending. + */ + if (bpf->bpf_read.br_endpt != NONE) + return EIO; + + bpf->bpf_pid = getnpid(user_endpt); + + /* These are in order of the NetBSD BIOC.. IOCTL numbers. */ + switch (request) { + case BIOCGBLEN: + uval = bpf->bpf_size; + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval)); + + case BIOCSBLEN: + if (bpf->bpf_sbuf != NULL) + return EINVAL; + + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval))) != OK) + return r; + + if (uval < BPF_BUF_MIN) + uval = BPF_BUF_MIN; + else if (uval > BPF_BUF_MAX) + uval = BPF_BUF_MAX; + + /* Is this the right thing to do? It doesn't matter for us. */ + uval = BPF_WORDALIGN(uval); + + if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval))) != OK) + return r; + + bpf->bpf_size = uval; + + return OK; + + case MINIX_BIOCSETF: + return bpfdev_setfilter(bpf, endpt, grant); + + case BIOCPROMISC: + if (bpf->bpf_ifdev == NULL) + return EINVAL; + + if (!(bpf->bpf_flags & BPFF_PROMISC)) { + if (!ifdev_set_promisc(bpf->bpf_ifdev)) + return EINVAL; + + bpf->bpf_flags |= BPFF_PROMISC; + } + + return OK; + + case BIOCFLUSH: + bpfdev_flush(bpf); + + return OK; + + case BIOCGDLT: + if (bpf->bpf_ifdev == NULL) + return EINVAL; + + /* TODO: support for type configuration per BPF device. */ + uval = ifdev_get_dlt(bpf->bpf_ifdev); + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval)); + + case BIOCGETIF: + if (bpf->bpf_ifdev == NULL) + return EINVAL; + + memset(&ifr, 0, sizeof(ifr)); + strlcpy(ifr.ifr_name, ifdev_get_name(bpf->bpf_ifdev), + sizeof(ifr.ifr_name)); + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&ifr, + sizeof(ifr)); + + case BIOCSETIF: + /* + * Test on the presence of a buffer rather than on an interface + * since the latter may disappear and thus be reset to NULL, in + * which case we do not want to allow rebinding to another. + */ + if (bpf->bpf_sbuf != NULL) + return EINVAL; + + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&ifr, + sizeof(ifr))) != OK) + return r; + + return bpfdev_attach(bpf, &ifr); + + case BIOCGSTATS: + /* + * Why do we not embed a bpf_stat structure directly in the + * BPF device structure? Well, bpf_stat has massive padding.. + */ + memset(&bs, 0, sizeof(bs)); + bs.bs_recv = bpf->bpf_stat.bs_recv; + bs.bs_drop = bpf->bpf_stat.bs_drop; + bs.bs_capt = bpf->bpf_stat.bs_capt; + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bs, + sizeof(bs)); + + case BIOCIMMEDIATE: + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval))) != OK) + return r; + + if (uval) + bpf->bpf_flags |= BPFF_IMMEDIATE; + else + bpf->bpf_flags &= ~BPFF_IMMEDIATE; + + return OK; + + case BIOCVERSION: + memset(&bv, 0, sizeof(bv)); + bv.bv_major = BPF_MAJOR_VERSION; + bv.bv_minor = BPF_MINOR_VERSION; + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bv, + sizeof(bv)); + + case BIOCGHDRCMPLT: + uval = !!(bpf->bpf_flags & BPFF_HDRCMPLT); + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval)); + + case BIOCSHDRCMPLT: + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval))) != OK) + return r; + + if (uval) + bpf->bpf_flags |= BPFF_HDRCMPLT; + else + bpf->bpf_flags &= ~BPFF_HDRCMPLT; + + return OK; + + case BIOCSDLT: + if (bpf->bpf_ifdev == NULL) + return EINVAL; + + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval))) != OK) + return r; + + /* TODO: support for type configuration per BPF device. */ + if (uval != ifdev_get_dlt(bpf->bpf_ifdev)) + return EINVAL; + + return OK; + + case MINIX_BIOCGDLTLIST: + if (bpf->bpf_ifdev == NULL) + return EINVAL; + + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&bfl, + sizeof(bfl))) != OK) + return r; + + if (bfl.bfl_list != NULL) { + if (bfl.bfl_len < 1) + return ENOMEM; + + /* + * Copy out the 'list', which consists of one entry. + * If we were to produce multiple entries, we would + * have to check against the MINIX_BPF_MAXDLT limit. + */ + uval = ifdev_get_dlt(bpf->bpf_ifdev); + + if ((r = sys_safecopyto(endpt, grant, + offsetof(struct minix_bpf_dltlist, mbfl_list), + (vir_bytes)&uval, sizeof(uval))) != OK) + return r; + } + bfl.bfl_len = 1; + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bfl, + sizeof(bfl)); + + case BIOCGSEESENT: + uval = !!(bpf->bpf_flags & BPFF_SEESENT); + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval)); + + case BIOCSSEESENT: + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval))) != OK) + return r; + + if (uval) + bpf->bpf_flags |= BPFF_SEESENT; + else + bpf->bpf_flags &= ~BPFF_SEESENT; + + return OK; + + case BIOCSRTIMEOUT: + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&tv, + sizeof(tv))) != OK) + return r; + + if ((r = util_timeval_to_ticks(&tv, &bpf->bpf_timeout)) != OK) + return r; + + return OK; + + case BIOCGRTIMEOUT: + util_ticks_to_timeval(bpf->bpf_timeout, &tv); + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&tv, + sizeof(tv)); + + case BIOCGFEEDBACK: + uval = !!(bpf->bpf_flags & BPFF_FEEDBACK); + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval)); + + case BIOCSFEEDBACK: + if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, + sizeof(uval))) != OK) + return r; + + if (uval) + bpf->bpf_flags |= BPFF_FEEDBACK; + else + bpf->bpf_flags &= ~BPFF_FEEDBACK; + + return OK; + + case FIONREAD: + val = 0; + if (bpf->bpf_hlen > 0) + val = bpf->bpf_hlen; + else if ((bpf->bpf_flags & BPFF_IMMEDIATE) && + bpf->bpf_slen > 0) + val = bpf->bpf_slen; + else + val = 0; + + return sys_safecopyto(endpt, grant, 0, (vir_bytes)&val, + sizeof(val)); + + default: + return ENOTTY; + } +} + +/* + * Cancel a previously suspended request on a BPF device. Since only read + * requests may be suspended (select is handled differently), the cancel + * request must be for a read request. Note that character devices currently + * (still) behave slightly differently from socket devices here: while socket + * drivers are supposed to respond to the original request, character drivers + * must respond to the original request from the cancel callback. + */ +static int +bpfdev_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id) +{ + struct bpfdev *bpf; + + if ((bpf = bpfdev_get_by_minor(minor)) == NULL) + return EDONTREPLY; + + /* Is this a cancel request for the currently pending read request? */ + if (bpf->bpf_read.br_endpt != endpt || bpf->bpf_read.br_id != id) + return EDONTREPLY; + + /* If so, cancel the read request. */ + if (bpf->bpf_timeout > 0) + cancel_timer(&bpf->bpf_read.br_timer); + + bpf->bpf_read.br_endpt = NONE; + + return EINTR; /* the return value for the canceled read request */ +} + +/* + * Perform a select query on a BPF device. + */ +static int +bpfdev_select(devminor_t minor, unsigned int ops, endpoint_t endpt) +{ + struct bpfdev *bpf; + unsigned int r, notify; + + if ((bpf = bpfdev_get_by_minor(minor)) == NULL) + return EINVAL; + + notify = (ops & CDEV_NOTIFY); + ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR); + + r = bpfdev_test_select(bpf, ops); + + /* + * For the operations that were not immediately ready, if requested, + * save the select request for later. + */ + ops &= ~r; + + if (ops != 0 && notify) { + if (bpf->bpf_select.bs_endpt != NONE) { + /* Merge in the operations with any earlier request. */ + if (bpf->bpf_select.bs_endpt != endpt) + return EIO; + bpf->bpf_select.bs_selops |= ops; + } else { + bpf->bpf_select.bs_endpt = endpt; + bpf->bpf_select.bs_selops = ops; + } + } + + return r; +} + +/* + * Process an incoming packet on the interface to which the given BPF device is + * attached. If the packet passes the filter (if any), store as much as + * requested of it in the store buffer, rotating buffers if needed and resuming + * suspended read and select requests as appropriate. This function is also + * called through bpfdev_output() below. + */ +void +bpfdev_input(struct bpfdev_link * bpfl, const struct pbuf * pbuf) +{ + struct bpfdev *bpf = (struct bpfdev *)bpfl; + struct timespec ts; + struct bpf_hdr bh; + const struct pbuf *pptr; + size_t caplen, hdrlen, totlen, off, chunk; + int hfull; + + /* + * Apparently bs_recv is the counter of packets that were run through + * the filter, not the number of packets that were or could be received + * by the user (which is what I got from the manual page.. oh well). + */ + bpf->bpf_stat.bs_recv++; + bpf_stat.bs_recv++; + + /* + * Run the packet through the BPF device's filter to see whether the + * packet should be stored and if so, how much of it. If no filter is + * set, all packets will be stored in their entirety. + */ + caplen = bpf_filter_ext(bpf->bpf_filter, pbuf, (u_char *)pbuf->payload, + pbuf->tot_len, pbuf->len); + + if (caplen == 0) + return; /* no match; ignore packet */ + + if (caplen > pbuf->tot_len) + caplen = pbuf->tot_len; + + /* Truncate packet entries to the full size of the buffers. */ + hdrlen = BPF_WORDALIGN(sizeof(bh)); + totlen = BPF_WORDALIGN(hdrlen + caplen); + + if (totlen > bpf->bpf_size) { + totlen = bpf->bpf_size; + caplen = totlen - hdrlen; + } + assert(totlen >= hdrlen); + + bpf->bpf_stat.bs_capt++; + bpf_stat.bs_capt++; + + assert(bpf->bpf_sbuf != NULL); + if (totlen > bpf->bpf_size - bpf->bpf_slen) { + /* + * If the store buffer is full and the hold buffer is not + * empty, we cannot swap the two buffers, and so we must drop + * the current packet. + */ + if (bpf->bpf_hlen > 0) { + bpf->bpf_stat.bs_drop++; + bpf_stat.bs_drop++; + + return; + } + + /* + * Rotate the buffers: the hold buffer will now be "full" and + * ready to be read - it may not actually be entirely full, but + * we could not fit this packet and we are not going to deliver + * packets out of order.. + */ + bpfdev_rotate(bpf); + + hfull = TRUE; + } else + hfull = FALSE; + + /* + * Retrieve the capture time for the packet. Ideally this would be + * done only once per accepted packet, but we do not expect many BPF + * devices to be receiving the same packets often enough to make that + * worth it. + */ + clock_time(&ts); + + /* + * Copy the packet into the store buffer, including a newly generated + * header. Zero any padding areas, even if strictly not necessary. + */ + memset(&bh, 0, sizeof(bh)); + bh.bh_tstamp.tv_sec = ts.tv_sec; + bh.bh_tstamp.tv_usec = ts.tv_nsec / 1000; + bh.bh_caplen = caplen; + bh.bh_datalen = pbuf->tot_len; + bh.bh_hdrlen = hdrlen; + + assert(bpf->bpf_sbuf != NULL); + off = bpf->bpf_slen; + + memcpy(&bpf->bpf_sbuf[off], &bh, sizeof(bh)); + if (hdrlen > sizeof(bh)) + memset(&bpf->bpf_sbuf[off + sizeof(bh)], 0, + hdrlen - sizeof(bh)); + off += hdrlen; + + for (pptr = pbuf; pptr != NULL && caplen > 0; pptr = pptr->next) { + chunk = pptr->len; + if (chunk > caplen) + chunk = caplen; + + memcpy(&bpf->bpf_sbuf[off], pptr->payload, chunk); + + off += chunk; + caplen -= chunk; + } + + assert(off <= bpf->bpf_slen + totlen); + if (bpf->bpf_slen + totlen > off) + memset(&bpf->bpf_sbuf[off], 0, bpf->bpf_slen + totlen - off); + + bpf->bpf_slen += totlen; + + /* + * Edge case: if the hold buffer is empty and the store buffer is now + * exactly full, rotate buffers so that the packets can be read + * immediately, without waiting for the next packet to cause rotation. + */ + if (bpf->bpf_hlen == 0 && bpf->bpf_slen == bpf->bpf_size) { + bpfdev_rotate(bpf); + + hfull = TRUE; + } + + /* + * If the hold buffer is now full, or if immediate mode is enabled, + * then we now have data to deliver to userland. See if we can wake up + * any read or select call (either but not both here). + */ + if (hfull || (bpf->bpf_flags & BPFF_IMMEDIATE)) { + if (bpf->bpf_read.br_endpt != NONE) + bpfdev_resume_read(bpf, FALSE /*is_timeout*/); + else + bpfdev_resume_select(bpf); + } +} + +/* + * Process an outgoing packet on the interface to which the given BPF device is + * attached. If the BPF device is configured to capture outgoing packets as + * well, attempt to capture the packet as per bpfdev_input(). + */ +void +bpfdev_output(struct bpfdev_link * bpfl, const struct pbuf * pbuf) +{ + struct bpfdev *bpf = (struct bpfdev *)bpfl; + + if (bpf->bpf_flags & BPFF_SEESENT) + bpfdev_input(bpfl, pbuf); +} + +/* + * Fill the given 'bde' structure with information about BPF device 'bpf'. + */ +static void +bpfdev_get_info(struct bpf_d_ext * bde, const struct bpfdev * bpf) +{ + + bde->bde_bufsize = bpf->bpf_size; + bde->bde_promisc = !!(bpf->bpf_flags & BPFF_PROMISC); + bde->bde_state = BPF_IDLE; + bde->bde_immediate = !!(bpf->bpf_flags & BPFF_IMMEDIATE); + bde->bde_hdrcmplt = !!(bpf->bpf_flags & BPFF_HDRCMPLT); + bde->bde_seesent = !!(bpf->bpf_flags & BPFF_SEESENT); + /* + * NetBSD updates the process ID upon device open, close, ioctl, and + * poll. From those, only open and ioctl make sense for us. Sadly + * there is no way to indicate "no known PID" to netstat(1), so we + * cannot even save just the endpoint and look up the corresponding PID + * later, since the user process may be gone by then. + */ + bde->bde_pid = bpf->bpf_pid; + bde->bde_rcount = bpf->bpf_stat.bs_recv; + bde->bde_dcount = bpf->bpf_stat.bs_drop; + bde->bde_ccount = bpf->bpf_stat.bs_capt; + if (bpf->bpf_ifdev != NULL) + strlcpy(bde->bde_ifname, ifdev_get_name(bpf->bpf_ifdev), + sizeof(bde->bde_ifname)); +} + +/* + * Obtain statistics about open BPF devices ("peers"). This node may be + * accessed by the superuser only. Used by netstat(1). + */ +static ssize_t +bpfdev_peers(struct rmib_call * call, struct rmib_node * node __unused, + struct rmib_oldp * oldp, struct rmib_newp * newp __unused) +{ + struct bpfdev *bpf; + struct bpf_d_ext bde; + unsigned int slot; + ssize_t off; + int r, size, max; + + if (!(call->call_flags & RMIB_FLAG_AUTH)) + return EPERM; + + if (call->call_namelen != 2) + return EINVAL; + + size = call->call_name[0]; + if (size < 0 || (size_t)size > sizeof(bde)) + return EINVAL; + if (size == 0) + size = sizeof(bde); + max = call->call_name[1]; + + off = 0; + + for (slot = 0; slot < __arraycount(bpf_array); slot++) { + bpf = &bpf_array[slot]; + + if (!(bpf->bpf_flags & BPFF_IN_USE)) + continue; + + if (rmib_inrange(oldp, off)) { + memset(&bde, 0, sizeof(bde)); + + bpfdev_get_info(&bde, bpf); + + if ((r = rmib_copyout(oldp, off, &bde, size)) < 0) + return r; + } + + off += sizeof(bde); + if (max > 0 && --max == 0) + break; + } + + /* No slack needed: netstat(1) resizes its buffer as needed. */ + return off; +} + +static const struct chardriver bpfdev_tab = { + .cdr_open = bpfdev_open, + .cdr_close = bpfdev_close, + .cdr_read = bpfdev_read, + .cdr_write = bpfdev_write, + .cdr_ioctl = bpfdev_ioctl, + .cdr_cancel = bpfdev_cancel, + .cdr_select = bpfdev_select +}; + +/* + * Process a character driver request. Since the LWIP service offers character + * devices for BPF only, it must be a request for a BPF device. + */ +void +bpfdev_process(message * m_ptr, int ipc_status) +{ + + chardriver_process(&bpfdev_tab, m_ptr, ipc_status); +} diff --git a/minix/net/lwip/bpfdev.h b/minix/net/lwip/bpfdev.h new file mode 100644 index 000000000..17f1a9e67 --- /dev/null +++ b/minix/net/lwip/bpfdev.h @@ -0,0 +1,18 @@ +#ifndef MINIX_NET_LWIP_BPFDEV_H +#define MINIX_NET_LWIP_BPFDEV_H + +/* + * BPF link structure, used to abstract away the details of the BPF structure + * from other modules. + */ +struct bpfdev_link { + TAILQ_ENTRY(bpfdev_link) bpfl_next; +}; + +void bpfdev_init(void); +void bpfdev_process(message * m_ptr, int ipc_status); +void bpfdev_detach(struct bpfdev_link * bpf); +void bpfdev_input(struct bpfdev_link * bpf, const struct pbuf * pbuf); +void bpfdev_output(struct bpfdev_link * bpf, const struct pbuf * pbuf); + +#endif /* !MINIX_NET_LWIP_BPFDEV_H */ diff --git a/minix/net/lwip/ethif.c b/minix/net/lwip/ethif.c new file mode 100644 index 000000000..863b12e48 --- /dev/null +++ b/minix/net/lwip/ethif.c @@ -0,0 +1,1718 @@ +/* LWIP service - ethif.c - ethernet interfaces */ +/* + * The most important aspect of this module is to maintain a send queue for the + * interface. This send queue consists of packets to send. At times, the user + * may request a change to the driver configuration. While configuration + * requests would ideally be enqueued in the send queue, this has proven too + * problematic to work in practice, especially since out-of-memory conditions + * may prevent configuration requests from being accepted immediately in such a + * model. Instead, we take a simple and blunt approach: configuration requests + * "cut in line" and thus take precedence over pending packets in the send + * queue. This may not always be entirely correct: for example, packets may be + * transmitted with the old ethernet address after the network device has + * already been reconfigured to receive from a new ethernet address. However, + * this should not be a real problem, and we take care explicitly of perhaps + * the most problematic case: packets not getting checksummed due to checksum + * offloading configuration changes. + * + * Even with this blunt approach, we maintain three concurrent configurations: + * the active, the pending, and the wanted configuration. The active one is + * the last known active configuration at the network driver. It used not only + * to report whether the device is in RUNNING state, but also to replay the + * active configuration to a restarted driver. The pending configuration is + * a partially new configuration that has been given to ndev to send to the + * driver, but not yet acknowledged by the driver. Finally, the wanted + * configuration is the latest one that has yet to be given to ndev. + * + * Each configuration has a bitmask indicating which part of the configuration + * has changed, in order to limit work on the driver side. This is also the + * reason that the pending and wanted configurations are separate: if e.g. a + * media change is pending at the driver, and the user also requests a mode + * change, we do not want the media change to be repeated after it has been + * acknowleged by the driver, just to change the mode as well. In this example + * the pending configuration will have NDEV_SET_MEDIA set, and the wanted + * configuration will have NDEV_SET_MODE set. Once acknowledged, the pending + * bitmask is cleared and the wanted bitmask is tested to see if another + * configuration change should be given to ndev. Technically, this could lead + * to starvation of actual packet transmission, but we expect configuration + * changes to be very rare, since they are always user initiated. + * + * It is important to note for understanding the code that for some fields + * (mode, flags, caps), the three configurations are cascading: even though the + * wanted configuration may not have NDEV_SET_MODE set, its mode field will + * still contain the most recently requested mode; that is, the mode in the + * pending configuration if that one has NDEV_SET_MODE set, or otherwise the + * mode in the active configuration. For that reason, we carefully merge + * configuration requests into the next level (wanted -> pending -> active), + * updating just the fields that have been changed by the previous level. This + * approach simplifies obtaining current values a lot, but is not very obvious. + * + * Also, we never send multiple configuration requests at once, even though + * ndev would let us do that: we use a single array for the list of multicast + * ethernet addresses that we send to the driver, which the driver may retrieve + * (using a memory grant) at any time. We necessarily recompute the multicast + * list before sending a configuration request, and thus, sending multiple + * requests at once may lead to the driver retrieving a corrupted list. + */ + +#include "lwip.h" +#include "ethif.h" + +#include "lwip/etharp.h" +#include "lwip/ethip6.h" +#include "lwip/igmp.h" +#include "lwip/mld6.h" + +#include + +#define ETHIF_MAX_MTU 1500 /* maximum MTU value for ethernet */ +#define ETHIF_DEF_MTU ETHIF_MAX_MTU /* default MTU value that we use */ + +#define ETHIF_MCAST_MAX 8 /* maximum number of multicast addresses */ + +struct ethif { + struct ifdev ethif_ifdev; /* interface device, MUST be first */ + ndev_id_t ethif_ndev; /* network device ID */ + unsigned int ethif_flags; /* interface flags (ETHIFF_) */ + uint32_t ethif_caps; /* driver capabilities (NDEV_CAPS_) */ + uint32_t ethif_media; /* driver-reported media type (IFM_) */ + struct ndev_conf ethif_active; /* active configuration (at driver) */ + struct ndev_conf ethif_pending; /* pending configuration (at ndev) */ + struct ndev_conf ethif_wanted; /* desired configuration (waiting) */ + struct ndev_hwaddr ethif_mclist[ETHIF_MCAST_MAX]; /* multicast list */ + struct { /* send queue (packet/conf refs) */ + struct pbuf *es_head; /* first (oldest) request reference */ + struct pbuf **es_unsentp; /* ptr-ptr to first unsent request */ + struct pbuf **es_tailp; /* ptr-ptr for adding new requests */ + unsigned int es_count; /* buffer count, see ETHIF_PBUF_.. */ + } ethif_snd; + struct { /* receive queue (packets) */ + struct pbuf *er_head; /* first (oldest) request buffer */ + struct pbuf **er_tailp; /* ptr-ptr for adding new requests */ + } ethif_rcv; + SIMPLEQ_ENTRY(ethif) ethif_next; /* next in free list */ +} ethif_array[NR_NDEV]; /* any other value would be suboptimal */ + +#define ethif_get_name(ethif) (ifdev_get_name(&(ethif)->ethif_ifdev)) +#define ethif_get_netif(ethif) (ifdev_get_netif(&(ethif)->ethif_ifdev)) + +#define ETHIFF_DISABLED 0x01 /* driver has disappeared */ +#define ETHIFF_FIRST_CONF 0x02 /* first configuration request sent */ + +/* + * Send queue limit settings. Both are counted in number of pbuf objects. + * ETHIF_PBUF_MIN is the minimum number of pbuf objects that can always be + * enqueued on a particular interface's send queue. It should be at least the + * number of pbufs for one single packet after being reduced to the ndev limit, + * so NDEV_IOV_MAX (8) is a natural fit. The ETHIF_PBUF_MAX_n values define + * the maximum number of pbufs that may be used by all interface send queues + * combined, whichever of the two is smaller. The resulting number must be set + * fairly high, because at any time there may be a lot of active TCP sockets + * that all generate a (multi-pbuf) packet as a result of a clock tick. It is + * currently a function of the size of the buffer pool, capped to a value that + * is a function of the number of TCP sockets (assuming one packet per socket; + * up to MSS/BUFSIZE+1 data pbufs, one header pbuf, one extra as margin). The + * difference between the per-interface guaranteed minimum and the global + * maximum is what makes up a pool of "spares", which are really just tokens + * allowing for enqueuing of that many pbufs. + */ +#define ETHIF_PBUF_MIN (NDEV_IOV_MAX) +#define ETHIF_PBUF_MAX_1 (mempool_cur_buffers() >> 1) +#define ETHIF_PBUF_MAX_2 (NR_TCPSOCK * (TCP_MSS / MEMPOOL_BUFSIZE + 3)) + +static unsigned int ethif_spares; + +static SIMPLEQ_HEAD(, ethif) ethif_freelist; /* free ethif objects */ + +static const struct ifdev_ops ethif_ops; + +#ifdef INET6 +static ip6_addr_t ethif_ip6addr_allnodes_ll; +#endif /* INET6 */ + +/* + * Initialize the ethernet interfaces module. + */ +void +ethif_init(void) +{ + unsigned int slot; + + /* Initialize the list of free ethif objects. */ + SIMPLEQ_INIT(ðif_freelist); + + for (slot = 0; slot < __arraycount(ethif_array); slot++) + SIMPLEQ_INSERT_TAIL(ðif_freelist, ðif_array[slot], + ethif_next); + + /* Initialize the number of in-use spare tokens. */ + ethif_spares = 0; + +#ifdef INET6 + /* Preinitialize the link-local all-nodes IPv6 multicast address. */ + ip6_addr_set_allnodes_linklocal(ðif_ip6addr_allnodes_ll); +#endif /* INET6 */ +} + +/* + * As the result of some event, the NetBSD-style interface flags for this + * interface may have changed. Recompute and update the flags as appropriate. + */ +static void +ethif_update_ifflags(struct ethif * ethif) +{ + unsigned int ifflags; + + ifflags = ifdev_get_ifflags(ðif->ethif_ifdev); + + /* These are the flags that we might update here. */ + ifflags &= ~(IFF_RUNNING | IFF_ALLMULTI); + + /* + * For us, the RUNNING flag indicates that -as far as we know- the + * network device is fully operational and has its I/O engines running. + * This is a reflection of the current state, not of any intention, so + * we look at the active configuration here. We use the same approach + * for one other receive state flags here (ALLMULTI). + */ + if ((ethif->ethif_flags & + (ETHIFF_DISABLED | ETHIFF_FIRST_CONF)) == 0 && + ethif->ethif_active.nconf_mode != NDEV_MODE_DOWN) { + ifflags |= IFF_RUNNING; + + if (ethif->ethif_active.nconf_mode & NDEV_MODE_MCAST_ALL) + ifflags |= IFF_ALLMULTI; + } + + ifdev_update_ifflags(ðif->ethif_ifdev, ifflags); +} + +/* + * Add a multicast hardware receive address into the set of hardware addresses + * in the given configuration, if the given address is not already in the + * configuration's set. Adjust the configuration's mode as needed. Return + * TRUE If the address was added, and FALSE if the address could not be added + * due to a full list (of 'max' elements), in which case the mode is changed + * from receiving from listed multicast addresses to receiving from all + * multicast addresses. + */ +static int +ethif_add_mcast(struct ndev_conf * nconf, unsigned int max, + struct ndev_hwaddr * hwaddr) +{ + unsigned int slot; + + /* + * See if the hardware address is already in the list we produced so + * far. This makes the multicast list generation O(n^2) but we do not + * expect many entries nor is the list size large anyway. + */ + for (slot = 0; slot < nconf->nconf_mccount; slot++) + if (!memcmp(&nconf->nconf_mclist[slot], hwaddr, + sizeof(*hwaddr))) + return TRUE; + + if (nconf->nconf_mccount < max) { + memcpy(&nconf->nconf_mclist[slot], hwaddr, sizeof(*hwaddr)); + nconf->nconf_mccount++; + + nconf->nconf_mode |= NDEV_MODE_MCAST_LIST; + + return TRUE; + } else { + nconf->nconf_mode &= ~NDEV_MODE_MCAST_LIST; + nconf->nconf_mode |= NDEV_MODE_MCAST_ALL; + + return FALSE; + } +} + +/* + * Add the ethernet hardware address derived from the given IPv4 multicast + * address, to the list of multicast addresses. + */ +static int +ethif_add_mcast_v4(struct ndev_conf * nconf, unsigned int max, + const ip4_addr_t * ip4addr) +{ + struct ndev_hwaddr hwaddr; + + /* 01:00:05:xx:xx:xx with the lower 23 bits of the IPv4 address. */ + hwaddr.nhwa_addr[0] = LL_IP4_MULTICAST_ADDR_0; + hwaddr.nhwa_addr[1] = LL_IP4_MULTICAST_ADDR_1; + hwaddr.nhwa_addr[2] = LL_IP4_MULTICAST_ADDR_2; + hwaddr.nhwa_addr[3] = (ip4_addr_get_u32(ip4addr) >> 16) & 0x7f; + hwaddr.nhwa_addr[4] = (ip4_addr_get_u32(ip4addr) >> 8) & 0xff; + hwaddr.nhwa_addr[5] = (ip4_addr_get_u32(ip4addr) >> 0) & 0xff; + + return ethif_add_mcast(nconf, max, &hwaddr); +} + +/* + * Add the ethernet hardware address derived from the given IPv6 multicast + * address, to the list of multicast addresses. + */ +static int +ethif_add_mcast_v6(struct ndev_conf * nconf, unsigned int max, + const ip6_addr_t * ip6addr) +{ + struct ndev_hwaddr hwaddr; + + /* 33:33:xx:xx:xx:xx with the lower 32 bits of the IPv6 address. */ + hwaddr.nhwa_addr[0] = LL_IP6_MULTICAST_ADDR_0; + hwaddr.nhwa_addr[1] = LL_IP6_MULTICAST_ADDR_1; + memcpy(&hwaddr.nhwa_addr[2], &ip6addr->addr[3], sizeof(uint32_t)); + + return ethif_add_mcast(nconf, max, &hwaddr); +} + +/* + * Set up the multicast mode for a configuration that is to be sent to a + * network driver, generating a multicast receive address list for the driver + * as applicable. + */ +static void +ethif_gen_mcast(struct ethif * ethif, struct ndev_conf * nconf) +{ + struct igmp_group *group4; + struct mld_group *group6; + unsigned int max; + + /* Make sure that multicast is supported at all for this interface. */ + if (!(ethif->ethif_caps & NDEV_CAP_MCAST)) + return; + + /* Make sure the mode is being (re)configured to be up. */ + if (!(nconf->nconf_set & NDEV_SET_MODE) || + nconf->nconf_mode == NDEV_MODE_DOWN) + return; + + /* Recompute the desired multicast flags. */ + nconf->nconf_mode &= ~(NDEV_MODE_MCAST_LIST | NDEV_MODE_MCAST_ALL); + + /* If promiscuous mode is enabled, receive all multicast packets. */ + if (nconf->nconf_mode & NDEV_MODE_PROMISC) { + nconf->nconf_mode |= NDEV_MODE_MCAST_ALL; + + return; + } + + /* + * Map all IGMP/MLD6 multicast addresses to ethernet addresses, merging + * any duplicates to save slots. We have to add the MLD6 all-nodes + * multicast address ourselves, which also means the list is never + * empty unless compiling with USE_INET6=no. If the list is too small + * for all addresses, opt to receive all multicast packets instead. + */ + nconf->nconf_mclist = ethif->ethif_mclist; + nconf->nconf_mccount = 0; + max = __arraycount(ethif->ethif_mclist); + + for (group4 = netif_igmp_data(ethif_get_netif(ethif)); group4 != NULL; + group4 = group4->next) + if (!ethif_add_mcast_v4(nconf, max, &group4->group_address)) + return; + +#ifdef INET6 + if (!ethif_add_mcast_v6(nconf, max, ðif_ip6addr_allnodes_ll)) + return; +#endif /* INET6 */ + + for (group6 = netif_mld6_data(ethif_get_netif(ethif)); group6 != NULL; + group6 = group6->next) + if (!ethif_add_mcast_v6(nconf, max, &group6->group_address)) + return; +} + +/* + * Merge a source configuration into a destination configuration, copying any + * fields intended to be set from the source into the destination and clearing + * the "set" mask in the source, without changing the source fields, so that + * the source will reflect the destination's contents. + */ +static void +ethif_merge_conf(struct ndev_conf * dconf, struct ndev_conf * sconf) +{ + + dconf->nconf_set |= sconf->nconf_set; + + if (sconf->nconf_set & NDEV_SET_MODE) + dconf->nconf_mode = sconf->nconf_mode; + if (sconf->nconf_set & NDEV_SET_CAPS) + dconf->nconf_caps = sconf->nconf_caps; + if (sconf->nconf_set & NDEV_SET_FLAGS) + dconf->nconf_flags = sconf->nconf_flags; + if (sconf->nconf_set & NDEV_SET_MEDIA) + dconf->nconf_media = sconf->nconf_media; + if (sconf->nconf_set & NDEV_SET_HWADDR) + memcpy(&dconf->nconf_hwaddr, &sconf->nconf_hwaddr, + sizeof(dconf->nconf_hwaddr)); + + sconf->nconf_set = 0; +} + +/* + * Return TRUE if we can and should try to pass a configuration request to the + * ndev layer on this interface, or FALSE otherwise. + */ +static int +ethif_can_conf(struct ethif * ethif) +{ + + /* Is there a configuration change waiting? The common case is no. */ + if (ethif->ethif_wanted.nconf_set == 0) + return FALSE; + + /* + * Is there a configuration change pending already? Then wait for it + * to be acknowledged first. + */ + if (ethif->ethif_pending.nconf_set != 0) + return FALSE; + + /* Make sure the interface is in the appropriate state. */ + if (ethif->ethif_flags & ETHIFF_DISABLED) + return FALSE; + + /* First let all current packet send requests finish. */ + return (ethif->ethif_snd.es_unsentp == ðif->ethif_snd.es_head); +} + +/* + * Return TRUE if we can and should try to pass the next unsent packet send + * request to the ndev layer on this interface, or FALSE otherwise. + */ +static int +ethif_can_send(struct ethif * ethif) +{ + + /* Is there anything to hand to ndev at all? The common case is no. */ + if (*ethif->ethif_snd.es_unsentp == NULL) + return FALSE; + + /* + * Is there a configuration change pending? Then we cannot send + * packets yet. Always let all configuration changes through first. + */ + if (ethif->ethif_pending.nconf_set != 0 || + ethif->ethif_wanted.nconf_set != 0) + return FALSE; + + /* Make sure the interface is in the appropriate state. */ + if ((ethif->ethif_flags & (ETHIFF_DISABLED | ETHIFF_FIRST_CONF)) != 0) + return FALSE; + + return TRUE; +} + +/* + * Return TRUE if we can and should try to receive packets on this interface + * and are ready to accept received packets, or FALSE otherwise. + */ +static int +ethif_can_recv(struct ethif * ethif) +{ + + if ((ethif->ethif_flags & (ETHIFF_DISABLED | ETHIFF_FIRST_CONF)) != 0) + return FALSE; + + /* + * We do not check the link status here. There is no reason not to + * spawn receive requests, or accept received packets, while the link + * is reported to be down. + */ + return ifdev_is_up(ðif->ethif_ifdev); +} + +/* + * Polling function, invoked after each message loop iteration. Check whether + * any configuration change or packets can be sent to the driver, and whether + * any new packet receive requests can be enqueued at the driver. + */ +static void +ethif_poll(struct ifdev * ifdev) +{ + struct ethif *ethif = (struct ethif *)ifdev; + struct pbuf *pbuf, *pref; + + /* + * If a configuration request is desired, see if we can send it to the + * driver now. Otherwise, attempt to send any packets if possible. + * In both cases, a failure of the ndev call indicates that we should + * try again later. + */ + if (ethif_can_conf(ethif)) { + ethif_gen_mcast(ethif, ðif->ethif_wanted); + + /* + * On success, move the wanted configuration into the pending + * slot. Otherwise, try again on the next poll iteration. + */ + if (ndev_conf(ethif->ethif_ndev, ðif->ethif_wanted) == OK) + ethif_merge_conf(ðif->ethif_pending, + ðif->ethif_wanted); + } else { + while (ethif_can_send(ethif)) { + pref = *ethif->ethif_snd.es_unsentp; + + if (pref->type == PBUF_REF) + pbuf = (struct pbuf *)pref->payload; + else + pbuf = pref; + + if (ndev_send(ethif->ethif_ndev, pbuf) == OK) + ethif->ethif_snd.es_unsentp = + pchain_end(pref); + else + break; + } + } + + /* + * Attempt to create additional receive requests for the driver, if + * applicable. We currently do not set a limit on the maximum number + * of concurrently pending receive requests here, because the maximum + * in ndev is already quite low. That may have to be changed one day. + */ + while (ethif_can_recv(ethif) && ndev_can_recv(ethif->ethif_ndev)) { + /* + * Allocate a buffer for the network device driver to copy the + * received packet into. Allocation may fail if no buffers are + * available at this time; in that case simply try again later. + * We add room for a VLAN tag even though we do not support + * such tags just yet. + */ + if ((pbuf = pchain_alloc(PBUF_RAW, ETH_PAD_LEN + ETH_HDR_LEN + + ETHIF_MAX_MTU + NDEV_ETH_PACKET_TAG)) == NULL) + break; + + /* + * Effectively throw away two bytes in order to align TCP/IP + * header fields to 32 bits. See the short discussion in + * lwipopts.h as to why we are not using lwIP's ETH_PAD_SIZE. + */ + util_pbuf_header(pbuf, -ETH_PAD_LEN); + + /* + * Send the request to the driver. This may still fail due to + * grant allocation failure, in which case we try again later. + */ + if (ndev_recv(ethif->ethif_ndev, pbuf) != OK) { + pbuf_free(pbuf); + + break; + } + + /* + * Hold on to the packet buffer until the receive request + * completes or is aborted, or the driver disappears. + */ + *ethif->ethif_rcv.er_tailp = pbuf; + ethif->ethif_rcv.er_tailp = pchain_end(pbuf); + } +} + +/* + * Complete the link-layer header of the packet by filling in a source address. + * This is relevant for BPF-generated packets only, and thus we can safely + * modify the given pbuf. + */ +static void +ethif_hdrcmplt(struct ifdev * ifdev, struct pbuf * pbuf) +{ + struct netif *netif; + + /* Make sure there is an ethernet packet header at all. */ + if (pbuf->len < ETH_HDR_LEN) + return; + + netif = ifdev_get_netif(ifdev); + + /* + * Insert the source ethernet address into the packet. The source + * address is located right after the destination address at the start + * of the packet. + */ + memcpy((uint8_t *)pbuf->payload + netif->hwaddr_len, netif->hwaddr, + netif->hwaddr_len); +} + +/* + * Return TRUE if the given additional number of spare tokens may be used, or + * FALSE if the limit has been reached. Each spare token represents one + * enqueued pbuf. The limit must be such that we do not impede normal traffic + * but also do not spend the entire buffer pool on enqueued packets. + */ +static int +ethif_can_spare(unsigned int spares) +{ + unsigned int max; + + /* + * Use the configured maximum, which depends on the current size of the + * buffer pool. + */ + max = ETHIF_PBUF_MAX_1; + + /* + * However, limit the total to a value based on the maximum number of + * TCP packets that can, in the worst case, be expected to queue up at + * any single moment. + */ + if (max > ETHIF_PBUF_MAX_2) + max = ETHIF_PBUF_MAX_2; + + return (spares + ethif_spares <= max - ETHIF_PBUF_MIN * NR_NDEV); +} + +/* + * Process a packet as output on an ethernet interface. + */ +static err_t +ethif_output(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif) +{ + struct ethif *ethif = (struct ethif *)ifdev; + struct pbuf *pref, *pcopy; + size_t padding; + unsigned int count, spares; + + /* Packets must never be sent on behalf of another interface. */ + assert(netif == NULL); + + /* + * The caller already rejects packets while the interface or link is + * down. We do want to keep enqueuing packets while the driver is + * restarting, so do not check ETHIFF_DISABLED or ETHIFF_FIRST_CONF. + */ + + /* + * Reject oversized packets immediately. This should not happen. + * Undersized packets are padded below. + */ + if (pbuf->tot_len > NDEV_ETH_PACKET_MAX) { + printf("LWIP: attempt to send oversized ethernet packet " + "(size %u)\n", pbuf->tot_len); + util_stacktrace(); + + return ERR_MEM; + } + + /* + * The original lwIP idea for processing output packets is that we make + * a copy of the packet here, so that lwIP is free to do whatever it + * wants with the original packet (e.g., keep on the TCP retransmission + * queue). More recently, lwIP has made progress towards allowing the + * packet to be referenced only, decreasing the reference count only + * once the packet has been actually sent. For many embedded systems, + * that change now allows zero-copy transmission with direct DMA from + * the provided packet buffer. We are not so lucky: we have to make an + * additional inter-process copy anyway. We do however use the same + * referencing system to avoid having to make yet another copy of the + * packet here. + * + * There was previously a check on (pbuf->ref > 1) here, to ensure that + * we would never enqueue packets that are retransmitted while we were + * still in the process of sending the initial copy. Now that for ARP + * and NDP queuing, packets are referenced rather than copied (lwIP + * patch #9272), we can no longer perform that check: packets may + * legitimately have a reference count of 2 at this point. The second + * reference will be dropped by the caller immediately after we return. + */ + + /* + * There are two cases in which we need to make a copy of the packet + * after all: + * + * 1) in the case that the packet needs to be padded in order to reach + * the minimum ethernet packet size (for drivers' convenience); + * 2) in the (much more exceptional) case that the given pbuf chain + * exceeds the maximum vector size for network driver requests. + */ + if (NDEV_ETH_PACKET_MIN > pbuf->tot_len) + padding = NDEV_ETH_PACKET_MIN - pbuf->tot_len; + else + padding = 0; + + count = pbuf_clen(pbuf); + + if (padding != 0 || count > NDEV_IOV_MAX) { + pcopy = pchain_alloc(PBUF_RAW, pbuf->tot_len + padding); + if (pcopy == NULL) { + ifdev_output_drop(ifdev); + + return ERR_MEM; + } + + if (pbuf_copy(pcopy, pbuf) != ERR_OK) + panic("unexpected pbuf copy failure"); + + if (padding > 0) { + /* + * This restriction can be lifted if needed, but it + * involves hairy pbuf traversal and our standard pool + * size should be way in excess of the minimum packet + * size. + */ + assert(pcopy->len == pbuf->tot_len + padding); + + memset((char *)pcopy->payload + pbuf->tot_len, 0, + padding); + } + + count = pbuf_clen(pcopy); + assert(count <= NDEV_IOV_MAX); + + pbuf = pcopy; + } else + pcopy = NULL; + + /* + * Restrict the size of the send queue, so that it will not exhaust the + * buffer pool. + */ + if (ethif->ethif_snd.es_count >= ETHIF_PBUF_MIN) + spares = count; + else if (ethif->ethif_snd.es_count + count > ETHIF_PBUF_MIN) + spares = ethif->ethif_snd.es_count + count - ETHIF_PBUF_MIN; + else + spares = 0; + + if (spares > 0 && !ethif_can_spare(spares)) { + if (pcopy != NULL) + pbuf_free(pcopy); + + ifdev_output_drop(ifdev); + + return ERR_MEM; + } + + /* + * A side effect of the referencing approach is that we cannot touch + * the last pbuf's "next" pointer. Thus, we need another way of + * linking together the buffers on the send queue. We use a linked + * list of PBUF_REF-type buffers for this instead. However, do this + * only when we have not made a copy of the original pbuf, because then + * we might as well use the copy instead. + */ + if (pcopy == NULL) { + if ((pref = pbuf_alloc(PBUF_RAW, 0, PBUF_REF)) == NULL) { + ifdev_output_drop(ifdev); + + return ERR_MEM; + } + + pbuf_ref(pbuf); + + pref->payload = pbuf; + pref->tot_len = 0; + pref->len = count; + } else + pref = pcopy; + + /* If the send queue was empty so far, set the IFF_OACTIVE flag. */ + if (ethif->ethif_snd.es_head == NULL) + ifdev_update_ifflags(ðif->ethif_ifdev, + ifdev_get_ifflags(ðif->ethif_ifdev) | IFF_OACTIVE); + + /* + * Enqueue the packet on the send queue. It will be sent from the + * polling function as soon as possible. TODO: see if sending it from + * here makes any performance difference at all. + */ + *ethif->ethif_snd.es_tailp = pref; + ethif->ethif_snd.es_tailp = pchain_end(pref); + + ethif->ethif_snd.es_count += count; + ethif_spares += spares; + + return ERR_OK; +} + +/* + * Transmit an ethernet packet on an ethernet interface, as requested by lwIP. + */ +static err_t +ethif_linkoutput(struct netif * netif, struct pbuf * pbuf) +{ + struct ifdev *ifdev = netif_get_ifdev(netif); + + /* + * Let ifdev make the callback to our output function, so that it can + * pass the packet to BPF devices and generically update statistics. + */ + return ifdev_output(ifdev, pbuf, NULL /*netif*/, TRUE /*to_bpf*/, + TRUE /*hdrcmplt*/); +} + +/* + * The multicast address list has changed. See to it that the change will make + * it to the network driver at some point. + */ +static err_t +ethif_set_mcast(struct ethif * ethif) +{ + + /* + * Simply generate a mode change request, unless the interface is down. + * Once the mode change request is about to be sent to the driver, we + * will recompute the multicast settings. + */ + if (ifdev_is_up(ðif->ethif_ifdev)) + ethif->ethif_wanted.nconf_set |= NDEV_SET_MODE; + + return ERR_OK; +} + +/* + * An IPv4 multicast address has been added to or removed from the list of IPv4 + * multicast addresses. + */ +static err_t +ethif_set_mcast_v4(struct netif * netif, const ip4_addr_t * group __unused, + enum netif_mac_filter_action action __unused) +{ + + return ethif_set_mcast((struct ethif *)netif_get_ifdev(netif)); +} + +/* + * An IPv6 multicast address has been added to or removed from the list of IPv6 + * multicast addresses. + */ +static err_t +ethif_set_mcast_v6(struct netif * netif, const ip6_addr_t * group __unused, + enum netif_mac_filter_action action __unused) +{ + + return ethif_set_mcast((struct ethif *)netif_get_ifdev(netif)); +} + +/* + * Initialization function for an ethernet-type netif interface, called from + * lwIP at interface creation time. + */ +static err_t +ethif_init_netif(struct ifdev * ifdev, struct netif * netif) +{ + struct ethif *ethif = (struct ethif *)ifdev; + + /* + * Fill in a dummy name. Since it is only two characters, do not + * bother trying to reuse part of the given name. If this name is ever + * actually used anywhere, the dummy should suffice for debugging. + */ + netif->name[0] = 'e'; + netif->name[1] = 'n'; + + netif->linkoutput = ethif_linkoutput; + + memset(netif->hwaddr, 0, sizeof(netif->hwaddr)); + + /* + * Set the netif flags, partially based on the capabilities reported by + * the network device driver. The reason that we do this now is that + * lwIP tests for some of these flags and starts appropriate submodules + * (e.g., IGMP) right after returning from this function. If we set + * the flags later, we also have to take over management of those + * submodules, which is something we'd rather avoid. For this reason + * in particular, we also do not support capability mask changes after + * driver restarts - see ethif_enable(). + */ + netif->flags = NETIF_FLAG_ETHARP | NETIF_FLAG_ETHERNET; + + if (ethif->ethif_caps & NDEV_CAP_BCAST) + netif->flags |= NETIF_FLAG_BROADCAST; + + if (ethif->ethif_caps & NDEV_CAP_MCAST) { + /* The IGMP code adds the all-stations multicast entry. */ + netif->igmp_mac_filter = ethif_set_mcast_v4; + + netif->flags |= NETIF_FLAG_IGMP; + + /* For MLD6 we have to add the all-nodes entry ourselves. */ + netif->mld_mac_filter = ethif_set_mcast_v6; + + netif->flags |= NETIF_FLAG_MLD6; + } + + return ERR_OK; +} + +/* + * The ndev layer reports that a new network device driver has appeared, with + * the given ndev identifier, a driver-given name, and a certain set of + * capabilities. Create a new ethernet interface object for it. On success, + * return a pointer to the object (for later callbacks from ndev). In that + * case, the ndev layer will always immediately call ethif_enable() afterwards. + * On failure, return NULL, in which case ndev will forget about the driver. + */ +struct ethif * +ethif_add(ndev_id_t id, const char * name, uint32_t caps) +{ + struct ethif *ethif; + unsigned int ifflags; + int r; + + /* + * First make sure that the interface name is valid, unique, and not + * reserved for virtual interface types. + */ + if ((r = ifdev_check_name(name, NULL /*vtype_slot*/)) != OK) { + /* + * There is some risk in printing bad stuff, but this may help + * in preventing serious driver writer frustration.. + */ + printf("LWIP: invalid driver name '%s' (%d)\n", name, r); + + return NULL; + } + + /* Then see if there is a free ethernet interface object available. */ + if (SIMPLEQ_EMPTY(ðif_freelist)) { + printf("LWIP: out of slots for driver name '%s'\n", name); + + return NULL; + } + + /* + * All good; set up the interface. First initialize the object, since + * adding the interface to lwIP might spawn some activity right away. + */ + ethif = SIMPLEQ_FIRST(ðif_freelist); + SIMPLEQ_REMOVE_HEAD(ðif_freelist, ethif_next); + + /* Initialize the ethif structure. */ + memset(ethif, 0, sizeof(*ethif)); + ethif->ethif_ndev = id; + ethif->ethif_flags = ETHIFF_DISABLED; + ethif->ethif_caps = caps; + + ethif->ethif_snd.es_head = NULL; + ethif->ethif_snd.es_unsentp = ðif->ethif_snd.es_head; + ethif->ethif_snd.es_tailp = ðif->ethif_snd.es_head; + ethif->ethif_snd.es_count = 0; + + ethif->ethif_rcv.er_head = NULL; + ethif->ethif_rcv.er_tailp = ðif->ethif_rcv.er_head; + + /* + * Set all the three configurations to the same initial values. Since + * any change to the configuration will go through all three, this + * allows us to obtain various parts of the status (in particular, the + * mode, flags, enabled capabilities, and media type selection) from + * any of the three without having to consult the others. Note that + * the hardware address is set to a indeterminate initial value, as it + * is left to the network driver unless specifically overridden. + */ + ethif->ethif_active.nconf_set = 0; + ethif->ethif_active.nconf_mode = NDEV_MODE_DOWN; + ethif->ethif_active.nconf_flags = 0; + ethif->ethif_active.nconf_caps = 0; + ethif->ethif_active.nconf_media = + IFM_MAKEWORD(IFM_ETHER, IFM_AUTO, 0, 0); + memcpy(ðif->ethif_pending, ðif->ethif_active, + sizeof(ethif->ethif_pending)); + memcpy(ðif->ethif_wanted, ðif->ethif_pending, + sizeof(ethif->ethif_wanted)); + + /* + * Compute the initial NetBSD-style interface flags. The IFF_SIMPLEX + * interface flag is always enabled because we do not support network + * drivers that are receiving their own packets. In particular, lwIP + * currently does not deal well with receiving back its own multicast + * packets, which leads to IPv6 DAD failures. The other two flags + * (IFF_BROADCAST, IFF_MULTICAST) denote capabilities, not enabled + * receipt modes. + */ + ifflags = IFF_SIMPLEX; + if (caps & NDEV_CAP_BCAST) + ifflags |= IFF_BROADCAST; + if (caps & NDEV_CAP_MCAST) + ifflags |= IFF_MULTICAST; + + /* Finally, add the interface to ifdev and lwIP. This cannot fail. */ + ifdev_add(ðif->ethif_ifdev, name, ifflags, IFT_ETHER, ETH_HDR_LEN, + ETHARP_HWADDR_LEN, DLT_EN10MB, ETHIF_DEF_MTU, + ND6_IFF_PERFORMNUD | ND6_IFF_AUTO_LINKLOCAL, ðif_ops); + + return ethif; +} + +/* + * The link status and/or media type of an ethernet interface has changed. + */ +static void +ethif_set_status(struct ethif * ethif, uint32_t link, uint32_t media) +{ + unsigned int iflink; + + /* We save the media type locally for now. */ + ethif->ethif_media = media; + + /* Let the ifdev module handle the details of the link change. */ + switch (link) { + case NDEV_LINK_UP: iflink = LINK_STATE_UP; break; + case NDEV_LINK_DOWN: iflink = LINK_STATE_DOWN; break; + default: iflink = LINK_STATE_UNKNOWN; break; + } + + ifdev_update_link(ðif->ethif_ifdev, iflink); +} + +/* + * The ndev layer reports that a previously added or disabled network device + * driver has been (re)enabled. Start by initializing the driver. Return TRUE + * if the interface could indeed be enabled, or FALSE if it should be forgotten + * altogether after all. + */ +int +ethif_enable(struct ethif * ethif, const char * name, + const struct ndev_hwaddr * hwaddr, uint8_t hwaddr_len, uint32_t caps, + uint32_t link, uint32_t media) +{ + int r; + + assert(ethif->ethif_flags & ETHIFF_DISABLED); + + /* + * One disadvantage of keeping service labels and ethernet driver names + * disjunct is that the ethernet driver may mess with its name between + * restarts. Ultimately we may end up renaming our ethernet drivers + * such that their labels match their names, in which case we no longer + * need the drivers themselves to produce a name, and we can retire + * this check. + */ + if (name != NULL && strcmp(ethif_get_name(ethif), name)) { + printf("LWIP: driver '%s' restarted with name '%s'\n", + ethif_get_name(ethif), name); + + return FALSE; + } + + /* + * The hardware address length is just a sanity check for now. After + * the initialization reply, we assume the same length is used for all + * addresses, which is also the maximum, namely 48 bits (six bytes). + */ + if (hwaddr_len != ETHARP_HWADDR_LEN) { + printf("LWIP: driver '%s' reports hwaddr length %u\n", + ethif_get_name(ethif), hwaddr_len); + + return FALSE; + } + + /* + * If the driver has changed its available capabilities as a result of + * a restart, we have a problem: we may already have configured the + * interface's netif object to make use of of some of those + * capabilities. TODO: we can deal with some cases (e.g., disappearing + * checksum offloading capabilities) with some effort, and with other + * cases (e.g., disappearing multicast support) with a LOT more effort. + */ + if (ethif->ethif_caps != caps) { + printf("LWIP: driver '%s' changed capabilities\n", + ethif_get_name(ethif)); + + return FALSE; + } + + /* + * Set the hardware address on the interface, unless a request is + * currently pending to change it, in which case the new address has + * been set already and we do not want to revert that change. If not, + * we always set the address, because it may have changed as part of a + * driver restart and we do not want to get out of sync with it, nor + * can we necessarily change it back. + */ + if (!(ethif->ethif_active.nconf_set & NDEV_SET_HWADDR) && + !(ethif->ethif_pending.nconf_set & NDEV_SET_HWADDR)) + ifdev_update_hwaddr(ðif->ethif_ifdev, hwaddr->nhwa_addr, + (name == NULL) /*is_factory*/); + + /* + * At this point, only one more thing can fail: it is possible that we + * do not manage to send the first configuration request due to memory + * shortage. This is extremely unlikely to happen, so send the conf + * request first and forget the entire driver if it fails. + */ + /* + * Always generate a new multicast list before sending a configuration + * request, and at no other time (since there may be a grant for it). + */ + ethif_gen_mcast(ethif, ðif->ethif_active); + + if ((r = ndev_conf(ethif->ethif_ndev, ðif->ethif_active)) != OK) { + printf("LWIP: sending first configuration to '%s' failed " + "(%d)\n", ethif_get_name(ethif), r); + + return FALSE; + } + + ethif_set_status(ethif, link, media); + + ethif->ethif_flags &= ~ETHIFF_DISABLED; + ethif->ethif_flags |= ETHIFF_FIRST_CONF; + + return TRUE; +} + +/* + * The configuration change stored in the "pending" slot of the given ethif + * object has been acknowledged by the network device driver (or the driver has + * died, see ethif_disable()). Apply changes to the "active" slot of the given + * ethif object, as well as previously delayed changes to lwIP through netif. + */ +static void +ethif_post_conf(struct ethif * ethif) +{ + struct ndev_conf *nconf; + unsigned int flags; + + nconf = ðif->ethif_pending; + + /* + * Now that the driver configuration has changed, we know that the + * new checksum settings will be applied to all sent and received + * packets, and we can disable checksumming flags in netif as desired. + * Enabling checksumming flags has already been done earlier on. + */ + if (nconf->nconf_set & NDEV_SET_CAPS) { + flags = ethif_get_netif(ethif)->chksum_flags; + + if (nconf->nconf_caps & NDEV_CAP_CS_IP4_TX) + flags &= ~NETIF_CHECKSUM_GEN_IP; + if (nconf->nconf_caps & NDEV_CAP_CS_IP4_RX) + flags &= ~NETIF_CHECKSUM_CHECK_IP; + if (nconf->nconf_caps & NDEV_CAP_CS_UDP_TX) + flags &= ~NETIF_CHECKSUM_GEN_UDP; + if (nconf->nconf_caps & NDEV_CAP_CS_UDP_RX) + flags &= ~NETIF_CHECKSUM_CHECK_UDP; + if (nconf->nconf_caps & NDEV_CAP_CS_TCP_TX) + flags &= ~NETIF_CHECKSUM_GEN_TCP; + if (nconf->nconf_caps & NDEV_CAP_CS_TCP_RX) + flags &= ~NETIF_CHECKSUM_CHECK_TCP; + + NETIF_SET_CHECKSUM_CTRL(ethif_get_netif(ethif), flags); + } + + /* + * Merge any individual parts of the now acknowledged configuration + * changes into the active configuration. The result is that we are + * able to reapply these changes at any time should the network driver + * be restarted. In addition, by only setting bits for fields that + * have actually changed, we can later tell whether the user wanted the + * change or ethif should just take over what the driver reports after + * a restart; this is important for HW-address and media settings. + */ + ethif_merge_conf(ðif->ethif_active, ðif->ethif_pending); +} + +/* + * All receive requests have been canceled at the ndev layer, because the + * network device driver has been restarted or shut down. Clear the receive + * queue, freeing any packets in it. + */ +static void +ethif_drain(struct ethif * ethif) +{ + struct pbuf *pbuf, **pnext; + + while ((pbuf = ethif->ethif_rcv.er_head) != NULL) { + pnext = pchain_end(pbuf); + + if ((ethif->ethif_rcv.er_head = *pnext) == NULL) + ethif->ethif_rcv.er_tailp = ðif->ethif_rcv.er_head; + + *pnext = NULL; + pbuf_free(pbuf); + } +} + +/* + * The network device driver has stopped working (i.e., crashed), but has not + * been shut down completely, and is expect to come back later. + */ +void +ethif_disable(struct ethif * ethif) +{ + + /* + * We assume, optimistically, that a new instance of the driver will be + * brought up soon after which we can continue operating as before. As + * such, we do not want to change most of the user-visible state until + * we know for sure that our optimism was in vain. In particular, we + * do *not* want to change the following parts of the state here: + * + * - the contents of the send queue; + * - the state of the interface (up or down); + * - the state and media type of the physical link. + * + * The main user-visible indication of the crash will be that the + * interface does not have the IFF_RUNNING flag set. + */ + + /* + * If a configuration request was pending, it will be lost now. Highly + * unintuitively, make the requested configuration the *active* one, + * just as though the request completed successfully. This works, + * because once the driver comes back, the active configuration will be + * replayed as initial configuration. Therefore, by pretending that + * the current request went through, we ensure that it too will be sent + * to the new instance--before anything else is allowed to happen. + */ + if (ethif->ethif_pending.nconf_set != 0) + ethif_post_conf(ethif); + + /* + * Any packet send requests have been lost, too, and likewise forgotten + * by ndev. Thus, we need to forget that we sent any packets, so that + * they will be resent after the driver comes back up. That *may* + * cause packet duplication, but that is preferable over packet loss. + */ + ethif->ethif_snd.es_unsentp = ðif->ethif_snd.es_head; + + /* + * We fully restart the receive queue, because all receive requests + * have been forgotten by ndev as well now and it is easier to simply + * reconstruct the receive queue in its entirety later on. + */ + ethif_drain(ethif); + + /* Make sure we do not attempt to initiate new requests for now. */ + ethif->ethif_flags &= ~ETHIFF_FIRST_CONF; + ethif->ethif_flags |= ETHIFF_DISABLED; +} + +/* + * Dequeue and discard the packet at the head of the send queue. + */ +static void +ethif_dequeue_send(struct ethif * ethif) +{ + struct pbuf *pref, *pbuf, **pnext; + unsigned int count, spares; + + /* + * The send queue is a linked list of reference buffers, each of which + * links to the actual packet. Dequeue the first reference buffer. + */ + pref = ethif->ethif_snd.es_head; + assert(pref != NULL); + + pnext = pchain_end(pref); + + if (ethif->ethif_snd.es_unsentp == pnext) + ethif->ethif_snd.es_unsentp = ðif->ethif_snd.es_head; + + if ((ethif->ethif_snd.es_head = *pnext) == NULL) + ethif->ethif_snd.es_tailp = ðif->ethif_snd.es_head; + + /* Do this before possibly calling pbuf_clen() below.. */ + *pnext = NULL; + + /* + * If we never made a copy of the original packet, we now have it + * pointed to by a reference buffer. If so, decrease the reference + * count of the actual packet, thereby freeing it if lwIP itself was + * already done with. Otherwise, the copy of the packet is the + * reference buffer itself. In both cases we need to free that buffer. + */ + if (pref->type == PBUF_REF) { + pbuf = (struct pbuf *)pref->payload; + + pbuf_free(pbuf); + + count = pref->len; + } else + count = pbuf_clen(pref); + + assert(count > 0); + assert(ethif->ethif_snd.es_count >= count); + ethif->ethif_snd.es_count -= count; + + if (ethif->ethif_snd.es_count >= ETHIF_PBUF_MIN) + spares = count; + else if (ethif->ethif_snd.es_count + count > ETHIF_PBUF_MIN) + spares = ethif->ethif_snd.es_count + count - ETHIF_PBUF_MIN; + else + spares = 0; + + assert(ethif_spares >= spares); + ethif_spares -= spares; + + /* Free the reference buffer as well. */ + pbuf_free(pref); + + /* If the send queue is now empty, clear the IFF_OACTIVE flag. */ + if (ethif->ethif_snd.es_head == NULL) + ifdev_update_ifflags(ðif->ethif_ifdev, + ifdev_get_ifflags(ðif->ethif_ifdev) & ~IFF_OACTIVE); +} + +/* + * The ndev layer reports that a network device driver has been permanently + * shut down. Remove the corresponding ethernet interface from the system. + */ +void +ethif_remove(struct ethif * ethif) +{ + int r; + + /* Clear the send and receive queues. */ + while (ethif->ethif_snd.es_head != NULL) + ethif_dequeue_send(ethif); + + ethif_drain(ethif); + + /* Let the ifdev module deal with most other removal aspects. */ + if ((r = ifdev_remove(ðif->ethif_ifdev)) != OK) + panic("unable to remove ethernet interface: %d", r); + + /* Finally, readd the ethif object to the free list. */ + SIMPLEQ_INSERT_HEAD(ðif_freelist, ethif, ethif_next); +} + +/* + * The ndev layer reports that the (oldest) pending configuration request has + * completed with the given result. + */ +void +ethif_configured(struct ethif * ethif, int32_t result) +{ + + /* + * The driver is not supposed to return failure in response to a + * configure result. If it does, we have no proper way to recover, as + * we may already have applied part of the new configuration to netif. + * For now, just report failure and then pretend success. + */ + if (result < 0) { + printf("LWIP: driver '%s' replied with conf result %d\n", + ethif_get_name(ethif), result); + + result = 0; + } + + if (ethif->ethif_flags & ETHIFF_FIRST_CONF) + ethif->ethif_flags &= ~ETHIFF_FIRST_CONF; + else + ethif_post_conf(ethif); + + /* + * For now, the result is simply a boolean value indicating whether the + * driver is using the all-multicast receive mode instead of the + * multicast-list receive mode. We can turn it into a bitmap later. + */ + if (result != 0) { + ethif->ethif_active.nconf_mode &= ~NDEV_MODE_MCAST_LIST; + ethif->ethif_active.nconf_mode |= NDEV_MODE_MCAST_ALL; + } + + /* The interface flags may have changed now, so update them. */ + ethif_update_ifflags(ethif); + + /* Regular operation will resume from the polling function. */ +} + +/* + * The ndev layer reports that the first packet on the send queue has been + * successfully transmitted with 'result' set to OK, or dropped if 'result' is + * negative. The latter may happen if the interface was taken down while there + * were still packets in transit. + */ +void +ethif_sent(struct ethif * ethif, int32_t result) +{ + + ethif_dequeue_send(ethif); + + if (result < 0) + ifdev_output_drop(ðif->ethif_ifdev); + + /* More requests may be sent from the polling function now. */ +} + +/* + * The ndev layer reports that the first buffer on the receive queue has been + * filled with a packet of 'result' bytes, or if 'result' is negative, the + * receive request has been aborted. + */ +void +ethif_received(struct ethif * ethif, int32_t result) +{ + struct pbuf *pbuf, *pwalk, **pnext; + size_t left; + + /* + * Start by removing the first buffer chain off the receive queue. The + * ndev layer guarantees that there ever was a receive request at all. + */ + if ((pbuf = ethif->ethif_rcv.er_head) == NULL) + panic("driver received packet but queue empty"); + + pnext = pchain_end(pbuf); + + if ((ethif->ethif_rcv.er_head = *pnext) == NULL) + ethif->ethif_rcv.er_tailp = ðif->ethif_rcv.er_head; + *pnext = NULL; + + /* Decide if we can and should deliver a packet to the layers above. */ + if (result <= 0 || !ethif_can_recv(ethif)) { + pbuf_free(pbuf); + + return; + } + + if (result > pbuf->tot_len) { + printf("LWIP: driver '%s' returned bad packet size (%zd)\n", + ethif_get_name(ethif), (ssize_t)result); + + pbuf_free(pbuf); + + return; + } + + /* + * The packet often does not use all of the buffers, or at least not + * all of the last buffer. Adjust lengths for the buffers that contain + * part of the packet, and free the remaining (unused) buffers, if any. + */ + left = (size_t)result; + + for (pwalk = pbuf; ; pwalk = pwalk->next) { + pwalk->tot_len = left; + if (pwalk->len > left) + pwalk->len = left; + left -= pwalk->len; + if (left == 0) + break; + } + + if (pwalk->next != NULL) { + pbuf_free(pwalk->next); + + pwalk->next = NULL; + } + + /* + * Finally, hand off the packet to the layers above. We go through + * ifdev so that it can pass the packet to BPF devices and update + * statistics and all that. + */ + ifdev_input(ðif->ethif_ifdev, pbuf, NULL /*netif*/, + TRUE /*to_bpf*/); +} + +/* + * The ndev layer reports a network driver status update. If anything has + * changed since the last status, we may have to take action. The given + * statistics counters are relative to the previous status report. + */ +void +ethif_status(struct ethif * ethif, uint32_t link, uint32_t media, + uint32_t oerror, uint32_t coll, uint32_t ierror, uint32_t iqdrop) +{ + struct if_data *ifdata; + + ethif_set_status(ethif, link, media); + + ifdata = ifdev_get_ifdata(ðif->ethif_ifdev); + ifdata->ifi_oerrors += oerror; + ifdata->ifi_collisions += coll; + ifdata->ifi_ierrors += ierror; + ifdata->ifi_iqdrops += iqdrop; +} + +/* + * Set NetBSD-style interface flags (IFF_) for an ethernet interface. + */ +static int +ethif_set_ifflags(struct ifdev * ifdev, unsigned int ifflags) +{ + struct ethif *ethif = (struct ethif *)ifdev; + uint32_t mode, flags; + + /* + * We do not support IFF_NOARP at this time, because lwIP does not: the + * idea of IFF_NOARP is that only static ARP entries are used, but lwIP + * does not support separating static from dynamic ARP operation. The + * flag does not appear to be particularly widely used anyway. + */ + if ((ifflags & ~(IFF_UP | IFF_DEBUG | IFF_LINK0 | IFF_LINK1 | + IFF_LINK2)) != 0) + return EINVAL; + + mode = ethif->ethif_wanted.nconf_mode; + if ((ifflags & IFF_UP) && mode == NDEV_MODE_DOWN) { + mode = NDEV_MODE_UP; + + /* Always enable broadcast receipt when supported. */ + if (ethif->ethif_caps & NDEV_CAP_BCAST) + mode |= NDEV_MODE_BCAST; + + if (ifdev_is_promisc(ifdev)) + mode |= NDEV_MODE_PROMISC; + + /* + * The multicast flags will be set right before we send the + * request to the driver. + */ + } else if (!(ifflags & IFF_UP) && mode != NDEV_MODE_DOWN) + ethif->ethif_wanted.nconf_mode = NDEV_MODE_DOWN; + + if (mode != ethif->ethif_wanted.nconf_mode) { + ethif->ethif_wanted.nconf_mode = mode; + ethif->ethif_wanted.nconf_set |= NDEV_SET_MODE; + } + + /* + * Some of the interface flags (UP, DEBUG, PROMISC, LINK[0-2]) are a + * reflection of the intended state as set by userland before, so that + * a userland utility will never not see the flag it just set (or the + * other way around). These flags therefore do not necessarily reflect + * what is actually going on at that moment. We cannot have both. + */ + flags = 0; + if (ifflags & IFF_DEBUG) + flags |= NDEV_FLAG_DEBUG; + if (ifflags & IFF_LINK0) + flags |= NDEV_FLAG_LINK0; + if (ifflags & IFF_LINK1) + flags |= NDEV_FLAG_LINK1; + if (ifflags & IFF_LINK2) + flags |= NDEV_FLAG_LINK2; + + if (flags != ethif->ethif_wanted.nconf_flags) { + ethif->ethif_wanted.nconf_flags = flags; + ethif->ethif_wanted.nconf_set |= NDEV_SET_FLAGS; + } + + /* The changes will be picked up from the polling function. */ + return OK; +} + +/* + * Convert a bitmask of ndev-layer capabilities (NDEV_CAP_) to NetBSD-style + * interface capabilities (IFCAP_). + */ +static uint64_t +ethif_cap_to_ifcap(uint32_t caps) +{ + uint64_t ifcap; + + ifcap = 0; + if (caps & NDEV_CAP_CS_IP4_TX) + ifcap |= IFCAP_CSUM_IPv4_Tx; + if (caps & NDEV_CAP_CS_IP4_RX) + ifcap |= IFCAP_CSUM_IPv4_Rx; + if (caps & NDEV_CAP_CS_UDP_TX) + ifcap |= IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx; + if (caps & NDEV_CAP_CS_UDP_RX) + ifcap |= IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx; + if (caps & NDEV_CAP_CS_TCP_TX) + ifcap |= IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx; + if (caps & NDEV_CAP_CS_TCP_RX) + ifcap |= IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx; + + return ifcap; +} + +/* + * Retrieve potential and enabled NetBSD-style interface capabilities (IFCAP_). + */ +static void +ethif_get_ifcap(struct ifdev * ifdev, uint64_t * ifcap, uint64_t * ifena) +{ + struct ethif *ethif = (struct ethif *)ifdev; + + *ifcap = ethif_cap_to_ifcap(ethif->ethif_caps); + *ifena = ethif_cap_to_ifcap(ethif->ethif_wanted.nconf_caps); +} + +/* + * Set NetBSD-style enabled interface capabilities (IFCAP_). + */ +static int +ethif_set_ifcap(struct ifdev * ifdev, uint64_t ifcap) +{ + struct ethif *ethif = (struct ethif *)ifdev; + unsigned int flags; + uint32_t caps; + + if (ifcap & ~(IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx | + IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx | + IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx | + IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx | + IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx)) + return EINVAL; + + /* + * Some IPv4/IPv6 flags need to be set together in order to be picked + * up. Unfortunately, that is all we can do given that lwIP does not + * distinguish IPv4/IPv6 when it comes to TCP/UDP checksum flags. + */ + caps = 0; + if (ifcap & IFCAP_CSUM_IPv4_Tx) + caps |= NDEV_CAP_CS_IP4_TX; + if (ifcap & IFCAP_CSUM_IPv4_Rx) + caps |= NDEV_CAP_CS_IP4_RX; + if ((ifcap & (IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx)) == + (IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx)) + caps |= NDEV_CAP_CS_UDP_TX; + if ((ifcap & (IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx)) == + (IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx)) + caps |= NDEV_CAP_CS_UDP_RX; + if ((ifcap & (IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx)) == + (IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx)) + caps |= NDEV_CAP_CS_TCP_TX; + if ((ifcap & (IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx)) == + (IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx)) + caps |= NDEV_CAP_CS_TCP_RX; + + /* + * When changing checksumming capabilities, we have to make sure that + * we only ever checksum too much and never too little. This means + * that we enable any checksum options in netif here, and disable any + * checksum options in netif only after driver configuration. + * + * Note that we have to draw the line somewhere with this kind of + * self-protection, and that line is short of TCP retransmission: we + * see it as lwIP's job to compute checksums for retransmitted TCP + * packets if they were saved across checksum changes. Even though + * lwIP may not care, there is little we can do about that anyway. + */ + if (ethif->ethif_wanted.nconf_caps != caps) { + flags = ethif_get_netif(ethif)->chksum_flags; + + if (!(caps & NDEV_CAP_CS_IP4_TX)) + flags |= NETIF_CHECKSUM_GEN_IP; + if (!(caps & NDEV_CAP_CS_IP4_RX)) + flags |= NETIF_CHECKSUM_CHECK_IP; + if (!(caps & NDEV_CAP_CS_UDP_TX)) + flags |= NETIF_CHECKSUM_GEN_UDP; + if (!(caps & NDEV_CAP_CS_UDP_RX)) + flags |= NETIF_CHECKSUM_CHECK_UDP; + if (!(caps & NDEV_CAP_CS_TCP_TX)) + flags |= NETIF_CHECKSUM_GEN_TCP; + if (!(caps & NDEV_CAP_CS_TCP_RX)) + flags |= NETIF_CHECKSUM_CHECK_TCP; + + NETIF_SET_CHECKSUM_CTRL(ethif_get_netif(ethif), flags); + + ethif->ethif_wanted.nconf_caps = caps; + ethif->ethif_wanted.nconf_set |= NDEV_SET_CAPS; + } + + /* The changes will be picked up from the polling function. */ + return OK; +} + +/* + * Retrieve NetBSD-style interface media type (IFM_). Return both the current + * media type selection and the driver-reported active media type. + */ +static void +ethif_get_ifmedia(struct ifdev * ifdev, int * ifcurrent, int * ifactive) +{ + struct ethif *ethif = (struct ethif *)ifdev; + + /* + * For the current select, report back whatever the user gave us, even + * if it has not reached the driver at all yet. + */ + *ifcurrent = (int)ethif->ethif_wanted.nconf_media; + *ifactive = (int)ethif->ethif_media; +} + +/* + * Set current NetBSD-style interface media type (IFM_). + */ +static int +ethif_set_ifmedia(struct ifdev * ifdev, int ifmedia) +{ + struct ethif *ethif = (struct ethif *)ifdev; + + /* + * We currently completely lack the infrastructure to suspend the + * current IOCTL call until the driver replies (or disappears). + * Therefore we have no choice but to return success here, even if the + * driver cannot accept the change. The driver does notify us of media + * changes, so the user may observe the new active media type later. + * Also note that the new media type may not be the requested type, + * which is why we do not perform any checks against the wanted or + * active media types. + */ + ethif->ethif_wanted.nconf_media = (uint32_t)ifmedia; + ethif->ethif_wanted.nconf_set |= NDEV_SET_MEDIA; + + /* The change will be picked up from the polling function. */ + return OK; +} + +/* + * Enable or disable promiscuous mode on the interface. + */ +static void +ethif_set_promisc(struct ifdev * ifdev, int promisc) +{ + struct ethif *ethif = (struct ethif *)ifdev; + + if (ethif->ethif_wanted.nconf_mode != NDEV_MODE_DOWN) { + if (promisc) + ethif->ethif_wanted.nconf_mode |= NDEV_MODE_PROMISC; + else + ethif->ethif_wanted.nconf_mode &= ~NDEV_MODE_PROMISC; + ethif->ethif_wanted.nconf_set |= NDEV_SET_MODE; + } + + /* The change will be picked up from the polling function. */ +} + +/* + * Set the hardware address on the interface. + */ +static int +ethif_set_hwaddr(struct ifdev * ifdev, const uint8_t * hwaddr) +{ + struct ethif *ethif = (struct ethif *)ifdev; + + if (!(ethif->ethif_caps & NDEV_CAP_HWADDR)) + return EINVAL; + + memcpy(ðif->ethif_wanted.nconf_hwaddr.nhwa_addr, hwaddr, + ETHARP_HWADDR_LEN); + ethif->ethif_wanted.nconf_set |= NDEV_SET_HWADDR; + + /* The change will be picked up from the polling function. */ + return OK; +} + +/* + * Set the Maximum Transmission Unit for this interface. Return TRUE if the + * new value is acceptable, in which case the caller will do the rest. Return + * FALSE otherwise. + */ +static int +ethif_set_mtu(struct ifdev * ifdev __unused, unsigned int mtu) +{ + + return (mtu <= ETHIF_MAX_MTU); +} + +static const struct ifdev_ops ethif_ops = { + .iop_init = ethif_init_netif, + .iop_input = netif_input, + .iop_output = ethif_output, + .iop_output_v4 = etharp_output, + .iop_output_v6 = ethip6_output, + .iop_hdrcmplt = ethif_hdrcmplt, + .iop_poll = ethif_poll, + .iop_set_ifflags = ethif_set_ifflags, + .iop_get_ifcap = ethif_get_ifcap, + .iop_set_ifcap = ethif_set_ifcap, + .iop_get_ifmedia = ethif_get_ifmedia, + .iop_set_ifmedia = ethif_set_ifmedia, + .iop_set_promisc = ethif_set_promisc, + .iop_set_hwaddr = ethif_set_hwaddr, + .iop_set_mtu = ethif_set_mtu, +}; diff --git a/minix/net/lwip/ethif.h b/minix/net/lwip/ethif.h new file mode 100644 index 000000000..8fef8eb20 --- /dev/null +++ b/minix/net/lwip/ethif.h @@ -0,0 +1,24 @@ +#ifndef MINIX_NET_LWIP_ETHIF_H +#define MINIX_NET_LWIP_ETHIF_H + +#include "ndev.h" + +struct ethif; + +void ethif_init(void); + +struct ethif *ethif_add(ndev_id_t id, const char * name, uint32_t caps); +int ethif_enable(struct ethif * ethif, const char * name, + const struct ndev_hwaddr * hwaddr, uint8_t hwaddr_len, uint32_t caps, + uint32_t link, uint32_t media); +void ethif_disable(struct ethif * ethif); +void ethif_remove(struct ethif * ethif); + +void ethif_configured(struct ethif * ethif, int32_t result); +void ethif_sent(struct ethif * ethif, int32_t result); +void ethif_received(struct ethif * ethif, int32_t result); + +void ethif_status(struct ethif * ethif, uint32_t link, uint32_t media, + uint32_t oerror, uint32_t coll, uint32_t ierror, uint32_t iqdrop); + +#endif /* !MINIX_NET_LWIP_ETHIF_H */ diff --git a/minix/net/lwip/ifaddr.c b/minix/net/lwip/ifaddr.c new file mode 100644 index 000000000..17cb6b58b --- /dev/null +++ b/minix/net/lwip/ifaddr.c @@ -0,0 +1,2224 @@ +/* LWIP service - ifaddr.c - network interface address management */ +/* + * This module is an exception to the regular source organization of this + * service, in that it manages part of another module's data structures, namely + * ifdev. As such, it should be seen as logically part of ifdev. It is + * separated only to keep the source code more manageable. Still, this module + * may use direct access only on the address-related fields of the ifdev + * structure, so that those one day may be move into an ifaddr-specific + * substructure within ifdev. + */ +/* + * We manage three types of addresses here: IPv4 addresses (ifaddr_v4), + * IPv6 addresses (ifaddr_v6), and link-layer a.k.a. MAC addresses (ifaddr_dl). + * + * Managing IPv4 addresses is easy. lwIP supports only one IPv4 address per + * netif. While it would be possible to construct a model where one ifdev + * consists of multiple netifs (with one IPv4 address each), we not support + * this--mostly because it is a pain to keep state synchronized between the + * netifs in that case. Such support can still be added later; the IPv4 API + * exposed from here does support multiple IPv4 addresses already just in case, + * as does much of the code using the API. + * + * For IPv4 addresses we maintain only one extra piece of information here, + * which is whether an IPv4 address has been set at all. This is because for + * our userland (DHCP clients in particular), we must allow assigning 0.0.0.0 + * as address to an interface. We do not use the lwIP per-netif IPv4 gateway + * field, nor the concept of a "default netif", in both cases because we + * override all (routing) decisions that would use those settings. lwIP does + * not allow a broadcast address to be set, so support for broadcast addresses + * is botched here: we disregard custom broadcast addresses given to us, and + * instead expose the broadcast address that is used within lwIP. + * + * Managing IPv6 addresses is much more complicated. First of all, even though + * lwIP supports stateless address autoconfiguration (SLAAC) as per RFC 4862, + * we disable that and instead make dhcpcd(8) responsible for all IPv6 address + * configuration. dhcpcd(8) will set addresses and routes as necessary, the + * latter of which are used in lwIP through our routing hooks (in the route + * module). This approach, which is in line with where NetBSD is headed, + * allows us to work around a number of lwIP limitations. As a result we do + * differ in this respect from NetBSD, which may switch between kernel-only, + * dhcpcd-only, and hybrid autoconfiguration, mainly throught the accept_rtadv + * sysctl(7) node. Writing to this node has no real effect on MINIX 3. + * + * All IPv6 addresses have a prefix length, which is almost but not quite the + * same as IPv4's subnet masks (see RFC 5942). We must maintain the per- + * address prefix length ourselves, as lwIP supports IPv6 prefix lengths of 64 + * bits only. Our dhcpcd(8)-based approach allows us to work around that. + * + * All IPv6 addresses also have a state and a lifetime, both of which are + * managed by lwIP. Unlike for IPv4, address-derived routes and routing socket + * messages are only created for addresses that are "valid", which means that + * they are in either PREFERRED or DEPRECATED state. This means that we have + * to be aware of all address state transitions between "valid" and "not + * valid", some of which (namely address duplication detection and lifetime + * expirations) are initiated by lwIP. As such, we need to keep shadow state + * for each address, and use a callback to detect whether state has changed. + * + * For understanding of this module as well as lwIP, it is important to note + * that "valid" is not the opposite of "invalid" in this context: "not valid" + * includes the address states INVALID, DUPLICATED, and TENTATIVE, while + * "invalid"/INVALID simply means that the address slot is free. + * + * Each IPv6 address also has associated flags. We support an AUTOCONF flag + * which indicates that no subnet route should be added for the address; on + * MINIX 3, dhcpcd(8) is modified to pass in that flag when appropriate, thus + * solving a problem that NetBSD suffers from, namely that it does not know + * whether a userland-given route is static (implying a subnet) or auto- + * configured (implying no subnet, again as per RFC 5942), leading to it doing + * the wrong thing in dhcpcd-only autoconfiguration mode. The TEMPORARY flag, + * for privacy addresses (RFC 4941) should be the same as on NetBSD; it is + * currently used only in source address selection (RFC 6724). We override + * lwIP's IPv6 source address selection algorithm to include support for not + * just this flag, but also label and proper longest-common-prefix comparisons. + * Finally, there is an HWBASED flag to make sure that when the link-layer + * address is changed, the IPv6 link-local address is changed accordingly only + * if the previous link-local address was also autogenerated from a link-layer + * address and not set manually by userland. + * + * Finally, we support multiple link-layer addresses per interface, but only + * because NetBSD's ifconfig(8) uses an API that expects such multi-address + * support. At any time, only one of the addresses is marked as "active", + * which means it is used as MAC address in outgoing packets. We support only + * one MAC address per device driver, so the support for additional, inactive + * link-layer addresses is there exclusively for ifconfig(8) interoperability. + * + * All interfaces, including those that do not have MAC addresses at all (e.g., + * loopback interfaces), do have one link-layer address. This is expected in + * particular by getifaddrs(3), which only recognizes interfaces that have a + * link-layer address. + * + * Many features are still missing here, especially for IP addresses. For + * example, we do not yet support destination addresses at all yet, simply + * because there is no interface type that uses them. For IPv6, more work is + * to be done to support proper netif status transitions versus address states, + * fallout from address duplication, and various ND6_IFF_ flags. + */ + +#include "lwip.h" +#include "rtsock.h" +#include "route.h" + +#include "lwip/etharp.h" + +#include +#include + +/* + * Routing flags for local address and local network routing entries. This + * may later have to be refined, for example in order not to set RTF_CLONING + * for routes on interfaces that do not have link-layer addressing. + * + * IMPORTANT: as of NetBSD 8, RTF_CLONING has been renamed to RTF_CONNECTED. + */ +#define IFADDR_HOST_RTFLAGS (RTF_UP | RTF_HOST | RTF_LOCAL) +#define IFADDR_NET_RTFLAGS (RTF_UP | RTF_CLONING) + +/* Address-related sysctl(7) settings. */ +int ifaddr_auto_linklocal = 1; /* different from NetBSD, see its usage */ +int ifaddr_accept_rtadv = 0; /* settable but completely disregarded */ + +/* + * Initialize the local address administration for an interface that is in the + * process of being created. + */ +void +ifaddr_init(struct ifdev * ifdev) +{ + unsigned int i; + + ifdev->ifdev_v4set = FALSE; + + for (i = 0; i < LWIP_IPV6_NUM_ADDRESSES; i++) + ifdev->ifdev_v6state[i] = IP6_ADDR_INVALID; + + for (i = 0; i < __arraycount(ifdev->ifdev_hwlist); i++) + ifdev->ifdev_hwlist[i].ifhwa_flags = 0; +} + +/* + * Find an IPv4 address locally assigned to a interface. The IPv4 address is + * given as 'addr'. The interface is given as 'ifdev'. On success, return OK, + * with the IPv4 address number stored in 'num'. On failure, return a negative + * error code. + */ +int +ifaddr_v4_find(struct ifdev * ifdev, const struct sockaddr_in * addr, + ifaddr_v4_num_t * num) +{ + ip_addr_t ipaddr; + int r; + + if ((r = addr_get_inet((const struct sockaddr *)addr, sizeof(*addr), + IPADDR_TYPE_V4, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) + return r; + + if (!ifdev->ifdev_v4set || + !ip_addr_cmp(netif_ip_addr4(ifdev_get_netif(ifdev)), &ipaddr)) + return EADDRNOTAVAIL; + + *num = 0; + return OK; +} + +/* + * Enumerate IPv4 addresses locally assigned to the given interface 'ifdev'. + * The caller should set 'nump' to 0 initially, and increase it by one between + * a successful call and the next enumeration call. Return TRUE on success, + * meaning that starting from the given value of 'nump' there is at least one + * IPv4 address, of which the number is stored in 'nump' on return. Return + * FALSE if there are no more IPv4 addresses locally assigned to the interface. + */ +int +ifaddr_v4_enum(struct ifdev * ifdev, ifaddr_v4_num_t * num) +{ + + /* + * For now, we support only up to one IPv4 address per interface. + * set if we are to return it. + */ + return (*num == 0 && ifdev->ifdev_v4set); +} + +/* + * Obtain information about the IPv4 address 'num' assigned to the interface + * 'ifdev'. On success, return OK, with the IPv4 address stored in 'addr', the + * network mask stored in 'mask', the broadcast stored in 'bcast', and the + * destination address stored in 'dest'. Each of these pointers may be NULL. + * The interface may not have a broadcast and/or destination address; in that + * case, their corresponding structures are not filled in at all, and thus must + * be preinitialized by the caller to a default state. The reason for not + * zeroing them is that some callers use the same buffer for both. On failure, + * return a negative error code. + */ +int +ifaddr_v4_get(struct ifdev * ifdev, ifaddr_v4_num_t num, + struct sockaddr_in * addr, struct sockaddr_in * mask, + struct sockaddr_in * bcast, struct sockaddr_in * dest) +{ + const ip_addr_t *ipaddr, *netmask; + struct netif *netif; + ip_addr_t broad; + socklen_t addr_len; + + if (!ifaddr_v4_enum(ifdev, &num)) + return EADDRNOTAVAIL; + + netif = ifdev_get_netif(ifdev); + + if (addr != NULL) { + addr_len = sizeof(*addr); + + addr_put_inet((struct sockaddr *)addr, &addr_len, + netif_ip_addr4(netif), TRUE /*kame*/, 0 /*port*/); + } + + if (mask != NULL) { + addr_len = sizeof(*mask); + + /* + * Do not bother using addr_put_netmask() here, as we would + * then first have to compute the prefix length.. + */ + addr_put_inet((struct sockaddr *)mask, &addr_len, + netif_ip_netmask4(netif), TRUE /*kame*/, 0 /*port*/); + } + + if (bcast != NULL) { + if (netif->flags & NETIF_FLAG_BROADCAST) { + /* Fake a broadcast address. */ + ipaddr = netif_ip_addr4(netif); + netmask = netif_ip_netmask4(netif); + + ip_addr_set_ip4_u32(&broad, + ip_addr_get_ip4_u32(ipaddr) | + ~ip_addr_get_ip4_u32(netmask)); + + addr_len = sizeof(*bcast); + + addr_put_inet((struct sockaddr *)bcast, &addr_len, + &broad, TRUE /*kame*/, 0 /*port*/); + } else { + bcast->sin_len = 0; + bcast->sin_family = AF_UNSPEC; + } + } + + if (dest != NULL) { + /* TODO: dest */ + dest->sin_len = 0; + dest->sin_family = AF_UNSPEC; + } + + return OK; +} + +/* + * Obtain NetBSD-style state flags (IN_IFF_) for the given local IPv4 address. + * The given number must identify an existing address. Return the flags. + */ +int +ifaddr_v4_get_flags(struct ifdev * ifdev, ifaddr_v4_num_t num) +{ + + /* IPv4 per-address flags are not supported yet. */ + return 0; +} + +/* + * Determine whether there should be a local subnet route for the given + * assigned IPv4 address, and if so, compute the subnet mask to add. Return + * TRUE if a local subnet route should be added, and return the network base + * address in 'netbase' and the number of prefix bits in 'prefixp'. Return + * FALSE if no subnet route should be added for the assigned address. + */ +static unsigned int +ifaddr_v4_netroute(struct ifdev * ifdev, ifaddr_v4_num_t num, + ip_addr_t * netbase, unsigned int * prefixp) +{ + const ip_addr_t *ipaddr, *netmask; + unsigned int prefix; + uint32_t val; + + /* Do not add subnet masks for loopback interfaces. */ + if (ifdev_is_loopback(ifdev)) + return FALSE; + + assert(num == 0); + assert(ifdev->ifdev_v4set); + + ipaddr = netif_ip_addr4(ifdev_get_netif(ifdev)); + netmask = netif_ip_netmask4(ifdev_get_netif(ifdev)); + + /* + * If the subnet is a /32, skip adding a local host route: not only + * would it not be useful, it would fail anyway because we currently do + * not support adding a host-type route and a full-width net-type route + * for the same IP address. + */ + if (ip_addr_get_ip4_u32(netmask) == PP_HTONL(0xffffffffUL)) + return FALSE; + + /* Compute the network base address. */ + ip_addr_set_ip4_u32(netbase, + ip_addr_get_ip4_u32(ipaddr) & ip_addr_get_ip4_u32(netmask)); + + /* Find the number of prefix bits of the netmask. TODO: improve.. */ + val = ntohl(ip_addr_get_ip4_u32(netmask)); + + for (prefix = 0; prefix < IP4_BITS; prefix++) + if (!(val & (1 << (IP4_BITS - prefix - 1)))) + break; + + *prefixp = prefix; + return TRUE; +} + +/* + * A local IPv4 address has been added to an interface. The interface is given + * as 'ifdev', and the number of the just-added IPv4 address is given as 'num'. + * Generate a routing socket message and add local routes as appropriate. + */ +static void +ifaddr_v4_added(struct ifdev * ifdev, ifaddr_v4_num_t num) +{ + const ip_addr_t *ipaddr; + ip_addr_t netbase; + unsigned int prefix; + + assert(num == 0); + assert(ifdev->ifdev_v4set); + + /* Report the addition of the interface address. */ + rtsock_msg_addr_v4(ifdev, RTM_NEWADDR, num); + + /* + * Add the local host route. This will always succeed: for addition, + * we just checked with route_can_add(); when updating, we first remove + * the exact same route. For now, we forbid users from messing with + * RTF_LOCAL routes directly, since nothing good (and a whole lot of + * bad) can come out of that, so the routes will not change under us. + * + * Why are we not using lo0 for this route, like the BSDs do? Because + * that approach is not compatible with link-local addresses. Instead, + * we intercept outgoing traffic to the local address, and redirect it + * over lo0, bypassing routing. If we did not do this, we would never + * know the originally intended zone of the outgoing packet. As an + * intended side effect, the traffic does show up on lo0 with BPF, just + * like on BSDs. Similarly, we do not need to set a gateway here. + * + * We currently do not use the routing tables for lookups on local + * addresses - see ifaddr_v6_map() as to why. If we ever do, that adds + * another reason that the interface associated with the route must be + * the interface that owns the address (and not, say, lo0). + */ + ipaddr = netif_ip_addr4(ifdev_get_netif(ifdev)); + + (void)route_add(ipaddr, IP4_BITS, NULL /*gateway*/, ifdev, + IFADDR_HOST_RTFLAGS, NULL /*rtr*/); + + /* + * Add the local network route, if the rules say that we should. Even + * then, adding the route may fail for various reasons, but this route + * is not essential and so we ignore failures here. + */ + if (ifaddr_v4_netroute(ifdev, num, &netbase, &prefix)) + (void)route_add(&netbase, prefix, NULL /*gateway*/, ifdev, + IFADDR_NET_RTFLAGS, NULL /*rtr*/); +} + +/* + * A particular local IPv4 address is being deleted. See if there is another + * local IPv4 address assigned to another interface that should have the same + * local subnet route (but didn't, as such duplicate routes can obviously not + * be added), and if so, readd the route for that other address. + */ +static void +ifaddr_v4_dupcheck(struct ifdev * oifdev, const ip_addr_t * onetbase, + unsigned int oprefix) +{ + struct ifdev *ifdev; + ip_addr_t netbase; + unsigned int prefix; + + for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { + if (ifdev == oifdev || !ifdev->ifdev_v4set) + continue; + + if (ifaddr_v4_netroute(ifdev, (ifaddr_v4_num_t)0, &netbase, + &prefix) && prefix == oprefix && + ip_addr_cmp(&netbase, onetbase)) { + (void)route_add(&netbase, prefix, NULL /*gateway*/, + ifdev, IFADDR_NET_RTFLAGS, NULL /*rtr*/); + + return; + } + } +} + +/* + * A local IPv4 address is about to be deleted from an interface, or the + * interface itself is about to be destroyed. Generate a routing socket + * message about this and delete local routes as appropriate. The interface is + * given as 'ifdev', and the number of the IPv4 address that is about to be + * deleted is given as 'num'. + */ +static void +ifaddr_v4_deleted(struct ifdev * ifdev, ifaddr_v4_num_t num) +{ + struct route_entry *route; + ip_addr_t netbase; + unsigned int prefix; + + assert(num == 0); + assert(ifdev->ifdev_v4set); + + /* Delete the local network route, if we tried adding it at all. */ + if (ifaddr_v4_netroute(ifdev, num, &netbase, &prefix) && + (route = route_find(&netbase, prefix, + FALSE /*is_host*/)) != NULL && + route_get_flags(route) == IFADDR_NET_RTFLAGS) { + route_delete(route, NULL /*rtr*/); + + /* + * Readd the local network route for another interface, if that + * interface has a local address on the very same network. + */ + ifaddr_v4_dupcheck(ifdev, &netbase, prefix); + } + + /* Delete the local host route. */ + if ((route = route_find(netif_ip_addr4(ifdev_get_netif(ifdev)), + IP4_BITS, TRUE /*is_host*/)) != NULL) + route_delete(route, NULL /*rtr*/); + + /* Report the deletion of the interface address. */ + rtsock_msg_addr_v4(ifdev, RTM_DELADDR, num); +} + +/* + * Add or update an IPv4 address on an interface. The interface is given as + * 'ifdev'. The address to add or update is pointed to by 'addr', which must + * always be a pointer to a valid address. For DHCP clients it must be + * possible to add the 'any' address (0.0.0.0). The network mask, broadcast + * address, and destination address parameters 'mask', 'bcast', and 'dest' + * (respectively) may be NULL pointers or pointers to AF_UNSPEC addresses, and + * will be disregarded if they are. If 'mask' and/or 'bcast' are NULL when + * adding an address, default values will be computed for them. The 'flags' + * field may contain NetBSD-style address flags (IN_IFF_). Return OK if the + * address was successfully added or updated, or a negative error code if not. + */ +int +ifaddr_v4_add(struct ifdev * ifdev, const struct sockaddr_in * addr, + const struct sockaddr_in * mask, const struct sockaddr_in * bcast, + const struct sockaddr_in * dest, int flags) +{ + ip_addr_t ipaddr, netmask, broad; + ip4_addr_t ip4zero; + struct netif *netif; + unsigned int dummy; + uint32_t val; + int r; + + assert(addr != NULL); + + if ((r = addr_get_inet((const struct sockaddr *)addr, sizeof(*addr), + IPADDR_TYPE_V4, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) + return r; + + /* Forbid multicast (class D) and experimental (class E) addresses. */ + val = ntohl(ip_addr_get_ip4_u32(&ipaddr)); + + if (ip_addr_ismulticast(&ipaddr) || IP_EXPERIMENTAL(val)) + return EINVAL; + + if (mask != NULL && mask->sin_family != AF_UNSPEC) { + if ((r = addr_get_netmask((const struct sockaddr *)mask, + sizeof(*mask), IPADDR_TYPE_V4, &dummy, &netmask)) != OK) + return r; + } else { + /* + * Generate a netmask based on IP class. Old, obsolete stuff, + * but we can't have no netmask. + */ + if (IN_CLASSA(val)) + ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSA_NET)); + else if (IN_CLASSB(val)) + ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSB_NET)); + else if (IN_CLASSC(val)) + ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSC_NET)); + else /* should not trigger */ + ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSD_NET)); + } + + if (bcast != NULL && bcast->sin_family != AF_UNSPEC) { + if ((r = addr_get_inet((const struct sockaddr *)bcast, + sizeof(*bcast), IPADDR_TYPE_V4, &broad, TRUE /*kame*/, + NULL /*port*/)) != OK) + return r; + + /* + * lwIP does not allow setting the broadcast address, so we + * must ensure that the given address is what lwIP uses anyway. + * No need to perform byte order swaps here. + */ + if (ip_addr_get_ip4_u32(&broad) != + (ip_addr_get_ip4_u32(&ipaddr) | + ~ip_addr_get_ip4_u32(&netmask))) + return EINVAL; + } + + /* TODO: dest (note: may be NULL) */ + + /* + * We currently do not support any IPv4 address flags. Even though + * supporting them would make maintaining dhcpcd(8) easier, lwIP does + * not offers the means to implement them properly. + */ + if (flags != 0) + return EINVAL; + + netif = ifdev_get_netif(ifdev); + + /* Should we add a new address, or update an existing one? */ + if (!ifdev->ifdev_v4set || + !ip_addr_cmp(netif_ip_addr4(netif), &ipaddr)) { + /* + * Add a new address. lwIP supports only one IPv4 address per + * netif. + */ + if (ifdev->ifdev_v4set) + return ENOBUFS; /* TODO: a better error code */ + + /* + * It must be possible to add the address to the routing table, + * so make sure that we can add such a route later on. The + * error code should be accurate for most real-world cases. + */ + if (!route_can_add(&ipaddr, IP4_BITS, TRUE /*is_host*/)) + return EEXIST; + + ip4_addr_set_zero(&ip4zero); + + netif_set_addr(netif, ip_2_ip4(&ipaddr), ip_2_ip4(&netmask), + &ip4zero); + + ifdev->ifdev_v4set = TRUE; + } else { + /* + * Update an existing address. First report the address as + * deleted. Do not actually delete the address in netif, + * because that would cause problems with its changing IP + * addresses on existing sockets. + */ + ifaddr_v4_deleted(ifdev, (ifaddr_v4_num_t)0); + + /* Update the one part that may have actually changed. */ + netif_set_netmask(netif, ip_2_ip4(&netmask)); + } + + /* In both cases, we now need to report the address as added. */ + ifaddr_v4_added(ifdev, (ifaddr_v4_num_t)0); + + return OK; +} + +/* + * Delete an IPv4 address from an interface. The given address number 'num' + * must have been obtained from ifaddr_v4_find() or ifaddr_v4_enum() on the + * same interface just before. This function always succeeds. + */ +void +ifaddr_v4_del(struct ifdev * ifdev, ifaddr_v4_num_t num) +{ + ip4_addr_t ip4zero; + + assert(num == 0); + assert(ifdev->ifdev_v4set); + + /* + * Report the address as deleted. Always do this first, because the + * reporting requires that the address is still there. + */ + ifaddr_v4_deleted(ifdev, num); + + /* Then actually delete the address. */ + ip4_addr_set_zero(&ip4zero); + + netif_set_addr(ifdev_get_netif(ifdev), &ip4zero, &ip4zero, &ip4zero); + + ifdev->ifdev_v4set = FALSE; +} + +/* + * Announce all IPv4 addresses associated with the given interface as deleted, + * Used (only) right before the interface is destroyed. + */ +void +ifaddr_v4_clear(struct ifdev * ifdev) +{ + + if (ifdev->ifdev_v4set) + ifaddr_v4_deleted(ifdev, (ifaddr_v4_num_t)0); +} + +/* + * Return the first interface device that owns the given IPv4 address, or NULL + * if it is not a valid local IPv4 address. + */ +struct ifdev * +ifaddr_v4_map_by_addr(const ip4_addr_t * ip4addr) +{ + struct ifdev *ifdev; + + /* + * It would be nice to be able to do a route lookup on an RTF_LOCAL + * entry here, but we do not do this for IPv6 either - see the comment + * in ifaddr_v6_map() - and it is much less needed here, because each + * interface has at most one IPv4 address. + */ + for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { + if (ifdev->ifdev_v4set && + ip4_addr_cmp(netif_ip4_addr(ifdev_get_netif(ifdev)), + ip4addr)) + return ifdev; + } + + return NULL; +} + +/* + * Return the first interface device for which the given IPv4 address is on a + * configured local subnet, or NULL if no match was found. + */ +static struct ifdev * +ifaddr_v4_map_by_subnet(const ip4_addr_t * ip4addr) +{ + struct ifdev *ifdev; + struct netif *netif; + uint32_t addr1, addr2, mask; + + addr1 = ip4_addr_get_u32(ip4addr); + + /* + * Here, we must never do a route lookup, because this routine is used + * for SO_DONTROUTE/MSG_DONTROUTE. + */ + for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { + if (!ifdev->ifdev_v4set) + continue; + + netif = ifdev_get_netif(ifdev); + + addr2 = ip4_addr_get_u32(netif_ip4_addr(netif)); + mask = ip4_addr_get_u32(netif_ip4_netmask(netif)); + + if ((addr1 & mask) == (addr2 & mask)) + return ifdev; + } + + return NULL; +} + +/* + * Return TRUE if the given local IPv6 interface address is valid (= preferred + * or deprecated), or FALSE if it is not (= tentative or duplicated). The + * address slot must be in use, that is, it must not be free (= invalid). + */ +static int +ifaddr_v6_isvalid(struct ifdev * ifdev, ifaddr_v6_num_t num) +{ + int state; + + state = ifdev->ifdev_v6state[num]; + + /* Note that 'valid' and 'invalid' are not each other's inverse! */ + assert(!ip6_addr_isinvalid(state)); + + return ip6_addr_isvalid(state); +} + +/* + * Find an IPv6 address assigned to the given interface that matches the given + * IPv6 address. Return TRUE if a match was found, with its number stored in + * 'nump'. Return FALSE if the address is not assigned to the interface. + */ +static int +ifaddr_v6_match(struct ifdev * ifdev, const ip_addr_t * ipaddr, + ifaddr_v6_num_t * nump) +{ + int8_t i; + + assert(IP_IS_V6(ipaddr)); + + i = netif_get_ip6_addr_match(ifdev_get_netif(ifdev), ip_2_ip6(ipaddr)); + if (i < 0) + return FALSE; + + *nump = i; + return TRUE; +} + +/* + * Find an IPv6 address locally assigned to a interface. The IPv6 address is + * given as 'addr6', and must use KAME-style embedding for zones. The + * interface is given as 'ifdev'. On success, return OK, with the IPv6 address + * number stored in 'num'. On failure, return a negative error code. This + * function also returns tentative and duplicated addresses. + */ +int +ifaddr_v6_find(struct ifdev * ifdev, const struct sockaddr_in6 * addr6, + ifaddr_v6_num_t * nump) +{ + ip_addr_t ipaddr; + int r; + + if ((r = addr_get_inet((const struct sockaddr *)addr6, sizeof(*addr6), + IPADDR_TYPE_V6, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) + return r; + + if (ip6_addr_has_zone(ip_2_ip6(&ipaddr)) && + ip6_addr_zone(ip_2_ip6(&ipaddr)) != ifdev_get_index(ifdev)) + return EADDRNOTAVAIL; + + if (!ifaddr_v6_match(ifdev, &ipaddr, nump)) + return EADDRNOTAVAIL; + + return OK; +} + +/* + * Enumerate IPv6 addresses locally assigned to the given interface 'ifdev'. + * The caller should set 'nump' to 0 initially, and increase it by one between + * a successful call and the next enumeration call. Return TRUE on success, + * meaning that starting from the given value of 'nump' there is at least one + * IPv6 address, of which the number is stored in 'nump' on return. Return + * FALSE if there are no more IPv6 addresses locally assigned to the interface. + * This function also returns tentative and duplicated address entries. + */ +int +ifaddr_v6_enum(struct ifdev * ifdev, ifaddr_v6_num_t * nump) +{ + ifaddr_v6_num_t num; + + for (num = *nump; num < LWIP_IPV6_NUM_ADDRESSES; num++) { + if (!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])) { + *nump = num; + return TRUE; + } + } + + return FALSE; +} + +/* + * Obtain information about the IPv6 address 'num' assigned to the interface + * 'ifdev'. Store the IPv6 address in 'addr6', the network mask in 'mask6', + * and the destination address in 'dest6'. Each of these pointers may be NULL. + * The returned addresses use KAME-style embedding for zones. This function + * also returns tentative and duplicated addresses. It always succeeds. + */ +void +ifaddr_v6_get(struct ifdev * ifdev, ifaddr_v6_num_t num, + struct sockaddr_in6 * addr6, struct sockaddr_in6 * mask6, + struct sockaddr_in6 * dest6) +{ + struct netif *netif; + socklen_t addr_len; + + /* + * Due to route message generation upon address addition and deletion, + * either the ifdev_v6state or the netif state may not yet have been + * updated here. + */ + assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num]) || + !ip6_addr_isinvalid(netif_ip6_addr_state(ifdev_get_netif(ifdev), + (int)num))); + + netif = ifdev_get_netif(ifdev); + + if (addr6 != NULL) { + addr_len = sizeof(*addr6); + + (void)addr_put_inet((struct sockaddr *)addr6, &addr_len, + netif_ip_addr6(netif, (int)num), TRUE /*kame*/, + 0 /*port*/); + } + + if (mask6 != NULL) { + addr_len = sizeof(*mask6); + + addr_put_netmask((struct sockaddr *)mask6, &addr_len, + IPADDR_TYPE_V6, ifdev->ifdev_v6prefix[num]); + } + + if (dest6 != NULL) { + /* TODO: dest6 */ + dest6->sin6_len = 0; + dest6->sin6_family = AF_UNSPEC; + } +} + +/* + * Obtain NetBSD-style state flags (IN6_IFF_) for the given local IPv6 address. + * The given number must identify an existing address. Return the flags. + */ +int +ifaddr_v6_get_flags(struct ifdev * ifdev, ifaddr_v6_num_t num) +{ + int state, flags; + + state = ifdev->ifdev_v6state[num]; + + assert(!ip6_addr_isinvalid(state)); + + flags = 0; + if (ip6_addr_isduplicated(state)) + flags |= IN6_IFF_DUPLICATED; + if (ip6_addr_istentative(state)) + flags |= IN6_IFF_TENTATIVE; + if (ip6_addr_isdeprecated(state)) + flags |= IN6_IFF_DEPRECATED; + if (ifdev->ifdev_v6flags[num] & IFADDR_V6F_AUTOCONF) + flags |= IN6_IFF_AUTOCONF; + if (ifdev->ifdev_v6flags[num] & IFADDR_V6F_TEMPORARY) + flags |= IN6_IFF_TEMPORARY; + + return flags; +} + +/* + * Obtain lifetime information about the given local IPv6 address. The given + * 'lifetime' structure is filled as a result. This function always succeeds. + */ +void +ifaddr_v6_get_lifetime(struct ifdev * ifdev, ifaddr_v6_num_t num, + struct in6_addrlifetime * lifetime) +{ + struct netif *netif; + uint32_t valid_life, pref_life; + time_t now; + + assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])); + + netif = ifdev_get_netif(ifdev); + + valid_life = netif_ip6_addr_valid_life(netif, (int)num); + pref_life = netif_ip6_addr_pref_life(netif, (int)num); + + /* + * Represent 'static' as 'infinite' to userland. This applies only to + * link-local addresses, which do not have lifetimes at all. + */ + if (ip6_addr_life_isstatic(valid_life)) { + valid_life = IP6_ADDR_LIFE_INFINITE; + pref_life = IP6_ADDR_LIFE_INFINITE; + } + + now = clock_time(NULL); + + /* + * TODO: the _vltime and _pltime values filled in here are not correct. + * They should be set to the originally assigned values rather than the + * current ones. Getting this right would mean we'd have to save the + * original values. So far it does not look like userland needs that.. + */ + memset(lifetime, 0, sizeof(*lifetime)); + lifetime->ia6t_vltime = valid_life; + lifetime->ia6t_pltime = pref_life; + if (!ip6_addr_life_isinfinite(valid_life)) + lifetime->ia6t_expire = now + valid_life; + if (!ip6_addr_life_isinfinite(pref_life)) + lifetime->ia6t_preferred = now + pref_life; +} + +/* + * Determine whether there should be a local subnet route for the given + * assigned IPv6 address, and if so, compute the subnet mask to add. Return + * TRUE if a local subnet route should be added, and return the network base + * address in 'netbase' and the number of prefix bits in 'prefixp'. Return + * FALSE if no subnet route should be added for the assigned address. + */ +static unsigned int +ifaddr_v6_netroute(struct ifdev * ifdev, ifaddr_v6_num_t num, + ip_addr_t * netbase, unsigned int * prefixp) +{ + const ip_addr_t *ipaddr; + + ipaddr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); + + /* + * A local network route should be added only if all of the following + * conditions are met: + * + * 1) The address is not auto-configured. Autoconfigured addresses do + * not have an implied subnet, as explained in RFC 5942. + * Consistency with respect to subnet routes is why we do not allow + * changing the AUTOCONF flag after an address has been added. + * 2) The subnet assignment is not a /128 prefix. Not only would such + * a route not be useful, adding it would fail anyway because we + * currently do not support adding a host-type route and a + * full-width net-type route for the same IP address. + * 3) If the interface is a loopback device, the address is not a link- + * local address. This appears to be what NetBSD does, but + * additional loopback-related exceptions may be needed here. + */ + if ((ifdev->ifdev_v6flags[num] & IFADDR_V6F_AUTOCONF) || + ifdev->ifdev_v6prefix[num] == IP6_BITS || + (ifdev_is_loopback(ifdev) && + ip6_addr_islinklocal(ip_2_ip6(ipaddr)))) + return FALSE; + + addr_normalize(netbase, ipaddr, ifdev->ifdev_v6prefix[num]); + + *prefixp = ifdev->ifdev_v6prefix[num]; + return TRUE; +} + +/* + * A local IPv6 has become valid (preferred or deprecated) after previously + * being invalid (tentative, duplicated, or free). Report the addition of the + * now-usable address, and add appropriate routes to the IPv6 routing table. + * + * This function is *not* called immediately when an address is added, but + * rather when the address becomes valid (meaning it is no longer tentative, + * and thus supposedly collision-free). For that reason, unlike for IPv4, this + * function is only ever called indirectly, through the netif status callback. + */ +static void +ifaddr_v6_added(struct ifdev * ifdev, ifaddr_v6_num_t num) +{ + const ip_addr_t *ipaddr; + ip_addr_t base; + ip6_addr_t *base6; + unsigned int prefix; + + /* Check the netif as ifdev_v6state is not yet updated here. */ + assert(!ip6_addr_isinvalid(netif_ip6_addr_state(ifdev_get_netif(ifdev), + (int)num))); + + /* Report the addition of the interface address. */ + rtsock_msg_addr_v6(ifdev, RTM_NEWADDR, num); + + /* + * Add the local host route. This will always succeed. See the IPv4 + * version of this code for more information. + */ + ipaddr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); + + (void)route_add(ipaddr, IP6_BITS, NULL /*gateway*/, ifdev, + IFADDR_HOST_RTFLAGS, NULL /*rtr*/); + + /* + * Add the local network route, if the rules say that we should. Even + * then, adding the route may fail for various reasons, but this route + * is not essential and so we ignore failures here. + */ + if (ifaddr_v6_netroute(ifdev, num, &base, &prefix)) + (void)route_add(&base, prefix, NULL /*gateway*/, ifdev, + IFADDR_NET_RTFLAGS, NULL /*rtr*/); + + /* + * Add the node-local and link-local scope multicast routes. These are + * interface-specific rather than address-specific. They are (re)added + * for every address, and never deleted until interface destruction. + */ + ip_addr_set_zero_ip6(&base); + base6 = ip_2_ip6(&base); + + base6->addr[0] = htonl(0xff010000UL | ifdev_get_index(ifdev)); + + (void)route_add(&base, 32, NULL /*gateway*/, ifdev, IFADDR_NET_RTFLAGS, + NULL /*rtr*/); + + base6->addr[0] = htonl(0xff020000UL | ifdev_get_index(ifdev)); + + (void)route_add(&base, 32, NULL /*gateway*/, ifdev, IFADDR_NET_RTFLAGS, + NULL /*rtr*/); +} + +/* + * A particular local IPv6 address is being deleted. See if there is another + * local IPv6 address assigned that should have the same local subnet route + * (but didn't, as such duplicate routes can obviously not be added), and if + * so, readd the route for that other address, possibly for the same interface. + */ +static void +ifaddr_v6_dupcheck(struct ifdev * oifdev, const ip_addr_t * onetbase, + unsigned int oprefix) +{ + struct ifdev *ifdev; + ip_addr_t netbase; + unsigned int prefix; + ifaddr_v6_num_t num; + + for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { + if (ifdev == oifdev) + continue; + + for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { + if (ip6_addr_isinvalid(ifdev->ifdev_v6state[num]) || + !ifaddr_v6_isvalid(ifdev, num)) + continue; + + if (!ifaddr_v6_netroute(ifdev, num, &netbase, &prefix)) + continue; + + if (prefix != oprefix || + !ip_addr_cmp(&netbase, onetbase)) + continue; + + (void)route_add(&netbase, prefix, NULL /*gateway*/, + ifdev, IFADDR_NET_RTFLAGS, NULL /*rtr*/); + + return; + } + } +} + +/* + * A local IPv6 has become invalid (tentative, duplicated, or free) after + * previously being valid (preferred or deprecated). Report the deletion of + * the previously-usable address, and remove previously added routes from the + * IPv6 routing table. + * + * This function is not always called for every deleted address: instead, it is + * called only when the address was previously valid, meaning that + * ifaddr_v6_added() was invoked on it before as well. Unlike for IPv4, this + * function is typically called indirectly, through the netif status callback. + */ +static void +ifaddr_v6_deleted(struct ifdev * ifdev, ifaddr_v6_num_t num) +{ + struct route_entry *route; + const ip_addr_t *ipaddr; + ip_addr_t netbase; + unsigned int prefix; + + assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])); + + ipaddr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); + + /* Delete the local network route, if we tried adding it at all. */ + if (ifaddr_v6_netroute(ifdev, num, &netbase, &prefix) && + (route = route_find(&netbase, prefix, + FALSE /*is_host*/)) != NULL && + route_get_flags(route) == IFADDR_NET_RTFLAGS) { + route_delete(route, NULL /*rtr*/); + + /* + * Readd the local network route for another interface, if that + * interface has a local address on the very same network. + * Skip scoped (e.g., link-local) addresses, for which the + * routes are unique anyway. + */ + if (!ip6_addr_has_scope(ip_2_ip6(ipaddr), IP6_UNICAST)) + ifaddr_v6_dupcheck(ifdev, &netbase, prefix); + } + + /* Delete the local host route. */ + if ((route = route_find(ipaddr, IP6_BITS, TRUE /*is_host*/)) != NULL) + route_delete(route, NULL /*rtr*/); + + /* Report the deletion of the interface address. */ + rtsock_msg_addr_v6(ifdev, RTM_DELADDR, num); +} + +/* + * Add or update an IPv6 address on an interface. The interface is given as + * 'ifdev'. The IPv6 address to add or update is pointed to by 'addr6', which + * must always be a pointer to a valid address. The network mask is given as + * 'mask6', but may be NULL when updating an address. The same applies to the + * destination address 'dest6'. The given IPv6 address and destination address + * must use KAME-style embedding for zones. The flags field 'flags' contains + * a set of NetBSD-style address flags (IN6_IFF_). The 'lifetime' parameter + * always points to lifetime information to be set or updated. Return OK if + * the address was successfully added or updated, or a negative error code + * otherwise. + */ +int +ifaddr_v6_add(struct ifdev * ifdev, const struct sockaddr_in6 * addr6, + const struct sockaddr_in6 * mask6, const struct sockaddr_in6 * dest6, + int flags, const struct in6_addrlifetime * lifetime) +{ + ip_addr_t ipaddr; + ip6_addr_t *ip6addr; + struct netif *netif; + unsigned int prefix; + ifaddr_v6_num_t num; + uint32_t valid_life; + int r, state; + + netif = ifdev_get_netif(ifdev); + + /* + * Somewhat curiously, NetBSD ignores the zone ID for these requests, + * rather than rejecting requests with a zone ID that does not match + * the associated interface's. We have no reason to be stricter, and + * so we overwrite whatever zone was given.. + */ + if ((r = addr_get_inet((const struct sockaddr *)addr6, sizeof(*addr6), + IPADDR_TYPE_V6, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) + return r; + + /* + * Forbid locally-assigned multicast addresses. Not only are those + * absolutely disallowed in theory, we also assume all locally assigned + * addresses are unicast in various places in practice. + */ + if (ip_addr_ismulticast(&ipaddr)) + return EINVAL; + + ip6_addr_assign_zone(ip_2_ip6(&ipaddr), IP6_UNICAST, netif); + + /* + * The netmask needs to be there only when adding a new address, but if + * a netmask is given, it must be valid. Note that lwIP itself + * supports only /64 subnets; however, due to our custom routing hooks, + * combined with giving lifetimes to all addresses (except the primary + * link-local address, which is a /64), we control all routing + * decisions that would otherwise be affected by that lwIP limitation. + */ + if (mask6 != NULL && mask6->sin6_family != AF_UNSPEC) { + if ((r = addr_get_netmask((const struct sockaddr *)mask6, + sizeof(*mask6), IPADDR_TYPE_V6, &prefix, + NULL /*ipaddr*/)) != OK) + return r; + } else + prefix = 0; + + /* TODO: dest6 (note: may be NULL) */ + + /* TODO: support for IN6_IFF_ANYCAST and IN6_IFF_DETACHED. */ + if (flags & ~(IN6_IFF_TENTATIVE | IN6_IFF_DEPRECATED | IN6_IFF_NODAD | + IN6_IFF_AUTOCONF | IN6_IFF_TEMPORARY)) + return EINVAL; + + /* Should we add a new address, or update an existing one? */ + ip6addr = ip_2_ip6(&ipaddr); + + if (!ifaddr_v6_match(ifdev, &ipaddr, &num)) { + /* Add a new address. */ + if (prefix == 0) + return EINVAL; + + /* + * It must be possible to add the address to the routing table, + * so make sure that we can add such a route later on. The + * error code should be accurate for most real-world cases. + */ + if (!route_can_add(&ipaddr, IP6_BITS, TRUE /*is_host*/)) + return EEXIST; + + /* + * As an exception, if the given address is a link-local + * address and there is no link-local address in slot 0, use + * slot 0 to store this address. This requires a /64 prefix + * length, because lwIP will use an implied /64 subnet for it. + */ + if (ip6_addr_isinvalid(ifdev->ifdev_v6state[0]) && + ip6_addr_islinklocal(ip6addr) && prefix == 64) { + num = (ifaddr_v6_num_t)0; + + /* + * Such link-local addresses are not considered to be + * autoconfigured, because they always have an implied + * subnet. Therefore, clear that flag. + */ + flags &= ~IN6_IFF_AUTOCONF; + } else { + /* + * Find a free slot. We bypass netif_ip6_addr_add() as + * it makes things more, rather than less, complicated + * for us here. + */ + for (num = 1; num < LWIP_IPV6_NUM_ADDRESSES; num++) { + state = ifdev->ifdev_v6state[num]; + + if (ip6_addr_isinvalid(state)) + break; + } + + if (num == LWIP_IPV6_NUM_ADDRESSES) + return ENOBUFS; /* TODO: a better error code */ + } + + assert(ip6_addr_isinvalid(netif_ip6_addr_state(netif, num))); + + /* + * We bypass the standard netif IPv6 address assignment + * functions here, because we may want to change the state of + * the address to something particular (rather than always + * tentative) and set the state only when we're otherwise done. + */ + netif->ip6_addr[num] = ipaddr; + + ifdev->ifdev_v6prefix[num] = prefix; + + /* + * New addresses are always DAD-tested for collisions first, + * except on loopback interfaces, which will simply get back + * its own DAD request and conclude there is a collision.. + */ + if (flags & IN6_IFF_TENTATIVE) + state = IP6_ADDR_TENTATIVE; + else if (flags & IN6_IFF_DEPRECATED) + state = IP6_ADDR_VALID; + else if (ifdev_is_loopback(ifdev) || (flags & IN6_IFF_NODAD)) + state = IP6_ADDR_PREFERRED; + else + state = IP6_ADDR_TENTATIVE; + + ifdev->ifdev_v6flags[num] = 0; + if (flags & IN6_IFF_AUTOCONF) + ifdev->ifdev_v6flags[num] |= IFADDR_V6F_AUTOCONF; + if (flags & IN6_IFF_TEMPORARY) + ifdev->ifdev_v6flags[num] |= IFADDR_V6F_TEMPORARY; + + /* Precompute the address scope as well. */ + ifdev->ifdev_v6scope[num] = + addrpol_get_scope(&ipaddr, TRUE /*is_src*/); + } else { + /* Update an existing address. */ + /* + * Since no fundamental aspects about the address may change + * we also do not need to delete and readd the address here. + */ + if (prefix != 0 && prefix != ifdev->ifdev_v6prefix[num]) + return EINVAL; + + /* TODO: figure out exactly what userland wants here.. */ + if (flags & IN6_IFF_TENTATIVE) + state = IP6_ADDR_TENTATIVE; + else if (flags & IN6_IFF_DEPRECATED) + state = IP6_ADDR_VALID; + else + state = IP6_ADDR_PREFERRED; + + /* + * Leave the AUTOCONF flag as is, because otherwise we might + * also have to add or delete a subnet route here. + */ + if (flags & IN6_IFF_TEMPORARY) + ifdev->ifdev_v6flags[num] |= IFADDR_V6F_TEMPORARY; + else + ifdev->ifdev_v6flags[num] &= ~IFADDR_V6F_TEMPORARY; + } + + /* + * In our implementation, all addresses except the first link-local + * address (which is always stored in slot 0) have a lifetime and are + * thus not static as far as lwIP is concerned. The result is that all + * those addresses are considered to be /128 assignments, leaving the + * routing decisions entirely to us, which is exactly what we want. As + * such we have to be careful not to assign a valid lifetime of 0 + * ("static"). For preferred lifetimes, 0 is not a special value, + * though. Either value may be 0xffffffff, which denotes "infinite". + * + * As for those routing decisions: we use the AUTOCONF flag as the + * indication whether or not to add a subnet (= on-link prefix) route + * for the address. See also ifaddr_v6_added(). + */ + if (num != 0) { + valid_life = lifetime->ia6t_vltime; + if (ip6_addr_life_isstatic(valid_life)) + valid_life++; + netif_ip6_addr_set_valid_life(netif, (int)num, valid_life); + netif_ip6_addr_set_pref_life(netif, (int)num, + lifetime->ia6t_pltime); + } + + /* + * The lifetime of address slot 0 is initialized to, and remains at all + * times, zero ("static"). All other slots have an actual lifetime. + */ + assert(netif_ip6_addr_isstatic(netif, (int)num) == !num); + + /* + * Change the address state last, as this may immediately trigger + * reports and route addition etc, although usually it will not: + * addresses are typically added as tentative, and ifaddr_v6_added() + * will be called only once the address is valid. + */ + netif_ip6_addr_set_state(netif, (int)num, state); + + return OK; +} + +/* + * Delete an IPv6 address from an interface. The given address number must + * have been obtained through ifaddr_v6_find() or ifaddr_v6_enum(). + * This function always succeeds. + */ +void +ifaddr_v6_del(struct ifdev * ifdev, ifaddr_v6_num_t num) +{ + + assert(num <= LWIP_IPV6_NUM_ADDRESSES); + assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])); + + /* The state change will also trigger ifaddr_v6_deleted() if needed. */ + netif_ip6_addr_set_state(ifdev_get_netif(ifdev), (int)num, + IP6_ADDR_INVALID); +} + +/* + * Announce all IPv6 addresses associated with the given interface as deleted. + * Used (only) right before the interface is destroyed. + */ +void +ifaddr_v6_clear(struct ifdev * ifdev) +{ + ifaddr_v6_num_t num; + + for (num = 0; ifaddr_v6_enum(ifdev, &num); num++) { + if (ifaddr_v6_isvalid(ifdev, num)) + ifaddr_v6_deleted(ifdev, num); + } +} + +/* + * Check state changes on local IPv6 addresses and update shadow state + * accordingly. + */ +void +ifaddr_v6_check(struct ifdev * ifdev) +{ + struct netif *netif; + ifaddr_v6_num_t num; + int old_state, new_state, was_valid, is_valid; + + netif = ifdev_get_netif(ifdev); + + for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { + /* + * Since we compile lwIP without support for stateless + * autoconfiguration, there will be no cases where new + * addresses appear out of nowhere. As such, we can rely on + * all necessary fields already being initialized here. + */ + old_state = ifdev->ifdev_v6state[num]; + new_state = netif_ip6_addr_state(netif, num); + + if (old_state == new_state) + continue; + + was_valid = ip6_addr_isvalid(old_state); + is_valid = ip6_addr_isvalid(new_state); + + if (was_valid != is_valid) { + if (is_valid) + ifaddr_v6_added(ifdev, num); + else + ifaddr_v6_deleted(ifdev, num); + } + + ifdev->ifdev_v6state[num] = new_state; + + /* + * TODO: implement the requirements for dealing with duplicated + * addresses, in particular the link-local address, as + * specified by RFC 4862 Sec. 5.4.5. NetBSD uses the + * ND6_IFF_IFDISABLED flag for this, essentially disabling + * the interface completely when that flag is set. + */ + } +} + +/* + * A change in the interface and/or link status has resulted in both now being + * up. Set the link-local address, if any, to tentative state. Exempt + * loopback interfaces, which would just see their own requests as collisions. + * + * TODO: the current implementation is the absolute minimum required for + * dhcpcd(8) to function somewhat properly, but there is much more to be + * decided and done when it comes to dealing with status changes.. + */ +void +ifaddr_v6_set_up(struct ifdev * ifdev) +{ + + if (!ifdev_is_loopback(ifdev) && + !ip6_addr_isinvalid(ifdev->ifdev_v6state[0])) + netif_ip6_addr_set_state(ifdev_get_netif(ifdev), 0, + IP6_ADDR_TENTATIVE); +} + +/* + * Check whether all conditions are met for (re)assigning a link-local IPv6 + * address, and if so, do just that. + */ +void +ifaddr_v6_set_linklocal(struct ifdev * ifdev) +{ + + /* + * A few conditions must be met for link-local address assignment. + * First of all, link-local address assignment must be enabled both + * globally and on the interface. The BSDs use the global setting as + * an initial value for the link-local setting, but if we do this, it + * would basically be impossible to change the global setting and have + * any effect. Thus, we use the global setting as an additional + * requirement, with as reasoning that people will typically disable + * the global setting in order to assign no IPv6 addresses at all. + */ + if (!(ifdev_get_nd6flags(ifdev) & ND6_IFF_AUTO_LINKLOCAL) || + !ifaddr_auto_linklocal) + return; + + /* + * Second, the interface must be up. This is an artificial requirement + * that allows for the above settings to be changed at all: if we + * assigned a link-local address as soon as we could (see below), this + * would leave virtually no opportunity to change the settings. Once + * assigned, a link-local address is never removed automatically. + */ + if (!ifdev_is_up(ifdev)) + return; + + /* + * A proper (48-bit) hardware address must be set. Interfaces without + * hardware addresses (e.g., loopback devices) do not have this kind of + * auto-assignment. It may take a while for the driver to get back to + * us with its initial hardware address, so wait for at least that. + * Also update the link-local address upon subsequent (user-initiated) + * changes to the hardware address, as long as if the IPv6 address has + * not been overridden by userland by then. + */ + if (ifdev_get_hwlen(ifdev) != ETHARP_HWADDR_LEN || + !(ifdev->ifdev_hwlist[0].ifhwa_flags & IFHWAF_VALID)) + return; + + if (!ip6_addr_isinvalid(ifdev->ifdev_v6state[0]) && + (ifdev->ifdev_v6flags[0] & IFADDR_V6F_HWBASED)) + return; + + /* + * All conditions are met. Set or replace the interface's IPv6 + * link-local address. This uses the first IPv6 address slot, which + * will be skipped when adding non-link-local addresses. We first + * delete the old address if any, in order to force invalidation of + * bound sockets, because setting the new address does not (currently) + * consider sockets. + */ + if (!ip6_addr_isinvalid(ifdev->ifdev_v6state[0])) + ifaddr_v6_del(ifdev, (ifaddr_v6_num_t)0); + +#ifdef INET6 + ifdev->ifdev_v6flags[0] = IFADDR_V6F_HWBASED; + ifdev->ifdev_v6prefix[0] = 64; + netif_create_ip6_linklocal_address(ifdev_get_netif(ifdev), + 1 /*from_mac_48bit*/); + assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[0])); + + ifdev->ifdev_v6scope[0] = + addrpol_get_scope(netif_ip_addr6(ifdev_get_netif(ifdev), 0), + TRUE /*is_src*/); +#endif /* INET6 */ +} + +/* + * Return the first interface device that owns the given (non-any) IPv6 + * address, or NULL if it is not a valid local IPv6 address. Addresses that + * exist but are not usable ("usually assigned" in the RFC4862 sense) are + * considered not valid in this context. + */ +struct ifdev * +ifaddr_v6_map_by_addr(const ip6_addr_t * ip6addr) +{ + struct ifdev *ifdev; + struct netif *netif; + ifaddr_v6_num_t num; + + /* + * It would be nice to be able to do a route lookup on an RTF_LOCAL + * entry here, but this approach would currently have two problems. + * + * 1) link-local addresses would require a lookup with a different + * embedded zone for each possible interface, requiring a loop over + * all interfaces after all; we could do a route lookup for global + * addresses only, but then there's also the issue that.. + * 2) once we get the interface from the route, we still have to check + * check the state of the address, as done below, and that requires + * us to go through all the interface addresses after all; we could + * embed the local address number in the RTF_LOCAL routing entry but + * that would get rather messy API-wise. + * + * Still, if it turns out that this function is a bottleneck, the above + * workarounds should offer a way forward for the common case. + */ + for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { + netif = ifdev_get_netif(ifdev); + + for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { + if (ip6_addr_isinvalid(ifdev->ifdev_v6state[num])) + continue; + + /* + * An address may be used as a local address only if it + * is preferred or deprecated, not if it is tentative + * or duplicated. + */ + if (!ifaddr_v6_isvalid(ifdev, num)) + continue; + + /* + * Ignore the zone if the given address does not have + * one set. Otherwise, the zone must match. + */ + if (ip6_addr_cmp_zoneless(netif_ip6_addr(netif, num), + ip6addr) && (!ip6_addr_has_zone(ip6addr) || + ip6_addr_test_zone(ip6addr, netif))) + return ifdev; + } + } + + return NULL; +} + +/* + * Return the first interface device for which the given IPv6 address is on a + * configured local subnet, or NULL if no match was found. + */ +static struct ifdev * +ifaddr_v6_map_by_subnet(const ip_addr_t * ipaddr) +{ + const ip_addr_t *addr; + struct ifdev *ifdev; + struct netif *netif; + ifaddr_v6_num_t num; + unsigned int prefix; + + assert(IP_IS_V6(ipaddr)); + + for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { + netif = ifdev_get_netif(ifdev); + + if (ip6_addr_has_zone(ip_2_ip6(ipaddr)) && + !ip6_addr_test_zone(ip_2_ip6(ipaddr), netif)) + continue; + + for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { + if (ip6_addr_isinvalid(ifdev->ifdev_v6state[num])) + continue; + + if (!ifaddr_v6_isvalid(ifdev, num)) + continue; + + addr = netif_ip_addr6(netif, num); + + /* + * For addresses with no implied subnet, check against + * the full address, so as to match only that address. + */ + if (ifdev->ifdev_v6flags[num] & IFADDR_V6F_AUTOCONF) + prefix = IP6_BITS; + else + prefix = ifdev->ifdev_v6prefix[num]; + + if (addr_get_common_bits(ipaddr, addr, prefix) == + prefix) + return ifdev; + } + } + + return NULL; +} + +/* + * Select an IPv6 source address for communication to the given destination + * address on the given interface. Return the selected source address, or NULL + * if no appropriate source address could be found. This function implements + * RFC 6724 Sec. 5, and is very close to a drop-in replacement for lwIP's own + * ip6_select_source_address() function. We can do a slightly better job + * because we have more information (for Rules 6 and 7) and can offer a more + * complete, less lightweight implementation (for Rule 8). + * + * In summary, this is the implementation status of the rules: + * + * - Rules 1, 2, 3: fully implemented + * - Rules 4, 5, 5.5: not applicable + * - Rules 6, 7, 8: fully implemented + * + * Note that for rule 2, scope decisions are left to the addrpol module, which + * makes a deliberate exception from the RFC for Unique-Local Addresses. + * + * The given destination address may not be properly zoned. + */ +static const ip_addr_t * +ifaddr_v6_select(struct ifdev * ifdev, const ip_addr_t * dest_addr) +{ + const ip_addr_t *cand_addr, *best_addr; + int dest_scope, cand_scope, best_scope; + int dest_label, cand_label, best_label = 0 /*gcc*/; + uint8_t cand_pref, best_pref = 0 /*gcc*/; + uint8_t cand_temp, best_temp = 0 /*gcc*/; + int cand_bits, best_bits = 0 /*gcc*/; + ifaddr_v6_num_t num, best_num; + + assert(ifdev != NULL); + assert(IP_IS_V6(dest_addr)); + + dest_scope = addrpol_get_scope(dest_addr, FALSE /*is_src*/); + dest_label = -1; /* obtain only when necessary */ + + best_addr = NULL; + best_num = -1; + + for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { + /* Consider only valid (preferred and deprecated) addresses. */ + if (!ip6_addr_isvalid(ifdev->ifdev_v6state[num])) + continue; + + cand_addr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); + + /* Rule 1 */ + if (ip6_addr_cmp_zoneless(ip_2_ip6(cand_addr), + ip_2_ip6(dest_addr))) + return cand_addr; + + cand_scope = ifdev->ifdev_v6scope[num]; + cand_pref = ip6_addr_ispreferred(ifdev->ifdev_v6state[num]); + cand_temp = (ifdev->ifdev_v6flags[num] & IFADDR_V6F_TEMPORARY); + cand_label = -1; + cand_bits = -1; + + /* + * The following monster of an if-condition relies on order of + * evaluation to obtain the more expensive-to-compute values + * only when strictly necessary. We use a shortcut for Rule 6: + * labels are computed based on longest matching prefix, so if + * Rule 6 prefers the candidate address, Rule 8 would have + * preferred the candidate address as well. Therefore, skip + * even computing labels when Rule 7 would not prefer either + * address, i.e. the "temporary" state of the candidate and the + * best address are equal. For complete ties (which exist, + * because Rule 8 - longest common prefix - checks up to the + * subnet size), as "policy" we always pick the first address. + */ +#define ADDRPOL_GET_LABEL(addr, label) \ + (label != -1 || (label = addrpol_get_label(addr), 1)) +#define ADDR_GET_COMMON_BITS(addr1, addr2, num, bits) \ + (bits != -1 || (bits = (int) \ + addr_get_common_bits(addr1, addr2, ifdev->ifdev_v6prefix[num]), 1)) + + if (best_addr == NULL || /* no alternative yet */ + /* Rule 2 */ + (cand_scope < best_scope && cand_scope >= dest_scope) || + (cand_scope > best_scope && best_scope < dest_scope) || + (cand_scope == best_scope && + /* Rule 3 */ + (cand_pref > best_pref || (cand_pref == best_pref && + /* Rule 6 */ + ((cand_temp != best_temp && /* shortcut, part 1 */ + ADDRPOL_GET_LABEL(dest_addr, dest_label) && + ADDRPOL_GET_LABEL(cand_addr, cand_label) && + ADDRPOL_GET_LABEL(best_addr, best_label) && + cand_label == dest_label && best_label != dest_label) || + ((cand_temp == best_temp || /* shortcut, part 2 */ + ((cand_label == dest_label) == + (best_label == dest_label))) && + /* Rule 7 */ + (cand_temp > best_temp || (cand_temp == best_temp && + /* Rule 8 */ + ADDR_GET_COMMON_BITS(cand_addr, dest_addr, num, + cand_bits) && + ADDR_GET_COMMON_BITS(best_addr, dest_addr, best_num, + best_bits) && + cand_bits > best_bits)))))))) { + /* We found a new "winning" candidate. */ + best_addr = cand_addr; + best_scope = cand_scope; + best_pref = cand_pref; + best_temp = cand_temp; + best_label = cand_label; + best_bits = cand_bits; + best_num = num; + } + } + + /* Return the best candidate, if any. */ + return best_addr; +} + +/* + * Pick an IPv6 source address locally assigned to the given interface, for use + * with the given IPv6 destination address. See ifaddr_v6_select() on why we + * override lwIP's version of this function. + * + * This is a full replacement of the corresponding lwIP function, which should + * be overridden with weak symbols, using patches against the lwIP source code. + * As such, the lwIP headers should already provide the correct prototype for + * this function. If not, something will have changed in the lwIP + * implementation, and this code must be revised accordingly. + * + * Important: there are currently no tests that will detect that overriding is + * broken, since our test code (necessarily) uses the code path that calls + * ifaddr_v6_select() directly, even though there are other places in the lwIP + * source code that explicitly call this functions. + */ +const ip_addr_t * +ip6_select_source_address(struct netif * netif, const ip6_addr_t * dest_addr) +{ + ip_addr_t ipaddr; + + ip_addr_copy_from_ip6(ipaddr, *dest_addr); + + return ifaddr_v6_select(netif_get_ifdev(netif), &ipaddr); +} + +/* + * Find and return the interface to which the given address is assigned as a + * local (source) address, or NULL if the given address is not a local address + * for any interface. The 'any' address as well as IPv4-mapped IPv6 addresses + * are not supported and will yield NULL. + */ +struct ifdev * +ifaddr_map_by_addr(const ip_addr_t * ipaddr) +{ + + switch (IP_GET_TYPE(ipaddr)) { + case IPADDR_TYPE_V4: + return ifaddr_v4_map_by_addr(ip_2_ip4(ipaddr)); + + case IPADDR_TYPE_V6: + if (ip6_addr_isipv4mappedipv6(ip_2_ip6(ipaddr))) + return NULL; + + return ifaddr_v6_map_by_addr(ip_2_ip6(ipaddr)); + + case IPADDR_TYPE_ANY: + return NULL; + + default: + panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); + } +} + +/* + * Find and return an interface that has a local network configured that + * contains the given address, or NULL if there is no match. If there are + * multiple matches, an arbitrary one is returned. The 'any' address as well + * as IPv4-mapped IPv6 addresses are not supported and will yield NULL. + */ +struct ifdev * +ifaddr_map_by_subnet(const ip_addr_t * ipaddr) +{ + + switch (IP_GET_TYPE(ipaddr)) { + case IPADDR_TYPE_V4: + return ifaddr_v4_map_by_subnet(ip_2_ip4(ipaddr)); + + case IPADDR_TYPE_V6: + if (ip6_addr_isipv4mappedipv6(ip_2_ip6(ipaddr))) + return NULL; + + return ifaddr_v6_map_by_subnet(ipaddr); + + case IPADDR_TYPE_ANY: + return NULL; + + default: + panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); + } +} + +/* + * Select a local address to use as source address for the given destination + * address. If 'ifdev' is not NULL, it points to the interface from which to + * select a source address. If 'ifdev' is NULL, this function will attempt to + * select an interface as well. On success, return the selected source + * address, and if 'ifdevp' is not NULL, store the selected interface in it. + * On failure, return NULL. + */ +const ip_addr_t * +ifaddr_select(const ip_addr_t * dst_addr, struct ifdev * ifdev, + struct ifdev ** ifdevp) +{ + struct route_entry *route; + const ip6_addr_t *ip6addr; + + /* + * If no interface is provided yet, start by determining the interface. + * If the destination address has a zone, this step is easy. Otherwise + * we have to do a routing query on the destination address. + */ + if (ifdev == NULL) { + ip6addr = ip_2_ip6(dst_addr); + + if (IP_IS_V6(dst_addr) && ip6_addr_has_zone(ip6addr)) { + ifdev = ifdev_get_by_index(ip6_addr_zone(ip6addr)); + + if (ifdev == NULL) + return NULL; + } else { + if ((route = route_lookup(dst_addr)) == NULL) + return NULL; + + ifdev = route_get_ifdev(route); + } + } + + if (ifdevp != NULL) + *ifdevp = ifdev; + + /* + * We have found an interface. Now select an IP address assigned to + * that interface. For IPv4, this is easy: each interface has only one + * local address (if that). For IPv6, we may have to select one of the + * locally assigned addresses: global, link-local, etc. + */ + switch (IP_GET_TYPE(dst_addr)) { + case IPADDR_TYPE_V4: + /* Use the IPv4 source address if one is set at all. */ + if (!ifdev->ifdev_v4set) + return FALSE; + + return netif_ip_addr4(ifdev_get_netif(ifdev)); + + case IPADDR_TYPE_V6: + return ifaddr_v6_select(ifdev, dst_addr); + + default: + panic("unknown IP address type: %u", IP_GET_TYPE(dst_addr)); + } +} + +/* + * Check the given IPv6 address for a zone violation against the given + * interface--that is, a scoped address leaving its original zone if used in + * the context of the interface. Return TRUE if the address is zone- + * incompatible with the interface, and thus must not be used in packets sent + * to that interface. Return FALSE if there is no such zone incompatibility. + */ +int +ifaddr_is_zone_mismatch(const ip6_addr_t * ipaddr, struct ifdev * ifdev) +{ + + /* + * The IPv6 loopback address (::1) has an implicit link-local scope, + * with a zone corresponding to the interface it is assigned to. We + * take a shortcut by assuming that the loopback address is assigned to + * the primary loopback interface. + */ + if (ip6_addr_isloopback(ipaddr)) + return (ifdev != ifdev_get_loopback()); + + /* Zoned addresses must not leave their zone. */ + if (ip6_addr_has_zone(ipaddr)) + return !ip6_addr_test_zone(ipaddr, ifdev_get_netif(ifdev)); + + return FALSE; +} + +/* + * Find a data link (hardware) address locally assigned to a interface. The + * address is given as 'addr', and the length of the memory area that contains + * 'addr' is given as 'addr_len'. The interface is given as 'ifdev'. On + * success, return OK, with the data link address number stored in 'num'. For + * interfaces that do not support hardware addresses, if the given address + * provides a zero-length hardware address, always return successfully with 0 + * stored in 'nump'. On failure, return a negative error code. + */ +int +ifaddr_dl_find(struct ifdev * ifdev, const struct sockaddr_dlx * addr, + socklen_t addr_len, ifaddr_dl_num_t * nump) +{ + uint8_t hwaddr[NETIF_MAX_HWADDR_LEN]; + ifaddr_dl_num_t num; + int r; + + if ((r = addr_get_link((const struct sockaddr *)addr, addr_len, + NULL /*name*/, 0 /*name_max*/, hwaddr, + ifdev_get_hwlen(ifdev))) != OK) + return r; + + /* + * For interfaces without hardware addresses, after passing the above + * sanity checks (which guarantee that the searched-for address is of + * zero length), return the pseudo-entry zero, which yields an entry + * with a zero-sized hardware address once obtained. This is required + * for at least ifconfig(8). + */ + if (ifdev->ifdev_ops->iop_set_hwaddr == NULL) { + *nump = 0; + return OK; + } + + for (num = 0; (size_t)num < __arraycount(ifdev->ifdev_hwlist); num++) { + if ((ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID) && + !memcmp(ifdev->ifdev_hwlist[num].ifhwa_addr, hwaddr, + ifdev_get_hwlen(ifdev))) { + *nump = num; + return OK; + } + } + + return EADDRNOTAVAIL; +} + +/* + * Enumerate data link (hardware) addresses locally assigned to the given + * interface 'ifdev'. The caller should set 'nump' to 0 initially, and + * increase it by one between a successful call and the next enumeration call. + * Return TRUE on success, meaning that starting from the given value of 'nump' + * there is at least one data link address, of which the number is stored in + * 'nump' on return. Return FALSE if there are no more data link addresses + * locally assigned to the interface. + */ +int +ifaddr_dl_enum(struct ifdev * ifdev, ifaddr_dl_num_t * num) +{ + + /* + * If hardware addresses are not supported, or if no hardware address + * has been added to this interface yet (this shouldn't happen but + * still), there is always one entry with a (zero-sized) address. + * That is required for the IFP (name) entry as used by getifaddrs(3). + */ + if (ifdev->ifdev_ops->iop_set_hwaddr == NULL || + !(ifdev->ifdev_hwlist[0].ifhwa_flags & IFHWAF_VALID)) + return (*num == 0); + + for (; (size_t)*num < __arraycount(ifdev->ifdev_hwlist); (*num)++) { + if (ifdev->ifdev_hwlist[*num].ifhwa_flags & IFHWAF_VALID) + return TRUE; + } + + return FALSE; +} + +/* + * Retrieve a data link (hardware) address for an interface. For interfaces + * that support hardware addresses, 'num' must be a number returned by + * ifaddr_dl_find() or ifaddr_dl_enum(). For others, 'num' must be zero, and a + * pseudo-address of zero size will be returned. The address will be stored in + * 'addr'. This function always succeeds. + */ +void +ifaddr_dl_get(struct ifdev * ifdev, ifaddr_dl_num_t num, + struct sockaddr_dlx * addr) +{ + const uint8_t *hwaddr; + size_t hwaddr_len; + socklen_t addr_len; + + if ((hwaddr_len = ifdev_get_hwlen(ifdev)) > 0) { + /* + * Note that if we have no hardware addresses yet (which should + * not happen but still), the first entry may not be marked as + * valid yet. Ignore it, and return an all-zeroes address. + */ + hwaddr = ifdev->ifdev_hwlist[num].ifhwa_addr; + } else + hwaddr = NULL; + + addr_len = sizeof(*addr); + + addr_put_link((struct sockaddr *)addr, &addr_len, + ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), + ifdev_get_name(ifdev), hwaddr, hwaddr_len); +} + +/* + * Obtain NetBSD-style state flags (IFLR_) for the given local data link + * address. The given number may be 0, in which case that slot's state may not + * be valid. Otherwise, the given number must identify an existing address. + * Return the flags, 0 if the slot was not valid. + */ +int +ifaddr_dl_get_flags(struct ifdev * ifdev, ifaddr_dl_num_t num) +{ + int flags; + + assert(num >= 0 && (size_t)num < __arraycount(ifdev->ifdev_hwlist)); + + if (!(ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID)) + return 0; + + flags = (num == 0) ? IFLR_ACTIVE : 0; + + if (ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_FACTORY) + flags |= IFLR_FACTORY; + + return flags; +} + +/* + * Scan the list of hardware addresses of the given interface for a particular + * hardware address, as well as for an available entry. Return the entry found + * or -1 if the given hardware address was not found. Independently, return an + * available entry in 'availp' or -1 if no entries are available. + */ +static ifaddr_dl_num_t +ifaddr_dl_scan(struct ifdev * ifdev, const uint8_t * hwaddr, + ifaddr_dl_num_t * availp) +{ + ifaddr_dl_num_t num, found, avail; + + found = avail = -1; + + for (num = 0; (size_t)num < __arraycount(ifdev->ifdev_hwlist); num++) { + if (!(ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID)) { + if (avail == -1) + avail = num; + } else if (!memcmp(ifdev->ifdev_hwlist[num].ifhwa_addr, hwaddr, + ifdev_get_hwlen(ifdev))) + found = num; + } + + *availp = avail; + return found; +} + +/* + * Set a hardware address entry in the hardware address list of the given + * interface. + */ +static void +ifaddr_dl_set(struct ifdev * ifdev, ifaddr_dl_num_t num, + const uint8_t * hwaddr, int is_factory) +{ + + memcpy(&ifdev->ifdev_hwlist[num].ifhwa_addr, hwaddr, + ifdev_get_hwlen(ifdev)); + + ifdev->ifdev_hwlist[num].ifhwa_flags = IFHWAF_VALID; + if (is_factory) + ifdev->ifdev_hwlist[num].ifhwa_flags |= IFHWAF_FACTORY; + + rtsock_msg_addr_dl(ifdev, RTM_NEWADDR, num); +} + +/* + * Mark a new hardware address as active, after it has already been activated + * on the hardware and in local administration. The active slot is always slot + * zero, so swap slots if needed. + */ +static void +ifaddr_dl_activate(struct ifdev * ifdev, ifaddr_dl_num_t num) +{ + struct ifdev_hwaddr tmp; + struct netif *netif; + size_t sz; + + assert(num != -1); + + /* The given slot may be zero if this is the initial address. */ + if (num != 0) { + sz = sizeof(tmp); + memcpy(&tmp, &ifdev->ifdev_hwlist[0], sz); + memcpy(&ifdev->ifdev_hwlist[0], &ifdev->ifdev_hwlist[num], sz); + memcpy(&ifdev->ifdev_hwlist[num], &tmp, sz); + } + + netif = ifdev_get_netif(ifdev); + + /* Tell lwIP and routing sockets. */ + memcpy(&netif->hwaddr, &ifdev->ifdev_hwlist[0].ifhwa_addr, + ifdev_get_hwlen(ifdev)); + + rtsock_msg_addr_dl(ifdev, RTM_CHGADDR, 0); + + /* See if we can and should generate a link-local IPv6 address now. */ + ifaddr_v6_set_linklocal(ifdev); +} + +/* + * Add a data link (hardware) address to an interface, or if it already exists, + * update its associated flags (IFLR_). + */ +int +ifaddr_dl_add(struct ifdev * ifdev, const struct sockaddr_dlx * addr, + socklen_t addr_len, int flags) +{ + uint8_t hwaddr[NETIF_MAX_HWADDR_LEN]; + ifaddr_dl_num_t found, avail; + int r; + + /* + * If this interface type does not support setting hardware addresses, + * refuse the call. If the interface type supports it but the + * underlying hardware does not, we cannot report failure here, though. + * In that case, attempts to activate an address will fail instead. + */ + if (ifdev->ifdev_ops->iop_set_hwaddr == NULL) + return EINVAL; + + if ((r = addr_get_link((const struct sockaddr *)addr, addr_len, + NULL /*name*/, 0 /*name_max*/, hwaddr, + ifdev_get_hwlen(ifdev))) != OK) + return r; + + /* + * Find the slot for the given hardware address. Also find the slot of + * the active address, and a free slot. All of these may not exist. + */ + found = ifaddr_dl_scan(ifdev, hwaddr, &avail); + + if (found == -1) { + if (avail == -1) + return ENOBUFS; /* TODO: a better error code */ + found = avail; + } + + /* + * If we are asked to activate this address, try that first: this may + * fail if the network device does not support setting addresses, in + * which case we want to fail without causing routing socket noise. + */ + if ((flags & IFLR_ACTIVE) && found != 0 && + (r = ifdev->ifdev_ops->iop_set_hwaddr(ifdev, hwaddr)) != OK) + return r; + + /* + * If this is a new address, add and announce it. Otherwise, just + * update its flags. + */ + if (found == avail) { + ifaddr_dl_set(ifdev, found, hwaddr, + (flags & IFLR_FACTORY)); + } else { + ifdev->ifdev_hwlist[found].ifhwa_flags &= ~IFLR_FACTORY; + if (flags & IFLR_FACTORY) + ifdev->ifdev_hwlist[found].ifhwa_flags |= IFLR_FACTORY; + } + + /* + * Activate the address if requested, swapping slots as needed. It is + * not possible to deactivate the active address by changing its flags. + */ + if ((flags & IFLR_ACTIVE) && found != 0) + ifaddr_dl_activate(ifdev, found); + + return OK; +} + +/* + * Delete a data link (hardware) address from an interface. + */ +int +ifaddr_dl_del(struct ifdev * ifdev, ifaddr_dl_num_t num) +{ + + if (ifdev->ifdev_ops->iop_set_hwaddr == NULL) + return EINVAL; + + assert(num >= 0 && (size_t)num < __arraycount(ifdev->ifdev_hwlist)); + assert(ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID); + + /* It is not possible to delete the active address. */ + if (num == 0) + return EBUSY; + + rtsock_msg_addr_dl(ifdev, RTM_DELADDR, num); + + ifdev->ifdev_hwlist[num].ifhwa_flags = 0; + + return OK; +} + +/* + * Announce all data link (hardware) addresses associated with the given + * interface as deleted, including the active address. Used (only) right + * before the interface is destroyed. + */ +void +ifaddr_dl_clear(struct ifdev * ifdev) +{ + ifaddr_dl_num_t num; + + /* + * Do the active address last, because all announcements carry the + * active address's hardware address as well. + */ + for (num = 1; ifaddr_dl_enum(ifdev, &num); num++) + rtsock_msg_addr_dl(ifdev, RTM_DELADDR, num); + + if (ifdev->ifdev_hwlist[0].ifhwa_flags & IFHWAF_VALID) + rtsock_msg_addr_dl(ifdev, RTM_DELADDR, (ifaddr_dl_num_t)0); +} + +/* + * Update the interface's active hardware address. If the 'is_factory' flag is + * set, the address is the factory (driver-given) address. This function may + * only be called from ifdev_update_hwaddr(). + */ +void +ifaddr_dl_update(struct ifdev * ifdev, const uint8_t * hwaddr, int is_factory) +{ + ifaddr_dl_num_t found, avail; + + /* + * Find the slot for the given hardware address. Also find the slot of + * the active address, and a free slot. All of these may not exist. + */ + found = ifaddr_dl_scan(ifdev, hwaddr, &avail); + + /* If the given address is already the active one, do nothing. */ + if (found == 0) { + /* Factory addresses are always added first! */ + assert(!is_factory); + + return; + } + + if (found == -1) { + /* + * If the given address is not in the list, add it. If the + * list is full, first remove any non-active address. The user + * won't like this, but it preserves correctness without too + * many complications, because this case is unlikely to happen. + */ + if (avail == -1) { + found = 1; + + (void)ifaddr_dl_del(ifdev, found); + } else + found = avail; + + ifaddr_dl_set(ifdev, found, hwaddr, is_factory); + } + + ifaddr_dl_activate(ifdev, found); +} diff --git a/minix/net/lwip/ifaddr.h b/minix/net/lwip/ifaddr.h new file mode 100644 index 000000000..4ea7d9794 --- /dev/null +++ b/minix/net/lwip/ifaddr.h @@ -0,0 +1,70 @@ +#ifndef MINIX_NET_LWIP_IFADDR_H +#define MINIX_NET_LWIP_IFADDR_H + +/* Possible values of ifdev_v6flags[] elements. */ +#define IFADDR_V6F_AUTOCONF 0x01 /* autoconfigured address, no subnet */ +#define IFADDR_V6F_TEMPORARY 0x02 /* temporary (privacy) address */ +#define IFADDR_V6F_HWBASED 0x04 /* auto-derived from MAC address */ + +typedef int ifaddr_v4_num_t; /* interface IPv4 address number */ +typedef int ifaddr_v6_num_t; /* interface IPv6 address number */ +typedef int ifaddr_dl_num_t; /* interface link address number */ + +extern int ifaddr_auto_linklocal; +extern int ifaddr_accept_rtadv; + +void ifaddr_init(struct ifdev * ifdev); + +int ifaddr_v4_find(struct ifdev * ifdev, const struct sockaddr_in * addr, + ifaddr_v4_num_t * num); +int ifaddr_v4_enum(struct ifdev * ifdev, ifaddr_v4_num_t * num); +int ifaddr_v4_get(struct ifdev * ifdev, ifaddr_v4_num_t num, + struct sockaddr_in * addr, struct sockaddr_in * mask, + struct sockaddr_in * bcast, struct sockaddr_in * dest); +int ifaddr_v4_get_flags(struct ifdev * ifdev, ifaddr_v4_num_t num); +int ifaddr_v4_add(struct ifdev * ifdev, const struct sockaddr_in * addr, + const struct sockaddr_in * mask, const struct sockaddr_in * bcast, + const struct sockaddr_in * dest, int flags); +void ifaddr_v4_del(struct ifdev * ifdev, ifaddr_v4_num_t num); +void ifaddr_v4_clear(struct ifdev * ifdev); +struct ifdev *ifaddr_v4_map_by_addr(const ip4_addr_t * ip4addr); + +int ifaddr_v6_find(struct ifdev * ifdev, const struct sockaddr_in6 * addr6, + ifaddr_v6_num_t * num); +int ifaddr_v6_enum(struct ifdev * ifdev, ifaddr_v6_num_t * num); +void ifaddr_v6_get(struct ifdev * ifdev, ifaddr_v6_num_t num, + struct sockaddr_in6 * addr6, struct sockaddr_in6 * mask6, + struct sockaddr_in6 * dest6); +int ifaddr_v6_get_flags(struct ifdev * ifdev, ifaddr_v6_num_t num); +void ifaddr_v6_get_lifetime(struct ifdev * ifdev, ifaddr_v6_num_t num, + struct in6_addrlifetime * lifetime); +int ifaddr_v6_add(struct ifdev * ifdev, const struct sockaddr_in6 * addr6, + const struct sockaddr_in6 * mask6, const struct sockaddr_in6 * dest6, + int flags, const struct in6_addrlifetime * lifetime); +void ifaddr_v6_del(struct ifdev * ifdev, ifaddr_v6_num_t num); +void ifaddr_v6_clear(struct ifdev * ifdev); +void ifaddr_v6_check(struct ifdev * ifdev); +void ifaddr_v6_set_up(struct ifdev * ifdev); +void ifaddr_v6_set_linklocal(struct ifdev * ifdev); +struct ifdev *ifaddr_v6_map_by_addr(const ip6_addr_t * ip6addr); + +struct ifdev *ifaddr_map_by_addr(const ip_addr_t * ipaddr); +struct ifdev *ifaddr_map_by_subnet(const ip_addr_t * ipaddr); +const ip_addr_t *ifaddr_select(const ip_addr_t * dst_addr, + struct ifdev * ifdev, struct ifdev ** ifdevp); +int ifaddr_is_zone_mismatch(const ip6_addr_t * ipaddr, struct ifdev * ifdev); + +int ifaddr_dl_find(struct ifdev * ifdev, const struct sockaddr_dlx * addr, + socklen_t addr_len, ifaddr_dl_num_t * num); +int ifaddr_dl_enum(struct ifdev * ifdev, ifaddr_dl_num_t * num); +void ifaddr_dl_get(struct ifdev * ifdev, ifaddr_dl_num_t num, + struct sockaddr_dlx * addr); +int ifaddr_dl_get_flags(struct ifdev * ifdev, ifaddr_dl_num_t num); +int ifaddr_dl_add(struct ifdev * ifdev, const struct sockaddr_dlx * addr, + socklen_t addr_len, int flags); +int ifaddr_dl_del(struct ifdev * ifdev, ifaddr_dl_num_t num); +void ifaddr_dl_clear(struct ifdev * ifdev); +void ifaddr_dl_update(struct ifdev * ifdev, const uint8_t * hwaddr, + int is_factory); + +#endif /* !MINIX_NET_LWIP_IFADDR_H */ diff --git a/minix/net/lwip/ifconf.c b/minix/net/lwip/ifconf.c new file mode 100644 index 000000000..e1a48c7f8 --- /dev/null +++ b/minix/net/lwip/ifconf.c @@ -0,0 +1,930 @@ +/* LWIP service - ifconf.c - interface configuration */ + +#include "lwip.h" +#include "ifaddr.h" +#include "lldata.h" + +#include +#include + +#define LOOPBACK_IFNAME "lo0" /* name of the loopback interface */ + +/* + * Initialize the first loopback device, which is present by default. + */ +void +ifconf_init(void) +{ + const struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr = { htonl(INADDR_LOOPBACK) } + }; + struct sockaddr_in6 ll_addr6 = { + .sin6_family = AF_INET6, + }; + const struct sockaddr_in6 lo_addr6 = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT + }; + const struct in6_addrlifetime lifetime = { + .ia6t_vltime = ND6_INFINITE_LIFETIME, + .ia6t_pltime = ND6_INFINITE_LIFETIME + }; + struct sockaddr_in6 mask6; + struct ifdev *ifdev; + socklen_t addr_len; + int r; + + if ((r = ifdev_create(LOOPBACK_IFNAME)) != OK) + panic("unable to create loopback interface: %d", r); + + if ((ifdev = ifdev_find_by_name(LOOPBACK_IFNAME)) == NULL) + panic("unable to find loopback interface"); + + if ((r = ifaddr_v4_add(ifdev, &addr, NULL, NULL, NULL, 0)) != OK) + panic("unable to set IPv4 address on loopback interface: %d", + r); + + addr_len = sizeof(mask6); + addr_put_netmask((struct sockaddr *)&mask6, &addr_len, IPADDR_TYPE_V6, + 64 /*prefix*/); + + ll_addr6.sin6_addr.s6_addr[0] = 0xfe; + ll_addr6.sin6_addr.s6_addr[1] = 0x80; + ll_addr6.sin6_addr.s6_addr[15] = ifdev_get_index(ifdev); + + if ((r = ifaddr_v6_add(ifdev, &ll_addr6, &mask6, NULL, 0, + &lifetime)) != OK) + panic("unable to set IPv6 address on loopback interface: %d", + r); + + addr_len = sizeof(mask6); + addr_put_netmask((struct sockaddr *)&mask6, &addr_len, IPADDR_TYPE_V6, + 128 /*prefix*/); + + if ((r = ifaddr_v6_add(ifdev, &lo_addr6, &mask6, NULL, 0, + &lifetime)) != OK) + panic("unable to set IPv6 address on loopback interface: %d", + r); + + if ((r = ifdev_set_ifflags(ifdev, IFF_UP)) != OK) + panic("unable to bring up loopback interface"); +} + +/* + * Process an address family independent IOCTL request with an "ifreq" + * structure. + */ +static int +ifconf_ioctl_ifreq(unsigned long request, const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct ifreq ifr; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK) + return r; + + if (request != SIOCIFCREATE) { + ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL) + return ENXIO; + } else + ifdev = NULL; + + switch (request) { + case SIOCGIFFLAGS: + ifr.ifr_flags = ifdev_get_ifflags(ifdev); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCSIFFLAGS: + /* + * Unfortunately, ifr_flags is a signed integer and the sign + * bit is in fact used as a flag, so without explicit casting + * we end up setting all upper bits of the (full) integer. If + * NetBSD ever extends the field, this assert should trigger.. + */ + assert(sizeof(ifr.ifr_flags) == sizeof(short)); + + return ifdev_set_ifflags(ifdev, (unsigned short)ifr.ifr_flags); + + case SIOCGIFMETRIC: + ifr.ifr_metric = ifdev_get_metric(ifdev); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCSIFMETRIC: + /* The metric is not used within the operating system. */ + ifdev_set_metric(ifdev, ifr.ifr_metric); + + return OK; + + case SIOCSIFMEDIA: + return ifdev_set_ifmedia(ifdev, ifr.ifr_media); + + case SIOCGIFMTU: + ifr.ifr_mtu = ifdev_get_mtu(ifdev); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCSIFMTU: + return ifdev_set_mtu(ifdev, ifr.ifr_mtu); + + case SIOCIFCREATE: + if (memchr(ifr.ifr_name, '\0', sizeof(ifr.ifr_name)) == NULL) + return EINVAL; + + return ifdev_create(ifr.ifr_name); + + case SIOCIFDESTROY: + return ifdev_destroy(ifdev); + + case SIOCGIFDLT: + ifr.ifr_dlt = ifdev_get_dlt(ifdev); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCGIFINDEX: + ifr.ifr_index = ifdev_get_index(ifdev); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + default: + return ENOTTY; + } +} + +/* + * Process an address family independent IOCTL request with an "ifcapreq" + * structure. + */ +static int +ifconf_ioctl_ifcap(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct ifcapreq ifcr; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifcr, sizeof(ifcr))) != OK) + return r; + + ifcr.ifcr_name[sizeof(ifcr.ifcr_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifcr.ifcr_name)) == NULL) + return ENXIO; + + switch (request) { + case SIOCSIFCAP: + return ifdev_set_ifcap(ifdev, ifcr.ifcr_capenable); + + case SIOCGIFCAP: + ifdev_get_ifcap(ifdev, &ifcr.ifcr_capabilities, + &ifcr.ifcr_capenable); + + return sockdriver_copyout(data, 0, &ifcr, sizeof(ifcr)); + + default: + return ENOTTY; + } +} + +/* + * Process an address family independent IOCTL request with an "ifmediareq" + * structure. + */ +static int +ifconf_ioctl_ifmedia(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct ifmediareq ifm; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifm, sizeof(ifm))) != OK) + return r; + + ifm.ifm_name[sizeof(ifm.ifm_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifm.ifm_name)) == NULL) + return ENXIO; + + switch (request) { + case MINIX_SIOCGIFMEDIA: + if ((r = ifdev_get_ifmedia(ifdev, &ifm.ifm_current, + &ifm.ifm_active)) != OK) + return r; + ifm.ifm_mask = 0; + + switch (ifdev_get_link(ifdev)) { + case LINK_STATE_UP: + ifm.ifm_status = IFM_AVALID | IFM_ACTIVE; + break; + case LINK_STATE_DOWN: + ifm.ifm_status = IFM_AVALID; + break; + default: + ifm.ifm_status = 0; + break; + } + + /* + * TODO: support for the list of supported media types. This + * one is not easy, because we cannot simply suspend the IOCTL + * and query the driver. For now, return only entry (which is + * the minimum for ifconfig(8) not to complain), namely the + * currently selected one. + */ + if (ifm.ifm_ulist != NULL) { + if (ifm.ifm_count < 1) + return ENOMEM; + + /* + * Copy out the 'list', which consists of one entry. + * If we were to produce multiple entries, we would + * have to check against the MINIX_IF_MAXMEDIA limit. + */ + if ((r = sockdriver_copyout(data, + offsetof(struct minix_ifmediareq, mifm_list), + &ifm.ifm_current, sizeof(ifm.ifm_current))) != OK) + return r; + } + ifm.ifm_count = 1; + + return sockdriver_copyout(data, 0, &ifm, sizeof(ifm)); + + default: + return ENOTTY; + } +} + +/* + * Process an address family independent IOCTL request with an "if_clonereq" + * structure. + */ +static int +ifconf_ioctl_ifclone(unsigned long request, + const struct sockdriver_data * data) +{ + struct if_clonereq ifcr; + const char *ptr; + char name[IFNAMSIZ]; + size_t off; + unsigned int num; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifcr, sizeof(ifcr))) != OK) + return r; + + if (ifcr.ifcr_count < 0) + return EINVAL; + + off = offsetof(struct minix_if_clonereq, mifcr_buffer); + + for (num = 0; (ptr = ifdev_enum_vtypes(num)) != NULL; num++) { + /* Prevent overflow in case we ever have over 128 vtypes.. */ + if (num == MINIX_IF_MAXCLONERS) + break; + + if (ifcr.ifcr_buffer == NULL || + num >= (unsigned int)ifcr.ifcr_count) + continue; + + memset(name, 0, sizeof(name)); + strlcpy(name, ptr, sizeof(name)); + + if ((r = sockdriver_copyout(data, off, name, + sizeof(name))) != OK) + return r; + + off += sizeof(name); + } + + ifcr.ifcr_total = num; + + return sockdriver_copyout(data, 0, &ifcr, sizeof(ifcr)); +} + +/* + * Process an address family independent IOCTL request with an "if_addrprefreq" + * structure. + */ +static int +ifconf_ioctl_ifaddrpref(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct if_addrprefreq ifap; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifap, sizeof(ifap))) != OK) + return r; + + ifap.ifap_name[sizeof(ifap.ifap_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifap.ifap_name)) == NULL) + return ENXIO; + + /* + * For now, we simply support only a preference of 0. We do not try to + * look up the given address, nor do we return the looked up address. + */ + switch (request) { + case SIOCSIFADDRPREF: + if (ifap.ifap_preference != 0) + return EINVAL; + + return OK; + + case SIOCGIFADDRPREF: + ifap.ifap_preference = 0; + + return sockdriver_copyout(data, 0, &ifap, sizeof(ifap)); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_INET with an "ifreq" structure. + */ +static int +ifconf_ioctl_v4_ifreq(unsigned long request, + const struct sockdriver_data * data) +{ + struct sockaddr_in addr, mask, bcast, dest, *sin = NULL /*gcc*/; + struct ifdev *ifdev; + struct ifreq ifr; + ifaddr_v4_num_t num; + int r, flags; + + if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK) + return r; + + ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL) + return ENXIO; + + switch (request) { + case SIOCGIFADDR: + case SIOCGIFNETMASK: + case SIOCGIFBRDADDR: + case SIOCGIFDSTADDR: + /* Retrieve all addresses, then copy out the desired one. */ + switch (request) { + case SIOCGIFADDR: sin = &addr; break; + case SIOCGIFNETMASK: sin = &mask; break; + case SIOCGIFBRDADDR: sin = &bcast; break; + case SIOCGIFDSTADDR: sin = &dest; break; + } + + sin->sin_len = 0; + + if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask, + &bcast, &dest)) != OK) + return r; + + if (sin->sin_len == 0) /* not filled in */ + return EADDRNOTAVAIL; + + memcpy(&ifr.ifr_addr, sin, sizeof(*sin)); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCGIFAFLAG_IN: + if ((r = ifaddr_v4_find(ifdev, + (struct sockaddr_in *)&ifr.ifr_addr, &num)) != OK) + return r; + + ifr.ifr_addrflags = ifaddr_v4_get_flags(ifdev, num); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCSIFADDR: + /* + * This one is slightly different from the rest, in that we + * either set or update the primary address: if we set it, we + * must let _add() generate a matching netmask automatically, + * while if we update it, _add() would fail unless we first + * delete the old entry. + */ + sin = (struct sockaddr_in *)&ifr.ifr_addr; + + if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask, + &bcast, &dest)) == OK) { + flags = ifaddr_v4_get_flags(ifdev, (ifaddr_v4_num_t)0); + + ifaddr_v4_del(ifdev, (ifaddr_v4_num_t)0); + + /* + * If setting the new address fails, reinstating the + * old address should always work. This is really ugly + * as it generates routing socket noise, but this call + * is deprecated anyway. + */ + if ((r = ifaddr_v4_add(ifdev, sin, &mask, &bcast, + &dest, 0 /*flags*/)) != OK) + (void)ifaddr_v4_add(ifdev, &addr, &mask, + &bcast, &dest, flags); + + return r; + } else + return ifaddr_v4_add(ifdev, sin, NULL /*mask*/, + NULL /*bcast*/, NULL /*dest*/, 0 /*flags*/); + + case SIOCSIFNETMASK: + case SIOCSIFBRDADDR: + case SIOCSIFDSTADDR: + /* These calls only update the existing primary address. */ + if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask, + &bcast, &dest)) != OK) + return r; + + sin = (struct sockaddr_in *)&ifr.ifr_addr; + + switch (request) { + case SIOCSIFNETMASK: memcpy(&mask, sin, sizeof(mask)); break; + case SIOCSIFBRDADDR: memcpy(&bcast, sin, sizeof(bcast)); break; + case SIOCSIFDSTADDR: memcpy(&dest, sin, sizeof(dest)); break; + } + + return ifaddr_v4_add(ifdev, &addr, &mask, &bcast, &dest, + ifaddr_v4_get_flags(ifdev, (ifaddr_v4_num_t)0)); + + case SIOCDIFADDR: + if ((r = ifaddr_v4_find(ifdev, + (struct sockaddr_in *)&ifr.ifr_addr, &num)) != OK) + return r; + + ifaddr_v4_del(ifdev, num); + + return OK; + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_INET with an "ifaliasreq" structure. + */ +static int +ifconf_ioctl_v4_ifalias(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct ifaliasreq ifra; + struct sockaddr_in dest; + ifaddr_v4_num_t num; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifra, sizeof(ifra))) != OK) + return r; + + ifra.ifra_name[sizeof(ifra.ifra_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifra.ifra_name)) == NULL) + return ENXIO; + + switch (request) { + case SIOCAIFADDR: + return ifaddr_v4_add(ifdev, + (struct sockaddr_in *)&ifra.ifra_addr, + (struct sockaddr_in *)&ifra.ifra_mask, + (struct sockaddr_in *)&ifra.ifra_broadaddr, + (struct sockaddr_in *)&ifra.ifra_dstaddr, 0 /*flags*/); + + case SIOCGIFALIAS: + if ((r = ifaddr_v4_find(ifdev, + (struct sockaddr_in *)&ifra.ifra_addr, &num)) != OK) + return r; + + /* + * The broadcast and destination address are stored in the same + * ifaliasreq field. We cannot pass a pointer to the same + * field to ifaddr_v4_get(). So, use a temporary variable. + */ + (void)ifaddr_v4_get(ifdev, num, + (struct sockaddr_in *)&ifra.ifra_addr, + (struct sockaddr_in *)&ifra.ifra_mask, + (struct sockaddr_in *)&ifra.ifra_broadaddr, &dest); + + if (ifra.ifra_broadaddr.sa_len == 0) + memcpy(&ifra.ifra_dstaddr, &dest, sizeof(dest)); + + return sockdriver_copyout(data, 0, &ifra, sizeof(ifra)); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_INET. + */ +static int +ifconf_ioctl_v4(unsigned long request, const struct sockdriver_data * data, + endpoint_t user_endpt) +{ + + switch (request) { + case SIOCSIFADDR: + case SIOCSIFDSTADDR: + case SIOCSIFBRDADDR: + case SIOCSIFNETMASK: + case SIOCDIFADDR: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGIFADDR: + case SIOCGIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCGIFAFLAG_IN: + return ifconf_ioctl_v4_ifreq(request, data); + + case SIOCAIFADDR: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGIFALIAS: + return ifconf_ioctl_v4_ifalias(request, data); + + default: + return ENOTTY; + } +} + +#ifdef INET6 +/* + * Process an IOCTL request for AF_INET6 with an "in6_ifreq" structure. + */ +static int +ifconf_ioctl_v6_ifreq(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct in6_ifreq ifr; + ifaddr_v6_num_t num; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK) + return r; + + ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL) + return ENXIO; + + if ((r = ifaddr_v6_find(ifdev, &ifr.ifr_addr, &num)) != OK) + return r; + + switch (request) { + case SIOCGIFADDR_IN6: + /* This IOCTL basically checks if the given address exists. */ + ifaddr_v6_get(ifdev, num, &ifr.ifr_addr, NULL, NULL); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCDIFADDR_IN6: + ifaddr_v6_del(ifdev, num); + + return OK; + + case SIOCGIFNETMASK_IN6: + ifaddr_v6_get(ifdev, num, NULL, &ifr.ifr_addr, NULL); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCGIFAFLAG_IN6: + ifr.ifr_ifru.ifru_flags6 = ifaddr_v6_get_flags(ifdev, num); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + case SIOCGIFALIFETIME_IN6: + ifaddr_v6_get_lifetime(ifdev, num, + &ifr.ifr_ifru.ifru_lifetime); + + return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_INET6 with an "in6_aliasreq" structure. + */ +static int +ifconf_ioctl_v6_ifalias(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct in6_aliasreq ifra; + int r; + + if ((r = sockdriver_copyin(data, 0, &ifra, sizeof(ifra))) != OK) + return r; + + ifra.ifra_name[sizeof(ifra.ifra_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ifra.ifra_name)) == NULL) + return ENXIO; + + switch (request) { + case SIOCAIFADDR_IN6: + return ifaddr_v6_add(ifdev, &ifra.ifra_addr, + &ifra.ifra_prefixmask, &ifra.ifra_dstaddr, + ifra.ifra_flags, &ifra.ifra_lifetime); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_INET6 with an "in6_ndireq" structure. + */ +static int +ifconf_ioctl_v6_ndireq(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct in6_ndireq ndi; + int r; + + if ((r = sockdriver_copyin(data, 0, &ndi, sizeof(ndi))) != OK) + return r; + + ndi.ifname[sizeof(ndi.ifname) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(ndi.ifname)) == NULL) + return ENXIO; + + switch (request) { + case SIOCGIFINFO_IN6: + memset(&ndi.ndi, 0, sizeof(ndi.ndi)); + + ndi.ndi.linkmtu = ifdev_get_mtu(ifdev); + ndi.ndi.flags = ifdev_get_nd6flags(ifdev); + ndi.ndi.initialized = 1; + /* TODO: all the other fields.. */ + + return sockdriver_copyout(data, 0, &ndi, sizeof(ndi)); + + case SIOCSIFINFO_IN6: + /* TODO: all the other fields.. */ + + /* FALLTHROUGH */ + case SIOCSIFINFO_FLAGS: + return ifdev_set_nd6flags(ifdev, ndi.ndi.flags); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_INET6 with an "in6_nbrinfo" structure. + */ +static int +ifconf_ioctl_v6_nbrinfo(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct sockaddr_in6 addr; + struct in6_nbrinfo nbri; + lldata_ndp_num_t num; + int r; + + if ((r = sockdriver_copyin(data, 0, &nbri, sizeof(nbri))) != OK) + return r; + + nbri.ifname[sizeof(nbri.ifname) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(nbri.ifname)) == NULL) + return ENXIO; + + switch (request) { + case SIOCGNBRINFO_IN6: + /* + * Convert the given in6_addr to a full sockaddr_in6, mainly + * for internal consistency. It would have been nice if the + * KAME management API had had any sort of consistency itself. + */ + memset(&addr, 0, sizeof(addr)); + addr.sin6_family = AF_INET6; + memcpy(&addr.sin6_addr.s6_addr, &nbri.addr, + sizeof(addr.sin6_addr.s6_addr)); + + if ((r = lldata_ndp_find(ifdev, &addr, &num)) != OK) + return r; + + lldata_ndp_get_info(num, &nbri.asked, &nbri.isrouter, + &nbri.state, &nbri.expire); + + return sockdriver_copyout(data, 0, &nbri, sizeof(nbri)); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_INET6. + */ +static int +ifconf_ioctl_v6(unsigned long request, const struct sockdriver_data * data, + endpoint_t user_endpt) +{ + + switch (request) { + case SIOCDIFADDR_IN6: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGIFADDR_IN6: + case SIOCGIFNETMASK_IN6: + case SIOCGIFAFLAG_IN6: + case SIOCGIFALIFETIME_IN6: + return ifconf_ioctl_v6_ifreq(request, data); + + case SIOCAIFADDR_IN6: + if (!util_is_root(user_endpt)) + return EPERM; + + return ifconf_ioctl_v6_ifalias(request, data); + + case SIOCSIFINFO_IN6: + case SIOCSIFINFO_FLAGS: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGIFINFO_IN6: + return ifconf_ioctl_v6_ndireq(request, data); + + case SIOCGNBRINFO_IN6: + return ifconf_ioctl_v6_nbrinfo(request, data); + + default: + return ENOTTY; + } +} +#endif /* INET6 */ + +/* + * Process an IOCTL request for AF_LINK with an "if_laddrreq" structure. + */ +static int +ifconf_ioctl_dl_lifaddr(unsigned long request, + const struct sockdriver_data * data) +{ + struct ifdev *ifdev; + struct if_laddrreq iflr; + ifaddr_dl_num_t num; + int r; + + if ((r = sockdriver_copyin(data, 0, &iflr, sizeof(iflr))) != OK) + return r; + + iflr.iflr_name[sizeof(iflr.iflr_name) - 1] = '\0'; + + if ((ifdev = ifdev_find_by_name(iflr.iflr_name)) == NULL) + return ENXIO; + + switch (request) { + case SIOCGLIFADDR: + if (iflr.flags & IFLR_PREFIX) { + /* We ignore the prefix length, like NetBSD does. */ + if ((r = ifaddr_dl_find(ifdev, + (struct sockaddr_dlx *)&iflr.addr, + sizeof(iflr.addr), &num)) != OK) + return r; + } else + num = (ifaddr_dl_num_t)0; /* this always works */ + + ifaddr_dl_get(ifdev, num, (struct sockaddr_dlx *)&iflr.addr); + iflr.flags = ifaddr_dl_get_flags(ifdev, num); + memset(&iflr.dstaddr, 0, sizeof(iflr.dstaddr)); + + return sockdriver_copyout(data, 0, &iflr, sizeof(iflr)); + + case SIOCALIFADDR: + return ifaddr_dl_add(ifdev, (struct sockaddr_dlx *)&iflr.addr, + sizeof(iflr.addr), iflr.flags); + + case SIOCDLIFADDR: + if ((r = ifaddr_dl_find(ifdev, + (struct sockaddr_dlx *)&iflr.addr, sizeof(iflr.addr), + &num)) != OK) + return r; + + return ifaddr_dl_del(ifdev, num); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request for AF_LINK. + */ +static int +ifconf_ioctl_dl(unsigned long request, const struct sockdriver_data * data, + endpoint_t user_endpt) +{ + + switch (request) { + case SIOCALIFADDR: + case SIOCDLIFADDR: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGLIFADDR: + return ifconf_ioctl_dl_lifaddr(request, data); + + default: + return ENOTTY; + } +} + +/* + * Process an IOCTL request. This routine is shared between TCP, UDP, RAW, and + * link sockets. The given socket may be used to obtain the target domain: + * AF_INET, AF_INET6, or AF_LINK. + */ +int +ifconf_ioctl(struct sock * sock, unsigned long request, + const struct sockdriver_data * data, endpoint_t user_endpt) +{ + int domain; + + domain = sockevent_get_domain(sock); + + switch (request) { + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMEDIA: + case SIOCSIFMTU: + case SIOCIFCREATE: + case SIOCIFDESTROY: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFDLT: + case SIOCGIFINDEX: + return ifconf_ioctl_ifreq(request, data); + + case SIOCSIFCAP: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGIFCAP: + return ifconf_ioctl_ifcap(request, data); + + case MINIX_SIOCGIFMEDIA: + return ifconf_ioctl_ifmedia(request, data); + + case MINIX_SIOCIFGCLONERS: + return ifconf_ioctl_ifclone(request, data); + + case SIOCSIFADDRPREF: + if (!util_is_root(user_endpt)) + return EPERM; + + /* FALLTHROUGH */ + case SIOCGIFADDRPREF: + return ifconf_ioctl_ifaddrpref(request, data); + + default: + switch (domain) { + case AF_INET: + return ifconf_ioctl_v4(request, data, user_endpt); + +#ifdef INET6 + case AF_INET6: + return ifconf_ioctl_v6(request, data, user_endpt); +#endif /* INET6 */ + + case AF_LINK: + return ifconf_ioctl_dl(request, data, user_endpt); + + default: + return ENOTTY; + } + } +} diff --git a/minix/net/lwip/ifdev.c b/minix/net/lwip/ifdev.c new file mode 100644 index 000000000..1808a2a6a --- /dev/null +++ b/minix/net/lwip/ifdev.c @@ -0,0 +1,1064 @@ +/* LWIP service - ifdev.c - network interface devices */ + +#include "lwip.h" +#include "mcast.h" +#include "ifaddr.h" +#include "rtsock.h" +#include "route.h" +#include "bpfdev.h" + +#include + +/* + * The highest possible interface index number, plus one. We currently let + * lwIP choose the interface index. lwIP will generate a number between 1 and + * 255 inclusive. For efficiency, we use an array to look up an interface + * device object by its index. Thus, this array must be large enough to be + * indexed by the largest possible index number generated by lwIP. lwIP uses + * an unsigned 8-bit field to store the index number. + */ +#define MAX_IFDEV (UINT8_MAX + 1) + +/* The table is indexed by the interface index minus one. */ +static struct ifdev *ifdev_table[MAX_IFDEV]; /* index-based lookup table */ + +static TAILQ_HEAD(, ifdev) ifdev_list; /* list of active interfaces */ + +static struct ifdev *ifdev_loopback; /* loopback interface */ + +/* + * The maximum number of virtual interface types--that is, interface types for + * which interfaces may be created and destroyed dynamically. The BSDs call + * these "clones". There should be enough slots for all types, which are + * registered by their respective modules through ifdev_register(). Increase + * as necessary. + */ +#define MAX_VTYPE 4 + +static struct { + const char *ifvt_name; /* interface name without digits (e.g. "lo") */ + size_t ifvt_namelen; /* length of the name, excluding null term. */ + int (*ifvt_create)(const char *); /* ifdev create function */ +} ifdev_vtype[MAX_VTYPE]; + +static unsigned int ifdev_vtypes; /* number of in-use vtype slots */ + +#define IFDEV_MIN_MTU 1280 /* minimum interface MTU, required by IPv6 */ + +/* + * Initialize the network interface devices module. This call must be issued + * before any virtual interfaces are initialized, because the virtual types + * array is initialized here. + */ +void +ifdev_init(void) +{ + + memset(ifdev_table, 0, sizeof(ifdev_table)); + + TAILQ_INIT(&ifdev_list); + + memset(ifdev_vtype, 0, sizeof(ifdev_vtype)); + ifdev_vtypes = 0; +} + +/* + * Check all active interfaces to see if any tasks need to be performed. This + * function is called as part of each message loop iteration. + */ +void +ifdev_poll(void) +{ + struct ifdev *ifdev; + + /* + * Call the polling function of the active interfaces. Note that + * interfaces may not remove themselves as a result of polling! + */ + TAILQ_FOREACH(ifdev, &ifdev_list, ifdev_next) { + if (ifdev->ifdev_ops->iop_poll != NULL) + ifdev->ifdev_ops->iop_poll(ifdev); + } +} + +/* + * Handle an incoming packet on an interface. This function assumes ownership + * of the packet buffers: the caller must no longer refer to it afterward. For + * packets looped back for a non-loopback interface, 'ifdev' is the loopback + * interface and 'netif' is the original (non-loopback) interface's netif. For + * other packets, 'ifdev' is the actual interface and 'netif' is NULL. The + * packet is passed to BPF devices only if 'to_bpf' is set. + */ +void +ifdev_input(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif, + int to_bpf) +{ + struct bpfdev_link *bpfl; + err_t err; + + /* + * Looped-back packets are captured on the loopback device, not on the + * original interface. Similarly, we account the traffic to the + * loopback interface. This is a policy decision (inspired by NetBSD's + * behavior) and may be changed later. + */ + if (to_bpf) { + TAILQ_FOREACH(bpfl, &ifdev->ifdev_bpf, bpfl_next) + bpfdev_input(bpfl, pbuf); + } + + ifdev->ifdev_data.ifi_ipackets++; + ifdev->ifdev_data.ifi_ibytes += pbuf->tot_len; + + if (pbuf->flags & PBUF_FLAG_LLMCAST) + ifdev->ifdev_data.ifi_imcasts++; + + /* + * For looped-back packets, we must bypass the regular netif input + * function (as that one is for link-layer packet handling) and instead + * pass it directly to the IP-layer packet handling function of lwIP. + */ + if (netif != NULL) + err = ip_input(pbuf, netif); + else + err = ifdev->ifdev_netif.input(pbuf, &ifdev->ifdev_netif); + + if (err != ERR_OK) + pbuf_free(pbuf); +} + +/* + * Handle an outgoing packet on an interface. Return ERR_OK if the packet was + * transmitted or another lwIP ERR_ error code upon failure. Either way, the + * caller is responsible for freeing the packet buffers. If the packet is + * to be looped back to a non-loopback interface (because its destination is a + * local address), 'ifdev' is the loopback interface and 'netif' is set to the + * original interface's netif. In all other cases, 'ifdev' is the packet's + * source interface and 'netif' is NULL. The packet is passed to attached BPF + * devices only if 'to_bpf' is set. If 'hdrcmplt' is set, the source address + * of the data link header is already filled in; otherwise, the source address + * must be set to the device's source address, if applicable. + */ +err_t +ifdev_output(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif, + int to_bpf, int hdrcmplt) +{ + struct bpfdev_link *bpfl; + + /* + * If the interface and/or the link is down, discard the packet without + * reporting it to BPF or the actual interface module. + */ + if (!ifdev_is_up(ifdev) || !ifdev_is_link_up(ifdev)) + return ERR_IF; /* this should translate to ENETDOWN */ + + /* + * If the link-layer header is not yet complete, fill in the source + * address now. This exception applies to BPF-generated packets only. + * Complete the header before passing the packet back to BPF, which + * should see the completed version of the packet. + */ + if (!hdrcmplt && ifdev->ifdev_ops->iop_hdrcmplt != NULL) + ifdev->ifdev_ops->iop_hdrcmplt(ifdev, pbuf); + + /* + * As in ifdev_input(), we use the loopback interface for BPF and + * statistics even if the packet originates from a non-loopback device. + */ + if (to_bpf) { + TAILQ_FOREACH(bpfl, &ifdev->ifdev_bpf, bpfl_next) + bpfdev_output(bpfl, pbuf); + } + + ifdev->ifdev_data.ifi_opackets++; + ifdev->ifdev_data.ifi_obytes += pbuf->tot_len; + + /* + * TODO: this is rather imprecise, because it works only when we set + * the pbuf flag explicitly ourselves. That happens only for UDP/RAW + * packets, and not for (e.g.) ND6 multicast traffic. We have reasons + * to set the flags ourselves anyway, namely to support MSG_MCAST and + * MSG_BCAST on loopback interfaces, but they should be complemented by + * additional checks here on, say, the destination ethernet address. + */ + if (pbuf->flags & PBUF_FLAG_LLMCAST) + ifdev->ifdev_data.ifi_omcasts++; + + return ifdev->ifdev_ops->iop_output(ifdev, pbuf, netif); +} + +/* + * Transmit an IPv4 packet on an interface, as requested by lwIP. Pass on the + * packet to the interface's link processor (e.g., etharp), unless the packet + * should be rejected or blackholed according to route information, or it is to + * be looped back into the interface. The latter may occur if the destination + * address belongs to the interface. In that case, we send the packet over a + * loopback interface instead. In addition, if this is a multicast packet that + * should be looped back, send a copy over a loopback interface as well. + * Loopback interfaces themselves are exempt from these special cases. + */ +static err_t +ifdev_output_v4(struct netif * netif, struct pbuf * pbuf, + const ip4_addr_t * ipaddr) +{ + struct ifdev *ifdev = netif_get_ifdev(netif); + err_t err; + + assert(ifdev_loopback != NULL); + + /* Check for reject/blackhole routes. */ + if (!route_output_v4(ifdev, ipaddr, &err)) + return err; + + /* Handle looping of multicast packets on non-loopback interfaces. */ + if (!ifdev_is_loopback(ifdev) && (pbuf->flags & PBUF_FLAG_MCASTLOOP)) + (void)ifdev_output(ifdev_loopback, pbuf, netif, + FALSE /*to_bpf*/, TRUE /*hdrcmplt*/); + + /* Divert packets sent to the local interface address. */ + if (!ifdev_is_loopback(ifdev) && ifdev->ifdev_v4set && + ip4_addr_cmp(netif_ip4_addr(&ifdev->ifdev_netif), ipaddr)) + ifdev = ifdev_loopback; + else + netif = NULL; + + if (ifdev->ifdev_ops->iop_output_v4 != NULL) + return ifdev->ifdev_ops->iop_output_v4(ifdev_get_netif(ifdev), + pbuf, ipaddr); + else + return ifdev_output(ifdev, pbuf, netif, TRUE /*to_bpf*/, + TRUE /*hdrcmplt*/); +} + +/* + * Transmit an IPv6 packet on an interface, as requested by lwIP. As for IPv4. + */ +static err_t +ifdev_output_v6(struct netif * netif, struct pbuf * pbuf, + const ip6_addr_t * ipaddr) +{ + struct ifdev *ifdev = netif_get_ifdev(netif); + err_t err; + + assert(ifdev_loopback != NULL); + + /* Check for reject/blackhole routes. */ + if (!route_output_v6(ifdev, ipaddr, &err)) + return err; + + /* Handle looping of multicast packets on non-loopback interfaces. */ + if (!ifdev_is_loopback(ifdev) && (pbuf->flags & PBUF_FLAG_MCASTLOOP)) + (void)ifdev_output(ifdev_loopback, pbuf, netif, + FALSE /*to_bpf*/, TRUE /*hdrcmplt*/); + + /* Divert packets sent to the local interface address. */ + if (!ifdev_is_loopback(ifdev) && + (netif_get_ip6_addr_match(&ifdev->ifdev_netif, ipaddr) != -1 || + ip6_addr_ismulticast_iflocal(ipaddr))) + ifdev = ifdev_loopback; + else + netif = NULL; + + if (ifdev->ifdev_ops->iop_output_v6 != NULL) + return ifdev->ifdev_ops->iop_output_v6(ifdev_get_netif(ifdev), + pbuf, ipaddr); + else + return ifdev_output(ifdev, pbuf, netif, TRUE /*to_bpf*/, + TRUE /*hdrcmplt*/); +} + +/* + * Status callback function, called by lwIP whenever certain status changes are + * made on the netif. These changes may be initiated either by lwIP itself or + * by us. We use this callback to check lwIP-initiated state changes on local + * IPv6 addresses, using shadow state to filter out self-initiated changes. + * + * One day we might switch to the extended netif callback mechanism offered by + * lwIP. Currently, netif state changes are rare and it takes us little effort + * to find out whether anything changed, so there is no immediate need. + */ +static void +ifdev_status_callback(struct netif * netif) +{ + struct ifdev *ifdev = netif_get_ifdev(netif); + + ifaddr_v6_check(ifdev); +} + +/* + * Initialize the netif structure for a new interface. Most of this is handled + * by the specific interface module. + */ +static err_t +ifdev_init_netif(struct netif * netif) +{ + struct ifdev *ifdev = netif_get_ifdev(netif); + + assert(ifdev != NULL); + + netif->output = ifdev_output_v4; + netif->output_ip6 = ifdev_output_v6; + + netif->hwaddr_len = ifdev->ifdev_data.ifi_addrlen; + netif->mtu = ifdev->ifdev_data.ifi_mtu; + + netif_set_status_callback(netif, ifdev_status_callback); + + return ifdev->ifdev_ops->iop_init(ifdev, netif); +} + +/* + * Retrieve an interface device by its interface index. Return a pointer to + * the interface device if found, or NULL otherwise. If the given interface + * index is zero, this function will always return NULL. + */ +struct ifdev * +ifdev_get_by_index(uint32_t ifindex) +{ + + if (ifindex >= __arraycount(ifdev_table)) + return NULL; + + return ifdev_table[ifindex]; +} + +/* + * Find an interface device by its name. Return a pointer to the interface + * device if found, or NULL otherwise. + */ +struct ifdev * +ifdev_find_by_name(const char * name) +{ + struct ifdev *ifdev; + + TAILQ_FOREACH(ifdev, &ifdev_list, ifdev_next) { + if (!strcmp(ifdev->ifdev_name, name)) + return ifdev; + } + + return NULL; +} + +/* + * Given either NULL or a previously returned interface device object pointer, + * return the first or next interface device object pointer, or NULL if there + * are no more. + */ +struct ifdev * +ifdev_enum(struct ifdev * last) +{ + + if (last == NULL) + return TAILQ_FIRST(&ifdev_list); + else + return TAILQ_NEXT(last, ifdev_next); +} + +/* + * Attach a BPF device as listener to this interface. + */ +void +ifdev_attach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl) +{ + + TAILQ_INSERT_TAIL(&ifdev->ifdev_bpf, bpfl, bpfl_next); +} + +/* + * Detach a previously attached BPF device from this interface. + */ +void +ifdev_detach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl) +{ + + TAILQ_REMOVE(&ifdev->ifdev_bpf, bpfl, bpfl_next); +} + +/* + * Register the calling party as interested in putting the interface in + * promiscuous mode. There may be multiple such parties, each of which can + * call this function once, after which they must call ifdev_clear_promisc() + * later. If possible, the interface is put in promiscuous mode if there is at + * least one interested party. Return TRUE on success, or FALSE on failure. + */ +int +ifdev_set_promisc(struct ifdev * ifdev) +{ + + /* + * A bit silly, but we want to retain the ability to fail this call for + * other reasons in the future, with BPF handling that case properly. + */ + if (ifdev->ifdev_promisc == UINT_MAX) + return FALSE; + + if (ifdev->ifdev_promisc++ == 0) { + ifdev_update_ifflags(ifdev, + ifdev->ifdev_ifflags | IFF_PROMISC); + + if (ifdev->ifdev_ops->iop_set_promisc != NULL) + ifdev->ifdev_ops->iop_set_promisc(ifdev, TRUE); + } + + return TRUE; +} + +/* + * Deregister a previously registered party interested in putting the interface + * in promiscuous mode. Once the last party deregisters, the device is pulled + * out of promiscuous mode. + */ +void +ifdev_clear_promisc(struct ifdev * ifdev) +{ + + assert(ifdev->ifdev_promisc > 0); + + if (--ifdev->ifdev_promisc == 0) { + if (ifdev->ifdev_ops->iop_set_promisc != NULL) + ifdev->ifdev_ops->iop_set_promisc(ifdev, FALSE); + + ifdev_update_ifflags(ifdev, + ifdev->ifdev_ifflags & ~IFF_PROMISC); + } +} + +/* + * Set NetBSD-style interface flags (IFF_) for an interface. + */ +int +ifdev_set_ifflags(struct ifdev * ifdev, unsigned int ifflags) +{ + int r; + + /* Check and update only the subset of flags that may be changed. */ + ifflags &= ~(IFF_CANTCHANGE | IFF_LOOPBACK); + + /* + * Important: the callback function may call ifdev_update_ifflags() + * itself immediately, to update read-only flags such as IFF_RUNNING + * based on read-write flags such as IFF_UP. So as to make that work.. + * + * 1) this function MUST succeed if the callback function succeeds; + * 2) this function MUST NOT make assumptions about the ifdev_ifflags + * field across the callback invocation. + * + * Conversely, the callback function should be aware that the flags + * field will still be updated with the flags. In this model, it is + * not possible for the callback function to silently change any of the + * given flags. If that is ever necessary, API changes are needed. + */ + if ((r = ifdev->ifdev_ops->iop_set_ifflags(ifdev, ifflags)) != OK) + return r; + + /* + * On success, merge the updated subset with the subset that may not be + * changed. + */ + ifflags |= ifdev->ifdev_ifflags & (IFF_CANTCHANGE | IFF_LOOPBACK); + + ifdev_update_ifflags(ifdev, ifflags); + + return OK; +} + +/* + * Update NetBSD-style interface flags (IFF_) for an interface, and perform any + * required operations as a result of certain flags changing. This function + * bypasses all input checks and directly changes the flags field to exactly + * the given set of flags. + */ +void +ifdev_update_ifflags(struct ifdev * ifdev, unsigned int ifflags) +{ + struct netif *netif; + + /* + * First update the flags field itself. The new value should be + * visible in the routing messages generated below, for example. + */ + ifdev->ifdev_ifflags = ifflags; + + /* + * Then perform operations as a result of the flags field changing. + * For now, this is relevant for IFF_UP only. + */ + netif = ifdev_get_netif(ifdev); + + if ((ifflags & IFF_UP) && !netif_is_up(netif)) { + netif_set_up(netif); + + rtsock_msg_ifinfo(ifdev); + + /* + * Check if all conditions are now met for link-local IPv6 + * address assignment. + */ + ifaddr_v6_set_linklocal(ifdev); + + /* See if we should also reset address states now. */ + if (netif_is_link_up(netif)) + ifaddr_v6_set_up(ifdev); + } else if (!(ifflags & IFF_UP) && netif_is_up(netif)) { + netif_set_down(netif); + + rtsock_msg_ifinfo(ifdev); + } +} + +/* + * Retrieve NetBSD-style interface capabilities (IFCAP_) for an interface: both + * the supported and the enabled capabilities. + */ +void +ifdev_get_ifcap(struct ifdev * ifdev, uint64_t * ifcap, uint64_t * ifena) +{ + + *ifcap = 0; + *ifena = 0; + + if (ifdev->ifdev_ops->iop_get_ifcap != NULL) + ifdev->ifdev_ops->iop_get_ifcap(ifdev, ifcap, ifena); +} + +/* + * Set enabled NetBSD-style interface capabilities (IFCAP_) for an interface. + */ +int +ifdev_set_ifcap(struct ifdev * ifdev, uint64_t ifena) +{ + + if (ifdev->ifdev_ops->iop_set_ifcap != NULL) + return ifdev->ifdev_ops->iop_set_ifcap(ifdev, ifena); + else + return EINVAL; +} + +/* + * Retrieve NetBSD-style media type (IFM_) for an interface. Return OK on + * success, with the current media type selection stored in 'ifcurrent', the + * driver-reported active media type in 'ifactive', and the link status in + * 'ifstatus'. Return a negative error code on failure. + */ +int +ifdev_get_ifmedia(struct ifdev * ifdev, int * ifcurrent, int * ifactive) +{ + + if (ifdev->ifdev_ops->iop_get_ifmedia == NULL) + return ENOTTY; + + ifdev->ifdev_ops->iop_get_ifmedia(ifdev, ifcurrent, ifactive); + + return OK; +} + +/* + * Set NetBSD-style media type (IFM_) for an interface. Return OK on success, + * or a negative error code on failure. + */ +int +ifdev_set_ifmedia(struct ifdev * ifdev, int ifmedia) +{ + + if (ifdev->ifdev_ops->iop_set_ifmedia == NULL) + return ENOTTY; + + if (ifmedia < 0) + return EINVAL; + + return ifdev->ifdev_ops->iop_set_ifmedia(ifdev, ifmedia); +} + +/* + * Set the Maximum Transmission Unit for an interface. Return OK on success, + * or a negative error code on failure. + */ +int +ifdev_set_mtu(struct ifdev * ifdev, unsigned int mtu) +{ + + if (ifdev->ifdev_ops->iop_set_mtu == NULL) + return ENOTTY; + + if (mtu < IFDEV_MIN_MTU || mtu > UINT16_MAX || + !ifdev->ifdev_ops->iop_set_mtu(ifdev, mtu)) + return EINVAL; + + ifdev->ifdev_data.ifi_mtu = mtu; + ifdev->ifdev_netif.mtu = mtu; + + return OK; +} + +/* + * Set IPv6 Neighbor Discovery related flags. + */ +int +ifdev_set_nd6flags(struct ifdev * ifdev, uint32_t nd6flags) +{ + + /* For now, refuse setting any flags that are not even known. */ + if ((nd6flags & ~(ND6_IFF_PERFORMNUD | ND6_IFF_ACCEPT_RTADV | + ND6_IFF_IFDISABLED | ND6_IFF_OVERRIDE_RTADV | + ND6_IFF_AUTO_LINKLOCAL)) != 0) + return EINVAL; + + /* + * Unfortunately, the mismatch between NetBSD and lwIP requires us to + * support but butcher ND6 flags. The current status is as follows: + * + * - ND6_IFF_PERFORMNUD: set by default as lwIP always implements NUD; + * changes are disregarded but possible, for dhcpcd(8). + * - ND6_IFF_ACCEPT_RTADV: disregarded but settable, for dhcpcd(8); in + * our case, lwIP always processes router advertisements but never + * autoconfigures addresses, so this flag has no meaning for us. + * - ND6_IFF_IFDISABLED: not supported; can only be cleared; we could + * probably do detection of link-local address collision and set this + * flag (and disable the interface if set) when that happens; TODO. + * - ND6_IFF_OVERRIDE_RTADV: same as _ACCEPT_ above. + * - ND6_IFF_AUTO_LINKLOCAL: supported, but not initialized based on + * the corresponding sysctl(7) flag for reasons mentioned in ifaddr. + */ + if (nd6flags & ND6_IFF_IFDISABLED) + return EINVAL; + + ifdev->ifdev_nd6flags = nd6flags; + + return OK; +} + +/* + * Report an update to the interface's active hardware address that is *not* + * the result of a user action. If the 'is_factory' flag is set, the address + * is the factory (driver-given) address. This function is for use by + * interface modules, to update the internal state to their current external + * state. + */ +void +ifdev_update_hwaddr(struct ifdev * ifdev, const uint8_t * hwaddr, + int is_factory) +{ + + return ifaddr_dl_update(ifdev, hwaddr, is_factory); +} + +/* + * Insert a new interface device into the list of interface devices, at a + * location determined by policy. + */ +static void +ifdev_insert(struct ifdev * ifdev) +{ + struct ifdev *ifdev2; + const char *p; + unsigned int unit, unit2; + size_t namelen; + int found; + + /* + * While NetBSD can set up all interfaces in the order it wants them to + * appear in, we do not have such luxury: network device drivers come + * up and report to us in no particular predefined order, and we have + * no way to know how many and which will appear. The result is that + * we always have to create the loopback device first, something that + * is explicitly said to be bad in NetBSD. Instead, we create an + * illusion of a reasonable order by performing insertion sort on the + * interface list, using (for now) these rules, ordered by priority: + * + * 1. same-named devices are sorted by their unit number; + * 2. loopback interfaces are inserted after all other interfaces; + * 3. new devices are added at the end of their type category. + * + * In the future, other forms of real-vs-virtual sorting may be added. + */ + + /* First check for same-named devices (#1). */ + for (p = ifdev->ifdev_name; *p != '\0' && (*p < '0' || *p > '9'); p++); + + namelen = (size_t)(p - ifdev->ifdev_name); + + for (unit = 0; *p >= '0' && *p <= '9'; p++) + unit = unit * 10 + *p - '0'; + + found = FALSE; + TAILQ_FOREACH(ifdev2, &ifdev_list, ifdev_next) { + if (!strncmp(ifdev->ifdev_name, ifdev2->ifdev_name, namelen) && + *(p = &ifdev2->ifdev_name[namelen]) >= '0' && *p <= '9') { + for (unit2 = 0; *p >= '0' && *p <= '9'; p++) + unit2 = unit2 * 10 + *p - '0'; + + assert(unit != unit2); + + found = TRUE; + if (unit2 > unit) + break; + } else if (found) + break; + } + + if (found) { + if (ifdev2 != NULL) + TAILQ_INSERT_BEFORE(ifdev2, ifdev, ifdev_next); + else + TAILQ_INSERT_TAIL(&ifdev_list, ifdev, ifdev_next); + + return; + } + + /* + * No same-named device found. Is this a loopback interface? If not, + * insert before the first loopback device, if any. + */ + if (!ifdev_is_loopback(ifdev)) { + TAILQ_FOREACH(ifdev2, &ifdev_list, ifdev_next) { + if (ifdev_is_loopback(ifdev2)) { + TAILQ_INSERT_BEFORE(ifdev2, ifdev, ifdev_next); + + return; + } + } + } + + /* + * The given device is not a loopback device, or there was no loopback + * device in the list, possibly because it was empty. Add to the tail. + */ + TAILQ_INSERT_TAIL(&ifdev_list, ifdev, ifdev_next); +} + +/* + * Add and initialize an interface device. + */ +void +ifdev_add(struct ifdev * ifdev, const char * name, unsigned int ifflags, + unsigned int iftype, size_t hdrlen, size_t addrlen, unsigned int dlt, + unsigned int mtu, uint32_t nd6flags, const struct ifdev_ops * iop) +{ + unsigned int ifindex; + ip4_addr_t ip4addr_any, ip4addr_none; + + /* + * Since the call to netif_add() may end up invoking some of our + * callbacks (the add-multicast-address ones in particular), make sure + * that everything else is set up first. We cannot set up the index + * mapping until netif_add() returns, but this is currently no problem. + */ + strlcpy(ifdev->ifdev_name, name, sizeof(ifdev->ifdev_name)); + ifdev->ifdev_ifflags = 0; /* will be updated below */ + ifdev->ifdev_dlt = dlt; + ifdev->ifdev_nd6flags = nd6flags; + ifdev->ifdev_ops = iop; + + memset(&ifdev->ifdev_data, 0, sizeof(ifdev->ifdev_data)); + + assert(addrlen <= NETIF_MAX_HWADDR_LEN); + assert(mtu >= IFDEV_MIN_MTU && mtu <= UINT16_MAX); + + ifdev->ifdev_data.ifi_type = iftype; + ifdev->ifdev_data.ifi_hdrlen = hdrlen; + ifdev->ifdev_data.ifi_addrlen = addrlen; + ifdev->ifdev_data.ifi_link_state = LINK_STATE_UNKNOWN; + ifdev->ifdev_data.ifi_mtu = mtu; + + TAILQ_INIT(&ifdev->ifdev_bpf); + + ifaddr_init(ifdev); + + /* + * We have to assign an IPv4 address at netif addition time, but we may + * not have one yet, so pass in an "any" address for now. Hopefully + * lwIP will not mistake this for a real IPv4 address if we happen to + * enable the interface with only an IPv6 address later on. + */ + ip4_addr_set_any(&ip4addr_any); + ip4_addr_set_u32(&ip4addr_none, PP_HTONL(INADDR_NONE)); + + /* + * Insert the new interface device into a sensible place in the current + * list of interfaces. + */ + ifdev_insert(ifdev); + + /* + * netif_add() can fail only as a result of the initialization callback + * failing, which is something that should never happen in our case. + */ + if (netif_add(&ifdev->ifdev_netif, &ip4addr_any, &ip4addr_none, + &ip4addr_any, ifdev, ifdev_init_netif, iop->iop_input) == NULL) + panic("unable to add netif"); + + /* + * Set up the index mapping. Since interface index zero never + * generated, table slot zero is always NULL. We could shift all + * elements by one to save four bytes, but there's no real point. + */ + ifindex = netif_get_index(&ifdev->ifdev_netif); + + if (ifindex == 0 || ifindex >= __arraycount(ifdev_table)) + panic("invalid lwIP-generated interface index %u", ifindex); + + ifdev_table[ifindex] = ifdev; + + /* + * Set the initial interface flags. Use the regular procedure for this + * just in case the interface module is crazy enough to set the + * interface up right away (which is never a good idea but still). + */ + ifdev_update_ifflags(ifdev, ifflags); + + /* + * If this is the first loopback interface to be registered, save it as + * the loopback interface that we will use to loop back self-destined + * packets on other interfaces. Do this after setting the interface + * flags, since those are what we use to perform this loopback check. + */ + if (ifdev_loopback == NULL && ifdev_is_loopback(ifdev)) + ifdev_loopback = ifdev; + + /* Finally, announce the new interface. */ + rtsock_msg_ifannounce(ifdev, TRUE /*arrival*/); +} + +/* + * Remove an interface device. Return OK on success, or a negative error code + * on failure. Only loopback interfaces may be refused for removal. + */ +int +ifdev_remove(struct ifdev * ifdev) +{ + struct bpfdev_link *bpfl; + + /* + * If this is the loopback interface used to loop back packets for + * other interfaces (typically lo0), we cannot afford to get rid of it. + */ + if (ifdev == ifdev_loopback) + return EPERM; + + /* + * Take down the interface for the purpose of sending a routing + * message. NetBSD sends a RTM_IFINFO even if the interface was down + * already, and so we do not check whether IFF_UP was set at all here. + */ + ifdev_update_ifflags(ifdev, ifdev->ifdev_ifflags & ~IFF_UP); + + /* + * Report all associated addresses as deleted. It is not necessary to + * actually delete the addresses, nor is that even possible in all + * cases. In particular, the active hardware address cannot be + * deleted. Since the active hardware address is used in all address + * change announcements, delete it at the very end. + */ + ifaddr_v4_clear(ifdev); + ifaddr_v6_clear(ifdev); + ifaddr_dl_clear(ifdev); + + /* + * Delete all remaining routes associated with the interface. These + * are reported as well. We do this after clearing the addresses so as + * not to confuse the route deletion part of clearing addresses. + */ + route_clear(ifdev); + + /* Finally, announce the interface itself as gone. */ + rtsock_msg_ifannounce(ifdev, FALSE /*arrival*/); + + /* + * Free up all per-socket multicast membership structures associated to + * the interface. There is no need to leave the multicast groups. + */ + mcast_clear(ifdev); + + /* + * Also tell attached BPF devices that the interface is now gone. Do + * not bother to reset the list. + */ + TAILQ_FOREACH(bpfl, &ifdev->ifdev_bpf, bpfl_next) + bpfdev_detach(bpfl); + + /* Then perform the actual interface removal. */ + netif_remove(&ifdev->ifdev_netif); + + TAILQ_REMOVE(&ifdev_list, ifdev, ifdev_next); + + assert(ifdev_table[ifdev_get_index(ifdev)] == ifdev); + ifdev_table[ifdev_get_index(ifdev)] = NULL; + + return OK; +} + +/* + * Return the loopback interface. + */ +struct ifdev * +ifdev_get_loopback(void) +{ + + assert(ifdev_loopback != NULL); + + return ifdev_loopback; +} + +/* + * Report an update of the link state of the given interface, to 'unknown', + * 'up', or 'down', using NetBSD's LINK_STATE_ values. The link state is + * changed in the associated lwIP netif, and is reported on monitoring routing + * sockets. This function is for use by interface modules, to update the + * internal state to their current external state. + */ +void +ifdev_update_link(struct ifdev * ifdev, int iflink) +{ + struct netif *netif; + int was_up, is_up; + + ifdev->ifdev_data.ifi_link_state = iflink; + + /* + * For netif, 'up' and 'unknown' are the same link state: we simply try + * to send and receive packets in both cases. Thus, transitions from + * and to the 'down' link state are the ones that matter. + */ + netif = ifdev_get_netif(ifdev); + + was_up = netif_is_link_up(netif); + is_up = (iflink != LINK_STATE_DOWN); + + if (was_up != is_up) { + if (is_up) { + netif_set_link_up(netif); + + /* See if we should also reset address states now. */ + if (ifdev_is_up(ifdev)) + ifaddr_v6_set_up(ifdev); + } else + netif_set_link_down(netif); + + rtsock_msg_ifinfo(ifdev); + } +} + +/* + * Register a virtual interface type, using a name prefix and a function that + * is called when creation of a virtual interface of that type is requested. + */ +void +ifdev_register(const char * name, int (* create)(const char *)) +{ + + if (ifdev_vtypes == __arraycount(ifdev_vtype)) + panic("too few slots for all virtual interface types"); + + ifdev_vtype[ifdev_vtypes].ifvt_name = name; + ifdev_vtype[ifdev_vtypes].ifvt_namelen = strlen(name); + ifdev_vtype[ifdev_vtypes].ifvt_create = create; + ifdev_vtypes++; +} + +/* + * Verify that the given name is a valid interface name that can be used for + * creating a new interface. In particular, check that the given name is a + * valid interface name, consisting of an alphabetic string (the interface type + * or driver name) followed by a number string (the unit or instance number). + * Furthermore, make sure that the name does not already exist. Finally, see + * if the name prefix is reserved for a virtual interface type. If the given + * 'vtype_slot' pointer is not NULL, the prefix must be, and the virtual type + * slot number is returned in 'vtype_slot' on success. If 'vtype_slot' is + * NULL, the name must not have a virtual interface prefix, and an error is + * returned if it is. Since vtype slot numbers are meaningless outside of this + * module, external callers must always pass in NULL. This function returns OK + * on succes or a negative error code on error. + */ +int +ifdev_check_name(const char * name, unsigned int * vtype_slot) +{ + const char *p; + size_t namelen; + unsigned int slot; + + /* + * First see if the name is valid at all. TODO: decide if we want to + * allow uppercase letters, dashes, and/or underscores. + */ + for (p = name; *p >= 'a' && *p <= 'z'; p++); + + if (p == name || *p == '\0') + return EINVAL; + + namelen = (size_t)(p - name); + + for (; *p >= '0' && *p <= '9'; p++); + + if (*p != '\0') + return EINVAL; + + /* Then make sure that it does not already exist. */ + if (ifdev_find_by_name(name) != NULL) + return EEXIST; + + /* See if there is a matching virtual interface type for the name. */ + for (slot = 0; slot < ifdev_vtypes; slot++) { + if (ifdev_vtype[slot].ifvt_namelen == namelen && + !strncmp(ifdev_vtype[slot].ifvt_name, name, namelen)) + break; + } + + /* The interpretation of the result depends on 'vtype_slot'. */ + if (vtype_slot != NULL) { + if (slot == ifdev_vtypes) + return EINVAL; + + *vtype_slot = slot; + } else if (slot != ifdev_vtypes) + return EINVAL; + + return OK; +} + +/* + * Create a new virtual interface. The virtual interface type is based on the + * given name (without unit number). Return OK if the virtual interface has + * been successfully created, or a negative error code otherwise. This + * function is used both for the SIOCIFCREATE ioctl and internally. + */ +int +ifdev_create(const char * name) +{ + unsigned int slot; + int r; + + /* Verify that the given name is an acceptable interface name. */ + if ((r = ifdev_check_name(name, &slot)) != OK) + return EINVAL; + + /* Let the virtual interface implementation handle the rest. */ + return ifdev_vtype[slot].ifvt_create(name); +} + +/* + * Destroy an interface, if possible. + */ +int +ifdev_destroy(struct ifdev * ifdev) +{ + + if (ifdev->ifdev_ops->iop_destroy == NULL) + return EINVAL; + + return ifdev->ifdev_ops->iop_destroy(ifdev); +} + +/* + * Enumerate the names of currently supported virtual interface types. Return + * a pointer to the null-terminated name prefix of the Nth virtual interface + * type if the (zero-based) N value is within range, or NULL otherwise. + */ +const char * +ifdev_enum_vtypes(unsigned int num) +{ + + if (num < ifdev_vtypes) + return ifdev_vtype[num].ifvt_name; + else + return NULL; +} diff --git a/minix/net/lwip/ifdev.h b/minix/net/lwip/ifdev.h new file mode 100644 index 000000000..16206f906 --- /dev/null +++ b/minix/net/lwip/ifdev.h @@ -0,0 +1,155 @@ +#ifndef MINIX_NET_LWIP_IFDEV_H +#define MINIX_NET_LWIP_IFDEV_H + +#include +#include +#include +#include + +/* + * NetBSD makes setting a hardware address through ifconfig(8) a whole lot + * harder than it needs to be, namely by keeping a list of possible hardware + * addresses and marking one of them as active. For us, that level of extra + * flexibility is completely useless. In order to shield individual interface + * modules from having to deal with the rather extended interface for the list + * management, we maintain the list in ifdev and simply use a iop_set_hwaddr() + * call to the modules when the active address changes. This setting is the + * maximum number of hardware addresses in the list maintained by ifdev. It + * should be at least 2, or changing hardware addresses will not be possible. + */ +#define IFDEV_NUM_HWADDRS 3 + +struct ifdev; +struct bpfdev_link; +struct sockaddr_dlx; + +/* Interface operations table. */ +struct ifdev_ops { + err_t (* iop_init)(struct ifdev * ifdev, struct netif * netif); + err_t (* iop_input)(struct pbuf * pbuf, struct netif * netif); + err_t (* iop_output)(struct ifdev * ifdev, struct pbuf * pbuf, + struct netif * netif); + err_t (* iop_output_v4)(struct netif * netif, struct pbuf * pbuf, + const ip4_addr_t * ipaddr); + err_t (* iop_output_v6)(struct netif * netif, struct pbuf * pbuf, + const ip6_addr_t * ipaddr); + void (* iop_hdrcmplt)(struct ifdev * ifdev, struct pbuf * pbuf); + void (* iop_poll)(struct ifdev * ifdev); + int (* iop_set_ifflags)(struct ifdev * ifdev, unsigned int ifflags); + void (* iop_get_ifcap)(struct ifdev * ifdev, uint64_t * ifcap, + uint64_t * ifena); + int (* iop_set_ifcap)(struct ifdev * ifdev, uint64_t ifcap); + void (* iop_get_ifmedia)(struct ifdev * ifdev, int * ifcurrent, + int * ifactive); + int (* iop_set_ifmedia)(struct ifdev * ifdev, int ifmedia); + void (* iop_set_promisc)(struct ifdev * ifdev, int promisc); + int (* iop_set_hwaddr)(struct ifdev * ifdev, const uint8_t * hwaddr); + int (* iop_set_mtu)(struct ifdev * ifdev, unsigned int mtu); + int (* iop_destroy)(struct ifdev * ifdev); +}; + +/* Hardware address list entry. The first entry, if any, is the active one. */ +struct ifdev_hwaddr { + uint8_t ifhwa_addr[NETIF_MAX_HWADDR_LEN]; + uint8_t ifhwa_flags; +}; +#define IFHWAF_VALID 0x01 /* entry contains an address */ +#define IFHWAF_FACTORY 0x02 /* factory (device-given) address */ + +/* Interface structure. */ +struct ifdev { + TAILQ_ENTRY(ifdev) ifdev_next; /* list of active interfaces */ + char ifdev_name[IFNAMSIZ]; /* interface name, null terminated */ + unsigned int ifdev_ifflags; /* NetBSD-style interface flags */ + unsigned int ifdev_dlt; /* data link type (DLT_) */ + unsigned int ifdev_promisc; /* number of promiscuity requestors */ + struct netif ifdev_netif; /* lwIP interface structure */ + struct if_data ifdev_data; /* NetBSD-style interface data */ + char ifdev_v4set; /* interface has an IPv4 address? */ + uint8_t ifdev_v6prefix[LWIP_IPV6_NUM_ADDRESSES]; /* IPv6 prefixes */ + uint8_t ifdev_v6flags[LWIP_IPV6_NUM_ADDRESSES]; /* v6 address flags */ + uint8_t ifdev_v6state[LWIP_IPV6_NUM_ADDRESSES]; /* v6 shadow states */ + uint8_t ifdev_v6scope[LWIP_IPV6_NUM_ADDRESSES]; /* cached v6 scopes */ + struct ifdev_hwaddr ifdev_hwlist[IFDEV_NUM_HWADDRS]; /* HW addr's */ + uint32_t ifdev_nd6flags; /* ND6-related flags (ND6_IFF_) */ + const struct ifdev_ops *ifdev_ops; /* interface operations table */ + TAILQ_HEAD(, bpfdev_link) ifdev_bpf; /* list of attached BPF devices */ +}; + +#define ifdev_get_name(ifdev) ((ifdev)->ifdev_name) +#define ifdev_get_ifflags(ifdev) ((ifdev)->ifdev_ifflags) +#define ifdev_get_dlt(ifdev) ((ifdev)->ifdev_dlt) +#define ifdev_is_promisc(ifdev) ((ifdev)->ifdev_promisc != 0) +#define ifdev_get_netif(ifdev) (&(ifdev)->ifdev_netif) +#define ifdev_get_nd6flags(ifdev) ((ifdev)->ifdev_nd6flags) +#define ifdev_get_iftype(ifdev) ((ifdev)->ifdev_data.ifi_type) +#define ifdev_get_hwlen(ifdev) ((ifdev)->ifdev_data.ifi_addrlen) +#define ifdev_get_hdrlen(ifdev) ((ifdev)->ifdev_data.ifi_hdrlen) +#define ifdev_get_link(ifdev) ((ifdev)->ifdev_data.ifi_link_state) +#define ifdev_get_mtu(ifdev) ((ifdev)->ifdev_data.ifi_mtu) +#define ifdev_get_metric(ifdev) ((ifdev)->ifdev_data.ifi_metric) +#define ifdev_get_ifdata(ifdev) (&(ifdev)->ifdev_data) +#define ifdev_is_loopback(ifdev) ((ifdev)->ifdev_ifflags & IFF_LOOPBACK) +#define ifdev_is_up(ifdev) ((ifdev)->ifdev_ifflags & IFF_UP) +#define ifdev_is_link_up(ifdev) (netif_is_link_up(&(ifdev)->ifdev_netif)) +#define ifdev_set_metric(ifdev, metric) \ + ((void)((ifdev)->ifdev_data.ifi_metric = (metric))) +#define ifdev_get_index(ifdev) \ + ((uint32_t)(netif_get_index(ifdev_get_netif(ifdev)))) + +#define ifdev_output_drop(ifdev) ((ifdev)->ifdev_data.ifi_oerrors++) + +#define netif_get_ifdev(netif) ((struct ifdev *)(netif)->state) + +void ifdev_init(void); +void ifdev_poll(void); + +void ifdev_register(const char * name, int (* create)(const char *)); + +void ifdev_input(struct ifdev * ifdev, struct pbuf * pbuf, + struct netif * netif, int to_bpf); +err_t ifdev_output(struct ifdev * ifdev, struct pbuf * pbuf, + struct netif * netif, int to_bpf, int hdrcmplt); + +void ifdev_attach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl); +void ifdev_detach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl); + +struct ifdev *ifdev_get_by_index(uint32_t ifindex); +struct ifdev *ifdev_find_by_name(const char * name); +struct ifdev *ifdev_enum(struct ifdev * last); + +int ifdev_check_name(const char * name, unsigned int * vtype_slot); + +int ifdev_set_promisc(struct ifdev * ifdev); +void ifdev_clear_promisc(struct ifdev * ifdev); + +int ifdev_set_ifflags(struct ifdev * ifdev, unsigned int ifflags); +void ifdev_update_ifflags(struct ifdev * ifdev, unsigned int ifflags); + +void ifdev_get_ifcap(struct ifdev * ifdev, uint64_t * ifcap, + uint64_t * ifena); +int ifdev_set_ifcap(struct ifdev * ifdev, uint64_t ifena); + +int ifdev_get_ifmedia(struct ifdev * ifdev, int * ifcurrent, int * ifactive); +int ifdev_set_ifmedia(struct ifdev * ifdev, int ifmedia); + +int ifdev_set_mtu(struct ifdev * ifdev, unsigned int mtu); + +int ifdev_set_nd6flags(struct ifdev * ifdev, uint32_t nd6flags); + +void ifdev_add(struct ifdev * ifdev, const char * name, unsigned int ifflags, + unsigned int iftype, size_t hdrlen, size_t addrlen, unsigned int dlt, + unsigned int mtu, uint32_t nd6flags, const struct ifdev_ops * iop); +int ifdev_remove(struct ifdev * ifdev); + +struct ifdev *ifdev_get_loopback(void); + +void ifdev_update_link(struct ifdev * ifdev, int link); +void ifdev_update_hwaddr(struct ifdev * ifdev, const uint8_t * hwaddr, + int is_factory); + +int ifdev_create(const char * name); +int ifdev_destroy(struct ifdev * ifdev); +const char *ifdev_enum_vtypes(unsigned int num); + +#endif /* !MINIX_NET_LWIP_IFDEV_H */ diff --git a/minix/net/lwip/ipsock.c b/minix/net/lwip/ipsock.c new file mode 100644 index 000000000..f6e973e11 --- /dev/null +++ b/minix/net/lwip/ipsock.c @@ -0,0 +1,761 @@ +/* LWIP service - ipsock.c - shared IP-level socket code */ + +#include "lwip.h" +#include "ifaddr.h" + +#define ip6_hdr __netbsd_ip6_hdr /* conflicting definitions */ +#include +#include +#include +#include +#undef ip6_hdr + +/* The following are sysctl(7) settings. */ +int lwip_ip4_forward = 0; /* We patch lwIP to check these.. */ +int lwip_ip6_forward = 0; /* ..two settings at run time. */ +static int ipsock_v6only = 1; + +/* The CTL_NET PF_INET IPPROTO_IP subtree. */ +static struct rmib_node net_inet_ip_table[] = { +/* 1*/ [IPCTL_FORWARDING] = RMIB_INTPTR(RMIB_RW, &lwip_ip4_forward, + "forwarding", + "Enable forwarding of INET diagrams"), +/* 3*/ [IPCTL_DEFTTL] = RMIB_INT(RMIB_RO, IP_DEFAULT_TTL, "ttl", + "Default TTL for an INET diagram"), +/*23*/ [IPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int), + loopif_cksum, "do_loopback_cksum", + "Perform IP checksum on loopback"), +}; + +static struct rmib_node net_inet_ip_node = + RMIB_NODE(RMIB_RO, net_inet_ip_table, "ip", "IPv4 related settings"); + +/* The CTL_NET PF_INET6 IPPROTO_IPV6 subtree. */ +static struct rmib_node net_inet6_ip6_table[] = { +/* 1*/ [IPV6CTL_FORWARDING] = RMIB_INTPTR(RMIB_RW, &lwip_ip6_forward, + "forwarding", + "Enable forwarding of INET6 diagrams"), + /* + * The following functionality is not + * implemented in lwIP at this time. + */ +/* 2*/ [IPV6CTL_SENDREDIRECTS] = RMIB_INT(RMIB_RO, 0, "redirect", "Enable " + "sending of ICMPv6 redirect messages"), +/* 3*/ [IPV6CTL_DEFHLIM] = RMIB_INT(RMIB_RO, IP_DEFAULT_TTL, "hlim", + "Hop limit for an INET6 datagram"), +/*12*/ [IPV6CTL_ACCEPT_RTADV] = RMIB_INTPTR(RMIB_RW, &ifaddr_accept_rtadv, + "accept_rtadv", + "Accept router advertisements"), +/*16*/ [IPV6CTL_DAD_COUNT] = RMIB_INT(RMIB_RO, + LWIP_IPV6_DUP_DETECT_ATTEMPTS, "dad_count", + "Number of Duplicate Address Detection " + "probes to send"), +/*24*/ [IPV6CTL_V6ONLY] = RMIB_INTPTR(RMIB_RW, &ipsock_v6only, + "v6only", "Disallow PF_INET6 sockets from " + "connecting to PF_INET sockets"), + /* + * The following setting is significantly + * different from NetBSD, and therefore it has + * a somewhat different description as well. + */ +/*35*/ [IPV6CTL_AUTO_LINKLOCAL]= RMIB_INTPTR(RMIB_RW, &ifaddr_auto_linklocal, + "auto_linklocal", "Enable global support " + "for adding IPv6link-local addresses to " + "interfaces"), + /* + * Temporary addresses are managed entirely by + * userland. We only maintain the settings. + */ +/*+0*/ [IPV6CTL_MAXID] = RMIB_INT(RMIB_RW, 0, "use_tempaddr", + "Use temporary address"), +/*+1*/ [IPV6CTL_MAXID + 1] = RMIB_INT(RMIB_RW, 86400, "temppltime", + "Preferred lifetime of a temporary " + "address"), +/*+2*/ [IPV6CTL_MAXID + 2] = RMIB_INT(RMIB_RW, 604800, "tempvltime", + "Valid lifetime of a temporary address"), +}; + +static struct rmib_node net_inet6_ip6_node = + RMIB_NODE(RMIB_RO, net_inet6_ip6_table, "ip6", "IPv6 related settings"); + +/* + * Initialize the IP sockets module. + */ +void +ipsock_init(void) +{ + + /* + * Register the net.inet.ip and net.inet6.ip6 subtrees. Unlike for the + * specific protocols (TCP/UDP/RAW), here the IPv4 and IPv6 subtrees + * are and must be separate, even though many settings are shared + * between the two at the lwIP level. Ultimately we may have to split + * the subtrees for the specific protocols, too, though.. + */ + mibtree_register_inet(AF_INET, IPPROTO_IP, &net_inet_ip_node); + mibtree_register_inet(AF_INET6, IPPROTO_IPV6, &net_inet6_ip6_node); +} + +/* + * Return the lwIP IP address type (IPADDR_TYPE_) for the given IP socket. + */ +static int +ipsock_get_type(struct ipsock * ip) +{ + + if (!(ip->ip_flags & IPF_IPV6)) + return IPADDR_TYPE_V4; + else if (ip->ip_flags & IPF_V6ONLY) + return IPADDR_TYPE_V6; + else + return IPADDR_TYPE_ANY; +} + +/* + * Create an IP socket, for the given (PF_/AF_) domain and initial send and + * receive buffer sizes. Return the lwIP IP address type that should be used + * to create the corresponding PCB. Return a pointer to the libsockevent + * socket in 'sockp'. This function must not allocate any resources in any + * form, as socket creation may still fail later, in which case no destruction + * function is called. + */ +int +ipsock_socket(struct ipsock * ip, int domain, size_t sndbuf, size_t rcvbuf, + struct sock ** sockp) +{ + + ip->ip_flags = (domain == AF_INET6) ? IPF_IPV6 : 0; + + if (domain == AF_INET6 && ipsock_v6only) + ip->ip_flags |= IPF_V6ONLY; + + ip->ip_sndbuf = sndbuf; + ip->ip_rcvbuf = rcvbuf; + + /* Important: when adding settings here, also change ipsock_clone(). */ + + *sockp = &ip->ip_sock; + + return ipsock_get_type(ip); +} + +/* + * Clone the given socket 'ip' into the new socket 'newip', using the socket + * identifier 'newid'. In particular, tell libsockevent about the clone and + * copy over any settings from 'ip' to 'newip' that can be inherited on a + * socket. Cloning is used for new TCP connections arriving on listening TCP + * sockets. This function must not fail. + */ +void +ipsock_clone(struct ipsock * ip, struct ipsock * newip, sockid_t newid) +{ + + sockevent_clone(&ip->ip_sock, &newip->ip_sock, newid); + + /* Inherit all settings from the original socket. */ + newip->ip_flags = ip->ip_flags; + newip->ip_sndbuf = ip->ip_sndbuf; + newip->ip_rcvbuf = ip->ip_rcvbuf; +} + +/* + * Create an address for the given socket, taking into account whether + * the socket is IPv4, IPv6, or mixed. The generated address, stored in + * 'ipaddr', will have the same type as returned from the ipsock_socket() call. + */ +void +ipsock_get_any_addr(struct ipsock * ip, ip_addr_t * ipaddr) +{ + + ip_addr_set_any(ipsock_is_ipv6(ip), ipaddr); + + if (ipsock_is_ipv6(ip) && !ipsock_is_v6only(ip)) + IP_SET_TYPE(ipaddr, IPADDR_TYPE_ANY); +} + +/* + * Verify whether the given (properly scoped) IP address is a valid source + * address for the given IP socket. The 'allow_mcast' flag indicates whether + * the source address is allowed to be a multicast address. Return OK on + * success. If 'ifdevp' is not NULL, it is filled with either the interface + * that owns the address, or NULL if the address is (while valid) not + * associated with a particular interface. On failure, return a negative error + * code. This function must be called, in one way or another, for every source + * address used for binding or sending on a IP-layer socket. + */ +int +ipsock_check_src_addr(struct ipsock * ip, ip_addr_t * ipaddr, int allow_mcast, + struct ifdev ** ifdevp) +{ + ip6_addr_t *ip6addr; + struct ifdev *ifdev; + uint32_t inaddr, zone; + int is_mcast; + + /* + * TODO: for now, forbid binding to multicast addresses. Callers that + * never allow multicast addresses anyway (e.g., IPV6_PKTINFO) should + * do their own check for this; the one here may eventually be removed. + */ + is_mcast = ip_addr_ismulticast(ipaddr); + + if (is_mcast && !allow_mcast) + return EADDRNOTAVAIL; + + if (IP_IS_V6(ipaddr)) { + /* + * The given address must not have a KAME-style embedded zone. + * This check is already performed in addr_get_inet(), but we + * have to replicate it here because not all source addresses + * go through addr_get_inet(). + */ + ip6addr = ip_2_ip6(ipaddr); + + if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN) && + (ip6addr->addr[0] & PP_HTONL(0x0000ffffUL))) + return EINVAL; + + /* + * lwIP does not support IPv4-mapped IPv6 addresses, so these + * must be converted to plain IPv4 addresses instead. The IPv4 + * 'any' address is not supported in this form. In V6ONLY + * mode, refuse connecting or sending to IPv4-mapped addresses + * at all. + */ + if (ip6_addr_isipv4mappedipv6(ip6addr)) { + if (ipsock_is_v6only(ip)) + return EINVAL; + + inaddr = ip6addr->addr[3]; + + if (inaddr == PP_HTONL(INADDR_ANY)) + return EADDRNOTAVAIL; + + ip_addr_set_ip4_u32(ipaddr, inaddr); + } + } + + ifdev = NULL; + + if (!ip_addr_isany(ipaddr)) { + if (IP_IS_V6(ipaddr) && + ip6_addr_lacks_zone(ip_2_ip6(ipaddr), IP6_UNKNOWN)) + return EADDRNOTAVAIL; + + /* + * If the address is a unicast address, it must be assigned to + * an interface. Otherwise, if it is a zoned multicast + * address, the zone denotes the interface. For global + * multicast addresses, we cannot determine an interface. + */ + if (!is_mcast) { + if ((ifdev = ifaddr_map_by_addr(ipaddr)) == NULL) + return EADDRNOTAVAIL; + } else { + /* Some multicast addresses are not acceptable. */ + if (!addr_is_valid_multicast(ipaddr)) + return EINVAL; + + if (IP_IS_V6(ipaddr) && + ip6_addr_has_zone(ip_2_ip6(ipaddr))) { + zone = ip6_addr_zone(ip_2_ip6(ipaddr)); + + if ((ifdev = ifdev_get_by_index(zone)) == NULL) + return ENXIO; + } + } + } + + if (ifdevp != NULL) + *ifdevp = ifdev; + + return OK; +} + +/* + * Retrieve and validate a source address for use in a socket bind call on + * socket 'ip'. The user-provided address is given as 'addr', with length + * 'addr_len'. The socket's current local IP address and port are given as + * 'local_ip' and 'local_port', respectively; for raw sockets, the given local + * port number is always zero. The caller's endpoint is given as 'user_endpt', + * used to make sure only root can bind to local port numbers. The boolean + * 'allow_mcast' flag indicates whether the source address is allowed to be a + * multicast address. On success, return OK with the source IP address stored + * in 'src_addr' and, if 'src_port' is not NULL, the port number to bind to + * stored in 'portp'. Otherwise, return a negative error code. This function + * performs all the tasks necessary before the socket can be bound using a lwIP + * call. + */ +int +ipsock_get_src_addr(struct ipsock * ip, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt, ip_addr_t * local_ip, + uint16_t local_port, int allow_mcast, ip_addr_t * src_addr, + uint16_t * src_port) +{ + uint16_t port; + int r; + + /* + * If the socket has been bound already, it cannot be bound again. + * We check this by checking whether the current local port is non- + * zero. This rule does not apply to raw sockets, but raw sockets have + * no port numbers anyway, so this conveniently works out. However, + * raw sockets may not be rebound after being connected, but that is + * checked before we even get here. + */ + if (local_port != 0) + return EINVAL; + + /* Parse the user-provided address. */ + if ((r = addr_get_inet(addr, addr_len, ipsock_get_type(ip), src_addr, + FALSE /*kame*/, &port)) != OK) + return r; + + /* Validate the user-provided address. */ + if ((r = ipsock_check_src_addr(ip, src_addr, allow_mcast, + NULL /*ifdevp*/)) != OK) + return r; + + /* + * If we are interested in port numbers at all (for non-raw sockets, + * meaning portp is not NULL), make sure that only the superuser can + * bind to privileged port numbers. For raw sockets, only the + * superuser can open a socket anyway, so we need no check here. + */ + if (src_port != NULL) { + if (port != 0 && port < IPPORT_RESERVED && + !util_is_root(user_endpt)) + return EACCES; + + *src_port = port; + } + + return OK; +} + +/* + * Retrieve and validate a destination address for use in a socket connect or + * sendto call. The user-provided address is given as 'addr', with length + * 'addr_len'. The socket's current local IP address is given as 'local_addr'. + * On success, return OK with the destination IP address stored in 'dst_addr' + * and, if 'dst_port' is not NULL, the port number to bind to stored in + * 'dst_port'. Otherwise, return a negative error code. This function must be + * called, in one way or another, for every destination address used for + * connecting or sending on a IP-layer socket. + */ +int +ipsock_get_dst_addr(struct ipsock * ip, const struct sockaddr * addr, + socklen_t addr_len, const ip_addr_t * local_addr, ip_addr_t * dst_addr, + uint16_t * dst_port) +{ + uint16_t port; + int r; + + /* Parse the user-provided address. */ + if ((r = addr_get_inet(addr, addr_len, ipsock_get_type(ip), dst_addr, + FALSE /*kame*/, &port)) != OK) + return r; + + /* Destination addresses are always specific. */ + if (IP_GET_TYPE(dst_addr) == IPADDR_TYPE_ANY) + IP_SET_TYPE(dst_addr, IPADDR_TYPE_V6); + + /* + * lwIP does not support IPv4-mapped IPv6 addresses, so these must be + * supported to plain IPv4 addresses instead. In V6ONLY mode, refuse + * connecting or sending to IPv4-mapped addresses at all. + */ + if (IP_IS_V6(dst_addr) && + ip6_addr_isipv4mappedipv6(ip_2_ip6(dst_addr))) { + if (ipsock_is_v6only(ip)) + return EINVAL; + + ip_addr_set_ip4_u32(dst_addr, ip_2_ip6(dst_addr)->addr[3]); + } + + /* + * Now make sure that the local and remote addresses are of the same + * family. The local address may be of type IPADDR_TYPE_ANY, which is + * allowed for both IPv4 and IPv6. Even for connectionless socket + * types we must perform this check as part of connect calls (as well + * as sendto calls!) because otherwise we will create problems for + * sysctl based socket enumeration (i.e., netstat), which uses the + * local IP address type to determine the socket family. + */ + if (IP_GET_TYPE(local_addr) != IPADDR_TYPE_ANY && + IP_IS_V6(local_addr) != IP_IS_V6(dst_addr)) + return EINVAL; + + /* + * TODO: on NetBSD, an 'any' destination address is replaced with a + * local interface address. + */ + if (ip_addr_isany(dst_addr)) + return EHOSTUNREACH; + + /* + * If the address is a multicast address, the multicast address itself + * must be valid. + */ + if (ip_addr_ismulticast(dst_addr) && + !addr_is_valid_multicast(dst_addr)) + return EINVAL; + + /* + * TODO: decide whether to add a zone to a scoped IPv6 address that + * lacks a zone. For now, we let lwIP handle this, as lwIP itself + * will always add the zone at some point. If anything changes there, + * this would be the place to set the zone (using a route lookup). + */ + + /* + * For now, we do not forbid or alter any other particular destination + * addresses. + */ + + if (dst_port != NULL) { + /* + * Disallow connecting/sending to port zero. There is no error + * code that applies well to this case, so we copy NetBSD's. + */ + if (port == 0) + return EADDRNOTAVAIL; + + *dst_port = port; + } + + return OK; +} + +/* + * Store the address 'ipaddr' associated with the socket 'ip' (for example, it + * may be the local or remote IP address of the socket) as a sockaddr structure + * in 'addr'. A port number is provided as 'port' (in host-byte order) if + * relevant, and zero is passed in otherwise. This function MUST only be + * called from contexts where 'addr' is a buffer provided by libsockevent or + * libsockdriver, meaning that it is of size SOCKADDR_MAX. The value pointed + * to by 'addr_len' is not expected to be initialized in calls to this function + * (and will typically zero). On return, 'addr_len' is filled with the length + * of the address generated in 'addr'. This function never fails. + */ +void +ipsock_put_addr(struct ipsock * ip, struct sockaddr * addr, + socklen_t * addr_len, ip_addr_t * ipaddr, uint16_t port) +{ + ip_addr_t mappedaddr; + + /* + * If the socket is an AF_INET6-type socket, and the given address is + * an IPv4-type address, store it as an IPv4-mapped IPv6 address. + */ + if (ipsock_is_ipv6(ip) && IP_IS_V4(ipaddr)) { + addr_make_v4mapped_v6(&mappedaddr, ip_2_ip4(ipaddr)); + + ipaddr = &mappedaddr; + } + + /* + * We have good reasons to keep the sockdriver and sockevent APIs as + * they are, namely, defaulting 'addr_len' to zero such that the caller + * must provide a non-zero length (only) when returning a valid + * address. The consequence here is that we have to know the size of + * the provided buffer. For libsockevent callbacks, we are always + * guaranteed to get a buffer of at least this size. + */ + *addr_len = SOCKADDR_MAX; + + addr_put_inet(addr, addr_len, ipaddr, FALSE /*kame*/, port); +} + +/* + * Set socket options on an IP socket. + */ +int +ipsock_setsockopt(struct ipsock * ip, int level, int name, + const struct sockdriver_data * data, socklen_t len, + struct ipopts * ipopts) +{ + int r, val, allow; + uint8_t type; + + switch (level) { + case SOL_SOCKET: + switch (name) { + case SO_SNDBUF: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val <= 0 || (size_t)val < ipopts->sndmin || + (size_t)val > ipopts->sndmax) + return EINVAL; + + ip->ip_sndbuf = val; + + return OK; + + case SO_RCVBUF: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val <= 0 || (size_t)val < ipopts->rcvmin || + (size_t)val > ipopts->rcvmax) + return EINVAL; + + ip->ip_rcvbuf = val; + + return OK; + } + + break; + + case IPPROTO_IP: + if (ipsock_is_ipv6(ip)) + break; + + switch (name) { + case IP_TOS: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < 0 || val > UINT8_MAX) + return EINVAL; + + *ipopts->tos = (uint8_t)val; + + return OK; + + case IP_TTL: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < 0 || val > UINT8_MAX) + return EINVAL; + + *ipopts->ttl = (uint8_t)val; + + return OK; + } + + break; + + case IPPROTO_IPV6: + if (!ipsock_is_ipv6(ip)) + break; + + switch (name) { + case IPV6_UNICAST_HOPS: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < -1 || val > UINT8_MAX) + return EINVAL; + + if (val == -1) + val = IP_DEFAULT_TTL; + + *ipopts->ttl = val; + + return OK; + + case IPV6_TCLASS: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < -1 || val > UINT8_MAX) + return EINVAL; + + if (val == -1) + val = 0; + + *ipopts->tos = val; + + return OK; + + case IPV6_V6ONLY: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + /* + * If the socket has been bound to an actual address, + * we still allow the option to be changed, but it no + * longer has any effect. + */ + type = IP_GET_TYPE(ipopts->local_ip); + allow = (type == IPADDR_TYPE_ANY || + (type == IPADDR_TYPE_V6 && + ip_addr_isany(ipopts->local_ip))); + + if (val) { + ip->ip_flags |= IPF_V6ONLY; + + type = IPADDR_TYPE_V6; + } else { + ip->ip_flags &= ~IPF_V6ONLY; + + type = IPADDR_TYPE_ANY; + } + + if (allow) + IP_SET_TYPE(ipopts->local_ip, type); + + return OK; + } + + break; + } + + return ENOPROTOOPT; +} + +/* + * Retrieve socket options on an IP socket. + */ +int +ipsock_getsockopt(struct ipsock * ip, int level, int name, + const struct sockdriver_data * data, socklen_t * len, + struct ipopts * ipopts) +{ + int val; + + switch (level) { + case SOL_SOCKET: + switch (name) { + case SO_SNDBUF: + val = ip->ip_sndbuf; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case SO_RCVBUF: + val = ip->ip_rcvbuf; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + + case IPPROTO_IP: + if (ipsock_is_ipv6(ip)) + break; + + switch (name) { + case IP_TOS: + val = (int)*ipopts->tos; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IP_TTL: + val = (int)*ipopts->ttl; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + + case IPPROTO_IPV6: + if (!ipsock_is_ipv6(ip)) + break; + + switch (name) { + case IPV6_UNICAST_HOPS: + val = *ipopts->ttl; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_TCLASS: + val = *ipopts->tos; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_V6ONLY: + val = !!(ip->ip_flags & IPF_V6ONLY); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + } + + return ENOPROTOOPT; +} + +/* + * Fill the given kinfo_pcb sysctl(7) structure with IP-level information. + */ +void +ipsock_get_info(struct kinfo_pcb * ki, const ip_addr_t * local_ip, + uint16_t local_port, const ip_addr_t * remote_ip, uint16_t remote_port) +{ + ip_addr_t ipaddr; + socklen_t len; + uint8_t type; + + len = sizeof(ki->ki_spad); /* use this for the full size, not ki_src */ + + addr_put_inet(&ki->ki_src, &len, local_ip, TRUE /*kame*/, local_port); + + /* + * At this point, the local IP address type has already been used to + * determine whether this is an IPv4 or IPv6 socket. While not ideal, + * that is the best we can do: we cannot use IPv4-mapped IPv6 addresses + * in lwIP PCBs, we cannot store the original type in those PCBs, and + * we also cannot rely on the PCB having an associated ipsock object + * anymore. We also cannot use the ipsock only when present: it could + * make a TCP PCB "jump" from IPv6 to IPv4 in the netstat listing when + * it goes into TIME_WAIT state, for example. + * + * So, use *only* the type of the local IP address to determine whether + * this is an IPv4 or an IPv6 socket. At the same time, do *not* rely + * on the remote IP address being IPv4 for a local IPv4 address; it may + * be of type IPADDR_TYPE_V6 for an unconnected socket bound to an + * IPv4-mapped IPv6 address. Pretty messy, but we're limited by what + * lwIP offers here. Since it's just netstat, it need not be perfect. + */ + if ((type = IP_GET_TYPE(local_ip)) == IPADDR_TYPE_V4) { + if (!ip_addr_isany(local_ip) || local_port != 0) + ki->ki_prstate = INP_BOUND; + + /* + * Make sure the returned socket address types are consistent. + * The only case where the remote IP address is not IPv4 here + * is when it is not set yet, so there is no need to check + * whether it is the 'any' address: it always is. + */ + if (IP_GET_TYPE(remote_ip) != IPADDR_TYPE_V4) { + ip_addr_set_zero_ip4(&ipaddr); + + remote_ip = &ipaddr; + } + } else { + if (!ip_addr_isany(local_ip) || local_port != 0) + ki->ki_prstate = IN6P_BOUND; + if (type != IPADDR_TYPE_ANY) + ki->ki_pflags |= IN6P_IPV6_V6ONLY; + } + + len = sizeof(ki->ki_dpad); /* use this for the full size, not ki_dst */ + + addr_put_inet(&ki->ki_dst, &len, remote_ip, TRUE /*kame*/, + remote_port); + + /* Check the type of the *local* IP address here. See above. */ + if (!ip_addr_isany(remote_ip) || remote_port != 0) { + if (type == IPADDR_TYPE_V4) + ki->ki_prstate = INP_CONNECTED; + else + ki->ki_prstate = IN6P_CONNECTED; + } +} diff --git a/minix/net/lwip/ipsock.h b/minix/net/lwip/ipsock.h new file mode 100644 index 000000000..0e8a8302f --- /dev/null +++ b/minix/net/lwip/ipsock.h @@ -0,0 +1,95 @@ +#ifndef MINIX_NET_LWIP_IPSOCK_H +#define MINIX_NET_LWIP_IPSOCK_H + +/* IP-level socket, shared by TCP, UDP, and RAW. */ +struct ipsock { + struct sock ip_sock; /* socket object, MUST be first */ + unsigned int ip_flags; /* all socket flags */ + size_t ip_sndbuf; /* send buffer size */ + size_t ip_rcvbuf; /* receive buffer size */ +}; + +/* + * Socket flags. In order to reduce memory consumption, all these flags are + * stored in the same field (ipsock.ip_flags) and thus must not overlap between + * the same users of the field, and that is why they are all here. For + * example, UDPF/PKTF/IPF should all be unique, and TCPF/IPF should be unique, + * but UDPF/PKTF may overlap with TCPF and UDPF may overlap with RAWF. In + * practice, we have no UDPF or RAWF flags and plenty of space to make all + * flags unique anyway. + */ +#define IPF_IPV6 0x0000001 /* socket is IPv6 */ +#define IPF_V6ONLY 0x0000002 /* socket is IPv6 only */ + +#define PKTF_RECVINFO 0x0000010 /* receive ancillary PKTINFO */ +#define PKTF_RECVTTL 0x0000020 /* receive ancillary TTL */ +#define PKTF_RECVTOS 0x0000040 /* receive ancillary TOS */ +#define PKTF_MCAWARE 0x0000080 /* owner is multicast aware */ + +#define TCPF_CONNECTING 0x0001000 /* attempting to connect */ +#define TCPF_SENT_FIN 0x0002000 /* send FIN when possible */ +#define TCPF_RCVD_FIN 0x0004000 /* received FIN from peer */ +#define TCPF_FULL 0x0008000 /* PCB send buffer is full */ +#define TCPF_OOM 0x0010000 /* memory allocation failed */ + +#define ipsock_get_sock(ip) (&(ip)->ip_sock) +#define ipsock_is_ipv6(ip) ((ip)->ip_flags & IPF_IPV6) +#define ipsock_is_v6only(ip) ((ip)->ip_flags & IPF_V6ONLY) +#define ipsock_get_flags(ip) ((ip)->ip_flags) +#define ipsock_get_flag(ip,fl) ((ip)->ip_flags & (fl)) +#define ipsock_set_flag(ip,fl) ((ip)->ip_flags |= (fl)) +#define ipsock_clear_flag(ip,fl) ((ip)->ip_flags &= ~(fl)) +#define ipsock_get_sndbuf(ip) ((ip)->ip_sndbuf) +#define ipsock_get_rcvbuf(ip) ((ip)->ip_rcvbuf) + +/* + * IP-level option pointers. This is necessary because even though lwIP's + * TCP, UDP, and RAW PCBs share the same initial fields, the C standard does + * not permit generic access to such initial fields (due to both possible + * padding differences and strict-aliasing rules). The fields in this + * structure are therefore pointers to the initial fields of each of the PCB + * structures. If lwIP ever groups its IP PCB fields into a single structure + * and uses that structure as first field of each of the other PCBs, then we + * will be able to replace this structure with a pointer to the IP PCB instead. + * For convenience we also carry the send and receive buffer limits here. + */ +struct ipopts { + ip_addr_t *local_ip; + ip_addr_t *remote_ip; + uint8_t *tos; + uint8_t *ttl; + size_t sndmin; + size_t sndmax; + size_t rcvmin; + size_t rcvmax; +}; + +struct ifdev; + +void ipsock_init(void); +int ipsock_socket(struct ipsock * ip, int domain, size_t sndbuf, size_t rcvbuf, + struct sock ** sockp); +void ipsock_clone(struct ipsock * ip, struct ipsock * newip, sockid_t newid); +void ipsock_get_any_addr(struct ipsock * ip, ip_addr_t * ipaddr); +int ipsock_check_src_addr(struct ipsock * ip, ip_addr_t * ipaddr, + int allow_mcast, struct ifdev ** ifdevp); +int ipsock_get_src_addr(struct ipsock * ip, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt, ip_addr_t * local_ip, + uint16_t local_port, int allow_mcast, ip_addr_t * ipaddr, + uint16_t * portp); +int ipsock_get_dst_addr(struct ipsock * ip, const struct sockaddr * addr, + socklen_t addr_len, const ip_addr_t * local_addr, ip_addr_t * dst_addr, + uint16_t * dst_port); +void ipsock_put_addr(struct ipsock * ip, struct sockaddr * addr, + socklen_t * addr_len, ip_addr_t * ipaddr, uint16_t port); +int ipsock_setsockopt(struct ipsock * ip, int level, int name, + const struct sockdriver_data * data, socklen_t len, + struct ipopts * ipopts); +int ipsock_getsockopt(struct ipsock * ip, int level, int name, + const struct sockdriver_data * data, socklen_t * len, + struct ipopts * ipopts); +void ipsock_get_info(struct kinfo_pcb * ki, const ip_addr_t * local_ip, + uint16_t local_port, const ip_addr_t * remote_ip, + uint16_t remote_port); + +#endif /* !MINIX_NET_LWIP_IPSOCK_H */ diff --git a/minix/net/lwip/lldata.c b/minix/net/lwip/lldata.c new file mode 100644 index 000000000..80050c78c --- /dev/null +++ b/minix/net/lwip/lldata.c @@ -0,0 +1,584 @@ +/* LWIP service - lldata.c - link-layer (ARP, NDP) data related routines */ +/* + * This module is largely isolated from the regular routing code. There are + * two reasons for that. First, mixing link-layer routes with regular routes + * would not work well due to the fact that lwIP keeps these data structures + * entirely separate. Second, as of version 8, NetBSD keeps the IP-layer and + * link-layer routing separate as well. + * + * Unfortunately, lwIP does not provide much in the way of implementing the + * functionality that would be expected for this module. As such, the current + * implementation is very restricted and simple. + * + * For ARP table entries, lwIP only allows for adding and deleting static + * entries. Non-static entries cannot be deleted. Incomplete (pending) + * entries cannot even be enumerated, nor can (e.g.) expiry information be + * obtained. The lwIP ARP datastructures are completely hidden, so there is no + * way to overcome these limitations without changing lwIP itself. As a + * result, not all functionality of the arp(8) userland utility is supported. + * + * For NDP table entries, lwIP offers no API at all. However, since the data + * structures are exposed directly, we can use those to implement full support + * for exposing information in a read-only way. However, manipulating data + * structures directly from here is too risky, nor does lwIP currently support + * the concept of static NDP table entries. Therefore, adding, changing, and + * deleting NDP entries is currently not supported, and will also first require + * changes to lwIP itself. + * + * The ndp(8) userland utility is also able to show and manipulate various + * other neighbor discovery related tables and settings. We support only a + * small subset of them. The main reason for this is that the other tables, + * in particular the prefix and default router lists, are not relevant: on + * MINIX 3, these are always managed fully in userland (usually dhcpcd(8)), and + * we even hardcode lwIP not to parse Router Advertisement messages at all, so + * even though those tables are still part of lwIP, they are always empty. + * Other ndp(8) functionality are unsupported for similar reasons. + */ + +#include "lwip.h" +#include "lldata.h" +#include "route.h" +#include "rtsock.h" + +#include "lwip/etharp.h" +#include "lwip/nd6.h" +#include "lwip/priv/nd6_priv.h" /* for neighbor_cache */ + +/* + * Process a routing command specifically for an ARP table entry. Return OK if + * the routing command has been processed successfully and a routing socket + * reply message has already been generated. Return a negative error code on + * failure, in which case the caller will generate a reply message instead. + */ +static int +lldata_arp_process(unsigned int type, const ip_addr_t * dst_addr, + const struct eth_addr * gw_addr, struct ifdev * ifdev, + unsigned int flags, const struct rtsock_request * rtr) +{ + const ip4_addr_t *ip4addr; + struct eth_addr ethaddr, *ethptr; + struct netif *netif; + lldata_arp_num_t num; + err_t err; + + netif = (ifdev != NULL) ? ifdev_get_netif(ifdev) : NULL; + + num = etharp_find_addr(netif, ip_2_ip4(dst_addr), ðptr, &ip4addr); + + if (type != RTM_ADD && num < 0) + return ESRCH; + else if (type == RTM_ADD && num >= 0) + return EEXIST; + + switch (type) { + case RTM_CHANGE: + /* + * This request is not used by arp(8), so keep things simple. + * For RTM_ADD we support only static entries; we support only + * those too here, and thus we can use delete-and-readd. If + * the ethernet address is not being changed, try readding the + * entry with the previous ethernet address. + */ + if (gw_addr == NULL) + gw_addr = ethptr; + + if (etharp_remove_static_entry(ip_2_ip4(dst_addr)) != ERR_OK) + return EPERM; + + /* FALLTHROUGH */ + case RTM_ADD: + assert(gw_addr != NULL); + + memcpy(ðaddr, gw_addr, sizeof(ethaddr)); + + /* + * Adding static, permanent, unpublished, non-proxy entries is + * all that lwIP supports right now. We also do not get to + * specify the interface, and the way lwIP picks the interface + * may in fact result in a different one. + */ + if ((err = etharp_add_static_entry(ip_2_ip4(dst_addr), + ðaddr)) != ERR_OK) + return util_convert_err(err); + + if ((num = etharp_find_addr(NULL /*netif*/, ip_2_ip4(dst_addr), + ðptr, &ip4addr)) < 0) + panic("unable to find just-added static ARP entry"); + + /* FALLTHROUGH */ + case RTM_LOCK: + case RTM_GET: + rtsock_msg_arp(num, type, rtr); + + return OK; + + case RTM_DELETE: + memcpy(ðaddr, ethptr, sizeof(ethaddr)); + + if (etharp_remove_static_entry(ip_2_ip4(dst_addr)) != ERR_OK) + return EPERM; + + /* + * FIXME: the following block is a hack, because we cannot + * predict whether the above removal will succeed, while at the + * same time we need the entry to be present in order to report + * the deleted address to the routing socket. We temporarily + * readd and then remove the entry just for the purpose of + * generating the routing socket reply. There are other ways + * to resolve this, but only a better lwIP etharp API would + * allow us to resolve this problem cleanly. + */ + (void)etharp_add_static_entry(ip_2_ip4(dst_addr), ðaddr); + + num = etharp_find_addr(NULL /*netif*/, ip_2_ip4(dst_addr), + ðptr, &ip4addr); + assert(num >= 0); + + rtsock_msg_arp(num, type, rtr); + + (void)etharp_remove_static_entry(ip_2_ip4(dst_addr)); + + return OK; + + default: + return EINVAL; + } +} + +/* + * Enumerate ARP table entries. Return TRUE if there is at least one more ARP + * table entry, of which the number is stored in 'num'. The caller should set + * 'num' to 0 initially, and increase it by one between a successful call and + * the next call. Return FALSE if there are no more ARP table entries. + */ +int +lldata_arp_enum(lldata_arp_num_t * num) +{ + ip4_addr_t *ip4addr; + struct netif *netif; + struct eth_addr *ethaddr; + + for (; *num < ARP_TABLE_SIZE; ++*num) { + if (etharp_get_entry(*num, &ip4addr, &netif, ðaddr)) + return TRUE; + } + + return FALSE; +} + +/* + * Obtain information about the ARP table entry identified by 'num'. The IPv4 + * address of the entry is stored in 'addr'. Its ethernet address is stored in + * 'gateway'. The associated interface is stored in 'ifdevp', and the entry's + * routing flags (RTF_) are stored in 'flagsp'. + */ +void +lldata_arp_get(lldata_arp_num_t num, struct sockaddr_in * addr, + struct sockaddr_dlx * gateway, struct ifdev ** ifdevp, + unsigned int * flagsp) +{ + ip_addr_t ipaddr; + ip4_addr_t *ip4addr; + struct netif *netif; + struct ifdev *ifdev; + struct eth_addr *ethaddr; + socklen_t addr_len; + + if (!etharp_get_entry(num, &ip4addr, &netif, ðaddr)) + panic("request for invalid ARP entry"); + + ip_addr_copy_from_ip4(ipaddr, *ip4addr); + + assert(netif != NULL); + ifdev = netif_get_ifdev(netif); + + addr_len = sizeof(*addr); + + addr_put_inet((struct sockaddr *)addr, &addr_len, &ipaddr, + TRUE /*kame*/, 0 /*port*/); + + addr_len = sizeof(*gateway); + + addr_put_link((struct sockaddr *)gateway, &addr_len, + ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), NULL /*name*/, + ethaddr->addr, sizeof(ethaddr->addr)); + + *ifdevp = ifdev; + + /* + * TODO: this is not necessarily accurate, but lwIP does not provide us + * with information as to whether this is a static entry or not.. + */ + *flagsp = RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_STATIC | RTF_CLONED; +} + +/* + * Obtain information about the ND6 neighbor cache entry 'i', which must be a + * number between 0 (inclusive) and LWIP_ND6_NUM_NEIGHBORS (exclusive). If an + * entry with this number exists, return a pointer to its IPv6 address, and + * additional information in each of the given pointers if not NULL. The + * associated interface is stored in 'netif'. If the entry has an associated + * link-layer address, a pointer to it is stored in 'lladdr'. The entry's + * state (ND6_{INCOMPLETE,REACHABLE,STALE,DELAY,PROBE}) is stored in 'state'. + * The 'isrouter' parameter is filled with a boolean value indicating whether + * the entry is for a router. For ND6_INCOMPLETE and ND6_PROBE, the number of + * probes sent so far is stored in 'probes_sent'; for other states, the value + * is set to zero. For ND6_REACHABLE and ND6_DELAY, the time until expiration + * in ND6_TMR_INTERVAL-millisecond units is stored in 'expire_time'; for other + * states, the value is set to zero. If an entry with number 'i' does not + * exist, NULL is returned. + * + * TODO: upstream this function to lwIP. + */ +static const ip6_addr_t * +nd6_get_neighbor_cache_entry(int8_t i, struct netif ** netif, + const uint8_t ** lladdr, uint8_t * state, uint8_t * isrouter, + uint32_t * probes_sent, uint32_t * expire_time) +{ + + if (i < 0 || i >= LWIP_ND6_NUM_NEIGHBORS || + neighbor_cache[i].state == ND6_NO_ENTRY) + return NULL; + + if (netif != NULL) + *netif = neighbor_cache[i].netif; + + if (lladdr != NULL) { + if (neighbor_cache[i].state != ND6_INCOMPLETE) + *lladdr = neighbor_cache[i].lladdr; + else + *lladdr = NULL; + } + + if (state != NULL) + *state = neighbor_cache[i].state; + + if (isrouter != NULL) + *isrouter = neighbor_cache[i].isrouter; + + if (probes_sent != NULL) { + if (neighbor_cache[i].state == ND6_INCOMPLETE || + neighbor_cache[i].state == ND6_PROBE) + *probes_sent = neighbor_cache[i].counter.probes_sent; + else + *probes_sent = 0; + } + + if (expire_time != NULL) { + switch (neighbor_cache[i].state) { + case ND6_REACHABLE: + *expire_time = + neighbor_cache[i].counter.reachable_time / + ND6_TMR_INTERVAL; + break; + case ND6_DELAY: + *expire_time = neighbor_cache[i].counter.delay_time; + break; + case ND6_INCOMPLETE: + case ND6_PROBE: + /* Probes are sent once per timer tick. */ + *expire_time = (LWIP_ND6_MAX_MULTICAST_SOLICIT + 1 - + neighbor_cache[i].counter.probes_sent) * + (ND6_TMR_INTERVAL / 1000); + break; + default: + /* Stale entries do not expire; they get replaced. */ + *expire_time = 0; + break; + } + } + + return &neighbor_cache[i].next_hop_address; +} + +/* + * Find a neighbor cache entry by IPv6 address. Return its index number if + * found, or -1 if not. This is a reimplementation of the exact same function + * internal to lwIP. + * + * TODO: make this function public in lwIP. + */ +static int8_t +nd6_find_neighbor_cache_entry(const ip6_addr_t * addr) +{ + int8_t i; + + for (i = 0; i < LWIP_ND6_NUM_NEIGHBORS; i++) { + if (ip6_addr_cmp(addr, &neighbor_cache[i].next_hop_address)) + return i; + } + + return -1; +} + +/* + * Find an NDP table entry based on the given interface and IPv6 address. On + * success, return OK, with the entry's index number stored in 'nump'. On + * failure, return an appropriate error code. + */ +int +lldata_ndp_find(struct ifdev * ifdev, const struct sockaddr_in6 * addr, + lldata_ndp_num_t * nump) +{ + ip_addr_t ipaddr; + int8_t i; + int r; + + if ((r = addr_get_inet((const struct sockaddr *)addr, sizeof(*addr), + IPADDR_TYPE_V6, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) + return r; + + /* + * For given link-local addresses, no zone may be provided in the + * address at all. In such cases, add the zone ourselves, using the + * given interface. + */ + if (ip6_addr_lacks_zone(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) + ip6_addr_assign_zone(ip_2_ip6(&ipaddr), IP6_UNKNOWN, + ifdev_get_netif(ifdev)); + + i = nd6_find_neighbor_cache_entry(ip_2_ip6(&ipaddr)); + if (i < 0) + return ESRCH; + + /* + * We should compare the neighbor cache entry's associated netif to + * the given ifdev, but since the lwIP neighbor cache is currently not + * keyed by netif anyway (i.e. the internal lookups are purely by IPv6 + * address as well), doing so makes little sense in practice. + */ + + *nump = (lldata_ndp_num_t)i; + return OK; +} + +/* + * Process a routing command specifically for an NDP table entry. Return OK if + * the routing command has been processed successfully and a routing socket + * reply message has already been generated. Return a negative error code on + * failure, in which case the caller will generate a reply message instead. + */ +static int +lldata_ndp_process(unsigned int type, const ip_addr_t * dst_addr, + const struct eth_addr * gw_addr, + struct ifdev * ifdev, unsigned int flags, + const struct rtsock_request * rtr) +{ + lldata_ndp_num_t num; + + num = (lldata_ndp_num_t) + nd6_find_neighbor_cache_entry(ip_2_ip6(dst_addr)); + + if (type != RTM_ADD && num < 0) + return ESRCH; + else if (type == RTM_ADD && num >= 0) + return EEXIST; + + switch (type) { + case RTM_LOCK: + case RTM_GET: + rtsock_msg_arp(num, type, rtr); + + return OK; + + case RTM_ADD: + case RTM_CHANGE: + case RTM_DELETE: + /* TODO: add lwIP support to implement these commands. */ + return ENOSYS; + + default: + return EINVAL; + } +} + +/* + * Enumerate NDP table entries. Return TRUE if there is at least one more NDP + * table entry, of which the number is stored in 'num'. The caller should set + * 'num' to 0 initially, and increase it by one between a successful call and + * the next call. Return FALSE if there are no more NDP table entries. + */ +int +lldata_ndp_enum(lldata_ndp_num_t * num) +{ + + for (; *num < LWIP_ND6_NUM_NEIGHBORS; ++*num) { + if (nd6_get_neighbor_cache_entry(*num, NULL /*netif*/, + NULL /*lladdr*/, NULL /*state*/, NULL /*isrouter*/, + NULL /*probes_sent*/, NULL /*expire_time*/) != NULL) + return TRUE; + } + + return FALSE; +} + +/* + * Obtain information about the NDP table entry identified by 'num'. The IPv6 + * address of the entry is stored in 'addr'. Its ethernet address is stored in + * 'gateway'. The associated interface is stored in 'ifdevp', and the entry's + * routing flags (RTF_) are stored in 'flagsp'. + */ +void +lldata_ndp_get(lldata_ndp_num_t num, struct sockaddr_in6 * addr, + struct sockaddr_dlx * gateway, struct ifdev ** ifdevp, + unsigned int * flagsp) +{ + const ip6_addr_t *ip6addr; + ip_addr_t ipaddr; + struct netif *netif; + struct ifdev *ifdev; + const uint8_t *lladdr; + socklen_t addr_len; + + ip6addr = nd6_get_neighbor_cache_entry(num, &netif, &lladdr, + NULL /*state*/, NULL /*isrouter*/, NULL /*probes_sent*/, + NULL /*expire_time*/); + assert(ip6addr != NULL); + + ip_addr_copy_from_ip6(ipaddr, *ip6addr); + + ifdev = netif_get_ifdev(netif); + assert(ifdev != NULL); + + addr_len = sizeof(*addr); + + addr_put_inet((struct sockaddr *)addr, &addr_len, &ipaddr, + TRUE /*kame*/, 0 /*port*/); + + addr_len = sizeof(*gateway); + + addr_put_link((struct sockaddr *)gateway, &addr_len, + ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), NULL /*name*/, + lladdr, ifdev_get_hwlen(ifdev)); + + *ifdevp = ifdev; + *flagsp = RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_CLONED; +} + +/* + * Obtain information about the NDP table entry with the number 'num', which + * must be obtained through a previous call to lldata_ndp_find(). On return, + * 'asked' is filled with the number of probes sent so far (0 if inapplicable), + * 'isrouter' is set to 1 or 0 depending on whether the entry is for a router, + * 'state' is set to the entry's state (ND6_LLINFO_), and 'expire' is set to + * either the UNIX timestamp of expiry for the entry; 0 for permanent entries. + * None of the given pointers must be NULL. This function always succeeds. + */ +void +lldata_ndp_get_info(lldata_ndp_num_t num, long * asked, int * isrouter, + int * state, int * expire) +{ + uint32_t nd6_probes_sent = 0 /*gcc*/, nd6_expire_time = 0 /*gcc*/; + uint8_t nd6_state = 0 /*gcc*/, nd6_isrouter = 0 /*gcc*/; + + (void)nd6_get_neighbor_cache_entry(num, NULL /*netif*/, + NULL /*lladdr*/, &nd6_state, &nd6_isrouter, &nd6_probes_sent, + &nd6_expire_time); + + *asked = (long)nd6_probes_sent; + + *isrouter = !!nd6_isrouter; + + switch (nd6_state) { + case ND6_INCOMPLETE: *state = ND6_LLINFO_INCOMPLETE; break; + case ND6_REACHABLE: *state = ND6_LLINFO_REACHABLE; break; + case ND6_STALE: *state = ND6_LLINFO_STALE; break; + case ND6_DELAY: *state = ND6_LLINFO_DELAY; break; + case ND6_PROBE: *state = ND6_LLINFO_PROBE; break; + default: panic("unknown ND6 state %u", nd6_state); + } + + if (nd6_expire_time != 0) + *expire = clock_time(NULL) + + (int)nd6_expire_time * (ND6_TMR_INTERVAL / 1000); + else + *expire = 0; +} + +/* + * Process a routing command specifically for a link-layer route, as one of the + * specific continuations of processing started by route_process(). The RTM_ + * routing command is given as 'type'. The route destination is given as + * 'dst_addr'; its address type determines whether the operation is for ARP or + * NDP. The sockaddr structure for 'gateway' is passed on as is and may have + * to be parsed here if not NULL. 'ifdev' is the interface to be associated + * with the route; it is non-NULL only if an interface name (IFP) or address + * (IFA) was given. The RTF_ flags field has been checked against the globally + * supported flags, but may have to be checked for flags that do not apply to + * ARP/NDP routes. Return OK or a negative error code, following the same + * semantics as route_process(). + */ +int +lldata_process(unsigned int type, const ip_addr_t * dst_addr, + const struct sockaddr * gateway, struct ifdev * ifdev, + unsigned int flags, const struct rtsock_request * rtr) +{ + const struct route_entry *route; + struct eth_addr ethaddr, *gw_addr; + int r; + + assert(flags & RTF_LLDATA); + + /* + * It seems that RTF_UP does not apply to link-layer routing entries. + * We basically accept any flags that we can return, but we do not + * actually check most of them anywhere. + */ + if ((flags & ~(RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_STATIC | + RTF_CLONED | RTF_ANNOUNCE)) != 0) + return EINVAL; + + gw_addr = NULL; + + if (type == RTM_ADD || type == RTM_CHANGE) { + /* + * Link-layer entries are always host entries. Not all + * requests pass in this flag though, so check only when the + * flags are supposed to be set. + */ + if ((type == RTM_ADD || type == RTM_CHANGE) && + !(flags & RTF_HOST)) + return EINVAL; + + /* lwIP does not support publishing custom entries. */ + if (flags & RTF_ANNOUNCE) + return ENOSYS; + + /* RTF_GATEWAY is always cleared for link-layer entries. */ + if (gateway != NULL) { + if ((r = addr_get_link(gateway, gateway->sa_len, + NULL /*name*/, 0 /*name_max*/, ethaddr.addr, + sizeof(ethaddr.addr))) != OK) + return r; + + gw_addr = ðaddr; + } + + if (type == RTM_ADD) { + if (gateway == NULL) + return EINVAL; + + /* + * If no interface has been specified, see if the + * destination address is on a locally connected + * network. If so, use that network's interface. + * Otherwise reject the request altogether: we must + * have an interface to which to associate the entry. + */ + if (ifdev == NULL) { + if ((route = route_lookup(dst_addr)) != NULL && + !(route_get_flags(route) & RTF_GATEWAY)) + ifdev = route_get_ifdev(route); + else + return ENETUNREACH; + } + } + } + + if (IP_IS_V4(dst_addr)) + return lldata_arp_process(type, dst_addr, gw_addr, ifdev, + flags, rtr); + else + return lldata_ndp_process(type, dst_addr, gw_addr, ifdev, + flags, rtr); +} diff --git a/minix/net/lwip/lldata.h b/minix/net/lwip/lldata.h new file mode 100644 index 000000000..b7e5c85d0 --- /dev/null +++ b/minix/net/lwip/lldata.h @@ -0,0 +1,27 @@ +#ifndef MINIX_NET_LWIP_LLDATA_H +#define MINIX_NET_LWIP_LLDATA_H + +struct rtsock_request; + +typedef int lldata_arp_num_t; /* ARP table entry number */ +typedef int lldata_ndp_num_t; /* NDP table entry number */ + +int lldata_arp_enum(lldata_arp_num_t * num); +void lldata_arp_get(lldata_arp_num_t num, struct sockaddr_in * addr, + struct sockaddr_dlx * gateway, struct ifdev ** ifdevp, + unsigned int * flagsp); + +int lldata_ndp_find(struct ifdev * ifdev, + const struct sockaddr_in6 * addr, lldata_ndp_num_t * nump); +int lldata_ndp_enum(lldata_ndp_num_t * num); +void lldata_ndp_get(lldata_ndp_num_t num, struct sockaddr_in6 * addr, + struct sockaddr_dlx * gateway, struct ifdev ** ifdevp, + unsigned int * flagsp); +void lldata_ndp_get_info(lldata_ndp_num_t num, long * asked, int * isrouter, + int * state, int * expire); + +int lldata_process(unsigned int type, const ip_addr_t * dst_addr, + const struct sockaddr * gateway, struct ifdev * ifdev, + unsigned int flags, const struct rtsock_request * rtr); + +#endif /* !MINIX_NET_LWIP_LLDATA_H */ diff --git a/minix/net/lwip/lnksock.c b/minix/net/lwip/lnksock.c new file mode 100644 index 000000000..a6e1e7a28 --- /dev/null +++ b/minix/net/lwip/lnksock.c @@ -0,0 +1,77 @@ +/* LWIP service - lnksock.c - link sockets */ +/* + * This module contains absolutely minimal support for AF_LINK type sockets, + * because for now we need them only to support a specific set of IOCTLs, as + * required by for example ifconfig(8). + */ + +#include "lwip.h" + +/* The number of link sockets. */ +#define NR_LNKSOCK 4 + +static struct lnksock { + struct sock lnk_sock; /* socket object, MUST be first */ + SIMPLEQ_ENTRY(lnksock) lnk_next; /* next in free list */ +} lnk_array[NR_LNKSOCK]; + +static SIMPLEQ_HEAD(, lnksock) lnk_freelist; /* list of free link sockets */ + +static const struct sockevent_ops lnksock_ops; + +/* + * Initialize the link sockets module. + */ +void +lnksock_init(void) +{ + unsigned int slot; + + /* Initialize the list of free link sockets. */ + SIMPLEQ_INIT(&lnk_freelist); + + for (slot = 0; slot < __arraycount(lnk_array); slot++) + SIMPLEQ_INSERT_TAIL(&lnk_freelist, &lnk_array[slot], lnk_next); +} + +/* + * Create a link socket. + */ +sockid_t +lnksock_socket(int type, int protocol, struct sock ** sockp, + const struct sockevent_ops ** ops) +{ + struct lnksock *lnk; + + if (type != SOCK_DGRAM) + return EPROTOTYPE; + + if (protocol != 0) + return EPROTONOSUPPORT; + + if (SIMPLEQ_EMPTY(&lnk_freelist)) + return ENOBUFS; + + lnk = SIMPLEQ_FIRST(&lnk_freelist); + SIMPLEQ_REMOVE_HEAD(&lnk_freelist, lnk_next); + + *sockp = &lnk->lnk_sock; + *ops = &lnksock_ops; + return SOCKID_LNK | (sockid_t)(lnk - lnk_array); +} + +/* + * Free up a closed link socket. + */ +static void +lnksock_free(struct sock * sock) +{ + struct lnksock *lnk = (struct lnksock *)sock; + + SIMPLEQ_INSERT_HEAD(&lnk_freelist, lnk, lnk_next); +} + +static const struct sockevent_ops lnksock_ops = { + .sop_ioctl = ifconf_ioctl, + .sop_free = lnksock_free +}; diff --git a/minix/net/lwip/loopif.c b/minix/net/lwip/loopif.c new file mode 100644 index 000000000..db21db09a --- /dev/null +++ b/minix/net/lwip/loopif.c @@ -0,0 +1,420 @@ +/* LWIP service - loopif.c - loopback interfaces */ +/* + * There is always at least one loopback device. This device is used also to + * loop back packets sent on other interfaces to the local interface address. + * Therefore, not all packets on the loopback device have a source or + * destination address corresponding to the loopback device. + */ + +#include "lwip.h" + +/* + * As a safety measure, if lwIP somehow gets stuck in a loop replying to its + * own packets on a loopback interface, stop with immediately feeding packets + * back into lwIP after this many packets. The remaining packets will still be + * delivered, but not before the main message loop has had a chance to run. + */ +#define LOOPIF_LIMIT 65536 + +/* + * The MTU is restricted to 65531 bytes, because we need space for a 4-byte + * header to identify the original interface of the packet. + */ +#define LOOPIF_MAX_MTU (UINT16_MAX - sizeof(uint32_t)) /* maximum MTU */ +#define LOOPIF_DEF_MTU LOOPIF_MAX_MTU /* default MTU */ + +#define NR_LOOPIF 2 /* number of loopback devices */ + +struct loopif { + struct ifdev loopif_ifdev; /* interface device, MUST be first */ + struct pbuf *loopif_head; /* head of pending loopback packets */ + struct pbuf **loopif_tailp; /* tail ptr-ptr of pending packets */ + TAILQ_ENTRY(loopif) loopif_next; /* next in free list */ +} loopif_array[NR_LOOPIF]; + +static TAILQ_HEAD(, loopif) loopif_freelist; /* free loop interfaces list */ +static TAILQ_HEAD(, loopif) loopif_activelist; /* active loop interfaces */ + +#define loopif_get_netif(loopif) (ifdev_get_netif(&(loopif)->loopif_ifdev)) + +static unsigned int loopif_cksum_flags; + +static int loopif_create(const char *name); + +static const struct ifdev_ops loopif_ops; + +/* + * Initialize the loopback interface module. + */ +void +loopif_init(void) +{ + unsigned int slot; + + /* Initialize the lists of loopback interfaces. */ + TAILQ_INIT(&loopif_freelist); + TAILQ_INIT(&loopif_activelist); + + for (slot = 0; slot < __arraycount(loopif_array); slot++) + TAILQ_INSERT_TAIL(&loopif_freelist, &loopif_array[slot], + loopif_next); + + /* + * The default is to perform no checksumming on loopback interfaces, + * except for ICMP messages because otherwise we would need additional + * changes in the code receiving those. In fact, for future + * compatibility, disable only those flags that we manage ourselves. + */ + loopif_cksum_flags = NETIF_CHECKSUM_ENABLE_ALL & + ~(NETIF_CHECKSUM_GEN_IP | NETIF_CHECKSUM_CHECK_IP | + NETIF_CHECKSUM_GEN_UDP | NETIF_CHECKSUM_CHECK_UDP | + NETIF_CHECKSUM_GEN_TCP | NETIF_CHECKSUM_CHECK_TCP); + + /* Tell the ifdev module that users may create more loopif devices. */ + ifdev_register("lo", loopif_create); +} + +/* + * Polling function, invoked after each message loop iteration. Forward any + * packets received on the output side of the loopback device during this + * loop iteration, to the input side of the device. + */ +static void +loopif_poll(struct ifdev * ifdev) +{ + struct loopif *loopif = (struct loopif *)ifdev; + struct pbuf *pbuf, **pnext; + struct ifdev *oifdev; + struct netif *netif; + uint32_t oifindex; + unsigned int count; + static int warned = FALSE; + + count = 0; + + while ((pbuf = loopif->loopif_head) != NULL) { + /* + * Prevent endless loops. Keep in mind that packets may be + * added to the queue as part of processing packets from the + * queue here, so the queue itself will never reach this + * length. As such the limit can (and must) be fairly high. + * + * In any case, if this warning is shown, that basically means + * that a bug in lwIP has been triggered. There should be no + * such bugs, so if there are, they should be fixed in lwIP. + */ + if (count++ == LOOPIF_LIMIT) { + if (!warned) { + printf("LWIP: excess loopback traffic, " + "throttling output\n"); + warned = TRUE; + } + + break; + } + + pnext = pchain_end(pbuf); + + if ((loopif->loopif_head = *pnext) == NULL) + loopif->loopif_tailp = &loopif->loopif_head; + *pnext = NULL; + + /* + * Get the original interface for the packet, which if non-zero + * must also be used to pass the packet back to. The interface + * should still exist in all cases, but better safe than sorry. + */ + memcpy(&oifindex, pbuf->payload, sizeof(oifindex)); + + util_pbuf_header(pbuf, -(int)sizeof(oifindex)); + + if (oifindex != 0 && + (oifdev = ifdev_get_by_index(oifindex)) != NULL) + netif = ifdev_get_netif(oifdev); + else + netif = NULL; + + /* + * Loopback devices hand packets to BPF on output only. Doing + * so on input as well would duplicate all captured packets. + */ + ifdev_input(ifdev, pbuf, netif, FALSE /*to_bpf*/); + } +} + +/* + * Process a packet as output on a loopback interface. Packets cannot be + * passed back into lwIP right away, nor can the original packets be passed + * back into lwIP. Therefore, make a copy of the packet, and pass it back to + * lwIP at the end of the current message loop iteration. + */ +static err_t +loopif_output(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif) +{ + struct loopif *loopif = (struct loopif *)ifdev; + struct ifdev *oifdev; + struct pbuf *pcopy; + uint32_t oifindex; + + /* Reject oversized packets immediately. This should not happen. */ + if (pbuf->tot_len > UINT16_MAX - sizeof(oifindex)) { + printf("LWIP: attempt to send oversized loopback packet\n"); + + return ERR_MEM; + } + + /* + * If the service is low on memory, this is a likely place where + * allocation failures will occur. Thus, do not print anything here. + * The user can diagnose such problems with interface statistics. + */ + pcopy = pchain_alloc(PBUF_RAW, sizeof(oifindex) + pbuf->tot_len); + if (pcopy == NULL) { + ifdev_output_drop(ifdev); + + return ERR_MEM; + } + + /* + * If the packet was purposely diverted from a non-loopback interface + * to this interface, we have to remember the original interface, so + * that we can pass back the packet to that interface as well. If we + * don't, packets to link-local addresses assigned to non-loopback + * interfaces will not be processed correctly. + */ + if (netif != NULL) { + oifdev = netif_get_ifdev(netif); + oifindex = ifdev_get_index(oifdev); + } else + oifindex = 0; + + assert(pcopy->len >= sizeof(oifindex)); + + memcpy(pcopy->payload, &oifindex, sizeof(oifindex)); + + util_pbuf_header(pcopy, -(int)sizeof(oifindex)); + + if (pbuf_copy(pcopy, pbuf) != ERR_OK) + panic("unexpected pbuf copy failure"); + + pcopy->flags |= pbuf->flags & (PBUF_FLAG_LLMCAST | PBUF_FLAG_LLBCAST); + + util_pbuf_header(pcopy, sizeof(oifindex)); + + *loopif->loopif_tailp = pcopy; + loopif->loopif_tailp = pchain_end(pcopy); + + return ERR_OK; +} + +/* + * Initialization function for a loopback-type netif interface, called from + * lwIP at interface creation time. + */ +static err_t +loopif_init_netif(struct ifdev * ifdev, struct netif * netif) +{ + + netif->name[0] = 'l'; + netif->name[1] = 'o'; + + /* + * FIXME: unfortunately, lwIP does not allow one to enable multicast on + * an interface without also enabling multicast management traffic + * (that is, IGMP and MLD). Thus, for now, joining multicast groups + * and assigning local IPv6 addresses will incur such traffic even on + * loopback interfaces. For now this is preferable over not supporting + * multicast on loopback interfaces at all. + */ + netif->flags |= NETIF_FLAG_IGMP | NETIF_FLAG_MLD6; + + NETIF_SET_CHECKSUM_CTRL(netif, loopif_cksum_flags); + + return ERR_OK; +} + +/* + * Create a new loopback device. + */ +static int +loopif_create(const char * name) +{ + struct loopif *loopif; + + /* Find a free loopback interface slot, if available. */ + if (TAILQ_EMPTY(&loopif_freelist)) + return ENOBUFS; + + loopif = TAILQ_FIRST(&loopif_freelist); + TAILQ_REMOVE(&loopif_freelist, loopif, loopif_next); + + /* Initialize the loopif structure. */ + TAILQ_INSERT_HEAD(&loopif_activelist, loopif, loopif_next); + + loopif->loopif_head = NULL; + loopif->loopif_tailp = &loopif->loopif_head; + + /* + * For simplicity and efficiency, we do not prepend the address family + * (IPv4/IPv6) to the packet for BPF, which means our loopback devices + * are of type DLT_RAW rather than (NetBSD's) DLT_NULL. + */ + ifdev_add(&loopif->loopif_ifdev, name, IFF_LOOPBACK | IFF_MULTICAST, + IFT_LOOP, 0 /*hdrlen*/, 0 /*addrlen*/, DLT_RAW, LOOPIF_MAX_MTU, + 0 /*nd6flags*/, &loopif_ops); + + ifdev_update_link(&loopif->loopif_ifdev, LINK_STATE_UP); + + return OK; +} + +/* + * Destroy an existing loopback device. + */ +static int +loopif_destroy(struct ifdev * ifdev) +{ + struct loopif *loopif = (struct loopif *)ifdev; + struct pbuf *pbuf, **pnext; + int r; + + /* + * The ifdev module may refuse to remove this interface if it is the + * loopback interface used to loop back packets for other interfaces. + */ + if ((r = ifdev_remove(&loopif->loopif_ifdev)) != OK) + return r; + + /* + * Clean up. The loopback queue can be non-empty only if we have been + * throttling in case of a feedback loop. + */ + while ((pbuf = loopif->loopif_head) != NULL) { + pnext = pchain_end(pbuf); + + if ((loopif->loopif_head = *pnext) == NULL) + loopif->loopif_tailp = &loopif->loopif_head; + *pnext = NULL; + + pbuf_free(pbuf); + } + + TAILQ_REMOVE(&loopif_activelist, loopif, loopif_next); + + TAILQ_INSERT_HEAD(&loopif_freelist, loopif, loopif_next); + + return OK; +} + +/* + * Set NetBSD-style interface flags (IFF_) for a loopback interface. + */ +static int +loopif_set_ifflags(struct ifdev * ifdev, unsigned int ifflags) +{ + struct loopif *loopif = (struct loopif *)ifdev; + + /* + * Only the IFF_UP flag may be set and cleared. We adjust the + * IFF_RUNNING flag immediately based on this flag. This is a bit + * dangerous, but the caller takes this possibility into account. + */ + if ((ifflags & ~IFF_UP) != 0) + return EINVAL; + + if (ifflags & IFF_UP) + ifdev_update_ifflags(&loopif->loopif_ifdev, + ifdev_get_ifflags(&loopif->loopif_ifdev) | IFF_RUNNING); + else + ifdev_update_ifflags(&loopif->loopif_ifdev, + ifdev_get_ifflags(&loopif->loopif_ifdev) & ~IFF_RUNNING); + + return OK; +} + +/* + * Set the Maximum Transmission Unit for this interface. Return TRUE if the + * new value is acceptable, in which case the caller will do the rest. Return + * FALSE otherwise. + */ +static int +loopif_set_mtu(struct ifdev * ifdev __unused, unsigned int mtu) +{ + + return (mtu <= LOOPIF_MAX_MTU); +} + +static const struct ifdev_ops loopif_ops = { + .iop_init = loopif_init_netif, + .iop_input = ip_input, + .iop_output = loopif_output, + .iop_poll = loopif_poll, + .iop_set_ifflags = loopif_set_ifflags, + .iop_set_mtu = loopif_set_mtu, + .iop_destroy = loopif_destroy, +}; + +/* + * Set and/or retrieve a per-protocol loopback checksumming option through + * sysctl(7). + */ +ssize_t +loopif_cksum(struct rmib_call * call, struct rmib_node * node __unused, + struct rmib_oldp * oldp, struct rmib_newp * newp) +{ + struct loopif *loopif; + unsigned int flags; + int r, val; + + /* + * The third name field is the protocol. We ignore the domain (the + * second field), thus sharing settings between PF_INET and PF_INET6. + * This is necessary because lwIP does not support TCP/UDP checksumming + * flags on a per-domain basis. + */ + switch (call->call_oname[2]) { + case IPPROTO_IP: + flags = NETIF_CHECKSUM_GEN_IP | NETIF_CHECKSUM_CHECK_IP; + break; + case IPPROTO_UDP: + flags = NETIF_CHECKSUM_GEN_UDP | NETIF_CHECKSUM_CHECK_UDP; + break; + case IPPROTO_TCP: + flags = NETIF_CHECKSUM_GEN_TCP | NETIF_CHECKSUM_CHECK_TCP; + break; + default: + return EINVAL; + } + + /* Copy out the old (current) checksumming option. */ + if (oldp != NULL) { + val = !!(loopif_cksum_flags & flags); + + if ((r = rmib_copyout(oldp, 0, &val, sizeof(val))) < 0) + return r; + } + + if (newp != NULL) { + if ((r = rmib_copyin(newp, &val, sizeof(val))) != OK) + return r; + + if (val) + loopif_cksum_flags |= flags; + else + loopif_cksum_flags &= ~flags; + + /* + * Apply the new checksum flags to all loopback interfaces. + * Technically, this may result in dropped packets when + * enabling checksumming on a throttled loopif, but that is a + * case so rare and unimportant that we ignore it. + */ + TAILQ_FOREACH(loopif, &loopif_activelist, loopif_next) { + NETIF_SET_CHECKSUM_CTRL(loopif_get_netif(loopif), + loopif_cksum_flags); + } + } + + /* Return the length of the node. */ + return sizeof(val); +} diff --git a/minix/net/lwip/lwip.c b/minix/net/lwip/lwip.c new file mode 100644 index 000000000..1f602ab0a --- /dev/null +++ b/minix/net/lwip/lwip.c @@ -0,0 +1,382 @@ +/* LWIP service - lwip.c - main program and dispatch code */ + +#include "lwip.h" +#include "tcpisn.h" +#include "mcast.h" +#include "ethif.h" +#include "rtsock.h" +#include "route.h" +#include "bpfdev.h" + +#include "lwip/init.h" +#include "lwip/sys.h" +#include "lwip/timeouts.h" +#include "arch/cc.h" + +static int running, recheck_timer; +static minix_timer_t lwip_timer; + +static void expire_lwip_timer(int); + +/* + * Return the system uptime in milliseconds. Also remember that lwIP retrieved + * the system uptime during this call, so that we know to check for timer + * updates at the end of the current iteration of the message loop. + */ +uint32_t +sys_now(void) +{ + + recheck_timer = TRUE; + + /* TODO: avoid 64-bit arithmetic if possible. */ + return (uint32_t)(((uint64_t)getticks() * 1000) / sys_hz()); +} + +/* + * Check if and when lwIP has its next timeout, and set or cancel our timer + * accordingly. + */ +static void +set_lwip_timer(void) +{ + uint32_t next_timeout; + clock_t ticks; + + /* Ask lwIP when the next alarm is supposed to go off, if any. */ + next_timeout = sys_timeouts_sleeptime(); + + /* + * Set or update the lwIP timer. We rely on set_timer() asking the + * kernel for an alarm only if the timeout is different from the one we + * gave it last time (if at all). However, due to conversions between + * absolute and relative times, and the fact that we cannot guarantee + * that the uptime itself does not change while executing these + * routines, set_timer() will sometimes be issuing a kernel call even + * if the alarm has not changed. Not a huge deal, but fixing this will + * require a different interface to lwIP and/or the timers library. + */ + if (next_timeout != (uint32_t)-1) { + /* + * Round up the next timeout (which is in milliseconds) to the + * number of clock ticks to add to the current time. Avoid any + * potential for overflows, no matter how unrealistic.. + */ + if (next_timeout > TMRDIFF_MAX / sys_hz()) + ticks = TMRDIFF_MAX; + else + ticks = (next_timeout * sys_hz() + 999) / 1000; + + set_timer(&lwip_timer, ticks, expire_lwip_timer, 0 /*unused*/); + } else + cancel_timer(&lwip_timer); /* not really needed.. */ +} + +/* + * The timer for lwIP timeouts has gone off. Check timeouts, and possibly set + * a new timer. + */ +static void +expire_lwip_timer(int arg __unused) +{ + + /* Let lwIP do its work. */ + sys_check_timeouts(); + + /* + * See if we have to update our timer for the next lwIP timer. Doing + * this here, rather than from the main loop, avoids one kernel call. + */ + set_lwip_timer(); + + recheck_timer = FALSE; +} + +/* + * Check whether we should adjust our local timer based on a change in the next + * lwIP timeout. + */ +static void +check_lwip_timer(void) +{ + + /* + * We make the assumption that whenever lwIP starts a timer, it will + * need to retrieve the current time. Thus, whenever sys_now() is + * called, we set the 'recheck_timer' flag. Here, we check whether to + * (re)set our lwIP timer only if the flag is set. As a result, we do + * not have to mess with timers for literally every incoming message. + * + * When lwIP stops a timer, it does not call sys_now(), and thus, we + * may miss such updates. However, timers being stopped should be rare + * and getting too many alarm messages is not a big deal. + */ + if (!recheck_timer) + return; + + set_lwip_timer(); + + /* Reset the flag for the next message loop iteration. */ + recheck_timer = FALSE; +} + +/* + * Return a random number, for use by lwIP. + */ +uint32_t +lwip_hook_rand(void) +{ + + /* + * The current known uses of this hook are for selection of initial + * TCP/UDP port numbers and for multicast-related timer randomness. + * The former case exists only to avoid picking the same starting port + * numbers after a reboot. After that, simple sequential iteration of + * the port numbers is used. The latter case varies the response time + * for sending multicast messages. Thus, none of the current uses of + * this function require proper randomness, and so we use the simplest + * approach, with time-based initialization to cover the reboot case. + * The sequential port number selection could be improved upon, but + * such an extension would probably bypass this hook anyway. + */ + return lrand48(); +} + +/* + * Create a new socket, with the given domain, type, and protocol, for the user + * process identified by 'user_endpt'. On success, return the new socket's + * identifier, with the libsockevent socket stored in 'sock' and an operations + * table stored in 'ops'. On failure, return a negative error code. + */ +static sockid_t +alloc_socket(int domain, int type, int protocol, endpoint_t user_endpt, + struct sock ** sock, const struct sockevent_ops **ops) +{ + + switch (domain) { + case PF_INET: +#ifdef INET6 + case PF_INET6: +#endif /* INET6 */ + switch (type) { + case SOCK_STREAM: + return tcpsock_socket(domain, protocol, sock, ops); + + case SOCK_DGRAM: + return udpsock_socket(domain, protocol, sock, ops); + + case SOCK_RAW: + if (!util_is_root(user_endpt)) + return EACCES; + + return rawsock_socket(domain, protocol, sock, ops); + + default: + return EPROTOTYPE; + } + + case PF_ROUTE: + return rtsock_socket(type, protocol, sock, ops); + + case PF_LINK: + return lnksock_socket(type, protocol, sock, ops); + + default: + /* This means that the service has been misconfigured. */ + printf("socket() with unsupported domain %d\n", domain); + + return EAFNOSUPPORT; + } +} + +/* + * Initialize the service. + */ +static int +init(int type __unused, sef_init_info_t * init __unused) +{ + + /* + * Initialize the random number seed. See the lwip_hook_rand() comment + * on why this weak random number source is currently sufficient. + */ + srand48(clock_time(NULL)); + + /* Initialize the lwIP library. */ + lwip_init(); + + /* Initialize the socket events library. */ + sockevent_init(alloc_socket); + + /* Initialize various helper modules. */ + mempool_init(); + tcpisn_init(); + mcast_init(); + + /* Initialize the high-level socket modules. */ + ipsock_init(); + tcpsock_init(); + udpsock_init(); + rawsock_init(); + + /* Initialize the various network interface modules. */ + ifdev_init(); + loopif_init(); + ethif_init(); + + /* Initialize the network device driver module. */ + ndev_init(); + + /* Initialize the low-level socket modules. */ + rtsock_init(); + lnksock_init(); + + /* Initialize the routing module. */ + route_init(); + + /* Initialize other device modules. */ + bpfdev_init(); + + /* + * Initialize the MIB module, after all other modules have registered + * their subtrees with this module. + */ + mibtree_init(); + + /* + * After everything else has been initialized, set up the default + * configuration - in particular, a loopback interface. + */ + ifconf_init(); + + /* + * Initialize the master timer for all the lwIP timers. Just in case + * lwIP starts a timer right away, perform a first check upon entry of + * the message loop. + */ + init_timer(&lwip_timer); + + recheck_timer = TRUE; + + running = TRUE; + + return OK; +} + +/* + * Perform initialization using the System Event Framework (SEF). + */ +static void +startup(void) +{ + + sef_setcb_init_fresh(init); + /* + * This service requires stateless restarts, in that several parts of + * the system (including VFS and drivers) expect that if restarted, + * this service comes back up with a new endpoint. Therefore, do not + * set a _restart callback here. + * + * TODO: support for live update. + * + * TODO: support for immediate shutdown if no sockets are in use, as + * also done by UDS. For now, we never shut down immediately, giving + * other processes the opportunity to close sockets on system shutdown. + */ + + sef_startup(); +} + +/* + * The lwIP-based TCP/IP sockets driver. + */ +int +main(void) +{ + message m; + int r, ipc_status; + + startup(); + + while (running) { + /* + * For various reasons, the loopback interface does not pass + * packets back into the stack right away. Instead, it queues + * them up for later processing. We do that processing here. + */ + ifdev_poll(); + + /* + * Unfortunately, lwIP does not tell us when it starts or stops + * timers. This means that we have to check ourselves every + * time we have called into lwIP. For simplicity, we perform + * the check here. + */ + check_lwip_timer(); + + if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) { + if (r == EINTR) + continue; /* sef_cancel() was called */ + + panic("sef_receive_status failed: %d", r); + } + + /* Process the received message. */ + if (is_ipc_notify(ipc_status)) { + switch (m.m_source) { + case CLOCK: + expire_timers(m.m_notify.timestamp); + + break; + + case DS_PROC_NR: + /* Network drivers went up and/or down. */ + ndev_check(); + + break; + + default: + printf("unexpected notify from %d\n", + m.m_source); + } + + continue; + } + + switch (m.m_source) { + case MIB_PROC_NR: + rmib_process(&m, ipc_status); + + break; + + case VFS_PROC_NR: + /* Is this a socket device request? */ + if (IS_SDEV_RQ(m.m_type)) { + sockevent_process(&m, ipc_status); + + break; + } + + /* Is this a character (or block) device request? */ + if (IS_CDEV_RQ(m.m_type) || IS_BDEV_RQ(m.m_type)) { + bpfdev_process(&m, ipc_status); + + break; + } + + /* FALLTHROUGH */ + default: + /* Is this a network device driver response? */ + if (IS_NDEV_RS(m.m_type)) { + ndev_process(&m, ipc_status); + + break; + } + + printf("unexpected message %d from %d\n", + m.m_type, m.m_source); + } + } + + return 0; +} diff --git a/minix/net/lwip/lwip.conf b/minix/net/lwip/lwip.conf new file mode 100644 index 000000000..3179721e3 --- /dev/null +++ b/minix/net/lwip/lwip.conf @@ -0,0 +1,10 @@ +service lwip +{ + domain + INET INET6 ROUTE LINK + ; + system KILL; # for SIGPIPE + ipc + SYSTEM vfs rs vm mib + ; +}; diff --git a/minix/net/lwip/lwip.h b/minix/net/lwip/lwip.h new file mode 100644 index 000000000..2f65ab8ee --- /dev/null +++ b/minix/net/lwip/lwip.h @@ -0,0 +1,130 @@ +#ifndef MINIX_NET_LWIP_LWIP_H +#define MINIX_NET_LWIP_LWIP_H + +#include +#include +#include +#include +#include +#include + +#include "lwip/ip.h" +#include "lwiphooks.h" + +#include "addr.h" +#include "ipsock.h" +#include "ifdev.h" +#include "util.h" + +/* + * The standard sockaddr_dl is an absolute pain, because the actual structure + * is dynamically sized, while the standard definition is neither the minimum + * nor the maximum size. We use our own version, which uses the maximum size + * that we will ever produce and accept. This greatly simplifies dealing with + * this structure while also limiting stack usage a bit. + */ +struct sockaddr_dlx { + uint8_t sdlx_len; /* actual length of this structure */ + sa_family_t sdlx_family; /* address family, always AF_LINK */ + uint16_t sdlx_index; /* interface index */ + uint8_t sdlx_type; /* interface type (IFT_) */ + uint8_t sdlx_nlen; /* interface name length, w/o nul */ + uint8_t sdlx_alen; /* link-layer address length */ + uint8_t sdlx_slen; /* selector length, always 0 */ + uint8_t sdlx_data[IFNAMSIZ + NETIF_MAX_HWADDR_LEN]; +}; + +STATIC_SOCKADDR_MAX_ASSERT(sockaddr_in); +STATIC_SOCKADDR_MAX_ASSERT(sockaddr_in6); +STATIC_SOCKADDR_MAX_ASSERT(sockaddr_dlx); + +/* This is our own, much smaller internal version of sockaddr_storage. */ +union sockaddr_any { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr_dlx sdlx; +}; + +/* Number of bits in each of the types of IP addresses. */ +#define IP4_BITS 32 /* number of bits in an IPv4 address */ +#define IP6_BITS 128 /* number of bits in an IPv6 address */ + +/* + * Each socket module maintains its own set of sockets, but all sockets must be + * given globally unique identifiers. Therefore, we use these modifier masks, + * which are bitwise OR'ed with the per-module socket identifiers. + */ +#define SOCKID_TCP 0x00000000 +#define SOCKID_UDP 0x00100000 +#define SOCKID_RAW 0x00200000 +#define SOCKID_RT 0x00400000 +#define SOCKID_LNK 0x00800000 + +/* + * Static remote MIB node identifiers for nodes that are dynamically numbered + * on NetBSD, because they do not have a corresponding protocol family number. + */ +#define NET_INTERFACES (PF_MAX) /* net.interfaces (TODO) */ +#define NET_BPF (PF_MAX + 1) /* net.bpf */ + +#define ROOT_EUID 0 /* effective user ID of superuser */ + +/* + * Function declarations. Modules with more extended interfaces have their own + * header files. + */ + +/* mempool.c */ +void mempool_init(void); +unsigned int mempool_cur_buffers(void); +unsigned int mempool_max_buffers(void); + +/* pchain.c */ +struct pbuf **pchain_end(struct pbuf * pbuf); +size_t pchain_size(struct pbuf * pbuf); + +/* addrpol.c */ +int addrpol_get_label(const ip_addr_t * ipaddr); +int addrpol_get_scope(const ip_addr_t * ipaddr, int is_src); + +/* tcpsock.c */ +void tcpsock_init(void); +sockid_t tcpsock_socket(int domain, int protocol, struct sock ** sock, + const struct sockevent_ops ** ops); + +/* udpsock.c */ +void udpsock_init(void); +sockid_t udpsock_socket(int domain, int protocol, struct sock ** sock, + const struct sockevent_ops ** ops); + +/* rawsock.c */ +void rawsock_init(void); +sockid_t rawsock_socket(int domain, int protocol, struct sock ** sock, + const struct sockevent_ops ** ops); + +/* loopif.c */ +void loopif_init(void); +ssize_t loopif_cksum(struct rmib_call * call, struct rmib_node * node, + struct rmib_oldp * oldp, struct rmib_newp * newp); + +/* lnksock.c */ +void lnksock_init(void); +sockid_t lnksock_socket(int type, int protocol, struct sock ** sock, + const struct sockevent_ops ** ops); + +/* mibtree.c */ +void mibtree_init(void); +void mibtree_register_inet(int domain, int protocol, struct rmib_node * node); +void mibtree_register_lwip(struct rmib_node * node); + +/* ifconf.c */ +void ifconf_init(void); +int ifconf_ioctl(struct sock * sock, unsigned long request, + const struct sockdriver_data * data, endpoint_t user_endpt); + +/* bpf_filter.c */ +u_int bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf, + const u_char * packet, u_int total, u_int len); + +#endif /* !MINIX_NET_LWIP_LWIP_H */ diff --git a/minix/net/lwip/mcast.c b/minix/net/lwip/mcast.c new file mode 100644 index 000000000..832dce1a4 --- /dev/null +++ b/minix/net/lwip/mcast.c @@ -0,0 +1,283 @@ +/* LWIP service - mcast.c - per-socket multicast membership tracking */ +/* + * Each socket has a linked list of multicast groups of which it is a member. + * The linked list consists of 'mcast_member' elements. There is both a global + * limit (the number of elements in 'mcast_array') and a per-socket limit on + * group membership. Since multiple sockets may join the same multicast + * groups, there is not a one-to-one relationship between our membership + * structures and the lwIP IGMP/MLD membership structures. Moreover, linking + * to the latter structures directly is not intended by lwIP, so we have to + * keep our own tracking independent, which in particular means that we have to + * make a copy of the multicast group address. + * + * We currently put no effort into saving memory on storing that group address. + * Optimization is complicated by the fact that we have to be able to remove + * membership structures when their corresponding interface disappears, which + * currently involves removal without knowing about the corresponding socket, + * and therefore the socket's address family. All of this can be changed. + * + * There is no function to test whether a particular socket is a member of a + * multicast group. The pktsock module currently makes the assumption that if + * a socket has been joined to any multicast groups, or set any multicast + * options, the application is multicast aware and therefore able to figure out + * whether it is interested in particular packets, and so we do not filter + * incoming packets against the receiving socket's multicast list. This should + * be more or less in line with what W. Richard Stevens say that the BSDs do. + */ + +#include "lwip.h" +#include "mcast.h" + +#include "lwip/igmp.h" +#include "lwip/mld6.h" + +/* + * The per-socket limit on group membership. In theory, the limit should be + * high enough that a single socket can join a particular multicast group on + * all interfaces that support multicast. In practice, we set it a bit lower + * to prevent one socket from using up half of the entries per address family. + * Setting it to IP_MAX_MEMBERSHIPS is definitely excessive right now.. + */ +#define MAX_GROUPS_PER_SOCKET 8 + +static struct mcast_member { + LIST_ENTRY(mcast_member) mm_next; /* next in socket, free list */ + struct ifdev * mm_ifdev; /* interface (NULL: free) */ + ip_addr_t mm_group; /* group address */ +} mcast_array[NR_IPV4_MCAST_GROUP + NR_IPV6_MCAST_GROUP]; + +static LIST_HEAD(, mcast_member) mcast_freelist; + +/* + * Initialize the per-socket multicast membership module. + */ +void +mcast_init(void) +{ + unsigned int slot; + + /* Initialize the list of free multicast membership entries. */ + LIST_INIT(&mcast_freelist); + + for (slot = 0; slot < __arraycount(mcast_array); slot++) { + mcast_array[slot].mm_ifdev = NULL; + + LIST_INSERT_HEAD(&mcast_freelist, &mcast_array[slot], mm_next); + } +} + +/* + * Reset the multicast head for a socket. The socket must not have any + * previous multicast group memberships. + */ +void +mcast_reset(struct mcast_head * mcast_head) +{ + + LIST_INIT(&mcast_head->mh_list); +} + +/* + * Attempt to add a per-socket multicast membership association. The given + * 'mcast_head' pointer is part of a socket. The 'group' parameter is the + * multicast group to join. It is a properly zoned address, but has not been + * checked in any other way. If 'ifdev' is not NULL, it is the interface for + * the membership; if it is NULL, an interface will be selected using routing. + * Return OK if the membership has been successfully removed, or a negative + * error code otherwise. + */ +int +mcast_join(struct mcast_head * mcast_head, const ip_addr_t * group, + struct ifdev * ifdev) +{ + struct mcast_member *mm; + struct netif *netif; + unsigned int count; + err_t err; + + /* + * The callers of this function perform only checks that depend on the + * address family. We check everything else here. + */ + if (!ip_addr_ismulticast(group)) + return EADDRNOTAVAIL; + + if (!addr_is_valid_multicast(group)) + return EINVAL; + + /* + * If no interface was specified, pick one with a routing query. Note + * that scoped IPv6 addresses do require an interface to be specified. + */ + if (ifdev == NULL) { + netif = ip_route(IP46_ADDR_ANY(IP_GET_TYPE(group)), group); + + if (netif == NULL) + return EHOSTUNREACH; + + ifdev = netif_get_ifdev(netif); + } + + assert(ifdev != NULL); + assert(!IP_IS_V6(group) || + !ip6_addr_lacks_zone(ip_2_ip6(group), IP6_MULTICAST)); + + /* The interface must support multicast. */ + if (!(ifdev_get_ifflags(ifdev) & IFF_MULTICAST)) + return EADDRNOTAVAIL; + + /* + * First see if this socket is already joined to the given group, which + * is an error. While looking, also count the number of groups the + * socket has joined already, to enforce the per-socket limit. + */ + count = 0; + + LIST_FOREACH(mm, &mcast_head->mh_list, mm_next) { + if (mm->mm_ifdev == ifdev && ip_addr_cmp(&mm->mm_group, group)) + return EEXIST; + + count++; + } + + if (count >= MAX_GROUPS_PER_SOCKET) + return ENOBUFS; + + /* Do we have a free membership structure available? */ + if (LIST_EMPTY(&mcast_freelist)) + return ENOBUFS; + + /* + * Nothing can go wrong as far as we are concerned. Ask lwIP to join + * the multicast group. This may result in a multicast list update at + * the driver end. + */ + netif = ifdev_get_netif(ifdev); + + if (IP_IS_V6(group)) + err = mld6_joingroup_netif(netif, ip_2_ip6(group)); + else + err = igmp_joingroup_netif(netif, ip_2_ip4(group)); + + if (err != ERR_OK) + return util_convert_err(err); + + /* + * Success. Allocate, initialize, and attach a membership structure to + * the socket. + */ + mm = LIST_FIRST(&mcast_freelist); + + LIST_REMOVE(mm, mm_next); + + mm->mm_ifdev = ifdev; + mm->mm_group = *group; + + LIST_INSERT_HEAD(&mcast_head->mh_list, mm, mm_next); + + return OK; +} + +/* + * Free the given per-socket multicast membership structure, which must + * previously have been associated with a socket. If 'leave_group' is set, + * also tell lwIP to leave the corresponding multicast group. + */ +static void +mcast_free(struct mcast_member * mm, int leave_group) +{ + struct netif *netif; + err_t err; + + assert(mm->mm_ifdev != NULL); + + if (leave_group) { + netif = ifdev_get_netif(mm->mm_ifdev); + + if (IP_IS_V6(&mm->mm_group)) + err = mld6_leavegroup_netif(netif, + ip_2_ip6(&mm->mm_group)); + else + err = igmp_leavegroup_netif(netif, + ip_2_ip4(&mm->mm_group)); + + if (err != ERR_OK) + panic("lwIP multicast membership desynchronization"); + } + + LIST_REMOVE(mm, mm_next); + + mm->mm_ifdev = NULL; + + LIST_INSERT_HEAD(&mcast_freelist, mm, mm_next); +} + +/* + * Attempt to remove a per-socket multicast membership association. The given + * 'mcast_head' pointer is part of a socket. The 'group' parameter is the + * multicast group to leave. It is a properly zoned address, but has not been + * checked in any other way. If 'ifdev' is not NULL, it is the interface of + * the membership; if it is NULL, a membership matching the address on any + * interface will suffice. As such, the parameter requirements mirror those of + * mcast_join(). Return OK if the membership has been successfully removed, or + * a negative error code otherwise. + */ +int +mcast_leave(struct mcast_head * mcast_head, const ip_addr_t * group, + struct ifdev * ifdev) +{ + struct mcast_member *mm; + + /* + * Look up a matching entry. The fact that we must find a match for + * the given address and interface, keeps us from having to perform + * various other checks, such as whether the given address is a + * multicast address at all. The exact error codes are not specified. + */ + LIST_FOREACH(mm, &mcast_head->mh_list, mm_next) { + if ((ifdev == NULL || mm->mm_ifdev == ifdev) && + ip_addr_cmp(&mm->mm_group, group)) + break; + } + + if (mm == NULL) + return ESRCH; + + mcast_free(mm, TRUE /*leave_group*/); + + return OK; +} + +/* + * Remove all per-socket multicast membership associations of the given socket. + * This function is called when the socket is closed. + */ +void +mcast_leave_all(struct mcast_head * mcast_head) +{ + struct mcast_member *mm; + + while (!LIST_EMPTY(&mcast_head->mh_list)) { + mm = LIST_FIRST(&mcast_head->mh_list); + + mcast_free(mm, TRUE /*leave_group*/); + } +} + +/* + * The given interface is about to disappear. Remove and free any per-socket + * multicast membership structures associated with the interface, without + * leaving the multicast group itself (as that will happen a bit later anyway). + */ +void +mcast_clear(struct ifdev * ifdev) +{ + unsigned int slot; + + for (slot = 0; slot < __arraycount(mcast_array); slot++) { + if (mcast_array[slot].mm_ifdev != ifdev) + continue; + + mcast_free(&mcast_array[slot], FALSE /*leave_group*/); + } +} diff --git a/minix/net/lwip/mcast.h b/minix/net/lwip/mcast.h new file mode 100644 index 000000000..2b962503f --- /dev/null +++ b/minix/net/lwip/mcast.h @@ -0,0 +1,21 @@ +#ifndef MINIX_NET_LWIP_MCAST_H +#define MINIX_NET_LWIP_MCAST_H + +struct mcast_member; + +struct mcast_head { + LIST_HEAD(, mcast_member) mh_list; +}; + +#define mcast_isempty(mcast_head) (LIST_EMPTY(&(mcast_head)->mh_list)) + +void mcast_init(void); +void mcast_reset(struct mcast_head * mcast_head); +int mcast_join(struct mcast_head * mcast_head, const ip_addr_t * group, + struct ifdev * ifdev); +int mcast_leave(struct mcast_head * mcast_head, const ip_addr_t * group, + struct ifdev * ifdev); +void mcast_leave_all(struct mcast_head * mcast_head); +void mcast_clear(struct ifdev * ifdev); + +#endif /* !MINIX_NET_LWIP_MCAST_H */ diff --git a/minix/net/lwip/mempool.c b/minix/net/lwip/mempool.c new file mode 100644 index 000000000..11d18a048 --- /dev/null +++ b/minix/net/lwip/mempool.c @@ -0,0 +1,821 @@ +/* LWIP service - mempool.c - memory pool management and slab allocation */ +/* + * This module should be considered a replacement for lwIP's PBUF_POOL and + * custom-pools functionality. lwIP's PBUF_POOL system allows a PBUF_POOL type + * allocation for a moderately large amount of memory, for example for a full- + * sized packet, to be turned into a chain of "pbuf" buffers, each of a static + * size. Most of lwIP can deal with such pbuf chains, because many other types + * of allocations also end up consisting of pbuf chains. However, lwIP will + * never use PBUF_POOL for its own memory allocations, and use PBUF_RAM + * allocations instead. Such PBUF_RAM allocations always return one single + * pbuf with a contiguous memory area. lwIP's custom pools support allows such + * PBUF_RAM allocations to draw from user-defined pools of statically allocated + * memory, as an alternative to turning such allocations into malloc() calls. + * + * However, lwIP itself does not offer a way to combine these two pool systems: + * the PBUF_POOL buffer pool and the custom pools are completely separate. We + * want to be able to draw both kinds of memory from the same pool. This is + * the first reason that we are using our own memory pools. The second is + * something that lwIP could never offer anyway: we would like to provide a + * certain amount of static/preallocated memory for those types of allocations, + * but optionally also add a much larger amount of dynamic memory when needed. + * + * In order to make this module work, we do not use PBUF_POOL anywhere. + * Instead, we use chained static-sized PBUF_RAM allocations for all types of + * allocations that we manage ourselves--see pchain_alloc(). We tell lwIP to + * use the functions in this module to do the malloc-type allocations for those + * PBUF_RAM buffers. As such, this module manages all PBUF_RAM allocations, + * both from our own code and from lwIP. Note that we do still use lwIP's own + * pools for various lwIP structures. We do want to keep the isolation + * provided by the use of such pools, even though that means that we have to + * provision some of those pools for the worst case, resulting in some memory + * overhead that is unnecessary for the common case. + * + * With PBUF_RAM allocation redirection system in place, this module has to + * manage the memory for those allocations. It does this based on the + * assertion that there are three main classes of PBUF_RAM allocation sizes: + * + * - "large" allocations: these are allocations for up to MEMPOOL_BUFSIZE bytes + * of PBUF_RAM data, where MEMPOOL_BUFSIZE is the allocation granularity that + * we have picked for the individual buffers in larger chains. It is set to + * 512 bytes right now, mainly to keep pbuf chains for full-sized ethernet + * packets short, which has many performance advantages. Since the pbuf + * header itself also takes some space (16 bytes, right now), this results in + * allocations seen by mempool_malloc() of up to just over 512 bytes. + * - "small" allocations: these are allocations mostly for packet headers, as + * needed by lwIP to prepend to (mainly TCP) packet data that we give to it. + * The size of these allocations varies, but most are 76 bytes (80 bytes if + * we ever add VLAN support), plus once again the pbuf header. + * - "excessive" allocations: these are allocations larger than the maximum + * we have configured, effectively requesting contiguous memory of (possibly + * far) more than 512 bytes. We do not make such allocations ourselves, as + * we only ever create pbuf chains. Thus, any such allocations come from + * lwIP. There are a few locations in lwIP that attempt to make those kinds + * of allocations, but we replace one important case in the lwIP code with + * a chained allocation, (currently) leaving only one case: allocation of + * ICMP ping reply packets. In this module, we outright *deny* any excessive + * allocations. Practically, that means that no replies are generated for + * requests exceeding around 460 bytes, which is in fact not bad, especially + * since we have multicast ICMP ping replying enabled. If any new cases of + * excessive allocations are added to lwIP in the future, we will have to + * deal with those on a case-by-case basis, but for now this should be all. + * + * This module caters to the first two types of allocations. For large buffer + * allocations, it provides a standard slab allocator, with a hardcoded slab + * size of MEMPOOL_LARGE_COUNT buffers with a 512-byte data area each. One + * slab is allocated at service start-up; additional slabs up to a configured + * maximum are allocated on demand. Once fallen out of use, all but one slabs + * will be freed after a while, using a timer. The current per-slab count of + * 512 large buffers, combined with the buffer size of 512 plus the pbuf header + * plus a bit of extra overhead, results in about 266 KB per slab. + * + * For small buffer allocations, there are two facilities. First, there is a + * static pool of small buffers. This pool currently provides 256 small-sized + * buffers, mainly in order to allow packet headers to be produced even in low- + * memory conditions. In addition, small buffers may be formed by allocating + * and then splitting up one large buffer. The module is currently configured + * to split one large buffer into four small buffers, which yields a small + * buffer size of just over 100 bytes--enough for the packet headers while + * leaving little slack on either side. + * + * It is important to note that large and small buffer allocations are freed up + * through the same function, with no information on the original allocation + * size. As a result, we have to distinguish between large and small buffers + * using a unified system. In particular, this module prepends each of its + * allocations by a single pointer, which points to a header structure that is + * at the very beginning of the slab that contains the allocated buffer. That + * header structure contains information about the type of slab (large or + * small) as well as some accounting information used by both types. + * + * For large-buffer slabs, this header is part of a larger structure with for + * example the slab's list of free buffers. This larger structure is then + * followed by the actual buffers in the slab. + * + * For small-buffer slabs, the header is followed directly by the actual small + * buffers. Thus, when a large buffer is split up into four small buffers, the + * data area of that large buffer consists of a small-type slab header and four + * small buffers. The large buffer itself is simply considered in use, as + * though it was allocated for regular data. This nesting approach saves a lot + * of memory for small allocations, at the cost of a bit more computation. + * + * It should be noted that all allocations should be (and are) pointer-aligned. + * Normally lwIP would check for this, but we cannot tell lwIP the platform + * pointer size without hardcoding that size. This module performs proper + * alignment of all buffers itself though, regardless of the pointer size. + */ + +#include "lwip.h" + +#include + +/* Alignment to pointer sizes. */ +#define MEMPOOL_ALIGN_DOWN(s) ((s) & ~(sizeof(void *) - 1)) +#define MEMPOOL_ALIGN_UP(s) MEMPOOL_ALIGN_DOWN((s) + sizeof(void *) - 1) + +/* Large buffers: per-slab count and data area size. */ +#define MEMPOOL_LARGE_COUNT 512 +#define MEMPOOL_LARGE_SIZE \ + (MEMPOOL_ALIGN_UP(sizeof(struct pbuf)) + MEMPOOL_BUFSIZE) + +/* Small buffers: per-slab count and data area size. */ +#define MEMPOOL_SMALL_COUNT 4 +#define MEMPOOL_SMALL_SIZE \ + (MEMPOOL_ALIGN_DOWN(MEMPOOL_LARGE_SIZE / MEMPOOL_SMALL_COUNT) - \ + sizeof(struct mempool_header)) + +/* Memory pool slab header, part of both small and large slabs. */ +struct mempool_header { + union { + struct { + uint8_t mhui_flags; + uint32_t mhui_inuse; + } mhu_info; + void *mhu_align; /* force pointer alignment */ + } mh_u; +}; +#define mh_flags mh_u.mhu_info.mhui_flags +#define mh_inuse mh_u.mhu_info.mhui_inuse + +/* Header flags. */ +#define MHF_SMALL 0x01 /* slab is for small buffers, not large ones */ +#define MHF_STATIC 0x02 /* small slab is statically allocated */ +#define MHF_MARKED 0x04 /* large empty slab is up for deallocation */ + +/* + * Large buffer. When allocated, mlb_header points to the (header of) the + * containing large slab, and mlb_data is returned for arbitrary use by the + * user of the buffer. When free, mlb_header is NULL and instead mlb_header2 + * points to the containing slab (allowing for double-free detection), and the + * buffer is on the slab's free list by using mlb_next. + */ +struct mempool_large_buf { + struct mempool_header *mlb_header; + union { + struct { + struct mempool_header *mlbuf_header2; + LIST_ENTRY(mempool_large_buf) mlbuf_next; + } mlbu_free; + char mlbu_data[MEMPOOL_LARGE_SIZE]; + } mlb_u; +}; +#define mlb_header2 mlb_u.mlbu_free.mlbuf_header2 +#define mlb_next mlb_u.mlbu_free.mlbuf_next +#define mlb_data mlb_u.mlbu_data + +/* Small buffer. Same idea, different size. */ +struct mempool_small_buf { + struct mempool_header *msb_header; + union { + struct { + struct mempool_header *msbuf_header2; + TAILQ_ENTRY(mempool_small_buf) msbuf_next; + } msbu_free; + char msbu_data[MEMPOOL_SMALL_SIZE]; + } msb_u; +}; +#define msb_header2 msb_u.msbu_free.msbuf_header2 +#define msb_next msb_u.msbu_free.msbuf_next +#define msb_data msb_u.msbu_data + +/* + * A large slab, including header, other per-slab fields, and large buffers. + * Each of these structures is on exactly one of three slab lists, depending + * on whether all its buffers are free (empty), some but not all of its buffers + * are in use (partial), or all of its buffers are in use (full). The mls_next + * field is used for that list. The mls_free field is the per-slab list of + * free buffers. + */ +struct mempool_large_slab { + struct mempool_header mls_header; /* MUST be first */ + LIST_ENTRY(mempool_large_slab) mls_next; + LIST_HEAD(, mempool_large_buf) mls_free; + struct mempool_large_buf mls_buf[MEMPOOL_LARGE_COUNT]; +}; + +/* The three slab lists for large slabs, as described above. */ +static LIST_HEAD(, mempool_large_slab) mempool_empty_slabs; +static LIST_HEAD(, mempool_large_slab) mempool_partial_slabs; +static LIST_HEAD(, mempool_large_slab) mempool_full_slabs; + +/* + * A small slab, including header and small buffers. We use unified free lists + * for small buffers, and these small slabs are not part of any lists + * themselves, so we need neither of the two fields from large slabs for that. + */ +struct mempool_small_slab { + struct mempool_header mss_header; /* MUST be first */ + struct mempool_small_buf mss_buf[MEMPOOL_SMALL_COUNT]; +}; + +/* + * The free lists for static small buffers (from the static pool, see below) + * and dynamic small buffers (as obtained by splitting large buffers). + */ +static TAILQ_HEAD(, mempool_small_buf) mempool_small_static_freelist; +static TAILQ_HEAD(, mempool_small_buf) mempool_small_dynamic_freelist; + +/* + * A static pool of small buffers. Small buffers are somewhat more important + * than large buffers, because they are used for packet headers. The purpose + * of this static pool is to be able to make progress even if all large buffers + * are allocated for data, typically in the case that the system is low on + * memory. Note that the number of static small buffers is the given number of + * small slabs multiplied by MEMPOOL_SMALL_COUNT, hence the division. + */ +#define MEMPOOL_SMALL_SLABS (256 / MEMPOOL_SMALL_COUNT) + +static struct mempool_small_slab mempool_small_pool[MEMPOOL_SMALL_SLABS]; + +/* + * The following setting (mempool_max_slabs) can be changed through sysctl(7). + * As such it may be set by userland to a completely arbitrary value and must + * be sanity-checked before any actual use. The default is picked such that + * all TCP sockets can fill up their send and receive queues: (TCP_SNDBUF_DEF + + * TCP_RCVBUF_DEF) * NR_TCPSOCK / (MEMPOOL_BUFSIZE * MEMPOOL_LARGE_COUNT) = + * (32768 + 32768) * 256 / (512 * 512) = 64. We put in the resulting number + * rather than the formula because not all those definitions are public. + */ +#define MEMPOOL_DEFAULT_MAX_SLABS 64 /* about 17 MB of memory */ + +static int mempool_max_slabs; /* maximum number of large slabs */ +static int mempool_nr_slabs; /* current number of large slabs */ + +static int mempool_nr_large; /* current number of large buffers */ +static int mempool_used_large; /* large buffers currently in use */ +static int mempool_used_small; /* small buffers currently in use */ + +/* + * Number of clock ticks between timer invocations. The timer is used to + * deallocate unused slabs. + */ +#define MEMPOOL_TIMER_TICKS (10 * sys_hz()) + +static minix_timer_t mempool_timer; + +static int mempool_defer_alloc; /* allocation failed, defer next try */ + +/* The CTL_MINIX MINIX_LWIP "mempool" subtree. Dynamically numbered. */ +static struct rmib_node minix_lwip_mempool_table[] = { + RMIB_INTPTR(RMIB_RW, &mempool_max_slabs, "slab_max", + "Maximum number of memory slabs (configurable)"), + RMIB_INTPTR(RMIB_RO, &mempool_nr_slabs, "slab_num", + "Current number of memory slabs"), + RMIB_INT(RMIB_RO, sizeof(struct mempool_large_slab), "slab_size", + "Byte size of a single memory slab"), + RMIB_INT(RMIB_RO, MEMPOOL_LARGE_COUNT, "slab_bufs", + "Number of large buffers per memory slab"), + RMIB_INTPTR(RMIB_RO, &mempool_nr_large, "large_num", + "Current total number of large buffers"), + RMIB_INTPTR(RMIB_RO, &mempool_used_large, "large_used", + "Current number of used large buffers"), + RMIB_INT(RMIB_RO, MEMPOOL_LARGE_SIZE, "large_size", + "Byte size of a single large buffer"), + RMIB_INTPTR(RMIB_RO, &mempool_used_small, "small_used", + "Current number of used small buffers"), + RMIB_INT(RMIB_RO, MEMPOOL_SMALL_SIZE, "small_size", + "Byte size of a single small buffer"), +}; + +static struct rmib_node minix_lwip_mempool_node = + RMIB_NODE(RMIB_RO, minix_lwip_mempool_table, "mempool", + "Memory pool settings"); + +/* + * Initialize the given "slab" of small buffers. The slab may either come from + * the statically allocated pool ('is_static' is TRUE) or a single large buffer + * that we aim to chop up into small buffers. + */ +static void +mempool_prepare_small(struct mempool_small_slab * mss, int is_static) +{ + struct mempool_small_buf *msb; + unsigned int count; + + mss->mss_header.mh_flags = MHF_SMALL | ((is_static) ? MHF_STATIC : 0); + mss->mss_header.mh_inuse = 0; + + msb = mss->mss_buf; + + for (count = 0; count < MEMPOOL_SMALL_COUNT; count++, msb++) { + msb->msb_header = NULL; + msb->msb_header2 = &mss->mss_header; + + if (is_static) + TAILQ_INSERT_HEAD(&mempool_small_static_freelist, msb, + msb_next); + else + TAILQ_INSERT_HEAD(&mempool_small_dynamic_freelist, msb, + msb_next); + } +} + +/* + * Allocate a new slab for large buffers, if allowed by policy and possible. + */ +static void +mempool_new_slab(void) +{ + struct mempool_large_slab *mls; + struct mempool_large_buf *mlb; + unsigned int count; + + /* + * See if allocating a new slab would result in overrunning the + * configured maximum number of large buffers. Round the maximum, + * which is probably what the user intended. + */ + if (mempool_cur_buffers() + MEMPOOL_LARGE_COUNT / 2 > + mempool_max_buffers()) { + assert(mempool_nr_slabs > 0); + + return; + } + + /* + * If a previous allocation failed before during this timer interval, + * do not try again now. + */ + if (mempool_defer_alloc) + return; + + /* + * Allocate the slab. Preallocate the memory, or we might crash later + * during low-memory conditions. If allocation fails, simply do + * nothing further. The caller will check the free lists. + */ + mls = (struct mempool_large_slab *)mmap(NULL, + sizeof(struct mempool_large_slab), PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0); + + if (mls == MAP_FAILED) { + if (mempool_nr_slabs == 0) + panic("unable to allocate initial memory pool"); + + /* + * Do not keep hammering VM with mmap requests when the system + * is out of memory. Try again after the next timer tick. + */ + mempool_defer_alloc = TRUE; + + return; + } + + /* Initialize the new slab. */ + mls->mls_header.mh_flags = 0; + mls->mls_header.mh_inuse = 0; + + mlb = mls->mls_buf; + + LIST_INIT(&mls->mls_free); + + for (count = 0; count < MEMPOOL_LARGE_COUNT; count++, mlb++) { + mlb->mlb_header = NULL; + mlb->mlb_header2 = &mls->mls_header; + + LIST_INSERT_HEAD(&mls->mls_free, mlb, mlb_next); + } + + LIST_INSERT_HEAD(&mempool_empty_slabs, mls, mls_next); + + mempool_nr_slabs++; + mempool_nr_large += MEMPOOL_LARGE_COUNT; +} + +/* + * Deallocate a slab for large buffers, if allowed. + */ +static void +mempool_destroy_slab(struct mempool_large_slab * mls) +{ + + assert(mempool_nr_slabs > 0); + + assert(!(mls->mls_header.mh_flags & MHF_SMALL)); + assert(mls->mls_header.mh_inuse == 0); + + /* Never deallocate the last large slab. */ + if (mempool_nr_slabs == 1) + return; + + LIST_REMOVE(mls, mls_next); + + if (munmap(mls, sizeof(*mls)) != 0) + panic("munmap failed: %d", -errno); + + assert(mempool_nr_large > MEMPOOL_LARGE_COUNT); + mempool_nr_large -= MEMPOOL_LARGE_COUNT; + mempool_nr_slabs--; +} + +/* + * Regular timer. Deallocate empty slabs already marked for deallocation, and + * mark any other empty slabs for deallocation. + */ +static void +mempool_tick(int arg __unused) +{ + struct mempool_large_slab *mls, *tmls; + + /* + * Go through all the empty slabs, destroying marked slabs and marking + * unmarked slabs. + */ + LIST_FOREACH_SAFE(mls, &mempool_empty_slabs, mls_next, tmls) { + if (mls->mls_header.mh_flags & MHF_MARKED) + mempool_destroy_slab(mls); + else + mls->mls_header.mh_flags |= MHF_MARKED; + } + + /* + * If allocation failed during the last interval, allow a new attempt + * during the next. + */ + mempool_defer_alloc = FALSE; + + /* Set the next timer. */ + set_timer(&mempool_timer, MEMPOOL_TIMER_TICKS, mempool_tick, 0); +} + +/* + * Initialize the memory pool module. + */ +void +mempool_init(void) +{ + unsigned int slot; + + /* These checks are for absolutely essential points. */ + assert(sizeof(void *) == MEM_ALIGNMENT); + assert(sizeof(struct mempool_small_slab) <= MEMPOOL_LARGE_SIZE); + assert(offsetof(struct mempool_small_buf, msb_data) == sizeof(void *)); + assert(offsetof(struct mempool_large_buf, mlb_data) == sizeof(void *)); + + /* Initialize module-local variables. */ + LIST_INIT(&mempool_empty_slabs); + LIST_INIT(&mempool_partial_slabs); + LIST_INIT(&mempool_full_slabs); + + TAILQ_INIT(&mempool_small_static_freelist); + TAILQ_INIT(&mempool_small_dynamic_freelist); + + mempool_max_slabs = MEMPOOL_DEFAULT_MAX_SLABS; + mempool_nr_slabs = 0; + + mempool_nr_large = 0; + mempool_used_large = 0; + mempool_used_small = 0; + + mempool_defer_alloc = FALSE; + + /* Initialize the static pool of small buffers. */ + for (slot = 0; slot < __arraycount(mempool_small_pool); slot++) + mempool_prepare_small(&mempool_small_pool[slot], + TRUE /*is_static*/); + + /* + * Allocate one large slab. The service needs at least one large slab + * for basic operation, and therefore will never deallocate the last. + */ + mempool_new_slab(); + + /* Set a regular low-frequency timer to deallocate unused slabs. */ + set_timer(&mempool_timer, MEMPOOL_TIMER_TICKS, mempool_tick, 0); + + /* Register the minix.lwip.mempool subtree. */ + mibtree_register_lwip(&minix_lwip_mempool_node); +} + +/* + * Return the total number of large buffers currently in the system, regardless + * of allocation status. + */ +unsigned int +mempool_cur_buffers(void) +{ + + return mempool_nr_large; +} + +/* + * Return the maximum number of large buffers that the system has been allowed + * to allocate. Note that due to low-memory conditions, this maximum may not + * be allocated in practice even when desired. + */ +unsigned int +mempool_max_buffers(void) +{ + + if (mempool_max_slabs <= 1) + return MEMPOOL_LARGE_COUNT; + + if ((size_t)mempool_max_slabs > + INT_MAX / sizeof(struct mempool_large_slab)) + return INT_MAX / sizeof(struct mempool_large_slab); + + return (size_t)mempool_max_slabs * MEMPOOL_LARGE_COUNT; +} + +/* + * Allocate a large buffer, either by taking one off a free list or by + * allocating a new large slab. On success, return a pointer to the data area + * of the large buffer. This data area is exactly MEMPOOL_LARGE_SIZE bytes in + * size. If no large buffer could be allocated, return NULL. + */ +static void * +mempool_alloc_large(void) +{ + struct mempool_large_slab *mls; + struct mempool_large_buf *mlb; + + /* + * Find a large slab that has free large blocks. As is standard for + * slab allocation, favor partially used slabs over empty slabs for + * eventual consolidation. If both lists are empty, try allocating a + * new slab. If that fails, we are out of memory, and return NULL. + */ + if (!LIST_EMPTY(&mempool_partial_slabs)) + mls = LIST_FIRST(&mempool_partial_slabs); + else { + if (LIST_EMPTY(&mempool_empty_slabs)) { + mempool_new_slab(); + + if (LIST_EMPTY(&mempool_empty_slabs)) + return NULL; /* out of memory */ + } + + mls = LIST_FIRST(&mempool_empty_slabs); + } + + /* Allocate a block from the slab that we picked. */ + assert(mls != NULL); + assert(!LIST_EMPTY(&mls->mls_free)); + + mlb = LIST_FIRST(&mls->mls_free); + LIST_REMOVE(mlb, mlb_next); + + assert(mlb->mlb_header == NULL); + assert(mlb->mlb_header2 == &mls->mls_header); + + mlb->mlb_header = &mls->mls_header; + + /* + * Adjust accounting for the large slab, which may involve moving it + * to another list. + */ + assert(mls->mls_header.mh_inuse < MEMPOOL_LARGE_COUNT); + mls->mls_header.mh_inuse++; + + if (mls->mls_header.mh_inuse == MEMPOOL_LARGE_COUNT) { + LIST_REMOVE(mls, mls_next); + + LIST_INSERT_HEAD(&mempool_full_slabs, mls, mls_next); + } else if (mls->mls_header.mh_inuse == 1) { + LIST_REMOVE(mls, mls_next); + + LIST_INSERT_HEAD(&mempool_partial_slabs, mls, mls_next); + } + + assert(mempool_used_large < mempool_nr_large); + mempool_used_large++; + + /* Return the block's data area. */ + return (void *)mlb->mlb_data; +} + +/* + * Allocate a small buffer, either by taking one off a free list or by + * allocating a large buffer and splitting it up in new free small buffers. On + * success, return a pointer to the data area of the small buffer. This data + * area is exactly MEMPOOL_SMALL_SIZE bytes in size. If no small buffer could + * be allocated, return NULL. + */ +static void * +mempool_alloc_small(void) +{ + struct mempool_small_slab *mss; + struct mempool_small_buf *msb; + struct mempool_header *mh; + + /* + * Find a free small block and take it off the free list. Try the + * static free list before the dynamic one, so that after a peak in + * buffer usage we are likely to be able to free up the dynamic slabs + * quickly. If both lists are empty, try allocating a large block to + * divvy up into small blocks. If that fails, we are out of memory. + */ + if (!TAILQ_EMPTY(&mempool_small_static_freelist)) { + msb = TAILQ_FIRST(&mempool_small_static_freelist); + + TAILQ_REMOVE(&mempool_small_static_freelist, msb, msb_next); + } else { + if (TAILQ_EMPTY(&mempool_small_dynamic_freelist)) { + mss = + (struct mempool_small_slab *)mempool_alloc_large(); + + if (mss == NULL) + return NULL; /* out of memory */ + + /* Initialize the small slab, including its blocks. */ + mempool_prepare_small(mss, FALSE /*is_static*/); + } + + msb = TAILQ_FIRST(&mempool_small_dynamic_freelist); + assert(msb != NULL); + + TAILQ_REMOVE(&mempool_small_dynamic_freelist, msb, msb_next); + } + + /* Mark the small block as allocated, and return its data area. */ + assert(msb != NULL); + + assert(msb->msb_header == NULL); + assert(msb->msb_header2 != NULL); + + mh = msb->msb_header2; + msb->msb_header = mh; + + assert(mh->mh_inuse < MEMPOOL_SMALL_COUNT); + mh->mh_inuse++; + + mempool_used_small++; + + return (void *)msb->msb_data; +} + +/* + * Memory pool wrapper function for malloc() calls from lwIP. + */ +void * +mempool_malloc(size_t size) +{ + + /* + * It is currently expected that there will be allocation attempts for + * sizes larger than our large size, in particular for ICMP ping + * replies as described elsewhere. As such, we cannot print any + * warnings here. For now, refusing these excessive allocations should + * not be a problem in practice. + */ + if (size > MEMPOOL_LARGE_SIZE) + return NULL; + + if (size <= MEMPOOL_SMALL_SIZE) + return mempool_alloc_small(); + else + return mempool_alloc_large(); +} + +/* + * Memory pool wrapper function for free() calls from lwIP. + */ +void +mempool_free(void * ptr) +{ + struct mempool_large_slab *mls; + struct mempool_large_buf *mlb; + struct mempool_small_slab *mss; + struct mempool_small_buf *msb; + struct mempool_header *mh; + unsigned int count; + + /* + * Get a pointer to the slab header, which is right before the data + * area for both large and small buffers. This pointer is NULL if the + * buffer is free, which would indicate that something is very wrong. + */ + ptr = (void *)((char *)ptr - sizeof(mh)); + + memcpy(&mh, ptr, sizeof(mh)); + + if (mh == NULL) + panic("mempool_free called on unallocated object!"); + + /* + * If the slab header says that the slab is for small buffers, deal + * with that case first. If we free up the last small buffer of a + * dynamically allocated small slab, we also free up the entire small + * slab, which is in fact the data area of a large buffer. + */ + if (mh->mh_flags & MHF_SMALL) { + /* + * Move the small buffer onto the appropriate small free list. + */ + msb = (struct mempool_small_buf *)ptr; + + msb->msb_header2 = mh; + msb->msb_header = NULL; + + /* + * Simple heuristic, unless the buffer is static: favor reuse + * of small buffers in containers that are already in use + * for other small buffers as well, for consolidation. + */ + if (mh->mh_flags & MHF_STATIC) + TAILQ_INSERT_HEAD(&mempool_small_static_freelist, msb, + msb_next); + else if (mh->mh_inuse > 1) + TAILQ_INSERT_HEAD(&mempool_small_dynamic_freelist, msb, + msb_next); + else + TAILQ_INSERT_TAIL(&mempool_small_dynamic_freelist, msb, + msb_next); + + assert(mh->mh_inuse > 0); + mh->mh_inuse--; + + assert(mempool_used_small > 0); + mempool_used_small--; + + /* + * If the small buffer is statically allocated, or it was not + * the last allocated small buffer in its containing large + * buffer, then we are done. + */ + if (mh->mh_inuse > 0 || (mh->mh_flags & MHF_STATIC)) + return; + + /* + * Otherwise, free the containing large buffer as well. First, + * remove all its small buffers from the free list. + */ + mss = (struct mempool_small_slab *)mh; + msb = mss->mss_buf; + + for (count = 0; count < MEMPOOL_SMALL_COUNT; count++, msb++) { + assert(msb->msb_header == NULL); + assert(msb->msb_header2 == mh); + + TAILQ_REMOVE(&mempool_small_dynamic_freelist, msb, + msb_next); + } + + /* Then, fall through to the large-buffer free code. */ + ptr = (void *)((char *)mh - sizeof(mh)); + + memcpy(&mh, ptr, sizeof(mh)); + + assert(mh != NULL); + assert(!(mh->mh_flags & MHF_SMALL)); + } + + /* + * Move the large buffer onto the free list of the large slab to which + * it belongs. + */ + mls = (struct mempool_large_slab *)mh; + mlb = (struct mempool_large_buf *)ptr; + + mlb->mlb_header2 = &mls->mls_header; + mlb->mlb_header = NULL; + + LIST_INSERT_HEAD(&mls->mls_free, mlb, mlb_next); + + /* + * Adjust accounting for the large slab, which may involve moving it + * to another list. + */ + assert(mls->mls_header.mh_inuse > 0); + mls->mls_header.mh_inuse--; + + if (mls->mls_header.mh_inuse == 0) { + LIST_REMOVE(mls, mls_next); + + LIST_INSERT_HEAD(&mempool_empty_slabs, mls, mls_next); + + mls->mls_header.mh_flags &= ~MHF_MARKED; + } else if (mls->mls_header.mh_inuse == MEMPOOL_LARGE_COUNT - 1) { + LIST_REMOVE(mls, mls_next); + + LIST_INSERT_HEAD(&mempool_partial_slabs, mls, mls_next); + } + + assert(mempool_used_large > 0); + mempool_used_large--; +} + +/* + * Memory pool wrapper function for calloc() calls from lwIP. + */ +void * +mempool_calloc(size_t num, size_t size) +{ + void *ptr; + size_t total; + + /* + * Standard overflow check. This can be improved, but it doesn't have + * to be, because in practice lwIP never calls calloc() anyway. + */ + if (num > 0 && size > 0 && (size_t)-1 / size < num) + return NULL; + + total = num * size; + + if ((ptr = mempool_malloc(total)) == NULL) + return NULL; + + memset(ptr, 0, total); + + return ptr; +} diff --git a/minix/net/lwip/mibtree.c b/minix/net/lwip/mibtree.c new file mode 100644 index 000000000..847e7ab00 --- /dev/null +++ b/minix/net/lwip/mibtree.c @@ -0,0 +1,141 @@ +/* LWIP service - mibtree.c - sysctl support for */ +/* + * This file acts as a dispatcher for the net.inet, net.inet6, and minix.lwip + * sysctl trees. It does not cover the other net.* trees; these are taken care + * of in other source files. + */ + +#include "lwip.h" + +#include + +#define MAX_PROTO 6 /* maximum # of INET protocols with subtrees */ + +static struct rmib_indir net_inet_indir[MAX_PROTO]; +static unsigned int net_inet_indir_count = 0; +static struct rmib_node net_inet_node = + RMIB_SNODE(RMIB_RO, net_inet_indir, "inet", "PF_INET related settings"); + +#ifdef INET6 +static struct rmib_indir net_inet6_indir[MAX_PROTO]; +static unsigned int net_inet6_indir_count = 0; +static struct rmib_node net_inet6_node = + RMIB_SNODE(RMIB_RO, net_inet6_indir, "inet6", "PF_INET6 related settings"); +#endif /* INET6 */ + +#define MAX_LWIP 4 /* maximum # of miscellaneous LWIP subtrees */ + +static struct rmib_indir minix_lwip_indir[MAX_LWIP]; +static unsigned int minix_lwip_indir_count = 0; +static struct rmib_node minix_lwip_node = + RMIB_SNODE(RMIB_RO, minix_lwip_indir, "lwip", + "LWIP service information and settings"); + +/* + * Initialize the status module by registering the net.inet, net.inet6, and + * minix.lwip trees with the MIB service. Other modules must have added all + * subtrees to those trees through mibtree_register_*() before this point. + */ +void +mibtree_init(void) +{ + const int inet_mib[] = { CTL_NET, PF_INET }; +#ifdef INET6 + const int inet6_mib[] = { CTL_NET, PF_INET6 }; +#endif /* INET6 */ + const int lwip_mib[] = { CTL_MINIX, MINIX_LWIP }; + int r; + + /* + * Register the "net.inet", "net.inet6", and "minix.lwip" subtrees with + * the MIB service. + * + * These calls only return local failures. Remote failures (in the MIB + * service) are silently ignored. So, we can safely panic on failure. + */ + if ((r = rmib_register(inet_mib, __arraycount(inet_mib), + &net_inet_node)) != OK) + panic("unable to register net.inet RMIB tree: %d", r); + +#ifdef INET6 + if ((r = rmib_register(inet6_mib, __arraycount(inet6_mib), + &net_inet6_node)) != OK) + panic("unable to register net.inet6 RMIB tree: %d", r); +#endif /* INET6 */ + + if ((r = rmib_register(lwip_mib, __arraycount(lwip_mib), + &minix_lwip_node)) != OK) + panic("unable to register minix.lwip RMIB tree: %d", r); +} + +/* + * Add a subtree to the local net.inet or net.inet6 tree. This function must + * only be called *before* mibtree_init(), as the latter will register the + * final tree with the MIB service. + */ +void +mibtree_register_inet(int domain, int protocol, struct rmib_node * node) +{ + struct rmib_node *parent; + struct rmib_indir *indir; + unsigned int i, *count; + + switch (domain) { + case PF_INET: + parent = &net_inet_node; + indir = net_inet_indir; + count = &net_inet_indir_count; + break; + case PF_INET6: +#ifdef INET6 + parent = &net_inet6_node; + indir = net_inet6_indir; + count = &net_inet6_indir_count; + break; +#else /* !INET6 */ + return; +#endif /* !INET6 */ + default: + panic("invalid domain %d", domain); + } + + assert(*count < MAX_PROTO); + + /* Insertion sort. */ + for (i = 0; i < *count; i++) { + assert(indir[i].rindir_id != (unsigned int)protocol); + + if (indir[i].rindir_id > (unsigned int)protocol) + break; + } + + if (i < *count) + memmove(&indir[i + 1], &indir[i], + sizeof(indir[0]) * (*count - i)); + + indir[i].rindir_id = protocol; + indir[i].rindir_node = node; + parent->rnode_size = ++*count; +} + +/* + * Add a miscellaneous subtree to the local minix.lwip tree. This function + * must only be called *before* mibtree_init(), as the latter will register the + * final tree with the MIB service. Note that the given subtrees are numbered + * arbitrarily. We use sparse trees here only to avoid having to declare + * external variables, which is a bit of a hack, but with the expected low + * number of miscellaneous subtrees there will be no performance penalty. + */ +void +mibtree_register_lwip(struct rmib_node * node) +{ + unsigned int i; + + i = minix_lwip_indir_count; + + assert(i < __arraycount(minix_lwip_indir)); + + minix_lwip_indir[i].rindir_id = i; + minix_lwip_indir[i].rindir_node = node; + minix_lwip_node.rnode_size = ++minix_lwip_indir_count; +} diff --git a/minix/net/lwip/ndev.c b/minix/net/lwip/ndev.c new file mode 100644 index 000000000..d806b5af7 --- /dev/null +++ b/minix/net/lwip/ndev.c @@ -0,0 +1,1019 @@ +/* LWIP service - ndev.c - network driver communication module */ +/* + * There is almost a one-to-one mapping between network device driver (ndev) + * objects and ethernet interface (ethif) objects, with as major difference + * that there may be an ndev object but not an ethif object for a driver that + * is known to exist but has not yet replied to our initialization request: + * without the information from the initialization request, there is no point + * creating an ethif object just yet, while we do need to track the driver + * process. TODO: it would be nice if unanswered init requests timed out and + * caused the removal of the ndev object after a while. + * + * Beyond that, this module aims to abstract away the low-level details of + * communication, memory grants, and driver restarts. Driver restarts are not + * fully transparent to the ethif module because it needs to reinitialize + * driver state only it knows about after a restart. Drivers that are in the + * process of restarting and therefore not operational are said to be disabled. + * + * From this module's point of view, a network driver is one of two states: + * initializing, where it has yet to respond to our initialization request, and + * active, where it is expected to accept and respond to all other requests. + * This module does not keep track of higher-level states and rules however; + * that is left to the ethif layer on one side, and the network driver itself + * on the other side. One important example is the interface being up or down: + * the ndev layer will happily forward send and receive requests when the + * interface is down, but these requests will be (resp.) dropped and rejected + * by the network driver in that state, and will not be generated by the ethif + * layer when the layer is down. Imposing barriers between configure and send + * requests is also left to the other parties. + * + * In this module, each active network driver has a send queue and a receive + * queue. The send queue is shared for packet send requests and configuration + * change requests. The receive queue is used for packet receive requests + * only. Each queue has a maximum depth, which is the minimum of a value + * provided by the network driver during initialization and local restrictions. + * These local restrictions are different for the two queue types: the receive + * queue is always bounded to a hardcoded value, while the send queue has a + * guaranteed minimum depth but may use up to the driver's maximum using spare + * entries. For both, a minimum depth is always available, since it is not + * possible to cancel individual send or receive requests after they have been + * sent to a particular driver. This does mean that we necessarily waste a + * large number of request structures in the common case. + * + * The general API model does not support the notion of blocking calls. While + * it would make sense to retrieve e.g. error statistics from the driver only + * when requested by userland, implementing this without threads would be + * seriously complicated, because such requests can have many origins (ioctl, + * PF_ROUTE message, sysctl). Instead, we rely on drivers updating us with the + * latest information on everything at all times, so that we can hand over a + * cached copy of (e.g.) those error statistics right away. We provide a means + * for drivers to perform rate limiting of such status updates (to prevent + * overflowing asynsend queues), by replying to these status messages. That + * means that there is a request-response combo going in the opposite direction + * of the regular messages. + * + * TODO: in the future we will want to obtain the list of supported media modes + * (IFM_) from drivers, so that userland can view the list. Given the above + * model, the easiest way would be to obtain a copy of the full list, limited + * to a configured number of entries, at driver initialization time. This + * would require that the initialization request also involve a memory grant. + * + * If necessary, it would not be too much work to split off this module into + * its own libndev library. For now, there is no point in doing this and the + * tighter coupling allows us to optimize just a little but (see pbuf usage). + */ + +#include "lwip.h" +#include "ndev.h" +#include "ethif.h" + +#define LABEL_MAX 16 /* FIXME: this should be in a system header */ + +#define NDEV_SENDQ 2 /* minimum guaranteed send queue depth */ +#define NDEV_RECVQ 2 /* guaranteed receive queue depth */ +#define NREQ_SPARES 8 /* spare send queue (request) objects */ +#define NR_NREQ ((NDEV_SENDQ + NDEV_RECVQ) * NR_NDEV + NREQ_SPARES) + +static SIMPLEQ_HEAD(, ndev_req) nreq_freelist; + +static struct ndev_req { + SIMPLEQ_ENTRY(ndev_req) nreq_next; /* next request in queue */ + int nreq_type; /* type of request message */ + cp_grant_id_t nreq_grant[NDEV_IOV_MAX]; /* grants for request */ +} nreq_array[NR_NREQ]; + +static unsigned int nreq_spares; /* number of free spare objects */ + +struct ndev_queue { + uint32_t nq_head; /* ID of oldest pending request */ + uint8_t nq_count; /* current nr of pending requests */ + uint8_t nq_max; /* maximum nr of pending requests */ + SIMPLEQ_HEAD(, ndev_req) nq_req; /* queue of pending requests */ +}; + +static struct ndev { + endpoint_t ndev_endpt; /* driver endpoint */ + char ndev_label[LABEL_MAX]; /* driver label */ + struct ethif *ndev_ethif; /* ethif object, or NULL if init'ing */ + struct ndev_queue ndev_sendq; /* packet send and configure queue */ + struct ndev_queue ndev_recvq; /* packet receive queue */ +} ndev_array[NR_NDEV]; + +static ndev_id_t ndev_max; /* highest driver count ever seen */ + +/* + * This macro checks whether the network driver is active rather than + * initializing. See above for more information. + */ +#define NDEV_ACTIVE(ndev) ((ndev)->ndev_sendq.nq_max > 0) + +static int ndev_pending; /* number of initializing drivers */ + +/* The CTL_MINIX MINIX_LWIP "drivers" subtree. Dynamically numbered. */ +static struct rmib_node minix_lwip_drivers_table[] = { + RMIB_INTPTR(RMIB_RO, &ndev_pending, "pending", + "Number of drivers currently initializing"), +}; + +static struct rmib_node minix_lwip_drivers_node = + RMIB_NODE(RMIB_RO, minix_lwip_drivers_table, "drivers", + "Network driver information"); + +/* + * Initialize the network driver communication module. + */ +void +ndev_init(void) +{ + unsigned int slot; + int r; + + /* Initialize local variables. */ + ndev_max = 0; + + SIMPLEQ_INIT(&nreq_freelist); + + for (slot = 0; slot < __arraycount(nreq_array); slot++) + SIMPLEQ_INSERT_TAIL(&nreq_freelist, &nreq_array[slot], + nreq_next); + + nreq_spares = NREQ_SPARES; + + /* + * Preallocate the total number of grants that we could possibly need + * concurrently. Even though it is extremely unlikely that we will + * ever need that many grants in practice, the alternative is runtime + * dynamic memory (re)allocation which is something we prefer to avoid + * altogether. At time of writing, we end up preallocating 320 grants + * using up a total of a bit under 9KB of memory. + */ + cpf_prealloc(NR_NREQ * NDEV_IOV_MAX); + + + /* + * Not needed, just for ultimate safety: start off all queues with + * wildly different request sequence numbers, to minimize the chance + * that any two replies will ever be confused. + */ + for (slot = 0; slot < __arraycount(ndev_array); slot++) { + ndev_array[slot].ndev_sendq.nq_head = slot << 21; + ndev_array[slot].ndev_recvq.nq_head = (slot * 2 + 1) << 20; + } + + /* Subscribe to Data Store (DS) events from network drivers. */ + if ((r = ds_subscribe("drv\\.net\\..*", + DSF_INITIAL | DSF_OVERWRITE)) != OK) + panic("unable to subscribe to driver events: %d", r); + + /* + * Keep track of how many drivers are in "pending" state, which means + * that they have not yet replied to our initialization request. + */ + ndev_pending = 0; + + /* Register the minix.lwip.drivers subtree. */ + mibtree_register_lwip(&minix_lwip_drivers_node); +} + +/* + * Initialize a queue for first use. + */ +static void +ndev_queue_init(struct ndev_queue * nq) +{ + + /* + * Only ever increase sequence numbers, to minimize the chance that + * two (e.g. from different driver instances) happen to be the same. + */ + nq->nq_head++; + + nq->nq_count = 0; + nq->nq_max = 0; + SIMPLEQ_INIT(&nq->nq_req); +} + +/* + * Advance the given request queue, freeing up the request at the head of the + * queue including any grants in use for it. + */ +static void +ndev_queue_advance(struct ndev_queue * nq) +{ + struct ndev_req * nreq; + cp_grant_id_t grant; + unsigned int i; + + nreq = SIMPLEQ_FIRST(&nq->nq_req); + + for (i = 0; i < __arraycount(nreq->nreq_grant); i++) { + grant = nreq->nreq_grant[i]; + + if (!GRANT_VALID(grant)) + break; + + /* TODO: make the safecopies code stop using errno. */ + if (cpf_revoke(grant) != 0) + panic("unable to revoke grant: %d", -errno); + } + + if (nreq->nreq_type != NDEV_RECV && nq->nq_count > NDEV_SENDQ) { + nreq_spares++; + + assert(nreq_spares <= NREQ_SPARES); + } + + SIMPLEQ_REMOVE_HEAD(&nq->nq_req, nreq_next); + + SIMPLEQ_INSERT_HEAD(&nreq_freelist, nreq, nreq_next); + + nq->nq_head++; + nq->nq_count--; +} + +/* + * Clear any outstanding requests from the given queue and reset it to a + * pre-initialization state. + */ +static void +ndev_queue_reset(struct ndev_queue * nq) +{ + + while (nq->nq_count > 0) { + assert(!SIMPLEQ_EMPTY(&nq->nq_req)); + + ndev_queue_advance(nq); + } + + nq->nq_max = 0; +} + +/* + * Obtain a request object for use in a new request. Return the request + * object, with its request type field set to 'type', and with the request + * sequence ID returned in 'seq'. Return NULL if no request objects are + * available for the given request type. If the caller does send off the + * request, a call to ndev_queue_add() must follow immediately after. If the + * caller fails to send off the request for other reasons, it need not do + * anything: this function does not perform any actions that need to be undone. + */ +static struct ndev_req * +ndev_queue_get(struct ndev_queue * nq, int type, uint32_t * seq) +{ + struct ndev_req *nreq; + + /* Has the hard queue depth limit been reached? */ + if (nq->nq_count == nq->nq_max) + return NULL; + + /* + * For send requests, we may use request objects from a shared "spares" + * pool, if available. + */ + if (type != NDEV_RECV && nq->nq_count >= NDEV_SENDQ && + nreq_spares == 0) + return NULL; + + assert(!SIMPLEQ_EMPTY(&nreq_freelist)); + nreq = SIMPLEQ_FIRST(&nreq_freelist); + + nreq->nreq_type = type; + + *seq = nq->nq_head + nq->nq_count; + + return nreq; +} + +/* + * Add a successfully sent request to the given queue. The request must have + * been obtained using ndev_queue_get() directly before the call to this + * function. This function never fails. + */ +static void +ndev_queue_add(struct ndev_queue * nq, struct ndev_req * nreq) +{ + + if (nreq->nreq_type != NDEV_RECV && nq->nq_count >= NDEV_SENDQ) { + assert(nreq_spares > 0); + + nreq_spares--; + } + + SIMPLEQ_REMOVE_HEAD(&nreq_freelist, nreq_next); + + SIMPLEQ_INSERT_TAIL(&nq->nq_req, nreq, nreq_next); + + nq->nq_count++; +} + +/* + * Remove the head of the given request queue, but only if it matches the given + * request type and sequence ID. Return TRUE if the head was indeed removed, + * or FALSE if the head of the request queue (if any) did not match the given + * type and/or sequence ID. + */ +static int +ndev_queue_remove(struct ndev_queue * nq, int type, uint32_t seq) +{ + struct ndev_req *nreq; + + if (nq->nq_count < 1 || nq->nq_head != seq) + return FALSE; + + assert(!SIMPLEQ_EMPTY(&nq->nq_req)); + nreq = SIMPLEQ_FIRST(&nq->nq_req); + + if (nreq->nreq_type != type) + return FALSE; + + ndev_queue_advance(nq); + + return TRUE; +} + +/* + * Send an initialization request to a driver. If this is a new driver, the + * ethif module does not get to know about the driver until it answers to this + * request, as the ethif module needs much of what the reply contains. On the + * other hand, if this is a restarted driver, it will stay disabled until the + * init reply comes in. + */ +static void +ndev_send_init(struct ndev * ndev) +{ + message m; + int r; + + memset(&m, 0, sizeof(m)); + m.m_type = NDEV_INIT; + m.m_ndev_netdriver_init.id = ndev->ndev_sendq.nq_head; + + if ((r = asynsend3(ndev->ndev_endpt, &m, AMF_NOREPLY)) != OK) + panic("asynsend to driver failed: %d", r); +} + +/* + * A network device driver has been started or restarted. + */ +static void +ndev_up(const char * label, endpoint_t endpt) +{ + static int reported = FALSE; + struct ndev *ndev; + ndev_id_t slot; + + /* + * First see if we already had an entry for this driver. If so, it has + * been restarted, and we need to report it as not running to ethif. + */ + ndev = NULL; + + for (slot = 0; slot < ndev_max; slot++) { + if (ndev_array[slot].ndev_endpt == NONE) { + if (ndev == NULL) + ndev = &ndev_array[slot]; + + continue; + } + + if (!strcmp(ndev_array[slot].ndev_label, label)) { + /* Cancel any ongoing requests. */ + ndev_queue_reset(&ndev_array[slot].ndev_sendq); + ndev_queue_reset(&ndev_array[slot].ndev_recvq); + + if (ndev_array[slot].ndev_ethif != NULL) { + ethif_disable(ndev_array[slot].ndev_ethif); + + ndev_pending++; + } + + ndev_array[slot].ndev_endpt = endpt; + + /* Attempt to resume communication. */ + ndev_send_init(&ndev_array[slot]); + + return; + } + } + + if (ndev == NULL) { + /* + * If there is no free slot for this driver in our table, we + * necessarily have to ignore the driver altogether. We report + * such cases once, so that the user can recompile if desired. + */ + if (ndev_max == __arraycount(ndev_array)) { + if (!reported) { + printf("LWIP: not enough ndev slots!\n"); + + reported = TRUE; + } + return; + } + + ndev = &ndev_array[ndev_max++]; + } + + /* Initialize the slot. */ + ndev->ndev_endpt = endpt; + strlcpy(ndev->ndev_label, label, sizeof(ndev->ndev_label)); + ndev->ndev_ethif = NULL; + ndev_queue_init(&ndev->ndev_sendq); + ndev_queue_init(&ndev->ndev_recvq); + + ndev_send_init(ndev); + + ndev_pending++; +} + +/* + * A network device driver has been terminated. + */ +static void +ndev_down(struct ndev * ndev) +{ + + /* Cancel any ongoing requests. */ + ndev_queue_reset(&ndev->ndev_sendq); + ndev_queue_reset(&ndev->ndev_recvq); + + /* + * If this ndev object had a corresponding ethif object, tell the ethif + * layer that the device is really gone now. + */ + if (ndev->ndev_ethif != NULL) + ethif_remove(ndev->ndev_ethif); + else + ndev_pending--; + + /* Remove the driver from our own administration. */ + ndev->ndev_endpt = NONE; + + while (ndev_max > 0 && ndev_array[ndev_max - 1].ndev_endpt == NONE) + ndev_max--; +} + +/* + * The DS service has notified us of changes to our subscriptions. That means + * that network drivers may have been started, restarted, and/or shut down. + * Find out what has changed, and act accordingly. + */ +void +ndev_check(void) +{ + static const char *prefix = "drv.net."; + char key[DS_MAX_KEYLEN], *label; + size_t prefixlen; + endpoint_t endpt; + uint32_t val; + ndev_id_t slot; + int r; + + prefixlen = strlen(prefix); + + /* Check whether any drivers have been (re)started. */ + while ((r = ds_check(key, NULL, &endpt)) == OK) { + if (strncmp(key, prefix, prefixlen) != 0 || endpt == NONE) + continue; + + if (ds_retrieve_u32(key, &val) != OK || val != DS_DRIVER_UP) + continue; + + label = &key[prefixlen]; + if (label[0] == '\0' || memchr(label, '\0', LABEL_MAX) == NULL) + continue; + + ndev_up(label, endpt); + } + + if (r != ENOENT) + printf("LWIP: DS check failed (%d)\n", r); + + /* + * Check whether the drivers we currently know about are still up. The + * ones that are not are really gone. It is no problem that we recheck + * any drivers that have just been reported by ds_check() above. + * However, we cannot check the same key: while the driver is being + * restarted, its driver status is already gone from DS. Instead, see + * if there is still an entry for its label, as that entry remains in + * existence during the restart. The associated endpoint may still + * change however, so do not check that part: in such cases we will get + * a driver-up announcement later anyway. + */ + for (slot = 0; slot < ndev_max; slot++) { + if (ndev_array[slot].ndev_endpt == NONE) + continue; + + if (ds_retrieve_label_endpt(ndev_array[slot].ndev_label, + &endpt) != OK) + ndev_down(&ndev_array[slot]); + } +} + +/* + * A network device driver has sent a reply to our initialization request. + */ +static void +ndev_init_reply(struct ndev * ndev, const message * m_ptr) +{ + struct ndev_hwaddr hwaddr; + uint8_t hwaddr_len, max_send, max_recv; + const char *name; + int enabled; + + /* + * Make sure that we were waiting for a reply to an initialization + * request, and that this is the reply to that request. + */ + if (NDEV_ACTIVE(ndev) || + m_ptr->m_netdriver_ndev_init_reply.id != ndev->ndev_sendq.nq_head) + return; + + /* + * Do just enough sanity checking on the data to pass it up to the + * ethif layer, which will check the rest (e.g., name duplicates). + */ + if (memchr(m_ptr->m_netdriver_ndev_init_reply.name, '\0', + sizeof(m_ptr->m_netdriver_ndev_init_reply.name)) == NULL || + m_ptr->m_netdriver_ndev_init_reply.name[0] == '\0') { + printf("LWIP: driver %d provided invalid name\n", + m_ptr->m_source); + + ndev_down(ndev); + + return; + } + + hwaddr_len = m_ptr->m_netdriver_ndev_init_reply.hwaddr_len; + if (hwaddr_len < 1 || hwaddr_len > __arraycount(hwaddr.nhwa_addr)) { + printf("LWIP: driver %d provided invalid HW-addr length\n", + m_ptr->m_source); + + ndev_down(ndev); + + return; + } + + if ((max_send = m_ptr->m_netdriver_ndev_init_reply.max_send) < 1 || + (max_recv = m_ptr->m_netdriver_ndev_init_reply.max_recv) < 1) { + printf("LWIP: driver %d provided invalid queue maximum\n", + m_ptr->m_source); + + ndev_down(ndev); + + return; + } + + /* + * If the driver is new, allocate a new ethif object for it. On + * success, or if the driver was restarted, (re)enable the interface. + * Both calls may fail, in which case we should forget about the + * driver. It may continue to send us messages, which we should then + * discard. + */ + name = m_ptr->m_netdriver_ndev_init_reply.name; + + if (ndev->ndev_ethif == NULL) { + ndev->ndev_ethif = ethif_add((ndev_id_t)(ndev - ndev_array), + name, m_ptr->m_netdriver_ndev_init_reply.caps); + name = NULL; + } + + if (ndev->ndev_ethif != NULL) { + /* + * Set the maximum numbers of pending requests (for each + * direction) first, because enabling the interface may cause + * the ethif layer to start sending requests immediately. + */ + ndev->ndev_sendq.nq_max = max_send; + ndev->ndev_sendq.nq_head++; + + /* + * Limit the maximum number of concurrently pending receive + * requests to our configured maximum. For send requests, we + * use a more dynamic approach with spare request objects. + */ + if (max_recv > NDEV_RECVQ) + max_recv = NDEV_RECVQ; + ndev->ndev_recvq.nq_max = max_recv; + ndev->ndev_recvq.nq_head++; + + memset(&hwaddr, 0, sizeof(hwaddr)); + memcpy(hwaddr.nhwa_addr, + m_ptr->m_netdriver_ndev_init_reply.hwaddr, hwaddr_len); + + /* + * Provide a NULL pointer for the name if we have only just + * added the interface at all. The callee may use this to + * determine whether the driver is new or has been restarted. + */ + enabled = ethif_enable(ndev->ndev_ethif, name, &hwaddr, + m_ptr->m_netdriver_ndev_init_reply.hwaddr_len, + m_ptr->m_netdriver_ndev_init_reply.caps, + m_ptr->m_netdriver_ndev_init_reply.link, + m_ptr->m_netdriver_ndev_init_reply.media); + } else + enabled = FALSE; + + /* + * If we did not manage to enable the interface, remove it again, + * possibly also from the ethif layer. + */ + if (!enabled) + ndev_down(ndev); + else + ndev_pending--; +} + +/* + * Request that a network device driver change its configuration. This + * function allows for configuration of various different driver and device + * aspects: the I/O mode (and multicast receipt list), the enabled (sub)set of + * capabilities, the driver-specific flags, and the hardware address. Each of + * these settings may be changed by setting the corresponding NDEV_SET_ flag in + * the 'set' field of the given configuration structure. It is explicitly + * allowed to generate a request with no NDEV_SET_ flags; such a request will + * be sent to the driver and ultimately generate a response. Return OK if the + * configuration request was sent to the driver, EBUSY if no (more) requests + * can be sent to the driver right now, or ENOMEM on grant allocation failure. + */ +int +ndev_conf(ndev_id_t id, const struct ndev_conf * nconf) +{ + struct ndev *ndev; + struct ndev_req *nreq; + uint32_t seq; + message m; + cp_grant_id_t grant; + int r; + + assert(id < __arraycount(ndev_array)); + ndev = &ndev_array[id]; + + assert(ndev->ndev_endpt != NONE); + assert(NDEV_ACTIVE(ndev)); + + if ((nreq = ndev_queue_get(&ndev->ndev_sendq, NDEV_CONF, + &seq)) == NULL) + return EBUSY; + + memset(&m, 0, sizeof(m)); + m.m_type = NDEV_CONF; + m.m_ndev_netdriver_conf.id = seq; + m.m_ndev_netdriver_conf.set = nconf->nconf_set; + + grant = GRANT_INVALID; + + if (nconf->nconf_set & NDEV_SET_MODE) { + m.m_ndev_netdriver_conf.mode = nconf->nconf_mode; + + if (nconf->nconf_mode & NDEV_MODE_MCAST_LIST) { + assert(nconf->nconf_mclist != NULL); + assert(nconf->nconf_mccount != 0); + + grant = cpf_grant_direct(ndev->ndev_endpt, + (vir_bytes)nconf->nconf_mclist, + sizeof(nconf->nconf_mclist[0]) * + nconf->nconf_mccount, CPF_READ); + + if (!GRANT_VALID(grant)) + return ENOMEM; + + m.m_ndev_netdriver_conf.mcast_count = + nconf->nconf_mccount; + } + } + + m.m_ndev_netdriver_conf.mcast_grant = grant; + + if (nconf->nconf_set & NDEV_SET_CAPS) + m.m_ndev_netdriver_conf.caps = nconf->nconf_caps; + + if (nconf->nconf_set & NDEV_SET_FLAGS) + m.m_ndev_netdriver_conf.flags = nconf->nconf_flags; + + if (nconf->nconf_set & NDEV_SET_MEDIA) + m.m_ndev_netdriver_conf.media = nconf->nconf_media; + + if (nconf->nconf_set & NDEV_SET_HWADDR) + memcpy(m.m_ndev_netdriver_conf.hwaddr, + nconf->nconf_hwaddr.nhwa_addr, + __arraycount(m.m_ndev_netdriver_conf.hwaddr)); + + if ((r = asynsend3(ndev->ndev_endpt, &m, AMF_NOREPLY)) != OK) + panic("asynsend to driver failed: %d", r); + + nreq->nreq_grant[0] = grant; /* may also be invalid */ + nreq->nreq_grant[1] = GRANT_INVALID; + + ndev_queue_add(&ndev->ndev_sendq, nreq); + + return OK; +} + +/* + * The network device driver has sent a reply to a configuration request. + */ +static void +ndev_conf_reply(struct ndev * ndev, const message * m_ptr) +{ + + /* + * Was this the request we were waiting for? If so, remove it from the + * send queue. Otherwise, ignore this reply message. + */ + if (!NDEV_ACTIVE(ndev) || !ndev_queue_remove(&ndev->ndev_sendq, + NDEV_CONF, m_ptr->m_netdriver_ndev_reply.id)) + return; + + /* Tell the ethif layer about the updated configuration. */ + assert(ndev->ndev_ethif != NULL); + + ethif_configured(ndev->ndev_ethif, + m_ptr->m_netdriver_ndev_reply.result); +} + +/* + * Construct a packet send or receive request and send it off to a network + * driver. The given pbuf chain may be part of a queue. Return OK if the + * request was successfully sent, or ENOMEM on grant allocation failure. + */ +static int +ndev_transfer(struct ndev * ndev, const struct pbuf * pbuf, int do_send, + uint32_t seq, struct ndev_req * nreq) +{ + cp_grant_id_t grant; + message m; + unsigned int i; + size_t left; + int r; + + memset(&m, 0, sizeof(m)); + m.m_type = (do_send) ? NDEV_SEND : NDEV_RECV; + m.m_ndev_netdriver_transfer.id = seq; + + left = pbuf->tot_len; + + for (i = 0; left > 0; i++) { + assert(i < NDEV_IOV_MAX); + + grant = cpf_grant_direct(ndev->ndev_endpt, + (vir_bytes)pbuf->payload, pbuf->len, + (do_send) ? CPF_READ : CPF_WRITE); + + if (!GRANT_VALID(grant)) { + while (i-- > 0) + (void)cpf_revoke(nreq->nreq_grant[i]); + + return ENOMEM; + } + + m.m_ndev_netdriver_transfer.grant[i] = grant; + m.m_ndev_netdriver_transfer.len[i] = pbuf->len; + + nreq->nreq_grant[i] = grant; + + assert(left >= pbuf->len); + left -= pbuf->len; + pbuf = pbuf->next; + } + + m.m_ndev_netdriver_transfer.count = i; + + /* + * Unless the array is full, an invalid grant marks the end of the list + * of invalid grants. + */ + if (i < __arraycount(nreq->nreq_grant)) + nreq->nreq_grant[i] = GRANT_INVALID; + + if ((r = asynsend3(ndev->ndev_endpt, &m, AMF_NOREPLY)) != OK) + panic("asynsend to driver failed: %d", r); + + return OK; +} + +/* + * Send a packet to the given network driver. Return OK if the packet is sent + * off to the driver, EBUSY if no (more) packets can be sent to the driver at + * this time, or ENOMEM on grant allocation failure. + * + * The use of 'pbuf' in this interface is a bit ugly, but it saves us from + * having to go through an intermediate representation (e.g. an iovec array) + * for the data being sent. The same applies to ndev_receive(). + */ +int +ndev_send(ndev_id_t id, const struct pbuf * pbuf) +{ + struct ndev *ndev; + struct ndev_req *nreq; + uint32_t seq; + int r; + + assert(id < __arraycount(ndev_array)); + ndev = &ndev_array[id]; + + assert(ndev->ndev_endpt != NONE); + assert(NDEV_ACTIVE(ndev)); + + if ((nreq = ndev_queue_get(&ndev->ndev_sendq, NDEV_SEND, + &seq)) == NULL) + return EBUSY; + + if ((r = ndev_transfer(ndev, pbuf, TRUE /*do_send*/, seq, nreq)) != OK) + return r; + + ndev_queue_add(&ndev->ndev_sendq, nreq); + + return OK; +} + +/* + * The network device driver has sent a reply to a send request. + */ +static void +ndev_send_reply(struct ndev * ndev, const message * m_ptr) +{ + + /* + * Was this the request we were waiting for? If so, remove it from the + * send queue. Otherwise, ignore this reply message. + */ + if (!NDEV_ACTIVE(ndev) || !ndev_queue_remove(&ndev->ndev_sendq, + NDEV_SEND, m_ptr->m_netdriver_ndev_reply.id)) + return; + + /* Tell the ethif layer about the result of the transmission. */ + assert(ndev->ndev_ethif != NULL); + + ethif_sent(ndev->ndev_ethif, + m_ptr->m_netdriver_ndev_reply.result); +} + +/* + * Return TRUE if a new receive request can be spawned for a particular network + * driver, or FALSE if its queue of receive requests is full. This call exists + * merely to avoid needless buffer allocatin in the case that ndev_recv() is + * going to return EBUSY anyway. + */ +int +ndev_can_recv(ndev_id_t id) +{ + struct ndev *ndev; + + assert(id < __arraycount(ndev_array)); + ndev = &ndev_array[id]; + + assert(ndev->ndev_endpt != NONE); + assert(NDEV_ACTIVE(ndev)); + + return (ndev->ndev_recvq.nq_count < ndev->ndev_recvq.nq_max); +} + +/* + * Start the process of receiving a packet from a network driver. The packet + * will be stored in the given pbuf chain upon completion. Return OK if the + * receive request is sent to the driver, EBUSY if the maximum number of + * concurrent receive requests has been reached for this driver, or ENOMEM on + * grant allocation failure. + */ +int +ndev_recv(ndev_id_t id, struct pbuf * pbuf) +{ + struct ndev *ndev; + struct ndev_req *nreq; + uint32_t seq; + int r; + + assert(id < __arraycount(ndev_array)); + ndev = &ndev_array[id]; + + assert(ndev->ndev_endpt != NONE); + assert(NDEV_ACTIVE(ndev)); + + if ((nreq = ndev_queue_get(&ndev->ndev_recvq, NDEV_RECV, + &seq)) == NULL) + return EBUSY; + + if ((r = ndev_transfer(ndev, pbuf, FALSE /*do_send*/, seq, + nreq)) != OK) + return r; + + ndev_queue_add(&ndev->ndev_recvq, nreq); + + return OK; +} + +/* + * The network device driver has sent a reply to a receive request. + */ +static void +ndev_recv_reply(struct ndev * ndev, const message * m_ptr) +{ + + /* + * Was this the request we were waiting for? If so, remove it from the + * receive queue. Otherwise, ignore this reply message. + */ + if (!NDEV_ACTIVE(ndev) || !ndev_queue_remove(&ndev->ndev_recvq, + NDEV_RECV, m_ptr->m_netdriver_ndev_reply.id)) + return; + + /* Tell the ethif layer about the result of the receipt. */ + assert(ndev->ndev_ethif != NULL); + + ethif_received(ndev->ndev_ethif, + m_ptr->m_netdriver_ndev_reply.result); +} + +/* + * A network device driver sent a status report to us. Process it and send a + * reply. + */ +static void +ndev_status(struct ndev * ndev, const message * m_ptr) +{ + message m; + int r; + + if (!NDEV_ACTIVE(ndev)) + return; + + /* Tell the ethif layer about the status update. */ + assert(ndev->ndev_ethif != NULL); + + ethif_status(ndev->ndev_ethif, m_ptr->m_netdriver_ndev_status.link, + m_ptr->m_netdriver_ndev_status.media, + m_ptr->m_netdriver_ndev_status.oerror, + m_ptr->m_netdriver_ndev_status.coll, + m_ptr->m_netdriver_ndev_status.ierror, + m_ptr->m_netdriver_ndev_status.iqdrop); + + /* + * Send a reply, so that the driver knows it can send a new status + * update without risking asynsend queue overflows. The ID of these + * messages is chosen by the driver and and we simply echo it. + */ + memset(&m, 0, sizeof(m)); + m.m_type = NDEV_STATUS_REPLY; + m.m_ndev_netdriver_status_reply.id = m_ptr->m_netdriver_ndev_status.id; + + if ((r = asynsend(m_ptr->m_source, &m)) != OK) + panic("asynsend to driver failed: %d", r); +} + +/* + * Process a network driver reply message. + */ +void +ndev_process(const message * m_ptr, int ipc_status) +{ + struct ndev *ndev; + endpoint_t endpt; + ndev_id_t slot; + + /* Find the slot of the driver that sent the message, if any. */ + endpt = m_ptr->m_source; + + for (slot = 0, ndev = ndev_array; slot < ndev_max; slot++, ndev++) + if (ndev->ndev_endpt == endpt) + break; + + /* + * If we cannot find a slot for the driver, drop the message. We may + * be ignoring the driver because it misbehaved or we are out of slots. + */ + if (slot == ndev_max) + return; + + /* + * Process the reply message. For future compatibility, ignore any + * unrecognized message types. + */ + switch (m_ptr->m_type) { + case NDEV_INIT_REPLY: + ndev_init_reply(ndev, m_ptr); + + break; + + case NDEV_CONF_REPLY: + ndev_conf_reply(ndev, m_ptr); + + break; + + case NDEV_SEND_REPLY: + ndev_send_reply(ndev, m_ptr); + + break; + + case NDEV_RECV_REPLY: + ndev_recv_reply(ndev, m_ptr); + + break; + + case NDEV_STATUS: + ndev_status(ndev, m_ptr); + + break; + } +} diff --git a/minix/net/lwip/ndev.h b/minix/net/lwip/ndev.h new file mode 100644 index 000000000..8b70cd7e6 --- /dev/null +++ b/minix/net/lwip/ndev.h @@ -0,0 +1,33 @@ +#ifndef MINIX_NET_LWIP_NDEV_H +#define MINIX_NET_LWIP_NDEV_H + +/* The maximum supported number of network device drivers. */ +#define NR_NDEV 8 + +typedef uint32_t ndev_id_t; + +struct ndev_hwaddr { + uint8_t nhwa_addr[NDEV_HWADDR_MAX]; +}; + +struct ndev_conf { + uint32_t nconf_set; /* fields to set (NDEV_SET_) */ + uint32_t nconf_mode; /* desired mode (NDEV_MODE_) */ + struct ndev_hwaddr *nconf_mclist; /* multicast list pointer */ + size_t nconf_mccount; /* multicast list count */ + uint32_t nconf_caps; /* capabilities (NDEV_CAP_) */ + uint32_t nconf_flags; /* flags to set (NDEV_FLAG_) */ + uint32_t nconf_media; /* media selection (IFM_) */ + struct ndev_hwaddr nconf_hwaddr; /* desired hardware address */ +}; + +void ndev_init(void); +void ndev_check(void); +void ndev_process(const message * m_ptr, int ipc_status); + +int ndev_conf(ndev_id_t id, const struct ndev_conf * nconf); +int ndev_send(ndev_id_t id, const struct pbuf * pbuf); +int ndev_can_recv(ndev_id_t id); +int ndev_recv(ndev_id_t id, struct pbuf * pbuf); + +#endif /* !MINIX_NET_LWIP_NDEV_H */ diff --git a/minix/net/lwip/pchain.c b/minix/net/lwip/pchain.c new file mode 100644 index 000000000..0e8058aa1 --- /dev/null +++ b/minix/net/lwip/pchain.c @@ -0,0 +1,154 @@ +/* LWIP service - pchain.c - pbuf chain utility functions */ + +#include "lwip.h" + +/* + * Allocate a chain of pbuf buffers as though it were a PBUF_POOL allocation, + * except that each buffer is of type PBUF_RAM. Return the pbuf chain on + * success, or NULL on memory allocation failure. + */ +struct pbuf * +pchain_alloc(int layer, size_t size) +{ + struct pbuf *pbuf, *phead, **pnext; + size_t chunk, left; + int offset = 0; + + /* + * Check for length overflow. Note that we do this before prepending + * the header, because otherwise we could never send a full-sized + * (65535-byte) IP packet. This does mean that we are generating a + * pbuf chain that has over 64KB worth of allocated space, but our + * header hiding ensures that tot_len stays under 64KB. A check in + * pbuf_header() prevents that later header adjustments end up lifting + * tot_len over this limit. + */ + if (size > UINT16_MAX) + return NULL; + + /* + * Unfortunately, we have no choice but to replicate this block from + * lwIP's pbuf_alloc() code. It is however unlikely that the offsets + * change for the currently supported layer types, and we do not need + * to support any layer types that we do not use ourselves. + */ + switch (layer) { + case PBUF_TRANSPORT: + offset = PBUF_LINK_ENCAPSULATION_HLEN + PBUF_LINK_HLEN + + PBUF_IP_HLEN + PBUF_TRANSPORT_HLEN; + break; + case PBUF_IP: + offset = PBUF_LINK_ENCAPSULATION_HLEN + PBUF_LINK_HLEN + + PBUF_IP_HLEN; + break; + case PBUF_LINK: + offset = PBUF_LINK_ENCAPSULATION_HLEN + PBUF_LINK_HLEN; + break; + case PBUF_RAW_TX: + offset = PBUF_LINK_ENCAPSULATION_HLEN; + break; + case PBUF_RAW: + offset = 0; + break; + default: + panic("invalid pbuf layer: %d", layer); + } + + chunk = size + offset; + if (chunk > MEMPOOL_BUFSIZE) + chunk = MEMPOOL_BUFSIZE; + + if ((phead = pbuf_alloc(PBUF_RAW, chunk, PBUF_RAM)) == NULL) + return NULL; + + if (offset > 0) + util_pbuf_header(phead, -offset); + + phead->tot_len = size; + + pnext = &phead->next; + + for (left = size - (chunk - offset); left > 0; left -= chunk) { + chunk = (left < MEMPOOL_BUFSIZE) ? left : MEMPOOL_BUFSIZE; + + if ((pbuf = pbuf_alloc(PBUF_RAW, chunk, PBUF_RAM)) == NULL) { + /* + * Adjust tot_len to match the actual length of the + * chain so far, just in case pbuf_free() starts caring + * about this in the future. + */ + for (pbuf = phead; pbuf != NULL; pbuf = pbuf->next) + pbuf->tot_len -= left; + + pbuf_free(phead); + + return NULL; + } + + pbuf->tot_len = left; + + *pnext = pbuf; + pnext = &pbuf->next; + } + + return phead; +} + +/* + * Given the (non-empty) chain of buffers 'pbuf', return a pointer to the + * 'next' field of the last buffer in the chain. This function is packet queue + * friendly. A packet queue is a queue of packet chains, where each chain is + * delimited using the 'tot_len' field. As a result, while the pointer + * returned is never NULL, the value pointed to by the returned pointer may or + * may not be NULL (and will point to the next chain if not NULL). As notable + * exception, in cases where the buffer type is a single PBUF_REF, 'tot_len' + * may be zero and 'len' may be non-zero. In such cases, the chain consists of + * that single buffer only. This function must handle that case as well. + */ +struct pbuf ** +pchain_end(struct pbuf * pbuf) +{ + + assert(pbuf != NULL); + + while (pbuf->tot_len > pbuf->len) { + pbuf = pbuf->next; + + assert(pbuf != NULL); + } + + return &pbuf->next; +} + +/* + * Given the (non-empty) chain of buffers 'pbuf', return a byte size estimation + * of the memory used by the chain, rounded up to pool buffer sizes. This + * function is packet queue friendly. + */ +size_t +pchain_size(struct pbuf * pbuf) +{ + size_t size; + + assert(pbuf != NULL); + + /* + * Count the first buffer separately, as its length may be seriously + * off due to header hiding. While the caller should always provide + * exactly the same pbuf chain twice if it intends to get back the same + * size twice, this also protects against accidental size differences + * due to header hiding in that case. + */ + size = MEMPOOL_BUFSIZE; + + /* + * Round up the size of the rest of the chain to whole buffers. + */ + if (pbuf->tot_len > pbuf->len) { + size += pbuf->tot_len - pbuf->len + MEMPOOL_BUFSIZE - 1; + + size -= size % MEMPOOL_BUFSIZE; + } + + return size; +} diff --git a/minix/net/lwip/pktsock.c b/minix/net/lwip/pktsock.c new file mode 100644 index 000000000..5ddb9b55a --- /dev/null +++ b/minix/net/lwip/pktsock.c @@ -0,0 +1,1236 @@ +/* LWIP service - pktsock.c - packet code shared between UDP and RAW */ + +#include "lwip.h" +#include "pktsock.h" +#include "ifaddr.h" + +/* + * This buffer should be much bigger (at least 10KB, according to RFC 3542), + * but we do not support the ancillary options that take so much space anyway. + */ +#define PKTSOCK_CTLBUF_SIZE 256 + +static char pktsock_ctlbuf[PKTSOCK_CTLBUF_SIZE]; + +/* + * Header structures with ancillary data for received packets. The reason that + * we do not simply use a generic pkthdr structure with ip_addr_t source and + * destination addresses, is that for UDP packets, we put this structure in + * place of the received (ethernet and IP headers), and such a full structure + * (including IPv6-size addresses) would not fit in the header space for IPv4 + * packets. So instead we use two address structures, one for IPv4 and one for + * IPv6, and a generic header structure on top of it, which also identifies + * which address structure is underneath. The combination of the address + * structure and the header structure must fit in the IP header. The IPv6 + * packet header is already so close to the limit here that we have to use + * packed addresses. For IPv4 we use the regular addresses for simplicity. + */ +struct pkthdr { + uint16_t port; /* source port number (UDP only) */ + uint8_t dstif; /* interface that received the pkt */ + uint8_t addrif; /* interface that accepted the pkt */ + uint8_t tos; /* TOS/TC value from the IP header */ + uint8_t ttl; /* TTL/HL value from the IP header */ + uint8_t flags; /* packet flags (PKTHF_) */ + uint8_t _unused; /* all that is still available.. */ +}; + +#define PKTHF_IPV6 0x01 /* packet has IPv6 header */ +#define PKTHF_MCAST 0x02 /* packet has multicast destination */ +#define PKTHF_BCAST 0x04 /* packet has broadcast destination */ + +struct pktaddr4 { + ip4_addr_t srcaddr; + ip4_addr_t dstaddr; +}; + +struct pktaddr6 { + ip6_addr_p_t srcaddr; + ip6_addr_p_t dstaddr; +}; + +/* + * Create a packet socket. Relay parameters and return values to and from the + * IP module's socket creation function. This function must not allocate any + * resources in any form, as socket creation may still fail later, in which + * case no destruction function is called. + */ +int +pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf, size_t rcvbuf, + struct sock ** sockp) +{ + + pkt->pkt_rcvhead = NULL; + pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; + pkt->pkt_rcvlen = 0; + + mcast_reset(&pkt->pkt_mcast); + + memset(&pkt->pkt_srcaddr, 0, sizeof(pkt->pkt_srcaddr)); + pkt->pkt_ifindex = 0; + + /* + * Any PKTF_ type flags should be initialized on the socket only after + * the following call, as this call will clear the flags field. For + * now, no PKTF_ flags need to be set by default, though. + */ + return ipsock_socket(&pkt->pkt_ipsock, domain, sndbuf, rcvbuf, sockp); +} + +/* + * Return TRUE if the given packet can and should be received on the given + * socket, or FALSE if there is a reason not to receive the packet. + */ +static int +pktsock_may_recv(struct pktsock * pkt, struct pbuf * pbuf) +{ + + /* + * By policy, multicast packets should not be received on sockets of + * which the owning application is not multicast aware. + */ + if (ip_addr_ismulticast(ip_current_dest_addr()) && + !(ipsock_get_flag(&pkt->pkt_ipsock, PKTF_MCAWARE))) + return FALSE; + + /* + * Due to fragment reassembly, we might end up with packets that take + * up more buffer space than their byte size, even after rounding up + * the latter. The user probably does not want packets to get dropped + * for that reason, e.g. when they set a 64K limit and the packet ends + * up being estimated as 65K and dropped. So, we test against + * 'pbuf->tot_len' rather than the rounded-up packet size. However, + * 'pkt->pkt_rcvlen' itself is increased by the rounded-up packet size + * when enqueuing the packet, so that we still count the memory + * consumption (generally) conservatively, which is what we want. + */ + return (pkt->pkt_rcvlen + pbuf->tot_len <= + ipsock_get_rcvbuf(&pkt->pkt_ipsock)); +} + +/* + * Check whether the given packet can and should be received on the given + * socket. If so, return the amount of space for ancillary information that + * will be necessary for the packet. If not, return a negative value. + */ +int +pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf) +{ + + /* + * This check will be done again in pktsock_input(), but this function + * is called for raw packets only (not for UDP packets) and, if this + * (cheap) check fails, we can avoid a (rather expensive) packet copy. + */ + if (!pktsock_may_recv(pkt, pbuf)) + return -1; + + if (ip_current_is_v6()) + return (int)(sizeof(struct pktaddr6) + sizeof(struct pkthdr)); + else + return (int)(sizeof(struct pktaddr4) + sizeof(struct pkthdr)); +} + +/* + * A packet has arrived on a packet socket. We own the given packet buffer, + * and so we must free it if we do not want to keep it. + */ +void +pktsock_input(struct pktsock * pkt, struct pbuf * pbuf, + const ip_addr_t * srcaddr, uint16_t port) +{ + struct pktaddr4 pktaddr4; + struct pktaddr6 pktaddr6; + struct pkthdr pkthdr; + void *pktaddr; + struct ifdev *ifdev; + size_t pktaddrlen; + + /* + * We are going to mess with the packet's header and contents, so we + * must be the exclusive owner of the packet. For UDP packets, lwIP + * must have made a copy for us in case of non-exclusive delivery + * (e.g., multicast packets). For raw packets, we have made a copy of + * the packet ourselves just before the call to this function. + */ + if (pbuf->ref != 1) + panic("input packet has multiple references!"); + + /* If the packet should not be received on this socket, drop it. */ + if (!pktsock_may_recv(pkt, pbuf)) { + pbuf_free(pbuf); + + return; + } + + /* + * Enqueue the packet. Overwrite the leading IP header with packet + * information that is used at the time of receipt by userland. The + * data structures are such that the information always fits in what + * was the IP header. The reference count check earlier ensures that + * we never overwrite part of a packet that is still in use elsewhere. + */ + if (ip_current_is_v6()) { + assert(IP_IS_V6(srcaddr)); + assert(ip6_current_dest_addr() != NULL); + + ip6_addr_copy_to_packed(pktaddr6.srcaddr, *ip_2_ip6(srcaddr)); + ip6_addr_copy_to_packed(pktaddr6.dstaddr, + *ip6_current_dest_addr()); + pktaddr = &pktaddr6; + pktaddrlen = sizeof(pktaddr6); + + assert(pktaddrlen + sizeof(pkthdr) <= IP6_HLEN); + + pkthdr.tos = IP6H_TC(ip6_current_header()); + pkthdr.ttl = IP6H_HOPLIM(ip6_current_header()); + pkthdr.flags = PKTHF_IPV6; + } else { + assert(IP_IS_V4(srcaddr)); + assert(ip4_current_dest_addr() != NULL); + + memcpy(&pktaddr4.srcaddr, ip_2_ip4(srcaddr), + sizeof(pktaddr4.srcaddr)); + memcpy(&pktaddr4.dstaddr, ip4_current_dest_addr(), + sizeof(pktaddr4.srcaddr)); + pktaddr = &pktaddr4; + pktaddrlen = sizeof(pktaddr4); + + assert(pktaddrlen + sizeof(pkthdr) <= IP_HLEN); + + pkthdr.tos = IPH_TOS(ip4_current_header()); + pkthdr.ttl = IPH_TTL(ip4_current_header()); + pkthdr.flags = 0; + } + + /* + * Save both the interface on which the packet was received (for + * PKTINFO) and the interface that owns the destination address of the + * packet (for the source address's zone ID). + */ + assert(ip_current_input_netif() != NULL); + ifdev = netif_get_ifdev(ip_current_input_netif()); + pkthdr.dstif = (uint16_t)ifdev_get_index(ifdev); + + assert(ip_current_netif() != NULL); + ifdev = netif_get_ifdev(ip_current_netif()); + pkthdr.addrif = (uint16_t)ifdev_get_index(ifdev); + + if ((pbuf->flags & PBUF_FLAG_LLMCAST) || + ip_addr_ismulticast(ip_current_dest_addr())) + pkthdr.flags |= PKTHF_MCAST; + else if ((pbuf->flags & PBUF_FLAG_LLBCAST) || + ip_addr_isbroadcast(ip_current_dest_addr(), ip_current_netif())) + pkthdr.flags |= PKTHF_BCAST; + + pkthdr.port = port; + + util_pbuf_header(pbuf, sizeof(pkthdr)); + + memcpy(pbuf->payload, &pkthdr, sizeof(pkthdr)); + + util_pbuf_header(pbuf, pktaddrlen); + + memcpy(pbuf->payload, pktaddr, pktaddrlen); + + util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + pktaddrlen)); + + *pkt->pkt_rcvtailp = pbuf; + pkt->pkt_rcvtailp = pchain_end(pbuf); + pkt->pkt_rcvlen += pchain_size(pbuf); + + sockevent_raise(ipsock_get_sock(&pkt->pkt_ipsock), SEV_RECV); +} + +/* + * Obtain interface and source address information for an outgoing packet. In + * particular, parse any IPV6_PKTINFO options provided as either sticky options + * on the socket 'pkt' or as ancillary options in the packet options 'pkto'. + * On success, return OK, with 'ifdevp' set to either the outgoing interface to + * use for the packet, or NULL if no outgoing interface was specified using + * either of the aforementioned options. If, and only if, 'ifdevp' is set to + * an actual interface (i.e., not NULL), then 'src_addrp' is filled with either + * a locally owned, validated, unicast address to use as source of the packet, + * or the unspecified ('any') address if no source address was specified using + * the options. On failure, return a negative error code. + */ +int +pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto, + struct ifdev ** ifdevp, ip_addr_t * src_addrp) +{ + struct ifdev *ifdev, *ifdev2; + ip_addr_t ipaddr; + uint32_t ifindex; + int r; + + /* We support only IPV6_PKTINFO. IP_PKTINFO is not supported. */ + if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) { + *ifdevp = NULL; + return OK; + } + + /* + * TODO: we are spending a lot of effort on initializing and copying + * stuff around, even just to find out whether there is anything to do + * at all here. See if this can be optimized. + */ + ip_addr_set_zero_ip6(&ipaddr); + + /* + * Ancillary data takes precedence over sticky options. We treat the + * source address and interface index fields as separate, overriding + * each earlier value only if non-zero. TODO: is that correct? + */ + if (pkto->pkto_flags & PKTOF_PKTINFO) { + memcpy(ip_2_ip6(&ipaddr)->addr, &pkto->pkto_srcaddr.addr, + sizeof(ip_2_ip6(&ipaddr)->addr)); + ifindex = pkto->pkto_ifindex; + } else + ifindex = 0; + + if (ip6_addr_isany(ip_2_ip6(&ipaddr))) + memcpy(ip_2_ip6(&ipaddr)->addr, &pkt->pkt_srcaddr.addr, + sizeof(ip_2_ip6(&ipaddr)->addr)); + if (ifindex == 0) + ifindex = pkt->pkt_ifindex; + + /* If both fields are blank, there is nothing more to do. */ + if (ip6_addr_isany(ip_2_ip6(&ipaddr)) && ifindex == 0) { + *ifdevp = NULL; + return OK; + } + + /* If an interface index is specified, it must be valid. */ + ifdev = NULL; + + if (ifindex != 0 && (ifdev = ifdev_get_by_index(ifindex)) == NULL) + return ENXIO; + + /* + * Use the interface index to set a zone on the source address, if the + * source address has a scope. + */ + if (ip6_addr_has_scope(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) { + if (ifindex == 0) + return EADDRNOTAVAIL; + + ip6_addr_set_zone(ip_2_ip6(&ipaddr), ifindex); + } + + /* + * We need to validate the given address just as thoroughly as an + * address given through bind(). If we don't, we could allow forged + * source addresses etcetera. To be sure: this call may change the + * address to an IPv4 type address if needed. + */ + if ((r = ipsock_check_src_addr(pktsock_get_ipsock(pkt), &ipaddr, + FALSE /*allow_mcast*/, &ifdev2)) != OK) + return r; + + if (ifdev2 != NULL) { + if (ifdev == NULL) + ifdev = ifdev2; + else if (ifdev != ifdev2) + return EADDRNOTAVAIL; + } else { + /* + * There should be no cases where the (non-multicast) address + * successfully parsed, is not unspecified, and yet did not map + * to an interface. Eliminate the possibility anyway by + * throwing an error for this case. As a result, we are left + * with one of two cases: + * + * 1) ifdevp is not NULL, and src_addrp is unspecified; + * 2) ifdevp is not NULL, and src_addrp is a locally assigned + * (unicast) address. + * + * This is why we need not fill src_addrp when ifdevp is NULL. + */ + if (!ip_addr_isany(&ipaddr)) + return EADDRNOTAVAIL; + } + + *ifdevp = ifdev; + if (ifdev != NULL) + *src_addrp = ipaddr; + return OK; +} + +/* + * Parse a chunk of user-provided control data, on an IPv4 socket provided as + * 'pkt'. The control chunk is given as 'cmsg', and the length of the data + * following the control header (possibly zero) is given as 'len'. On success, + * return OK, with any parsed options merged into the set of packet options + * 'pkto'. On failure, return a negative error code. + */ +static int +pktsock_parse_ctl_v4(struct pktsock * pkt __unused, struct cmsghdr * cmsg, + socklen_t len, struct pktopt * pkto) +{ + uint8_t byte; + int val; + + if (cmsg->cmsg_level != IPPROTO_IP) + return EAFNOSUPPORT; + + switch (cmsg->cmsg_type) { + case IP_TOS: + /* + * Some userland code (bind's libisc in particular) supplies + * a single byte instead of a full integer for this option. + * We go out of our way to accept that format, too. + */ + if (len != sizeof(val) && len != sizeof(byte)) + return EINVAL; + + if (len == sizeof(byte)) { + memcpy(&byte, CMSG_DATA(cmsg), sizeof(byte)); + val = (int)byte; + } else + memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); + + if (val < 0 || val > UINT8_MAX) + return EINVAL; + + pkto->pkto_flags |= PKTOF_TOS; + pkto->pkto_tos = (uint8_t)val; + + return OK; + + case IP_TTL: + if (len != sizeof(val)) + return EINVAL; + + memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); + + if (val < 0 || val > UINT8_MAX) + return EINVAL; + + pkto->pkto_flags |= PKTOF_TTL; + pkto->pkto_ttl = (uint8_t)val; + + return OK; + + /* + * Implementing IP_PKTINFO might be a bit harder than its IPV6_PKTINFO + * sibling, because it would require the use of zone IDs (interface + * indices) for IPv4, which is not supported yet. + */ + } + + return EINVAL; +} + +/* + * Parse a chunk of user-provided control data, on an IPv6 socket provided as + * 'pkt'. The control chunk is given as 'cmsg', and the length of the data + * following the control header (possibly zero) is given as 'len'. On success, + * return OK, with any parsed options merged into the set of packet options + * 'pkto'. On failure, return a negative error code. + */ +static int +pktsock_parse_ctl_v6(struct pktsock * pkt, struct cmsghdr * cmsg, + socklen_t len, struct pktopt * pkto) +{ + struct in6_pktinfo ipi6; + int val; + + if (cmsg->cmsg_level != IPPROTO_IPV6) + return EAFNOSUPPORT; + + switch (cmsg->cmsg_type) { + case IPV6_TCLASS: + if (len != sizeof(val)) + return EINVAL; + + memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); + + if (val < -1 || val > UINT8_MAX) + return EINVAL; + + if (val == -1) + val = 0; + + pkto->pkto_flags |= PKTOF_TOS; + pkto->pkto_tos = (uint8_t)val; + + return OK; + + case IPV6_HOPLIMIT: + if (len != sizeof(val)) + return EINVAL; + + memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); + + if (val < -1 || val > UINT8_MAX) + return EINVAL; + + if (val == -1) + val = IP_DEFAULT_TTL; + + pkto->pkto_flags |= PKTOF_TTL; + pkto->pkto_ttl = (uint8_t)val; + + return OK; + + case IPV6_PKTINFO: + if (len != sizeof(ipi6)) + return EINVAL; + + memcpy(&ipi6, CMSG_DATA(cmsg), sizeof(ipi6)); + + pkto->pkto_flags |= PKTOF_PKTINFO; + memcpy(&pkto->pkto_srcaddr.addr, &ipi6.ipi6_addr, + sizeof(pkto->pkto_srcaddr.addr)); + pkto->pkto_ifindex = ipi6.ipi6_ifindex; + + return OK; + + case IPV6_USE_MIN_MTU: + if (len != sizeof(int)) + return EINVAL; + + memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); + + if (val < -1 || val > 1) + return EINVAL; + + /* TODO: not supported by lwIP, but needed by applications. */ + return OK; + } + + return EINVAL; +} + +/* + * Copy in and parse control data, as part of sending a packet on socket 'pkt'. + * The control data is accessible through 'ctl', with a user-provided length of + * 'ctl_len'. On success, return OK, with any parsed packet options stored in + * 'pkto'. On failure, return a negative error code. + */ +int +pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, + socklen_t ctl_len, struct pktopt * pkto) +{ + struct msghdr msghdr; + struct cmsghdr *cmsg; + socklen_t left, len; + int r; + + /* The default: no packet options are being overridden. */ + assert(pkto->pkto_flags == 0); + + /* If no control length is given, we are done here. */ + if (ctl_len == 0) + return OK; + + /* + * For now, we put a rather aggressive limit on the size of the control + * data. We copy in and parse the whole thing in a single buffer. + */ + if (ctl_len > sizeof(pktsock_ctlbuf)) { + printf("LWIP: too much control data given (%u bytes)\n", + ctl_len); + + return ENOBUFS; + } + + if ((r = sockdriver_copyin(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) + return r; + + memset(&msghdr, 0, sizeof(msghdr)); + msghdr.msg_control = pktsock_ctlbuf; + msghdr.msg_controllen = ctl_len; + + for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { + /* Check for bogus lengths. */ + assert((socklen_t)((char *)cmsg - pktsock_ctlbuf) <= ctl_len); + left = ctl_len - (socklen_t)((char *)cmsg - pktsock_ctlbuf); + assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ + + if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { + printf("LWIP: malformed control data rejected\n"); + + return EINVAL; + } + + len = cmsg->cmsg_len - CMSG_LEN(0); + + if (ipsock_is_ipv6(&pkt->pkt_ipsock)) + r = pktsock_parse_ctl_v6(pkt, cmsg, len, pkto); + else + r = pktsock_parse_ctl_v4(pkt, cmsg, len, pkto); + + if (r != OK) + return r; + } + + return OK; +} + +/* + * Copy in the packet data from the calling user process, and store it in the + * buffer 'pbuf' that must already have been allocated with the appropriate + * size. + */ +int +pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data, + size_t len, struct pbuf * pbuf) + +{ + + return util_copy_data(data, len, 0, pbuf, 0, TRUE /*copy_in*/); +} + +/* + * Dequeue and free the head of the receive queue of a packet socket. + */ +static void +pktsock_dequeue(struct pktsock * pkt) +{ + struct pbuf *pbuf, **pnext; + size_t size; + + pbuf = pkt->pkt_rcvhead; + assert(pbuf != NULL); + + pnext = pchain_end(pbuf); + size = pchain_size(pbuf); + + if ((pkt->pkt_rcvhead = *pnext) == NULL) + pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; + + assert(pkt->pkt_rcvlen >= size); + pkt->pkt_rcvlen -= size; + + *pnext = NULL; + pbuf_free(pbuf); +} + +/* + * Perform preliminary checks on a receive request. + */ +int +pktsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, + int flags) +{ + + /* + * We accept the same flags across all socket types in LWIP, and then + * simply ignore the ones we do not support for packet sockets. + */ + if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) + return EOPNOTSUPP; + + return OK; +} + +/* + * Add a chunk of control data to the global control buffer, starting from + * offset 'off'. The chunk has the given level and type, and its data is given + * in the buffer 'ptr' with size 'len'. Return the (padded) size of the chunk + * that was generated as a result. + */ +static size_t +pktsock_add_ctl(int level, int type, void * ptr, socklen_t len, size_t off) +{ + struct cmsghdr cmsg; + size_t size; + + size = CMSG_SPACE(len); + + /* + * The global control buffer must be large enough to store one chunk + * of each of the supported options. If this panic triggers, increase + * PKTSOCK_CTLBUF_SIZE by as much as needed. + */ + if (off + size > sizeof(pktsock_ctlbuf)) + panic("control buffer too small, increase " + "PKTSOCK_CTLBUF_SIZE"); + + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.cmsg_len = CMSG_LEN(len); + cmsg.cmsg_level = level; + cmsg.cmsg_type = type; + + /* + * Clear any padding space. This can be optimized, but in any case we + * must be careful not to copy out any bytes that have not been + * initialized at all. + */ + memset(&pktsock_ctlbuf[off], 0, size); + + memcpy(&pktsock_ctlbuf[off], &cmsg, sizeof(cmsg)); + memcpy(CMSG_DATA((struct cmsghdr *)&pktsock_ctlbuf[off]), ptr, len); + + return size; +} + +/* + * Generate and copy out control data, as part of delivering a packet from + * socket 'pkt' to userland. The control data buffer is given as 'ctl', with + * a user-given length of 'ctl_len' bytes. The packet's header information is + * provided as 'pkthdr', and its source and destination addresses as 'pktaddr', + * which maybe a pktaddr4 or pktaddr6 structure depending on the value of the + * PKTHF_IPV6 flag in the 'flags' field in 'pkthdr'. Note that we support + * dual-stack sockets, and as such it is possible that the socket is of domain + * AF_INET6 while the received packet is an IPv4 packet. On success, return + * the size of the control data copied out (possibly zero). If more control + * data were generated than copied out, also merge the MSG_CTRUNC flag into + * 'rflags'. On failure, return a negative error code. + */ +static int +pktsock_put_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, + socklen_t ctl_len, struct pkthdr * pkthdr, void * pktaddr, + int * rflags) +{ + struct pktaddr6 *pktaddr6; + struct pktaddr4 *pktaddr4; + struct in_pktinfo ipi; + struct in6_pktinfo ipi6; + ip_addr_t ipaddr; + unsigned int flags; + uint8_t byte; + size_t off; + int r, val; + + flags = ipsock_get_flags(&pkt->pkt_ipsock); + + if (!(flags & (PKTF_RECVINFO | PKTF_RECVTOS | PKTF_RECVTTL))) + return 0; + + /* + * Important: all generated control chunks must fit in the global + * control buffer together. When adding more options here, ensure that + * the control buffer remains large enough to receive all options at + * once. See also the panic in pktsock_add_ctl(). + */ + off = 0; + + /* + * IPv6 sockets may receive IPv4 packets. The ancillary data is in the + * format corresponding to the socket, which means we may have to + * convert any IPv4 addresses from the packet to IPv4-mapped IPv6 + * addresses for the ancillary data, just like the source address. + */ + if (ipsock_is_ipv6(&pkt->pkt_ipsock)) { + if (flags & PKTF_RECVTTL) { + val = pkthdr->ttl; + + off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_HOPLIMIT, + &val, sizeof(val), off); + } + + if (flags & PKTF_RECVTOS) { + val = pkthdr->tos; + + off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_TCLASS, &val, + sizeof(val), off); + } + + if (flags & PKTF_RECVINFO) { + memset(&ipi6, 0, sizeof(ipi6)); + + if (pkthdr->flags & PKTHF_IPV6) { + pktaddr6 = (struct pktaddr6 *)pktaddr; + memcpy(&ipi6.ipi6_addr, &pktaddr6->dstaddr, + sizeof(ipi6.ipi6_addr)); + } else { + pktaddr4 = (struct pktaddr4 *)pktaddr; + + addr_make_v4mapped_v6(&ipaddr, + &pktaddr4->dstaddr); + + memcpy(&ipi6.ipi6_addr, + ip_2_ip6(&ipaddr)->addr, + sizeof(ipi6.ipi6_addr)); + } + ipi6.ipi6_ifindex = pkthdr->dstif; + + off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_PKTINFO, + &ipi6, sizeof(ipi6), off); + } + } else { + if (flags & PKTF_RECVTTL) { + byte = pkthdr->ttl; + + off += pktsock_add_ctl(IPPROTO_IP, IP_TTL, &byte, + sizeof(byte), off); + } + + if (flags & PKTF_RECVINFO) { + assert(!(pkthdr->flags & PKTHF_IPV6)); + pktaddr4 = (struct pktaddr4 *)pktaddr; + + memset(&ipi, 0, sizeof(ipi)); + memcpy(&ipi.ipi_addr, &pktaddr4->dstaddr, + sizeof(ipi.ipi_addr)); + ipi.ipi_ifindex = pkthdr->dstif; + + off += pktsock_add_ctl(IPPROTO_IP, IP_PKTINFO, &ipi, + sizeof(ipi), off); + } + } + + assert(off > 0); + + if (ctl_len >= off) + ctl_len = off; + else + *rflags |= MSG_CTRUNC; + + if (ctl_len > 0 && + (r = sockdriver_copyout(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) + return r; + + return ctl_len; +} + +/* + * Receive data on a packet socket. + */ +int +pktsock_recv(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * off, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr, + socklen_t * addr_len, endpoint_t user_endpt __unused, int flags, + size_t min __unused, int * rflags) +{ + struct pktsock *pkt = (struct pktsock *)sock; + struct pktaddr4 pktaddr4; + struct pktaddr6 pktaddr6; + struct pkthdr pkthdr; + void *pktaddr; + struct pbuf *pbuf; + ip_addr_t srcaddr; + int r; + + if ((pbuf = pkt->pkt_rcvhead) == NULL) + return SUSPEND; + + /* + * Get the ancillary data for the packet. The format of the ancillary + * data depends on the received packet type, which may be different + * from the socket type. + */ + util_pbuf_header(pbuf, sizeof(pkthdr)); + + memcpy(&pkthdr, pbuf->payload, sizeof(pkthdr)); + + if (pkthdr.flags & PKTHF_IPV6) { + util_pbuf_header(pbuf, sizeof(pktaddr6)); + + memcpy(&pktaddr6, pbuf->payload, sizeof(pktaddr6)); + pktaddr = &pktaddr6; + + ip_addr_copy_from_ip6_packed(srcaddr, pktaddr6.srcaddr); + if (ip6_addr_has_scope(ip_2_ip6(&srcaddr), IP6_UNICAST)) + ip6_addr_set_zone(ip_2_ip6(&srcaddr), pkthdr.addrif); + + util_pbuf_header(pbuf, + -(int)(sizeof(pkthdr) + sizeof(pktaddr6))); + } else { + util_pbuf_header(pbuf, sizeof(pktaddr4)); + + memcpy(&pktaddr4, pbuf->payload, sizeof(pktaddr4)); + pktaddr = &pktaddr4; + + ip_addr_copy_from_ip4(srcaddr, pktaddr4.srcaddr); + + util_pbuf_header(pbuf, + -(int)(sizeof(pkthdr) + sizeof(pktaddr4))); + } + + /* Copy out the packet data to the calling user process. */ + if (len >= pbuf->tot_len) + len = pbuf->tot_len; + else + *rflags |= MSG_TRUNC; + + r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/); + + if (r != OK) + return r; + + /* Generate and copy out ancillary (control) data, if requested. */ + if ((r = pktsock_put_ctl(pkt, ctl, ctl_len, &pkthdr, pktaddr, + rflags)) < 0) + return r; + + /* Store the source IP address. */ + ipsock_put_addr(&pkt->pkt_ipsock, addr, addr_len, &srcaddr, + pkthdr.port); + + /* Set multicast or broadcast message flag, if applicable. */ + if (pkthdr.flags & PKTHF_MCAST) + *rflags |= MSG_MCAST; + else if (pkthdr.flags & PKTHF_BCAST) + *rflags |= MSG_BCAST; + + /* Discard the packet now, unless we were instructed to peek only. */ + if (!(flags & MSG_PEEK)) + pktsock_dequeue(pkt); + + /* Return the received part of the packet length. */ + *off = len; + *ctl_off = r; + return OK; +} + +/* + * Test whether data can be received on a packet socket, and if so, how many + * bytes of data. + */ +int +pktsock_test_recv(struct sock * sock, size_t min __unused, size_t * size) +{ + struct pktsock *pkt = (struct pktsock *)sock; + + if (pkt->pkt_rcvhead == NULL) + return SUSPEND; + + if (size != NULL) + *size = pkt->pkt_rcvhead->tot_len; + return OK; +} + +/* + * The caller has performed a multicast operation on the given socket. Thus, + * the caller is multicast aware. Remember this, because that means the socket + * may also receive traffic to multicast destinations. + */ +void +pktsock_set_mcaware(struct pktsock * pkt) +{ + + ipsock_set_flag(&pkt->pkt_ipsock, PKTF_MCAWARE); +} + +/* + * Set socket options on a packet socket. + */ +int +pktsock_setsockopt(struct pktsock * pkt, int level, int name, + const struct sockdriver_data * data, socklen_t len, + struct ipopts * ipopts) +{ + struct ip_mreq imr; + struct ipv6_mreq ipv6mr; + struct in6_pktinfo ipi6; + ip_addr_t ipaddr, ifaddr; + struct ifdev *ifdev; + unsigned int flag; + uint32_t ifindex; + int r, val, has_scope; + + switch (level) { + case IPPROTO_IP: + if (ipsock_is_ipv6(&pkt->pkt_ipsock)) + break; + + switch (name) { + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + pktsock_set_mcaware(pkt); + + if ((r = sockdriver_copyin_opt(data, &imr, sizeof(imr), + len)) != OK) + return r; + + ip_addr_set_ip4_u32(&ipaddr, imr.imr_multiaddr.s_addr); + ip_addr_set_ip4_u32(&ifaddr, imr.imr_interface.s_addr); + + if (!ip_addr_isany(&ifaddr)) { + ifdev = ifaddr_map_by_addr(&ifaddr); + + if (ifdev == NULL) + return EADDRNOTAVAIL; + } else + ifdev = NULL; + + if (name == IP_ADD_MEMBERSHIP) + r = mcast_join(&pkt->pkt_mcast, &ipaddr, + ifdev); + else + r = mcast_leave(&pkt->pkt_mcast, &ipaddr, + ifdev); + + return r; + + case IP_RECVTTL: + case IP_RECVPKTINFO: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + switch (name) { + case IP_RECVTTL: flag = PKTF_RECVTTL; break; + case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; + default: flag = 0; assert(0); break; + } + + if (val) + ipsock_set_flag(&pkt->pkt_ipsock, flag); + else + ipsock_clear_flag(&pkt->pkt_ipsock, flag); + + return OK; + } + + break; + + case IPPROTO_IPV6: + if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) + break; + + switch (name) { + case IPV6_JOIN_GROUP: + case IPV6_LEAVE_GROUP: + pktsock_set_mcaware(pkt); + + if ((r = sockdriver_copyin_opt(data, &ipv6mr, + sizeof(ipv6mr), len)) != OK) + return r; + + ip_addr_set_zero_ip6(&ipaddr); + memcpy(ip_2_ip6(&ipaddr)->addr, + &ipv6mr.ipv6mr_multiaddr, + sizeof(ip_2_ip6(&ipaddr)->addr)); + + /* + * We currently do not support joining IPv4 multicast + * groups on IPv6 sockets. The reason for this is that + * this would require decisions on what to do if the + * socket is set to V6ONLY later, as well as various + * additional exceptions for a case that hopefully + * doesn't occur in practice anyway. + */ + if (ip6_addr_isipv4mappedipv6(ip_2_ip6(&ipaddr))) + return EADDRNOTAVAIL; + + has_scope = ip6_addr_has_scope(ip_2_ip6(&ipaddr), + IP6_UNKNOWN); + + if ((ifindex = ipv6mr.ipv6mr_interface) != 0) { + ifdev = ifdev_get_by_index(ifindex); + + if (ifdev == NULL) + return ENXIO; + + if (has_scope) + ip6_addr_set_zone(ip_2_ip6(&ipaddr), + ifindex); + } else { + if (has_scope) + return EADDRNOTAVAIL; + + ifdev = NULL; + } + + if (name == IPV6_JOIN_GROUP) + r = mcast_join(&pkt->pkt_mcast, &ipaddr, + ifdev); + else + r = mcast_leave(&pkt->pkt_mcast, &ipaddr, + ifdev); + + return r; + + case IPV6_USE_MIN_MTU: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < -1 || val > 1) + return EINVAL; + + /* + * lwIP does not support path MTU discovery, so do + * nothing. TODO: see if this is actually good enough. + */ + return OK; + + case IPV6_PKTINFO: + if ((r = sockdriver_copyin_opt(data, &ipi6, + sizeof(ipi6), len)) != OK) + return r; + + /* + * Simply copy in what is given. The values will be + * parsed only once a packet is sent, in + * pktsock_get_pktinfo(). Otherwise, if we perform + * checks here, they may be outdated by the time the + * values are actually used. + */ + memcpy(&pkt->pkt_srcaddr.addr, &ipi6.ipi6_addr, + sizeof(pkt->pkt_srcaddr.addr)); + pkt->pkt_ifindex = ipi6.ipi6_ifindex; + + return OK; + + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_RECVTCLASS: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + switch (name) { + case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; + case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; + case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; + default: flag = 0; assert(0); break; + } + + if (val) + ipsock_set_flag(&pkt->pkt_ipsock, flag); + else + ipsock_clear_flag(&pkt->pkt_ipsock, flag); + + return OK; + } + + break; + } + + return ipsock_setsockopt(&pkt->pkt_ipsock, level, name, data, len, + ipopts); +} + +/* + * Retrieve socket options on a packet socket. + */ +int +pktsock_getsockopt(struct pktsock * pkt, int level, int name, + const struct sockdriver_data * data, socklen_t * len, + struct ipopts * ipopts) +{ + struct in6_pktinfo ipi6; + unsigned int flag; + int val; + + switch (level) { + case IPPROTO_IP: + if (ipsock_is_ipv6(&pkt->pkt_ipsock)) + break; + + switch (name) { + case IP_RECVTTL: + case IP_RECVPKTINFO: + switch (name) { + case IP_RECVTTL: flag = PKTF_RECVTTL; break; + case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; + default: flag = 0; assert(0); break; + } + + val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + + case IPPROTO_IPV6: + if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) + break; + + switch (name) { + case IPV6_USE_MIN_MTU: + /* + * TODO: sort out exactly what lwIP actually supports + * in the way of path MTU discovery. Value 1 means + * that path MTU discovery is disabled and packets are + * sent at the minimum MTU (RFC 3542). + */ + val = 1; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_PKTINFO: + memset(&ipi6, 0, sizeof(ipi6)); + + /* + * Simply copy out whatever was given before. These + * fields are initialized to zero on socket creation. + */ + memcpy(&ipi6.ipi6_addr, &pkt->pkt_srcaddr.addr, + sizeof(ipi6.ipi6_addr)); + ipi6.ipi6_ifindex = pkt->pkt_ifindex; + + return sockdriver_copyout_opt(data, &ipi6, + sizeof(ipi6), len); + + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_RECVTCLASS: + switch (name) { + case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; + case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; + case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; + default: flag = 0; assert(0); break; + } + + val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + } + + return ipsock_getsockopt(&pkt->pkt_ipsock, level, name, data, len, + ipopts); +} + +/* + * Drain the receive queue of a packet socket. + */ +static void +pktsock_drain(struct pktsock * pkt) +{ + + while (pkt->pkt_rcvhead != NULL) + pktsock_dequeue(pkt); + + assert(pkt->pkt_rcvlen == 0); + assert(pkt->pkt_rcvtailp == &pkt->pkt_rcvhead); +} + +/* + * Shut down a packet socket for reading and/or writing. + */ +void +pktsock_shutdown(struct pktsock * pkt, unsigned int mask) +{ + + if (mask & SFL_SHUT_RD) + pktsock_drain(pkt); +} + +/* + * Close a packet socket. + */ +void +pktsock_close(struct pktsock * pkt) +{ + + pktsock_drain(pkt); + + mcast_leave_all(&pkt->pkt_mcast); +} + +/* + * Return the rounded-up number of bytes in the packet socket's receive queue, + * for sysctl(7). NetBSD returns the used portion of each buffer, but that + * would be quite some extra effort for us (TODO). + */ +size_t +pktsock_get_recvlen(struct pktsock * pkt) +{ + + return pkt->pkt_rcvlen; +} diff --git a/minix/net/lwip/pktsock.h b/minix/net/lwip/pktsock.h new file mode 100644 index 000000000..9635a7f1f --- /dev/null +++ b/minix/net/lwip/pktsock.h @@ -0,0 +1,63 @@ +#ifndef MINIX_NET_LWIP_PKTSOCK_H +#define MINIX_NET_LWIP_PKTSOCK_H + +#include "mcast.h" + +/* Packet-level socket, shared by UDP and RAW. */ +struct pktsock { + struct ipsock pkt_ipsock; /* IP socket object, MUST be first */ + struct pbuf *pkt_rcvhead; /* receive buffer, first packet */ + struct pbuf **pkt_rcvtailp; /* receive buffer, last ptr-ptr */ + size_t pkt_rcvlen; /* receive buffer, length in bytes */ + struct mcast_head pkt_mcast; /* multicast membership list */ + ip6_addr_p_t pkt_srcaddr; /* IPV6_PKTINFO: source address */ + uint32_t pkt_ifindex; /* IPV6_KPTINFO: interface index */ +}; + +#define pktsock_get_ipsock(pkt) (&(pkt)->pkt_ipsock) +#define pktsock_get_ifindex(pkt) ((pkt)->pkt_ifindex) + +/* Options when sending packets. */ +struct pktopt { + uint8_t pkto_flags; /* packet send flags (PKTOF_) */ + uint8_t pkto_tos; /* type of service for the packet */ + uint8_t pkto_ttl; /* time-to-live for the packet */ + uint8_t pkto_mcast_ttl; /* time-to-live for multicast packet */ + ip6_addr_p_t pkto_srcaddr; /* IPV6_PKTINFO: source address */ + unsigned int pkto_ifindex; /* IPV6_PKTINFO: interface index */ +}; + +#define PKTOF_TTL 0x01 /* send packet with custom TTL value */ +#define PKTOF_TOS 0x02 /* send packet with custom TOS value */ +#define PKTOF_PKTINFO 0x04 /* send packet with src addr, on if. */ + +int pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf, + size_t rcvbuf, struct sock ** sockp); +int pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf); +void pktsock_input(struct pktsock * pkt, struct pbuf * pbuf, + const ip_addr_t * srcaddr, uint16_t port); +int pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto, + struct ifdev ** ifdevp, ip_addr_t * src_addrp); +int pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, + socklen_t ctl_len, struct pktopt * pkto); +int pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data, + size_t len, struct pbuf * pbuf); +int pktsock_pre_recv(struct sock * sock, endpoint_t user_endpt, int flags); +int pktsock_recv(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * off, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr, + socklen_t * addr_len, endpoint_t user_endpt, int flags, size_t min, + int * rflags); +int pktsock_test_recv(struct sock * sock, size_t min, size_t * size); +void pktsock_set_mcaware(struct pktsock * pkt); +int pktsock_setsockopt(struct pktsock * pkt, int level, int name, + const struct sockdriver_data * data, socklen_t len, + struct ipopts * ipopts); +int pktsock_getsockopt(struct pktsock * pkt, int level, int name, + const struct sockdriver_data * data, socklen_t * len, + struct ipopts * ipopts); +void pktsock_shutdown(struct pktsock * pkt, unsigned int mask); +void pktsock_close(struct pktsock * pkt); +size_t pktsock_get_recvlen(struct pktsock * pkt); + +#endif /* !MINIX_NET_LWIP_PKTSOCK_H */ diff --git a/minix/net/lwip/rawsock.c b/minix/net/lwip/rawsock.c new file mode 100644 index 000000000..d00df01e4 --- /dev/null +++ b/minix/net/lwip/rawsock.c @@ -0,0 +1,1341 @@ +/* LWIP service - rawsock.c - RAW sockets */ +/* + * For IPv6 sockets, this module attempts to implement a part of RFC 3542, but + * currently not more than what is supported by lwIP and/or what is expected by + * a handful of standard utilities (dhcpcd, ping6, traceroute6..). + * + * For general understanding, be aware that IPv4 raw sockets always receive + * packets including the IP header, and may be used to send packets including + * the IP header if IP_HDRINCL is set, while IPv6 raw sockets always send and + * receive actual payloads only, using ancillary (control) data to set and + * retrieve per-packet IP header fields. + * + * For packet headers we follow general BSD semantics. For example, some IPv4 + * header fields are swapped both when sending and when receiving. Also, like + * on NetBSD, IPPROTO_RAW is not a special value in any way. + */ + +#include "lwip.h" +#include "ifaddr.h" +#include "pktsock.h" + +#include "lwip/raw.h" +#include "lwip/inet_chksum.h" + +#include +#include +#include +#include + +/* The number of RAW sockets. Inherited from the lwIP configuration. */ +#define NR_RAWSOCK MEMP_NUM_RAW_PCB + +/* + * Outgoing packets are not getting buffered, so the send buffer size simply + * determines the maximum size for sent packets. The send buffer maximum is + * therefore limited to the maximum size of a single packet (64K-1 bytes), + * which is already enforced by lwIP's 16-bit length parameter to pbuf_alloc(). + * + * The actual transmission may enforce a lower limit, though. The full packet + * size must not exceed the same 64K-1 limit, and that includes any headers + * that still have to be prepended to the given packet. The size of those + * headers depends on the socket type (IPv4/IPv6) and the IP_HDRINCL setting. + * + * The default is equal to the maximum here, because if a (by definition, + * privileged) application wishes to send large raw packets, it probably has a + * good reason, and we do not want to get in its way. + */ +#define RAW_MAX_PAYLOAD (UINT16_MAX) + +#define RAW_SNDBUF_MIN 1 /* minimum RAW send buffer size */ +#define RAW_SNDBUF_DEF RAW_MAX_PAYLOAD /* default RAW send buffer size */ +#define RAW_SNDBUF_MAX RAW_MAX_PAYLOAD /* maximum RAW send buffer size */ +#define RAW_RCVBUF_MIN MEMPOOL_BUFSIZE /* minimum RAW receive buffer size */ +#define RAW_RCVBUF_DEF 32768 /* default RAW receive buffer size */ +#define RAW_RCVBUF_MAX 65536 /* maximum RAW receive buffer size */ + +static struct rawsock { + struct pktsock raw_pktsock; /* packet socket object */ + struct raw_pcb *raw_pcb; /* lwIP RAW control block */ + TAILQ_ENTRY(rawsock) raw_next; /* next in active/free list */ + struct icmp6_filter raw_icmp6filter; /* ICMPv6 type filter */ +} raw_array[NR_RAWSOCK]; + +static TAILQ_HEAD(, rawsock) raw_freelist; /* list of free RAW sockets */ +static TAILQ_HEAD(, rawsock) raw_activelist; /* list, in-use RAW sockets */ + +static const struct sockevent_ops rawsock_ops; + +#define rawsock_get_sock(raw) (ipsock_get_sock(rawsock_get_ipsock(raw))) +#define rawsock_get_ipsock(raw) (pktsock_get_ipsock(&(raw)->raw_pktsock)) +#define rawsock_is_ipv6(raw) (ipsock_is_ipv6(rawsock_get_ipsock(raw))) +#define rawsock_is_v6only(raw) (ipsock_is_v6only(rawsock_get_ipsock(raw))) +#define rawsock_is_conn(raw) \ + (raw_flags((raw)->raw_pcb) & RAW_FLAGS_CONNECTED) +#define rawsock_is_hdrincl(raw) \ + (raw_flags((raw)->raw_pcb) & RAW_FLAGS_HDRINCL) + +static ssize_t rawsock_pcblist(struct rmib_call *, struct rmib_node *, + struct rmib_oldp *, struct rmib_newp *); + +/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_RAW subtree. */ +/* All dynamically numbered; the sendspace/recvspace entries are ours. */ +static struct rmib_node net_inet_raw_table[] = { + RMIB_INT(RMIB_RO, RAW_SNDBUF_DEF, "sendspace", + "Default RAW send buffer size"), + RMIB_INT(RMIB_RO, RAW_RCVBUF_DEF, "recvspace", + "Default RAW receive buffer size"), + RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, rawsock_pcblist, "pcblist", + "RAW IP protocol control block list"), +}; + +static struct rmib_node net_inet_raw_node = + RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw", "RAW IPv4 settings"); +static struct rmib_node net_inet6_raw6_node = + RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw6", "RAW IPv6 settings"); + +/* + * Initialize the raw sockets module. + */ +void +rawsock_init(void) +{ + unsigned int slot; + + /* Initialize the list of free RAW sockets. */ + TAILQ_INIT(&raw_freelist); + + for (slot = 0; slot < __arraycount(raw_array); slot++) + TAILQ_INSERT_TAIL(&raw_freelist, &raw_array[slot], raw_next); + + /* Initialize the list of active RAW sockets. */ + TAILQ_INIT(&raw_activelist); + + /* Register the net.inet.raw and net.inet6.raw6 RMIB subtrees. */ + mibtree_register_inet(PF_INET, IPPROTO_RAW, &net_inet_raw_node); + mibtree_register_inet(PF_INET6, IPPROTO_RAW, &net_inet6_raw6_node); +} + +/* + * Check whether the given arrived IPv6 packet is fit to be received on the + * given raw socket. + */ +static int +rawsock_check_v6(struct rawsock * raw, struct pbuf * pbuf) +{ + uint8_t type; + + assert(rawsock_is_ipv6(raw)); + + /* + * For ICMPv6 packets, test against the configured type filter. + */ + if (raw->raw_pcb->protocol == IPPROTO_ICMPV6) { + if (pbuf->len < offsetof(struct icmp6_hdr, icmp6_dataun)) + return FALSE; + + memcpy(&type, &((struct icmp6_hdr *)pbuf->payload)->icmp6_type, + sizeof(type)); + + if (!ICMP6_FILTER_WILLPASS((int)type, &raw->raw_icmp6filter)) + return FALSE; + } + + /* + * For ICMPv6 packets, or if IPV6_CHECKSUM is enabled, we have to + * verify the checksum of the packet before passing it to the user. + * This is costly, but it needs to be done and lwIP is not doing it for + * us (as of writing, anyway), even though it maintains the offset.. + */ + if (raw->raw_pcb->chksum_reqd && + (pbuf->tot_len < raw->raw_pcb->chksum_offset + sizeof(uint16_t) || + ip6_chksum_pseudo(pbuf, raw->raw_pcb->protocol, pbuf->tot_len, + ip6_current_src_addr(), ip6_current_dest_addr()) != 0)) { + return FALSE; + } + + /* No reason to filter out this packet. */ + return TRUE; +} + +/* + * Adjust the given arrived IPv4 packet by changing the length and offset + * fields to host-byte order, as is done by the BSDs. This effectively mirrors + * the swapping part of the preparation done on IPv4 packets being sent if the + * IP_HDRINCL socket option is enabled. + */ +static void +rawsock_adjust_v4(struct pbuf * pbuf) +{ + struct ip_hdr *iphdr; + + if (pbuf->len < sizeof(struct ip_hdr)) + return; + + iphdr = (struct ip_hdr *)pbuf->payload; + + /* + * W. Richard Stevens also mentions ip_id, but at least on NetBSD that + * field seems to be swapped neither when sending nor when receiving.. + */ + IPH_LEN(iphdr) = htons(IPH_LEN(iphdr)); + IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr)); +} + +/* + * A packet has arrived on a raw socket. Since the same packet may have to be + * delivered to multiple raw sockets, we always return 0 (= not consumed) from + * this function. As such, we must make a copy of the given packet if we want + * to keep it, and never free it. + */ +static uint8_t +rawsock_input(void * arg, struct raw_pcb * pcb __unused, struct pbuf * psrc, + const ip_addr_t * srcaddr) +{ + struct rawsock *raw = (struct rawsock *)arg; + struct pbuf *pbuf; + int off, hdrlen; + + assert(raw->raw_pcb == pcb); + + /* + * If adding this packet would cause the receive buffer to go beyond + * the current limit, drop the new packet. This is just an estimation, + * because the copy we are about to make may not take the exact same + * amount of memory, due to the fact that 1) the pbuf we're given has + * an unknown set of headers in front of it, and 2) we need to store + * extra information in our copy. The return value of this call, if + * not -1, is the number of bytes we need to reserve to store that + * extra information. + */ + if ((hdrlen = pktsock_test_input(&raw->raw_pktsock, psrc)) < 0) + return 0; + + /* + * Raw IPv6 sockets receive only the actual packet data, whereas raw + * IPv4 sockets receive the IP header as well. + */ + if (ip_current_is_v6()) { + off = ip_current_header_tot_len(); + + util_pbuf_header(psrc, -off); + + if (!rawsock_check_v6(raw, psrc)) { + util_pbuf_header(psrc, off); + + return 0; + } + } else { + /* + * For IPv6 sockets, drop the packet if it was sent as an IPv4 + * packet and checksumming is enabled (this includes ICMPv6). + * Otherwise, the packet would bypass the above checks that we + * perform on IPv6 packets. Applications that want to use a + * dual-stack protocol with checksumming will have to do the + * checksum verification part themselves. Presumably the two + * different pseudoheaders would result in different checksums + * anyhow, so it would be useless to try to support that. + * + * Beyond that, for IPv4 packets on IPv6 sockets, hide the IPv4 + * header. + */ + if (rawsock_is_ipv6(raw)) { + if (raw->raw_pcb->chksum_reqd) + return 0; + + off = IP_HLEN; + + util_pbuf_header(psrc, -off); + } else + off = 0; + } + + /* + * We need to make a copy of the incoming packet. If we eat the one + * given to us, this will 1) stop any other raw sockets from getting + * the same packet, 2) allow a single raw socket to discard all TCP/UDP + * traffic, and 3) present us with a problem on how to store ancillary + * data. Raw sockets are not that performance critical so the extra + * copy -even when not always necessary- is not that big of a deal. + */ + if ((pbuf = pchain_alloc(PBUF_RAW, hdrlen + psrc->tot_len)) == NULL) { + if (off > 0) + util_pbuf_header(psrc, off); + + return 0; + } + + util_pbuf_header(pbuf, -hdrlen); + + if (pbuf_copy(pbuf, psrc) != ERR_OK) + panic("unexpected pbuf copy failure"); + + pbuf->flags |= psrc->flags & (PBUF_FLAG_LLMCAST | PBUF_FLAG_LLBCAST); + + if (off > 0) + util_pbuf_header(psrc, off); + + if (!rawsock_is_ipv6(raw)) + rawsock_adjust_v4(pbuf); + + pktsock_input(&raw->raw_pktsock, pbuf, srcaddr, 0); + + return 0; +} + +/* + * Create a raw socket. + */ +sockid_t +rawsock_socket(int domain, int protocol, struct sock ** sockp, + const struct sockevent_ops ** ops) +{ + struct rawsock *raw; + unsigned int flags; + uint8_t ip_type; + + if (protocol < 0 || protocol > UINT8_MAX) + return EPROTONOSUPPORT; + + if (TAILQ_EMPTY(&raw_freelist)) + return ENOBUFS; + + raw = TAILQ_FIRST(&raw_freelist); + + /* + * Initialize the structure. Do not memset it to zero, as it is still + * part of the linked free list. Initialization may still fail. + */ + + ip_type = pktsock_socket(&raw->raw_pktsock, domain, RAW_SNDBUF_DEF, + RAW_RCVBUF_DEF, sockp); + + /* We should have enough PCBs so this call should not fail.. */ + if ((raw->raw_pcb = raw_new_ip_type(ip_type, protocol)) == NULL) + return ENOBUFS; + raw_recv(raw->raw_pcb, rawsock_input, (void *)raw); + + /* By default, the multicast TTL is 1 and looping is enabled. */ + raw_set_multicast_ttl(raw->raw_pcb, 1); + + flags = raw_flags(raw->raw_pcb); + raw_setflags(raw->raw_pcb, flags | RAW_FLAGS_MULTICAST_LOOP); + + /* + * For ICMPv6, checksum generation and verification is mandatory and + * type filtering of incoming packets is supported (RFC 3542). For all + * other IPv6 protocols, checksumming may be turned on by the user. + */ + if (rawsock_is_ipv6(raw) && protocol == IPPROTO_ICMPV6) { + raw->raw_pcb->chksum_reqd = 1; + raw->raw_pcb->chksum_offset = + offsetof(struct icmp6_hdr, icmp6_cksum); + + ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter); + } else + raw->raw_pcb->chksum_reqd = 0; + + TAILQ_REMOVE(&raw_freelist, raw, raw_next); + + TAILQ_INSERT_TAIL(&raw_activelist, raw, raw_next); + + *ops = &rawsock_ops; + return SOCKID_RAW | (sockid_t)(raw - raw_array); +} + +/* + * Bind a raw socket to a local address. + */ +static int +rawsock_bind(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt) +{ + struct rawsock *raw = (struct rawsock *)sock; + ip_addr_t ipaddr; + err_t err; + int r; + + /* + * Raw sockets may be rebound even if that is not too useful. However, + * we do not allow (re)binding when the socket is connected, so as to + * eliminate any problems with source and destination type mismatches: + * such mismatches are detected at connect time, and rebinding would + * avoid those, possibly triggering lwIP asserts as a result. + */ + if (rawsock_is_conn(raw)) + return EINVAL; + + if ((r = ipsock_get_src_addr(rawsock_get_ipsock(raw), addr, addr_len, + user_endpt, &raw->raw_pcb->local_ip, 0 /*local_port*/, + TRUE /*allow_mcast*/, &ipaddr, NULL /*portp*/)) != OK) + return r; + + err = raw_bind(raw->raw_pcb, &ipaddr); + + return util_convert_err(err); +} + +/* + * Connect a raw socket to a remote address. + */ +static int +rawsock_connect(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt __unused) +{ + struct rawsock *raw = (struct rawsock *)sock; + const ip_addr_t *src_addr; + ip_addr_t dst_addr; + struct ifdev *ifdev; + uint32_t ifindex, ifindex2; + err_t err; + int r; + + /* + * One may "unconnect" socket by providing an address with family + * AF_UNSPEC. + */ + if (addr_is_unspec(addr, addr_len)) { + raw_disconnect(raw->raw_pcb); + + return OK; + } + + if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr, addr_len, + &raw->raw_pcb->local_ip, &dst_addr, NULL /*dst_port*/)) != OK) + return r; + + /* + * Bind explicitly to a source address if the PCB is not bound to one + * yet. This is expected in the BSD socket API, but lwIP does not do + * it for us. + */ + if (ip_addr_isany(&raw->raw_pcb->local_ip)) { + /* Help the multicast case a bit, if possible. */ + ifdev = NULL; + if (ip_addr_ismulticast(&dst_addr)) { + ifindex = pktsock_get_ifindex(&raw->raw_pktsock); + ifindex2 = raw_get_multicast_netif_index(raw->raw_pcb); + if (ifindex == 0) + ifindex = ifindex2; + + if (ifindex != 0) { + ifdev = ifdev_get_by_index(ifindex); + + if (ifdev == NULL) + return ENXIO; + } + } + + src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/); + + if (src_addr == NULL) + return EHOSTUNREACH; + + err = raw_bind(raw->raw_pcb, src_addr); + + if (err != ERR_OK) + return util_convert_err(err); + } + + /* + * Connecting a raw socket serves two main purposes: 1) the socket uses + * the address as destination when sending, and 2) the socket receives + * packets from only the connected address. + */ + err = raw_connect(raw->raw_pcb, &dst_addr); + + if (err != ERR_OK) + return util_convert_err(err); + + return OK; +} + +/* + * Perform preliminary checks on a send request. + */ +static int +rawsock_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, + const struct sockaddr * addr, socklen_t addr_len __unused, + endpoint_t user_endpt __unused, int flags) +{ + struct rawsock *raw = (struct rawsock *)sock; + + if ((flags & ~MSG_DONTROUTE) != 0) + return EOPNOTSUPP; + + if (!rawsock_is_conn(raw) && addr == NULL) + return EDESTADDRREQ; + + /* + * This is only one part of the length check. The rest is done from + * rawsock_send(), once we have more information. + */ + if (len > ipsock_get_sndbuf(rawsock_get_ipsock(raw))) + return EMSGSIZE; + + return OK; +} + +/* + * Swap IP-level options between the RAW PCB and the packet options structure, + * for all options that have their flag set in the packet options structure. + * This function is called twice when sending a packet. The result is that the + * flagged options are overridden for only the packet being sent. + */ +static void +rawsock_swap_opt(struct rawsock * raw, struct pktopt * pkto) +{ + uint8_t tos, ttl, mcast_ttl; + + if (pkto->pkto_flags & PKTOF_TOS) { + tos = raw->raw_pcb->tos; + raw->raw_pcb->tos = pkto->pkto_tos; + pkto->pkto_tos = tos; + } + + if (pkto->pkto_flags & PKTOF_TTL) { + ttl = raw->raw_pcb->ttl; + mcast_ttl = raw_get_multicast_ttl(raw->raw_pcb); + raw->raw_pcb->ttl = pkto->pkto_ttl; + raw_set_multicast_ttl(raw->raw_pcb, pkto->pkto_ttl); + pkto->pkto_ttl = ttl; + pkto->pkto_mcast_ttl = mcast_ttl; + } +} + +/* + * We are about to send the given packet that already includes an IPv4 header, + * because the IP_HDRINCL option is enabled on a raw IPv4 socket. Prepare the + * IPv4 header for sending, by modifying a few fields in it, as expected by + * userland. + */ +static int +rawsock_prepare_hdrincl(struct rawsock * raw, struct pbuf * pbuf, + const ip_addr_t * src_addr) +{ + struct ip_hdr *iphdr; + size_t hlen; + + /* + * lwIP obtains the destination address from the IP packet header in + * this case, so make sure the packet has a full-sized header. + */ + if (pbuf->len < sizeof(struct ip_hdr)) + return EINVAL; + + iphdr = (struct ip_hdr *)pbuf->payload; + + /* + * Fill in the source address if it is not set, and do the byte + * swapping and checksum computation common for the BSDs, without which + * ping(8) and traceroute(8) do not work properly. We consider this a + * convenience feature, so malformed packets are simply sent as is. + * TODO: deal with type punning.. + */ + hlen = (size_t)IPH_HL(iphdr) << 2; + + if (pbuf->len >= hlen) { + /* Fill in the source address if it is blank. */ + if (iphdr->src.addr == PP_HTONL(INADDR_ANY)) { + assert(IP_IS_V4(src_addr)); + + iphdr->src.addr = ip_addr_get_ip4_u32(src_addr); + } + + IPH_LEN(iphdr) = htons(IPH_LEN(iphdr)); + IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr)); + IPH_CHKSUM(iphdr) = 0; + + IPH_CHKSUM(iphdr) = inet_chksum(iphdr, hlen); + } + + return OK; +} + +/* + * Send a packet on a raw socket. + */ +static int +rawsock_send(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * off, const struct sockdriver_data * ctl __unused, + socklen_t ctl_len __unused, socklen_t * ctl_off __unused, + const struct sockaddr * addr, socklen_t addr_len, + endpoint_t user_endpt __unused, int flags, size_t min __unused) +{ + struct rawsock *raw = (struct rawsock *)sock; + struct pktopt pktopt; + struct pbuf *pbuf; + struct ifdev *ifdev; + struct netif *netif; + const ip_addr_t *dst_addrp, *src_addrp; + ip_addr_t src_addr, dst_addr; /* for storage only; not always used! */ + size_t hdrlen; + uint32_t ifindex; + err_t err; + int r; + + /* Copy in and parse any packet options. */ + pktopt.pkto_flags = 0; + + if ((r = pktsock_get_ctl(&raw->raw_pktsock, ctl, ctl_len, + &pktopt)) != OK) + return r; + + /* + * For a more in-depth explanation of what is going on here, see the + * udpsock module, which has largely the same code but with more + * elaborate comments. + */ + + /* + * Start by checking whether the source address and/or the outgoing + * interface are overridden using sticky and/or ancillary options. + */ + if ((r = pktsock_get_pktinfo(&raw->raw_pktsock, &pktopt, &ifdev, + &src_addr)) != OK) + return r; + + if (ifdev != NULL && !ip_addr_isany(&src_addr)) { + /* This is guaranteed to be a proper local unicast address. */ + src_addrp = &src_addr; + } else { + src_addrp = &raw->raw_pcb->local_ip; + + /* + * If the socket is bound to a multicast address, use the + * unspecified ('any') address as source address instead. A + * real source address will then be selected further below. + */ + if (ip_addr_ismulticast(src_addrp)) + src_addrp = IP46_ADDR_ANY(IP_GET_TYPE(src_addrp)); + } + + /* + * Determine the destination address to use. If the socket is + * connected, always ignore any address provided in the send call. + */ + if (!rawsock_is_conn(raw)) { + assert(addr != NULL); /* already checked in pre_send */ + + if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr, + addr_len, src_addrp, &dst_addr, NULL /*dst_port*/)) != OK) + return r; + + dst_addrp = &dst_addr; + } else + dst_addrp = &raw->raw_pcb->remote_ip; + + /* + * If the destination is a multicast address, select the outgoing + * interface based on the multicast interface index, if one is set. + * This must however *not* override an interface index already + * specified using IPV6_PKTINFO, as per RFC 3542 Sec. 6.7. + */ + if (ifdev == NULL && ip_addr_ismulticast(dst_addrp)) { + ifindex = raw_get_multicast_netif_index(raw->raw_pcb); + + if (ifindex != NETIF_NO_INDEX) + ifdev = ifdev_get_by_index(ifindex); /* (may fail) */ + } + + /* + * If an interface has been determined already now, the send operation + * will bypass routing. In that case, we must perform our own checks + * on address zone violations, because those will not be made anywhere + * else. Subsequent steps below will never introduce violations. + */ + if (ifdev != NULL && IP_IS_V6(dst_addrp)) { + if (ifaddr_is_zone_mismatch(ip_2_ip6(dst_addrp), ifdev)) + return EHOSTUNREACH; + + if (IP_IS_V6(src_addrp) && + ifaddr_is_zone_mismatch(ip_2_ip6(src_addrp), ifdev)) + return EHOSTUNREACH; + } + + /* + * If we do not yet have an interface at this point, perform a route + * lookup to determine the outgoing interface, unless MSG_DONTROUTE is + * set. + */ + if (ifdev == NULL) { + if (!(flags & MSG_DONTROUTE)) { + /* + * ip_route() should never be called with an + * IPADDR_TYPE_ANY type address. This is a lwIP- + * internal requirement; while we override both routing + * functions, we do not deviate from it. + */ + if (IP_IS_ANY_TYPE_VAL(*src_addrp)) + src_addrp = + IP46_ADDR_ANY(IP_GET_TYPE(dst_addrp)); + + /* Perform the route lookup. */ + if ((netif = ip_route(src_addrp, dst_addrp)) == NULL) + return EHOSTUNREACH; + + ifdev = netif_get_ifdev(netif); + } else { + if ((ifdev = ifaddr_map_by_subnet(dst_addrp)) == NULL) + return EHOSTUNREACH; + } + } + + /* + * At this point we have an outgoing interface. If we do not have a + * source address yet, pick one now. As a sidenote, if the destination + * address is scoped but has no zone, we could also fill in the zone + * now. We let lwIP handle that instead, though. + */ + assert(ifdev != NULL); + + if (ip_addr_isany(src_addrp)) { + src_addrp = ifaddr_select(dst_addrp, ifdev, NULL /*ifdevp*/); + + if (src_addrp == NULL) + return EHOSTUNREACH; + } + + /* + * Now that we know the full conditions of what we are about to send, + * check whether the packet size leaves enough room for lwIP to prepend + * headers. If so, allocate a chain of pbufs for the packet. + */ + assert(len <= RAW_MAX_PAYLOAD); + + if (rawsock_is_hdrincl(raw)) + hdrlen = 0; + else if (IP_IS_V6(dst_addrp)) + hdrlen = IP6_HLEN; + else + hdrlen = IP_HLEN; + + if (hdrlen + len > RAW_MAX_PAYLOAD) + return EMSGSIZE; + + if ((pbuf = pchain_alloc(PBUF_IP, len)) == NULL) + return ENOBUFS; + + /* Copy in the packet data. */ + if ((r = pktsock_get_data(&raw->raw_pktsock, data, len, pbuf)) != OK) { + pbuf_free(pbuf); + + return r; + } + + /* + * If the user has turned on IPV6_CHECKSUM, ensure that the packet is + * not only large enough to have the checksum stored at the configured + * place, but also that the checksum fits within the first pbuf: if we + * do not test this here, an assert will trigger in lwIP later. Also + * zero out the checksum field first, because lwIP does not do that. + */ + if (raw->raw_pcb->chksum_reqd) { + if (pbuf->len < raw->raw_pcb->chksum_offset + + sizeof(uint16_t)) { + pbuf_free(pbuf); + + return EINVAL; + } + + memset((char *)pbuf->payload + raw->raw_pcb->chksum_offset, 0, + sizeof(uint16_t)); + } + + /* + * For sockets where an IPv4 header is already included in the packet, + * we need to alter a few header fields to be compatible with BSD. + */ + if (rawsock_is_hdrincl(raw) && + (r = rawsock_prepare_hdrincl(raw, pbuf, src_addrp)) != OK) { + pbuf_free(pbuf); + + return r; + } + + /* Set broadcast/multicast flags for accounting purposes. */ + if (ip_addr_ismulticast(dst_addrp)) + pbuf->flags |= PBUF_FLAG_LLMCAST; + else if (ip_addr_isbroadcast(dst_addrp, ifdev_get_netif(ifdev))) + pbuf->flags |= PBUF_FLAG_LLBCAST; + + /* Send the packet. */ + rawsock_swap_opt(raw, &pktopt); + + assert(!ip_addr_isany(src_addrp)); + assert(!ip_addr_ismulticast(src_addrp)); + + err = raw_sendto_if_src(raw->raw_pcb, pbuf, dst_addrp, + ifdev_get_netif(ifdev), src_addrp); + + rawsock_swap_opt(raw, &pktopt); + + /* Free the pbuf again. */ + pbuf_free(pbuf); + + /* + * On success, make sure to return the size of the sent packet as well. + * As an aside: ctl_off need not be updated, as it is not returned. + */ + if ((r = util_convert_err(err)) == OK) + *off = len; + return r; +} + +/* + * Update the set of flag-type socket options on a raw socket. + */ +static void +rawsock_setsockmask(struct sock * sock, unsigned int mask) +{ + struct rawsock *raw = (struct rawsock *)sock; + + /* + * FIXME: raw sockets are not supposed to have a broardcast check, so + * perhaps just remove this and instead always set SOF_BROADCAST? + */ + if (mask & SO_BROADCAST) + ip_set_option(raw->raw_pcb, SOF_BROADCAST); + else + ip_reset_option(raw->raw_pcb, SOF_BROADCAST); +} + +/* + * Prepare a helper structure for IP-level option processing. + */ +static void +rawsock_get_ipopts(struct rawsock * raw, struct ipopts * ipopts) +{ + + ipopts->local_ip = &raw->raw_pcb->local_ip; + ipopts->remote_ip = &raw->raw_pcb->remote_ip; + ipopts->tos = &raw->raw_pcb->tos; + ipopts->ttl = &raw->raw_pcb->ttl; + ipopts->sndmin = RAW_SNDBUF_MIN; + ipopts->sndmax = RAW_SNDBUF_MAX; + ipopts->rcvmin = RAW_RCVBUF_MIN; + ipopts->rcvmax = RAW_RCVBUF_MAX; +} + +/* + * Set socket options on a raw socket. + */ +static int +rawsock_setsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t len) +{ + struct rawsock *raw = (struct rawsock *)sock; + struct ipopts ipopts; + struct icmp6_filter filter; + ip_addr_t ipaddr; + struct in_addr in_addr; + struct ifdev *ifdev; + unsigned int flags; + uint32_t ifindex; + uint8_t byte; + int r, val; + + /* + * Unfortunately, we have to duplicate most of the multicast options + * rather than sharing them with udpsock at the pktsock level. The + * reason is that each of the PCBs have their own multicast abstraction + * functions and so we cannot merge the rest. Same for getsockopt. + */ + + switch (level) { + case IPPROTO_IP: + if (rawsock_is_ipv6(raw)) + break; + + switch (name) { + case IP_HDRINCL: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val) { + raw_setflags(raw->raw_pcb, + raw_flags(raw->raw_pcb) | + RAW_FLAGS_HDRINCL); + } else { + raw_setflags(raw->raw_pcb, + raw_flags(raw->raw_pcb) & + ~RAW_FLAGS_HDRINCL); + } + + return OK; + + case IP_MULTICAST_IF: + pktsock_set_mcaware(&raw->raw_pktsock); + + if ((r = sockdriver_copyin_opt(data, &in_addr, + sizeof(in_addr), len)) != OK) + return r; + + ip_addr_set_ip4_u32(&ipaddr, in_addr.s_addr); + + if ((ifdev = ifaddr_map_by_addr(&ipaddr)) == NULL) + return EADDRNOTAVAIL; + + raw_set_multicast_netif_index(raw->raw_pcb, + ifdev_get_index(ifdev)); + + return OK; + + case IP_MULTICAST_LOOP: + pktsock_set_mcaware(&raw->raw_pktsock); + + if ((r = sockdriver_copyin_opt(data, &byte, + sizeof(byte), len)) != OK) + return r; + + flags = raw_flags(raw->raw_pcb); + + if (byte) + flags |= RAW_FLAGS_MULTICAST_LOOP; + else + flags &= ~RAW_FLAGS_MULTICAST_LOOP; + + raw_setflags(raw->raw_pcb, flags); + + return OK; + + case IP_MULTICAST_TTL: + pktsock_set_mcaware(&raw->raw_pktsock); + + if ((r = sockdriver_copyin_opt(data, &byte, + sizeof(byte), len)) != OK) + return r; + + raw_set_multicast_ttl(raw->raw_pcb, byte); + + return OK; + } + + break; + + case IPPROTO_IPV6: + if (!rawsock_is_ipv6(raw)) + break; + + switch (name) { + case IPV6_CHECKSUM: + /* ICMPv6 checksums are always computed. */ + if (raw->raw_pcb->protocol == IPPROTO_ICMPV6) + return EINVAL; + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val == -1) { + raw->raw_pcb->chksum_reqd = 0; + + return OK; + } else if (val >= 0 && !(val & 1)) { + raw->raw_pcb->chksum_reqd = 1; + raw->raw_pcb->chksum_offset = val; + + return OK; + } else + return EINVAL; + + case IPV6_MULTICAST_IF: + pktsock_set_mcaware(&raw->raw_pktsock); + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val != 0) { + ifindex = (uint32_t)val; + + ifdev = ifdev_get_by_index(ifindex); + + if (ifdev == NULL) + return ENXIO; + } else + ifindex = NETIF_NO_INDEX; + + raw_set_multicast_netif_index(raw->raw_pcb, ifindex); + + return OK; + + case IPV6_MULTICAST_LOOP: + pktsock_set_mcaware(&raw->raw_pktsock); + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < 0 || val > 1) + return EINVAL; + + flags = raw_flags(raw->raw_pcb); + + if (val) + flags |= RAW_FLAGS_MULTICAST_LOOP; + else + flags &= ~RAW_FLAGS_MULTICAST_LOOP; + + /* + * lwIP's IPv6 functionality does not actually check + * this flag at all yet. We set it in the hope that + * one day this will magically start working. + */ + raw_setflags(raw->raw_pcb, flags); + + return OK; + + case IPV6_MULTICAST_HOPS: + pktsock_set_mcaware(&raw->raw_pktsock); + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < -1 || val > UINT8_MAX) + return EINVAL; + + if (val == -1) + val = 1; + + raw_set_multicast_ttl(raw->raw_pcb, val); + + return OK; + } + + break; + + case IPPROTO_ICMPV6: + if (!rawsock_is_ipv6(raw) || + raw->raw_pcb->protocol != IPPROTO_ICMPV6) + break; + + switch (name) { + case ICMP6_FILTER: + /* Who comes up with these stupid exceptions? */ + if (len == 0) { + ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter); + + return OK; + } + + if ((r = sockdriver_copyin_opt(data, &filter, + sizeof(filter), len)) != OK) + return r; + + /* + * As always, never copy in the data into the actual + * destination, as any copy may run into a copy fault + * halfway through, potentially leaving the destination + * in a half-updated and thus corrupted state. + */ + memcpy(&raw->raw_icmp6filter, &filter, sizeof(filter)); + + return OK; + } + } + + rawsock_get_ipopts(raw, &ipopts); + + return pktsock_setsockopt(&raw->raw_pktsock, level, name, data, len, + &ipopts); +} + +/* + * Retrieve socket options on a raw socket. + */ +static int +rawsock_getsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t * len) +{ + struct rawsock *raw = (struct rawsock *)sock; + struct ipopts ipopts; + const ip4_addr_t *ip4addr; + struct in_addr in_addr; + struct ifdev *ifdev; + unsigned int flags; + uint32_t ifindex; + uint8_t byte; + int val; + + switch (level) { + case IPPROTO_IP: + if (rawsock_is_ipv6(raw)) + break; + + switch (name) { + case IP_HDRINCL: + val = !!rawsock_is_hdrincl(raw); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IP_MULTICAST_IF: + ifindex = raw_get_multicast_netif_index(raw->raw_pcb); + + /* + * Map back from the interface index to the IPv4 + * address assigned to the corresponding interface. + * Should this not work out, return the 'any' address. + */ + if (ifindex != NETIF_NO_INDEX && + (ifdev = ifdev_get_by_index(ifindex)) != NULL) { + ip4addr = + netif_ip4_addr(ifdev_get_netif(ifdev)); + + in_addr.s_addr = ip4_addr_get_u32(ip4addr); + } else + in_addr.s_addr = PP_HTONL(INADDR_ANY); + + return sockdriver_copyout_opt(data, &in_addr, + sizeof(in_addr), len); + + case IP_MULTICAST_LOOP: + flags = raw_flags(raw->raw_pcb); + + byte = !!(flags & RAW_FLAGS_MULTICAST_LOOP); + + return sockdriver_copyout_opt(data, &byte, + sizeof(byte), len); + + case IP_MULTICAST_TTL: + byte = raw_get_multicast_ttl(raw->raw_pcb); + + return sockdriver_copyout_opt(data, &byte, + sizeof(byte), len); + } + + break; + + case IPPROTO_IPV6: + if (!rawsock_is_ipv6(raw)) + break; + + switch (name) { + case IPV6_CHECKSUM: + if (raw->raw_pcb->chksum_reqd) + val = raw->raw_pcb->chksum_offset; + else + val = -1; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_MULTICAST_IF: + ifindex = raw_get_multicast_netif_index(raw->raw_pcb); + + val = (int)ifindex; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_MULTICAST_LOOP: + flags = raw_flags(raw->raw_pcb); + + val = !!(flags & RAW_FLAGS_MULTICAST_LOOP); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_MULTICAST_HOPS: + val = raw_get_multicast_ttl(raw->raw_pcb); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + + case IPPROTO_ICMPV6: + if (!rawsock_is_ipv6(raw) || + raw->raw_pcb->protocol != IPPROTO_ICMPV6) + break; + + switch (name) { + case ICMP6_FILTER: + return sockdriver_copyout_opt(data, + &raw->raw_icmp6filter, + sizeof(raw->raw_icmp6filter), len); + } + + break; + } + + rawsock_get_ipopts(raw, &ipopts); + + return pktsock_getsockopt(&raw->raw_pktsock, level, name, data, len, + &ipopts); +} + +/* + * Retrieve the local socket address of a raw socket. + */ +static int +rawsock_getsockname(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct rawsock *raw = (struct rawsock *)sock; + + ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len, + &raw->raw_pcb->local_ip, 0 /*port*/); + + return OK; +} + +/* + * Retrieve the remote socket address of a raw socket. + */ +static int +rawsock_getpeername(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct rawsock *raw = (struct rawsock *)sock; + + if (!rawsock_is_conn(raw)) + return ENOTCONN; + + ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len, + &raw->raw_pcb->remote_ip, 0 /*port*/); + + return OK; +} + +/* + * Shut down a raw socket for reading and/or writing. + */ +static int +rawsock_shutdown(struct sock * sock, unsigned int mask) +{ + struct rawsock *raw = (struct rawsock *)sock; + + if (mask & SFL_SHUT_RD) + raw_recv(raw->raw_pcb, NULL, NULL); + + pktsock_shutdown(&raw->raw_pktsock, mask); + + return OK; +} + +/* + * Close a raw socket. + */ +static int +rawsock_close(struct sock * sock, int force __unused) +{ + struct rawsock *raw = (struct rawsock *)sock; + + raw_recv(raw->raw_pcb, NULL, NULL); + + raw_remove(raw->raw_pcb); + raw->raw_pcb = NULL; + + pktsock_close(&raw->raw_pktsock); + + return OK; +} + +/* + * Free up a closed raw socket. + */ +static void +rawsock_free(struct sock * sock) +{ + struct rawsock *raw = (struct rawsock *)sock; + + assert(raw->raw_pcb == NULL); + + TAILQ_REMOVE(&raw_activelist, raw, raw_next); + + TAILQ_INSERT_HEAD(&raw_freelist, raw, raw_next); +} + +/* + * Fill the given kinfo_pcb sysctl(7) structure with information about the RAW + * PCB identified by the given pointer. + */ +static void +rawsock_get_info(struct kinfo_pcb * ki, const void * ptr) +{ + const struct raw_pcb *pcb = (const struct raw_pcb *)ptr; + struct rawsock *raw; + + /* We iterate our own list so we can't find "strange" PCBs. */ + raw = (struct rawsock *)pcb->recv_arg; + assert(raw >= raw_array && + raw < &raw_array[__arraycount(raw_array)]); + + ki->ki_type = SOCK_RAW; + ki->ki_protocol = pcb->protocol; + + ipsock_get_info(ki, &pcb->local_ip, 0 /*local_port*/, + &raw->raw_pcb->remote_ip, 0 /*remote_port*/); + + /* TODO: change this so that sockstat(1) may work one day. */ + ki->ki_sockaddr = (uint64_t)(uintptr_t)rawsock_get_sock(raw); + + ki->ki_rcvq = pktsock_get_recvlen(&raw->raw_pktsock); + + if (rawsock_is_hdrincl(raw)) + ki->ki_pflags |= INP_HDRINCL; +} + +/* + * Given either NULL or a previously returned RAW PCB pointer, return the first + * or next RAW PCB pointer, or NULL if there are no more. lwIP does not expose + * 'raw_pcbs', but other modules in this service may also use RAW PCBs (which + * should then stay hidden), so we iterate through our own list instead. + */ +static const void * +rawsock_enum(const void * last) +{ + const struct raw_pcb *pcb; + struct rawsock *raw; + + if (last != NULL) { + pcb = (const struct raw_pcb *)last; + + raw = (struct rawsock *)pcb->recv_arg; + assert(raw >= raw_array && + raw < &raw_array[__arraycount(raw_array)]); + + raw = TAILQ_NEXT(raw, raw_next); + } else + raw = TAILQ_FIRST(&raw_activelist); + + if (raw != NULL) + return raw->raw_pcb; + else + return NULL; +} + +/* + * Obtain the list of RAW protocol control blocks, for sysctl(7). + */ +static ssize_t +rawsock_pcblist(struct rmib_call * call, struct rmib_node * node, + struct rmib_oldp * oldp, struct rmib_newp * newp __unused) +{ + + return util_pcblist(call, oldp, rawsock_enum, rawsock_get_info); +} + +static const struct sockevent_ops rawsock_ops = { + .sop_bind = rawsock_bind, + .sop_connect = rawsock_connect, + .sop_pre_send = rawsock_pre_send, + .sop_send = rawsock_send, + .sop_pre_recv = pktsock_pre_recv, + .sop_recv = pktsock_recv, + .sop_test_recv = pktsock_test_recv, + .sop_ioctl = ifconf_ioctl, + .sop_setsockmask = rawsock_setsockmask, + .sop_setsockopt = rawsock_setsockopt, + .sop_getsockopt = rawsock_getsockopt, + .sop_getsockname = rawsock_getsockname, + .sop_getpeername = rawsock_getpeername, + .sop_shutdown = rawsock_shutdown, + .sop_close = rawsock_close, + .sop_free = rawsock_free +}; diff --git a/minix/net/lwip/route.c b/minix/net/lwip/route.c new file mode 100644 index 000000000..11a77fb33 --- /dev/null +++ b/minix/net/lwip/route.c @@ -0,0 +1,1654 @@ +/* LWIP service - route.c - route management */ +/* + * This module provides a destination-based routing implementation, roughly + * matching the routing as done traditionally by the BSDs and by current NetBSD + * in particular. As such, this implementation almost completely replaces + * lwIP's own more limited (and less rigid) routing algorithms. It does this + * using a combination of overriding lwIP functions (ip4_route, ip6_route) with + * weak-symbol patching, and lwIP-provided gateway hooks. Especially the + * former gives us a level of control that lwIP's routing hooks do not provide: + * not only does such overriding give us the ability to flag that no route was + * found at all, we also bypass a number of default decisions taken by lwIP + * where the routing hooks are not called at all. + * + * As a result, the routing tables as visible to the user are an almost + * completely accurate reflection of the routing decisions taken by this TCP/IP + * stack in practice. There is currently only one exception: for IPv4 gateway + * selection, lwIP will bypass the gateway hook if the given address is on the + * local subnet according to the locally assigned IP address and subnet mask. + * This exception should practically affect noone, though. + * + * Our routing implementation differs from NetBSD's in various aspects, though. + * Perhaps the most important one, also noted elsewhere, is that we do not + * support the coexistence of an all-bits-set network route and a host route + * for the same IP address. If necessary, this issue can be resolved. + * + * We use a custom concept of "immutable" routes for local addresses, which are + * a somewhat special case as explained in the ifaddr module. Since those + * RTF_LOCAL routes cannot be deleted, a small change is made to the route(8) + * flush-all command to skip them. Packets directed at local addresses on + * non-loopback interfaces are handled in a way that differs from NetBSD's, + * too. This is explained in the ifdev module. + * + * The BSDs support special routes that reject or blackhole packets, based on + * routing flags. We support such routes as well, but implement them somewhat + * differently from the BSDs: such packets always get routed over a loopback + * interface (regardless of their associated interface), in order to save on + * routing lookups for packets in the common case. + * + * As general rules of thumb: if there is no route to a destination, assignment + * of a local address will already fail with a "no route to host" error. If + * there is an RTF_REJECT route, a local address will be assigned, but actual + * packets will be routed to a loopback interface and result in a "no route to + * host" error upon reception there - this is what NetBSD seems to do too, even + * though the documentation says that RTF_REJECT routes generate ICMP messages + * instead. RTF_BLACKHOLE behaves similarly to RTF_REJECT, except that the + * packet is simply discarded upon receipt by the loopback interface. + * + * In various places, both here and elsewhere, we check to make sure that on + * routing and output, scoped IPv6 source and destination addresses never leave + * their zone. For example, a packet must not be sent to an outgoing interface + * if its source address is a link-local address with a zone for another + * interface. lwIP does not check for such violations, and so we must make + * sure that this does not happen ourselves. + * + * Normally, one would tell lwIP to use a particular default IPv4 gateway by + * associating the gateway address to a particular interface, and then setting + * that interface as default interface (netif_default). We explicitly do + * neither of these things. Instead, the routing hooks should return the + * default route whenever applicable, and the gateway hooks should return the + * default route's gateway IP address whenever needed. + * + * Due to lwIP's limited set of error codes, we do not properly distinguish + * between cases where EHOSTUNREACH or ENETUNREACH should be thrown, and throw + * the former in most cases. + */ + +#include "lwip.h" +#include "ifaddr.h" +#include "rttree.h" +#include "rtsock.h" +#include "route.h" +#include "lldata.h" + +#include "lwip/nd6.h" + +/* + * The maximum number of uint8_t bytes needed to represent a routing address. + * This value is the maximum of 4 (for IPv4) and 16 (for IPv6). + */ +#define ROUTE_ADDR_MAX (MAX(IP4_BITS, IP6_BITS) / NBBY) + +/* + * We use a shared routing entry data structure for IPv4 and IPv6 routing + * entries. The result is cleaner code at the cost of (currently) about 2.3KB + * of memory wasted (costing 12 bytes per address for three addresses for 64 of + * the 128 routing entries that would be for IPv4), although with the benefit + * that either address family may use more than half of the routing entries. + * From that 2.3KB, 1KB can be reclaimed by moving the destination address and + * mask into the rttree_entry data structure, at the cost of its generality. + */ +struct route_entry { + struct rttree_entry re_entry; /* routing tree entry */ + union pxfer_re_pu { + struct ifdev *repu_ifdev; /* associated interface */ + SIMPLEQ_ENTRY(route_entry) repu_next; /* next free pointer */ + } re_pu; + unsigned int re_flags; /* routing flags (RTF_) */ + unsigned int re_use; /* number of times used */ + uint8_t re_addr[ROUTE_ADDR_MAX]; /* destination address */ + uint8_t re_mask[ROUTE_ADDR_MAX]; /* destination mask */ + union ixfer_re_gu { + ip4_addr_p_t regu_gw4; /* gateway (IPv4) */ + ip6_addr_p_t regu_gw6; /* gateway (IPv6) */ + } re_gu; +}; +#define re_ifdev re_pu.repu_ifdev +#define re_next re_pu.repu_next +#define re_gw4 re_gu.regu_gw4 +#define re_gw6 re_gu.regu_gw6 + +/* Routes for local addresses are immutable, for reasons explained in ifdev. */ +#define route_is_immutable(route) ((route)->re_flags & RTF_LOCAL) + +/* + * We override a subset of the BSD routing flags in order to store our own + * local settings. In particular, we have to have a way to store whether a + * route is for an IPv4 or IPv6 destination address. We override BSD's + * RTF_DONE flag for this: RTF_DONE is only used with routing sockets, and + * never associated with actual routes. In contrast, RTF_IPV6 is only used + * with actual routes, and never sent across routing sockets. In general, + * overriding flags is preferable to adding new ones, as BSD might later add + * more flags itself as well, while it can never remove existing flags. + */ +#define RTF_IPV6 RTF_DONE /* route is for an IPv6 destination */ + +/* The total number of routing entries (IPv4 and IPv6 combined). */ +#define NR_ROUTE_ENTRY 128 + +static struct route_entry route_array[NR_ROUTE_ENTRY]; /* routing entries */ + +static SIMPLEQ_HEAD(, route_entry) route_freelist; /* free entry list */ + +/* The routing trees. There are two: one for IPv4 and one for IPv6. */ +#define ROUTE_TREE_V4 0 +#define ROUTE_TREE_V6 1 +#define NR_ROUTE_TREE 2 + +static struct rttree route_tree[NR_ROUTE_TREE]; + +/* We support a single cached routing entry per address family (IPv4, IPv6). */ +static int rtcache_v4set; +static ip4_addr_t rtcache_v4addr; +static struct route_entry *rtcache_v4route; + +static int rtcache_v6set; +static ip6_addr_t rtcache_v6addr; +static struct route_entry *rtcache_v6route; + +/* + * Initialize the routing cache. There are a lot of trivial functions here, + * but this is designed to be extended in the future. + */ +static void +rtcache_init(void) +{ + + rtcache_v4set = FALSE; + rtcache_v6set = FALSE; +} + +/* + * Look up the given IPv4 address in the routing cache. If there is a match, + * return TRUE with the associated route in 'route', possibly NULL if a + * negative result was cached. Return FALSE if the routing cache does not + * cache the given IPv4 address. + */ +static inline int +rtcache_lookup_v4(const ip4_addr_t * ipaddr, struct route_entry ** route) +{ + + if (rtcache_v4set && ip4_addr_cmp(&rtcache_v4addr, ipaddr)) { + *route = rtcache_v4route; + + return TRUE; + } else + return FALSE; +} + +/* + * Add the given IPv4 address and the given routing entry (NULL for negative + * caching) to the routing cache. + */ +static inline void +rtcache_add_v4(const ip4_addr_t * ipaddr, struct route_entry * route) +{ + + rtcache_v4addr = *ipaddr; + rtcache_v4route = route; + rtcache_v4set = TRUE; +} + +/* + * Reset the IPv4 routing cache. + */ +static void +rtcache_reset_v4(void) +{ + + rtcache_v4set = FALSE; +} + +/* + * Look up the given IPv6 address in the routing cache. If there is a match, + * return TRUE with the associated route in 'route', possibly NULL if a + * negative result was cached. Return FALSE if the routing cache does not + * cache the given IPv6 address. + */ +static inline int +rtcache_lookup_v6(const ip6_addr_t * ipaddr, struct route_entry ** route) +{ + + if (rtcache_v6set && ip6_addr_cmp(&rtcache_v6addr, ipaddr)) { + *route = rtcache_v6route; + + return TRUE; + } else + return FALSE; +} + +/* + * Add the given IPv6 address and the given routing entry (NULL for negative + * caching) to the routing cache. Caching of scoped addresses without zones is + * not supported. + */ +static inline void +rtcache_add_v6(const ip6_addr_t * ipaddr, struct route_entry * route) +{ + + rtcache_v6addr = *ipaddr; + rtcache_v6route = route; + rtcache_v6set = TRUE; +} + +/* + * Reset the IPv6 routing cache. + */ +static void +rtcache_reset_v6(void) +{ + + rtcache_v6set = FALSE; +} + +/* + * Initialize the routing module. + */ +void +route_init(void) +{ + unsigned int slot; + + /* Initialize the routing trees. */ + rttree_init(&route_tree[ROUTE_TREE_V4], IP4_BITS); + rttree_init(&route_tree[ROUTE_TREE_V6], IP6_BITS); + + /* Initialize the list of free routing entries. */ + SIMPLEQ_INIT(&route_freelist); + + for (slot = 0; slot < __arraycount(route_array); slot++) + SIMPLEQ_INSERT_TAIL(&route_freelist, &route_array[slot], + re_next); + + /* Reset the routing cache. */ + rtcache_init(); +} + +/* + * Prepare for a routing tree operation by converting the given IPv4 address + * into a raw address that can be used in that routing tree operation. + */ +static inline void +route_prepare_v4(const ip4_addr_t * ip4addr, uint8_t rtaddr[ROUTE_ADDR_MAX]) +{ + uint32_t val; + + val = ip4_addr_get_u32(ip4addr); + + memcpy(rtaddr, &val, sizeof(val)); +} + +/* + * Prepare for a routing tree operation by converting the given IPv6 address + * into a raw address that can be used in that routing tree operation. If the + * given prefix length allows for it, also incorporate the address zone. + */ +static inline void +route_prepare_v6(const ip6_addr_t * ip6addr, unsigned int prefix, + uint8_t rtaddr[ROUTE_ADDR_MAX]) +{ + + assert(sizeof(ip6addr->addr) == IP6_BITS / NBBY); + + /* + * TODO: in most cases, we could actually return a pointer to the + * address contained in the given lwIP IP address structure. However, + * doing so would make a lot things quite a bit messier around here, + * but the small performance gain may still make it worth it. + */ + memcpy(rtaddr, ip6addr->addr, sizeof(ip6addr->addr)); + + /* + * Embed the zone ID into the address, KAME style. This is the + * easiest way to have link-local addresses for multiple interfaces + * coexist in a single routing tree. Do this only if the full zone ID + * would be included in the prefix though, or we might de-normalize the + * address. + */ + if (ip6_addr_has_zone(ip6addr) && prefix >= 32) + rtaddr[3] = ip6_addr_zone(ip6addr); +} + +/* + * Prepare for a routing tree operation by converting the given IP address into + * a raw address that can be used in that routing tree operation. The given + * address's zone ID is embedded "KAME-style" into the raw (IPv6) address when + * applicable and if the given prefix length allows for it. Return the index + * of the routing tree to use (ROUTE_TREE_V4 or ROUTE_TREE_V6). + */ +static unsigned int +route_prepare(const ip_addr_t * ipaddr, unsigned int prefix, + uint8_t rtaddr[ROUTE_ADDR_MAX]) +{ + + switch (IP_GET_TYPE(ipaddr)) { + case IPADDR_TYPE_V4: + route_prepare_v4(ip_2_ip4(ipaddr), rtaddr); + + return ROUTE_TREE_V4; + + case IPADDR_TYPE_V6: + route_prepare_v6(ip_2_ip6(ipaddr), prefix, rtaddr); + + return ROUTE_TREE_V6; + + default: + panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); + } +} + +/* + * The given routing tree (ROUTE_TREE_V4 or ROUTE_TREE_V6) has been updated. + * Invalidate any cache entries that may now have become stale, both locally + * and in lwIP. + */ +static void +route_updated(unsigned int tree) +{ + + if (tree == ROUTE_TREE_V6) { + rtcache_reset_v6(); + + /* + * Also clear the lwIP ND6 destination cache, which may now + * contain entries for the wrong gateway. + */ + nd6_clear_destination_cache(); + } else + rtcache_reset_v4(); +} + +/* + * Add a route to the appropriate routing table. The address, address zone, + * prefix, and RTF_HOST flag in the flags field make up the identity of the + * route. If the flags field contains RTF_GATEWAY, a gateway must be given; + * otherwise, it must be NULL. The route is associated with the given + * interface, which may not be NULL. The caller must ensure that the flags + * field does not contain unsupported flags. On success, return OK, and also + * also announce the addition. On failure, return a negative error code. + */ +int +route_add(const ip_addr_t * addr, unsigned int prefix, + const ip_addr_t * gateway, struct ifdev * ifdev, unsigned int flags, + const struct rtsock_request * rtr) +{ + struct route_entry *route; + unsigned int tree, byte; + int r; + + assert(flags & RTF_UP); + assert(!!(flags & RTF_GATEWAY) == (gateway != NULL)); + assert(ifdev != NULL); + + /* Get a routing entry, if any are available. */ + if (SIMPLEQ_EMPTY(&route_freelist)) + return ENOBUFS; + + route = SIMPLEQ_FIRST(&route_freelist); + + /* + * Perform sanity checks on the input, and fill in enough of the + * routing entry to be able to try and add it to the routing tree. + */ + memset(route->re_addr, 0, sizeof(route->re_addr)); + + tree = route_prepare(addr, prefix, route->re_addr); + + switch (tree) { + case ROUTE_TREE_V4: + if (prefix > IP4_BITS || + (prefix != IP4_BITS && (flags & RTF_HOST))) + return EINVAL; + + flags &= ~RTF_IPV6; + + break; + + case ROUTE_TREE_V6: + if (prefix > IP6_BITS || + (prefix != IP6_BITS && (flags & RTF_HOST))) + return EINVAL; + + flags |= RTF_IPV6; + + break; + + default: + return EINVAL; + } + + /* Generate the (raw) network mask. This is protocol agnostic! */ + addr_make_netmask(route->re_mask, sizeof(route->re_mask), prefix); + + /* The given address must be normalized to its mask. */ + for (byte = 0; byte < __arraycount(route->re_addr); byte++) + if ((route->re_addr[byte] & ~route->re_mask[byte]) != 0) + return EINVAL; + + /* + * Attempt to add the routing entry. Host-type entries do not have an + * associated mask, enabling ever-so-slightly faster matching. + */ + if ((r = rttree_add(&route_tree[tree], &route->re_entry, + route->re_addr, (flags & RTF_HOST) ? NULL : route->re_mask, + prefix)) != OK) + return r; + + /* + * Success. Finish the routing entry. Remove the entry from the free + * list before assigning re_ifdev, as these two use the same memory. + */ + SIMPLEQ_REMOVE_HEAD(&route_freelist, re_next); + + route->re_ifdev = ifdev; + route->re_flags = flags; + + /* + * Store the gateway if one is given. Store the address in lwIP format + * because that is the easiest way use it later again. Store it as a + * union to keep the route entry structure as small as possible. Store + * the address without its zone, because the gateway's address zone is + * implied by its associated ifdev. + * + * If no gateway is given, this is a link-type route, i.e., a route for + * a local network, with all nodes directly connected and reachable. + */ + if (flags & RTF_GATEWAY) { + if (flags & RTF_IPV6) + ip6_addr_copy_to_packed(route->re_gw6, + *ip_2_ip6(gateway)); + else + ip4_addr_copy(route->re_gw4, *ip_2_ip4(gateway)); + } + + /* We have made routing changes. */ + route_updated(tree); + + /* Announce the route addition. */ + rtsock_msg_route(route, RTM_ADD, rtr); + + return OK; +} + +/* + * Check whether it is possible to add a route for the given destination to the + * corresponding routing table, that is, a subsequent route_add() call for this + * destination address is guaranteed to succeed (if all its parameters are + * valid). Return TRUE if adding the route is guaranteed to succeed, or FALSE + * if creating a route for the given destination would fail. + */ +int +route_can_add(const ip_addr_t * addr, unsigned int prefix, + int is_host __unused) +{ + uint8_t rtaddr[ROUTE_ADDR_MAX]; + unsigned int tree; + + tree = route_prepare(addr, prefix, rtaddr); + + /* + * The corresponding routing tree must not already contain an exact + * match for the destination. If the routing tree implementation is + * ever extended with support for coexisting host and net entries with + * the same prefix, we should also pass in 'is_host' here. + */ + if (rttree_lookup_exact(&route_tree[tree], rtaddr, prefix) != NULL) + return FALSE; + + /* There must be a routing entry on the free list as well. */ + return !SIMPLEQ_EMPTY(&route_freelist); +} + +/* + * Find a route with the exact given route identity. Return the route if + * found, or NULL if no route exists with this identity. + */ +struct route_entry * +route_find(const ip_addr_t * addr, unsigned int prefix, int is_host) +{ + struct rttree_entry *entry; + struct route_entry *route; + uint8_t rtaddr[ROUTE_ADDR_MAX]; + unsigned int tree; + + tree = route_prepare(addr, prefix, rtaddr); + + entry = rttree_lookup_exact(&route_tree[tree], rtaddr, prefix); + if (entry == NULL) + return NULL; + + route = (struct route_entry *)entry; + + /* + * As long as the routing tree code does not support coexisting host + * and net entries with the same prefix, we have to check the type. + */ + if (!!(route->re_flags & RTF_HOST) != is_host) + return NULL; + + return route; +} + +/* + * A route lookup failed for the given IP address. Generate an RTM_MISS + * message on routing sockets. + */ +static void +route_miss(const ip_addr_t * ipaddr) +{ + union sockaddr_any addr; + socklen_t addr_len; + + addr_len = sizeof(addr); + + addr_put_inet(&addr.sa, &addr_len, ipaddr, TRUE /*kame*/, 0 /*port*/); + + rtsock_msg_miss(&addr.sa); +} + +/* + * A route lookup failed for the given IPv4 address. Generate an RTM_MISS + * message on routing sockets. + */ +static void +route_miss_v4(const ip4_addr_t * ip4addr) +{ + ip_addr_t ipaddr; + + ip_addr_copy_from_ip4(ipaddr, *ip4addr); + + route_miss(&ipaddr); +} + +/* + * A route lookup failed for the given IPv6 address. Generate an RTM_MISS + * message on routing sockets. + */ +static void +route_miss_v6(const ip6_addr_t * ip6addr) +{ + ip_addr_t ipaddr; + + ip_addr_copy_from_ip6(ipaddr, *ip6addr); + + route_miss(&ipaddr); +} + +/* + * Look up the most narrow matching routing entry for the given IPv4 address. + * Return the routing entry if one exists at all, or NULL otherwise. This + * function performs caching. + */ +static inline struct route_entry * +route_lookup_v4(const ip4_addr_t * ip4addr) +{ + uint8_t rtaddr[ROUTE_ADDR_MAX]; + struct route_entry *route; + + /* + * Look up the route for the destination IP address, unless we have a + * cached route entry. We cache negatives in order to avoid generating + * lots of RTM_MISS messages for the same destination in a row. + */ + if (rtcache_lookup_v4(ip4addr, &route)) + return route; + + route_prepare_v4(ip4addr, rtaddr); + + route = (struct route_entry *) + rttree_lookup_match(&route_tree[ROUTE_TREE_V4], rtaddr); + + /* Cache the result, even if we found no route. */ + rtcache_add_v4(ip4addr, route); + + return route; +} + +/* + * Look up the most narrow matching routing entry for the given IPv6 address, + * taking into account its zone ID if applicable. Return the routing entry if + * one exists at all, or NULL otherwise. This function performs caching. + */ +static inline struct route_entry * +route_lookup_v6(const ip6_addr_t * ip6addr) +{ + uint8_t rtaddr[ROUTE_ADDR_MAX]; + struct route_entry *route; + int use_cache; + + /* + * We do not support caching of addresses that should have a zone but + * do not: in different contexts, such addresses could yield different + * routes. + */ + use_cache = !ip6_addr_lacks_zone(ip6addr, IP6_UNKNOWN); + + if (use_cache && rtcache_lookup_v6(ip6addr, &route)) + return route; + + route_prepare_v6(ip6addr, IP6_BITS, rtaddr); + + route = (struct route_entry *) + rttree_lookup_match(&route_tree[ROUTE_TREE_V6], rtaddr); + + /* Cache the result, even if no route was found. */ + if (use_cache) + rtcache_add_v6(ip6addr, route); + + return route; +} + +/* + * Look up the most narrow matching routing entry for the given IP address, + * taking into account its zone ID if applicable. Return the routing entry if + * one exists at all, or NULL otherwise. This function performs caching. + */ +struct route_entry * +route_lookup(const ip_addr_t * addr) +{ + + if (IP_IS_V4(addr)) + return route_lookup_v4(ip_2_ip4(addr)); + else + return route_lookup_v6(ip_2_ip6(addr)); +} + +/* + * Change an existing routing entry. Its flags are always updated to the new + * set of given flags, although certain flags are always preserved. If the + * new flags set has RTF_GATEWAY set and 'gateway' is not NULL, update the + * gateway associated with the route. If 'ifdev' is not NULL, reassociate the + * route with the given interface; this will not affect the zone of the + * route's destination address. On success, return OK, and also announce the + * change. On failure, return a negative error code. + */ +static int +route_change(struct route_entry * route, const ip_addr_t * gateway, + struct ifdev * ifdev, unsigned int flags, + const struct rtsock_request * rtr) +{ + unsigned int tree, preserve; + + tree = (route->re_flags & RTF_IPV6) ? ROUTE_TREE_V6 : ROUTE_TREE_V4; + + /* Update the associated interface (only) if a new one is given. */ + if (ifdev != NULL) + route->re_ifdev = ifdev; + + /* + * These flags may not be changed. RTF_UP should always be set anyway. + * RTF_HOST and RTF_IPV6 are part of the route's identity. RTF_LOCAL + * should be preserved as well, although we will not get here if either + * the old or the new flags have it set anyway. + */ + preserve = RTF_UP | RTF_HOST | RTF_IPV6 | RTF_LOCAL; + + /* Always update the flags. There is no way not to. */ + route->re_flags = (route->re_flags & preserve) | (flags & ~preserve); + + /* + * If a new gateway is given *and* RTF_GATEWAY is set, update the + * gateway. If RTF_GATEWAY is not set, this is a link-type route with + * no gateway. If no new gateway is given, we keep the gateway as is. + */ + if (gateway != NULL && (flags & RTF_GATEWAY)) { + if (flags & RTF_IPV6) + ip6_addr_copy_to_packed(route->re_gw6, + *ip_2_ip6(gateway)); + else + ip4_addr_copy(route->re_gw4, *ip_2_ip4(gateway)); + } + + /* We have made routing changes. */ + route_updated(tree); + + /* Announce the route change. */ + rtsock_msg_route(route, RTM_CHANGE, rtr); + + return OK; +} + +/* + * Delete the given route, and announce its deletion. + */ +void +route_delete(struct route_entry * route, const struct rtsock_request * rtr) +{ + unsigned int tree; + + /* First announce the deletion, while the route is still around. */ + tree = (route->re_flags & RTF_IPV6) ? ROUTE_TREE_V6 : ROUTE_TREE_V4; + + rtsock_msg_route(route, RTM_DELETE, rtr); + + /* Then actually delete the route. */ + rttree_delete(&route_tree[tree], &route->re_entry); + + SIMPLEQ_INSERT_HEAD(&route_freelist, route, re_next); + + /* We have made routing changes. */ + route_updated(tree); +} + +/* + * Delete all routes associated with the given interface, typically as part of + * destroying the interface. + */ +void +route_clear(struct ifdev * ifdev) +{ + struct rttree_entry *entry, *parent; + struct route_entry *route; + unsigned int tree; + + /* + * Delete all routes associated with the given interface. Fortunately, + * we need not also delete addresses zoned to the given interface, + * because no route can be created with a zone ID that does not match + * the associated interface. That is the main reason why we ignore + * zone IDs for gateways when adding or changing routes.. + */ + for (tree = 0; tree < NR_ROUTE_TREE; tree++) { + parent = NULL; + + while ((entry = rttree_enum(&route_tree[tree], + parent)) != NULL) { + route = (struct route_entry *)entry; + + if (route->re_ifdev == ifdev) + route_delete(route, NULL /*request*/); + else + parent = entry; + } + } +} + +/* + * Process a routing command specifically for an IPv4 or IPv6 route, as one of + * the specific continuations of processing started by route_process(). The + * RTM_ routing command is given as 'type'. The route destination is given as + * 'dst_addr'; its address type determines whether the operation is for IPv4 or + * IPv6. The sockaddr structures for 'mask' and 'gateway' are passed on as is + * and may have to be parsed here if not NULL. 'ifdev' is the interface to be + * associated with a route; it is non-NULL only if an interface name (IFP) or + * address (IFA) was given. The RTF_ flags field 'flags' has been checked + * against the globally supported flags, but may have to be checked for flags + * that do not apply to IPv4/IPv6 routes. Return OK or a negative error code, + * following the same semantics as route_process(). + */ +static int +route_process_inet(unsigned int type, const ip_addr_t * dst_addr, + const struct sockaddr * mask, const struct sockaddr * gateway, + struct ifdev * ifdev, unsigned int flags, + const struct rtsock_request * rtr) +{ + struct route_entry *route; + ip_addr_t gw_storage, *gw_addr; + struct ifdev *ifdev2; + uint32_t zone; + unsigned int prefix; + int r; + + assert(!(flags & RTF_LLDATA)); + + if ((flags & (RTF_DYNAMIC | RTF_MODIFIED | RTF_DONE | RTF_XRESOLVE | + RTF_LLINFO | RTF_CLONED | RTF_SRC | RTF_ANNOUNCE | + RTF_BROADCAST)) != 0) + return EINVAL; + + /* + * For network entries, a network mask must be provided in all cases. + * For host entries, the network mask is ignored, and we use a prefix + * with all bits set. + */ + if (!(flags & RTF_HOST)) { + if (mask == NULL) + return EINVAL; + + if ((r = addr_get_netmask(mask, mask->sa_len, + IP_GET_TYPE(dst_addr), &prefix, NULL /*ipaddr*/)) != OK) + return r; + } else { + if (IP_IS_V4(dst_addr)) + prefix = IP4_BITS; + else + prefix = IP6_BITS; + } + + gw_addr = NULL; + + /* + * Determine the gateway and interface for the routing entry, if + * applicable. + */ + if (type == RTM_ADD || type == RTM_CHANGE) { + /* + * The RTF_UP flag must always be set, but only if the flags + * field is used at all. + */ + if (!(flags & RTF_UP)) + return EINVAL; + + if ((flags & RTF_GATEWAY) && gateway != NULL) { + if ((r = addr_get_inet(gateway, gateway->sa_len, + IP_GET_TYPE(dst_addr), &gw_storage, TRUE /*kame*/, + NULL /*port*/)) != OK) + return r; + + gw_addr = &gw_storage; + + /* + * We use the zone of the gateway to help determine the + * interface, but we do not reject a mismatching zone + * here. The reason for this is that we do not want + * routes that have zones for an interface other than + * the one associated with the route, as that could + * create a world of trouble: packets leaving their + * zone, complications with cleaning up interfaces.. + */ + if (IP_IS_V6(gw_addr) && + ip6_addr_has_zone(ip_2_ip6(gw_addr))) { + zone = ip6_addr_zone(ip_2_ip6(gw_addr)); + + ifdev2 = ifdev_get_by_index(zone); + + if (ifdev != NULL && ifdev != ifdev2) + return EINVAL; + else + ifdev = ifdev2; + } + + /* + * If we still have no interface at this point, see if + * we can find one based on just the gateway address. + * See if a locally attached network owns the address. + * That may not succeed, leaving ifdev set to NULL. + */ + if (ifdev == NULL) + ifdev = ifaddr_map_by_subnet(gw_addr); + } + + /* + * When adding routes, all necessary information must be given. + * When changing routes, we can leave some settings as is. + */ + if (type == RTM_ADD) { + if ((flags & RTF_GATEWAY) && gw_addr == NULL) + return EINVAL; + + /* TODO: try harder to find a matching interface.. */ + if (ifdev == NULL) + return ENETUNREACH; + } + } + + /* + * All route commands except RTM_ADD require that a route exists for + * the given identity, although RTM_GET, when requesting a host entry, + * may return a wider (network) route based on just the destination + * address. + */ + if (type != RTM_ADD) { + /* For RTM_GET (only), a host query may return a net route. */ + if (type == RTM_GET && (flags & RTF_HOST)) + route = route_lookup(dst_addr); + else + route = route_find(dst_addr, prefix, + !!(flags & RTF_HOST)); + + if (route == NULL) + return ESRCH; + } else + route = NULL; + + /* Process the actual routing command. */ + switch (type) { + case RTM_ADD: + return route_add(dst_addr, prefix, gw_addr, ifdev, flags, rtr); + + case RTM_CHANGE: + /* Routes for local addresses are immutable. */ + if (route_is_immutable(route)) + return EPERM; + + return route_change(route, gw_addr, ifdev, flags, rtr); + + case RTM_DELETE: + /* Routes for local addresses are immutable. */ + if (route_is_immutable(route)) + return EPERM; + + route_delete(route, rtr); + + return OK; + + case RTM_LOCK: + /* + * TODO: implement even the suggestion that we support this. + * For now, we do not keep per-route metrics, let alone change + * them dynamically ourselves, so "locking" metrics is really + * not a concept that applies to us. We may however have to + * save the lock mask and return it in queries.. + */ + /* FALLTHROUGH */ + case RTM_GET: + /* Simply generate a message for the route we just found. */ + rtsock_msg_route(route, type, rtr); + + return OK; + + default: + return EINVAL; + } +} + +/* + * Process a routing command from a routing socket. The RTM_ type of command + * is given as 'type', and is one of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_GET, + * RTM_LOCK. In addition, the function takes a set of sockaddr pointers as + * provided by the routing command. Each of these sockaddr pointers may be + * NULL; if not NULL, the structure is at least large enough to contain the + * address length (sa_len) and family (sa_family), and the length never exceeds + * the amount of memory used to store the sockaddr structure. However, the + * length itself has not yet been checked against the expected protocol + * structure and could even be zero. The command's RTF_ routing flags and + * metrics are provided as well. On success, return OK, in which case the + * caller assumes that a routing socket announcement for the processed command + * has been sent already (passing on 'rtr' to the announcement function as is). + * On failure, return a negative error code; in that case, the caller will send + * a failure response on the original routing socket itself. + */ +int +route_process(unsigned int type, const struct sockaddr * dst, + const struct sockaddr * mask, const struct sockaddr * gateway, + const struct sockaddr * ifp, const struct sockaddr * ifa, + unsigned int flags, unsigned long inits, + const struct rt_metrics * rmx, const struct rtsock_request * rtr) +{ + struct ifdev *ifdev, *ifdev2; + char name[IFNAMSIZ]; + ip_addr_t dst_addr, if_addr; + uint32_t zone; + uint8_t addr_type; + int r; + + /* + * The identity of a route is determined by its destination address, + * destination zone, prefix length, and whether it is a host entry + * or not. If it is a host entry (RTF_HOST is set), the prefix length + * is implied by the protocol; otherwise it should be obtained from the + * given netmask if necessary. For link-local addresses, the zone ID + * must be embedded KAME-style in the destination address. A + * destination address must always be given. The destination address + * also determines the overall address family. + */ + if (dst == NULL) + return EINVAL; + + switch (dst->sa_family) { + case AF_INET: + addr_type = IPADDR_TYPE_V4; + break; +#ifdef INET6 + case AF_INET6: + addr_type = IPADDR_TYPE_V6; + break; +#endif /* INET6 */ + default: + return EAFNOSUPPORT; + } + + if ((r = addr_get_inet(dst, dst->sa_len, addr_type, &dst_addr, + TRUE /*kame*/, NULL /*port*/)) != OK) + return r; + + /* + * Perform a generic test on the given flags. This covers everything + * we support at all, plus a few flags we ignore. Specific route types + * may have further restrictions; those tests are performed later. + */ + if ((flags & ~(RTF_UP | RTF_GATEWAY | RTF_HOST | RTF_REJECT | + RTF_CLONING | RTF_LLINFO | RTF_LLDATA | RTF_STATIC | + RTF_BLACKHOLE | RTF_CLONED | RTF_PROTO2 | RTF_PROTO1)) != 0) + return EINVAL; + + ifdev = NULL; + + if (type == RTM_ADD || type == RTM_CHANGE) { + /* + * If an interface address or name is given, use that to + * identify the target interface. If both are given, make sure + * that both identify the same interface--a hopefully helpful + * feature to detect wrong route(8) usage (NetBSD simply takes + * IFP over IFA). An empty interface name is ignored on the + * basis that libc link_addr(3) is broken. + */ + if (ifp != NULL) { + if ((r = addr_get_link(ifp, ifp->sa_len, name, + sizeof(name), NULL /*hwaddr*/, + 0 /*hwaddr_len*/)) != OK) + return r; + + if (name[0] != '\0' && + (ifdev = ifdev_find_by_name(name)) == NULL) + return ENXIO; + } + + if (ifa != NULL) { + /* + * This is similar to retrieval of source addresses in + * ipsock, with the difference that we do not impose + * that a zone ID be given for link-local addresses. + */ + if ((r = addr_get_inet(ifa, ifa->sa_len, addr_type, + &if_addr, TRUE /*kame*/, NULL /*port*/)) != OK) + return r; + + if ((ifdev2 = ifaddr_map_by_addr(&if_addr)) == NULL) + return EADDRNOTAVAIL; + + if (ifdev != NULL && ifdev != ifdev2) + return EINVAL; + else + ifdev = ifdev2; + } + + /* + * If the destination address has a zone, then it must not + * conflict with the interface, if one was given. If not, we + * may use it to decide the interface to use for the route. + */ + if (IP_IS_V6(&dst_addr) && + ip6_addr_has_zone(ip_2_ip6(&dst_addr))) { + if (ifdev == NULL) { + zone = ip6_addr_zone(ip_2_ip6(&dst_addr)); + + ifdev = ifdev_get_by_index(zone); + } else { + if (!ip6_addr_test_zone(ip_2_ip6(&dst_addr), + ifdev_get_netif(ifdev))) + return EADDRNOTAVAIL; + } + } + } + + /* + * For now, no initializers are supported by any of the sub-processing + * routines, so outright reject requests that set any initializers. + * Most importantly, we do not support per-route MTU settings (RTV_MTU) + * because lwIP would not use them, and we do not support non-zero + * expiry (RTV_EXPIRE) because for IPv4/IPv6 routes it is not a widely + * used feature and for ARP/NDP we would have to change lwIP. + * dhcpcd(8) does supply RTV_MTU, we have to ignore that option rather + * than reject it, unfortunately. arp(8) always sets RTV_EXPIRE, so we + * reject only non-zero expiry there. + */ + if ((inits & ~(RTV_EXPIRE | RTV_MTU)) != 0 || + ((inits & RTV_EXPIRE) != 0 && rmx->rmx_expire != 0)) + return ENOSYS; + + /* + * From here on, the processing differs for ARP, NDP, and IP routes. + * As of writing, our userland is from NetBSD 7, which puts link-local + * route entries in its main route tables. This means we would have to + * search for existing routes before we can determine whether, say, a + * RTM_GET request is for an IP or an ARP route entry. As of NetBSD 8, + * the link-local administration is separated, and all requests use the + * RTF_LLDATA flag to indicate that they are for ARP/NDP routes rather + * than IP routes. Since that change makes things much cleaner for us, + * we borrow from the future, patching arp(8) and ndp(8) to add the + * RTF_LLDATA flag now, so that we can implement a clean split here. + */ + if (!(flags & RTF_LLDATA)) + return route_process_inet(type, &dst_addr, mask, gateway, + ifdev, flags, rtr); + else + return lldata_process(type, &dst_addr, gateway, ifdev, flags, + rtr); +} + +/* + * Return the routing flags (RTF_) for the given routing entry. Strip out any + * internal flags. + */ +unsigned int +route_get_flags(const struct route_entry * route) +{ + + return route->re_flags & ~RTF_IPV6; +} + +/* + * Return TRUE if the given routing entry is for the IPv6 address family, or + * FALSE if it is for IPv4. + */ +int +route_is_ipv6(const struct route_entry * route) +{ + + return !!(route->re_flags & RTF_IPV6); +} + +/* + * Return the interface associated with the given routing entry. The resulting + * interface is never NULL. + */ +struct ifdev * +route_get_ifdev(const struct route_entry * route) +{ + + return route->re_ifdev; +} + +/* + * Convert the given raw routing address pointed to by 'rtaddr' into a + * lwIP-style IP address 'ipaddr' of type 'type', which must by IPADDR_TYPE_V4 + * or IPADDR_TYPE_V6. + */ +static void +route_get_addr(ip_addr_t * ipaddr, const uint8_t * rtaddr, uint8_t type) +{ + ip6_addr_t *ip6addr; + uint32_t val, zone; + + /* + * Convert the routing address to a lwIP-type IP address. Take out the + * KAME-style embedded zone, if needed. + */ + memset(ipaddr, 0, sizeof(*ipaddr)); + IP_SET_TYPE(ipaddr, type); + + switch (type) { + case IPADDR_TYPE_V4: + memcpy(&val, rtaddr, sizeof(val)); + + ip_addr_set_ip4_u32(ipaddr, val); + + break; + + case IPADDR_TYPE_V6: + ip6addr = ip_2_ip6(ipaddr); + + memcpy(ip6addr->addr, rtaddr, sizeof(ip6addr->addr)); + + if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)) { + zone = ntohl(ip6addr->addr[0]) & 0x0000ffffU; + + ip6addr->addr[0] &= PP_HTONL(0xffff0000U); + + ip6_addr_set_zone(ip6addr, zone); + } + + break; + + default: + panic("unknown IP address type: %u", type); + } +} + +/* + * Obtain information about an IPv4 or IPv6 routing entry, by filling 'addr', + * 'mask', 'gateway', and optionally (if not NULL) 'ifp' and 'ifa' with + * sockaddr-type data for each of those fields. Also store the associated + * interface in 'ifdevp', the routing entry's flags in 'flags', and the route's + * usage count in 'use'. + */ +void +route_get(const struct route_entry * route, union sockaddr_any * addr, + union sockaddr_any * mask, union sockaddr_any * gateway, + union sockaddr_any * ifp, union sockaddr_any * ifa, + struct ifdev ** ifdevp, unsigned int * flags, unsigned int * use) +{ + const ip_addr_t *src_addr; + ip_addr_t dst_addr, gw_addr; + struct ifdev *ifdev; + socklen_t addr_len; + uint8_t type; + + type = (route->re_flags & RTF_IPV6) ? IPADDR_TYPE_V6 : IPADDR_TYPE_V4; + + /* Get the destination address. */ + route_get_addr(&dst_addr, route->re_addr, type); + + addr_len = sizeof(*addr); + + addr_put_inet(&addr->sa, &addr_len, &dst_addr, TRUE /*kame*/, + 0 /*port*/); + + /* Get the network mask, if applicable. */ + if (!(route->re_flags & RTF_HOST)) { + addr_len = sizeof(*mask); + + addr_put_netmask(&mask->sa, &addr_len, type, + rttree_get_prefix(&route->re_entry)); + } else + mask->sa.sa_len = 0; + + /* Get the gateway, which may be an IP address or a local link. */ + addr_len = sizeof(*gateway); + + ifdev = route->re_ifdev; + + if (route->re_flags & RTF_GATEWAY) { + if (type == IPADDR_TYPE_V4) + ip_addr_copy_from_ip4(gw_addr, route->re_gw4); + else + ip_addr_copy_from_ip6_packed(gw_addr, route->re_gw6); + + addr_put_inet(&gateway->sa, &addr_len, &gw_addr, TRUE /*kame*/, + 0 /*port*/); + } else { + addr_put_link(&gateway->sa, &addr_len, ifdev_get_index(ifdev), + ifdev_get_iftype(ifdev), NULL /*name*/, NULL /*hwaddr*/, + 0 /*hwaddr_len*/); + } + + /* Get the associated interface name. */ + if (ifp != NULL) { + addr_len = sizeof(*ifp); + + addr_put_link(&ifp->sa, &addr_len, ifdev_get_index(ifdev), + ifdev_get_iftype(ifdev), ifdev_get_name(ifdev), + NULL /*hwaddr*/, 0 /*hwaddr_len*/); + } + + /* Get the associated source address, if we can determine one. */ + if (ifa != NULL) { + src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/); + + if (src_addr != NULL) { + addr_len = sizeof(*ifa); + + addr_put_inet(&ifa->sa, &addr_len, src_addr, + TRUE /*kame*/, 0 /*port*/); + } else + ifa->sa.sa_len = 0; + } + + /* Get other fields. */ + *flags = route_get_flags(route); /* strip any internal flags */ + *ifdevp = ifdev; + *use = route->re_use; +} + +/* + * Enumerate IPv4 routing entries. Return the first IPv4 routing entry if + * 'last' is NULL, or the next routing entry after 'last' if it is not NULL. + * In both cases, the return value may be NULL if there are no more routes. + */ +struct route_entry * +route_enum_v4(struct route_entry * last) +{ + + assert(last == NULL || !(last->re_flags & RTF_IPV6)); + + return (struct route_entry *)rttree_enum(&route_tree[ROUTE_TREE_V4], + (last != NULL) ? &last->re_entry : NULL); +} + +/* + * Enumerate IPv6 routing entries. Return the first IPv6 routing entry if + * 'last' is NULL, or the next routing entry after 'last' if it is not NULL. + * In both cases, the return value may be NULL if there are no more routes. + */ +struct route_entry * +route_enum_v6(struct route_entry * last) +{ + + assert(last == NULL || (last->re_flags & RTF_IPV6)); + + return (struct route_entry *)rttree_enum(&route_tree[ROUTE_TREE_V6], + (last != NULL) ? &last->re_entry : NULL); +} + +/* + * lwIP IPv4 routing function. Given an IPv4 destination address, look up and + * return the target interface, or NULL if there is no route to the address. + * + * This is a full replacement of the corresponding lwIP function, which should + * be overridden with weak symbols, using patches against the lwIP source code. + * As such, the lwIP headers should already provide the correct prototype for + * this function. If not, something will have changed in the lwIP + * implementation, and this code must be revised accordingly. + */ +struct netif * +ip4_route(const ip4_addr_t * dst) +{ + struct route_entry *route; + struct ifdev *ifdev; + + /* + * Look up the route for the destination IPv4 address. If no route is + * found at all, return NULL to the caller. + */ + if ((route = route_lookup_v4(dst)) == NULL) { + route_miss_v4(dst); + + return NULL; + } + + /* + * For now, we increase the use counter only for actual route lookups, + * and not for gateway lookups or user queries. As of writing, + * route(8) does not print this number anyway.. + */ + route->re_use++; + + /* + * For all packets that are supposed to be rejected or blackholed, use + * a loopback interface, regardless of the interface to which the route + * is associated (even though it will typically be lo0 anyway). The + * reason for this is that on packet output, we perform another route + * route lookup just to check for rejection/blackholing, but for + * efficiency reasons, we limit such checks to loopback interfaces: + * loopback traffic will typically use only one IP address anyway, thus + * limiting route misses from such rejection/blackhole route lookups as + * much as we can. The lookup is implemented in route_output_v4(). We + * divert only if the target interface is not a loopback interface + * already, mainly to allow userland tests to create blackhole routes + * to a specific loopback interface for testing purposes. + * + * It is not correct to return NULL for RTF_REJECT routes here, because + * this could cause e.g. connect() calls to fail immediately, which is + * not how rejection should work. Related: a previous incarnation of + * support for these flags used a dedicated netif to eliminate the + * extra route lookup on regular output altogether, but in the current + * situation, that netif would have to be assigned (IPv4 and IPv6) + * addresses in order not to break e.g. connect() in the same way. + */ + if ((route->re_flags & (RTF_REJECT | RTF_BLACKHOLE)) && + !ifdev_is_loopback(route->re_ifdev)) + ifdev = ifdev_get_loopback(); + else + ifdev = route->re_ifdev; + + return ifdev_get_netif(ifdev); +} + +/* + * lwIP IPv4 routing hook. Since this hook is called only from lwIP's own + * ip4_route() implementation, this hook must never fire. If it does, either + * something is wrong with overriding ip4_route(), or lwIP added other places + * from which this hook is called. Both cases are highly problematic and must + * be resolved somehow, which is why we simply call panic() here. + */ +struct netif * +lwip_hook_ip4_route(const ip4_addr_t * dst) +{ + + panic("IPv4 routing hook called - this should not happen!"); +} + +/* + * lwIP IPv4 ARP gateway hook. + */ +const ip4_addr_t * +lwip_hook_etharp_get_gw(struct netif * netif, const ip4_addr_t * ip4addr) +{ + static ip4_addr_t gw_addr; /* may be returned to the caller */ + struct route_entry *route; + + /* Look up the route for the destination IP address. */ + if ((route = route_lookup_v4(ip4addr)) == NULL) + return NULL; + + /* + * This case could only ever trigger as a result of lwIP taking its own + * routing decisions instead of calling the IPv4 routing hook. While + * not impossible, such cases should be extremely rare. We cannot + * provide a meaningful gateway address in this case either, though. + */ + if (route->re_ifdev != netif_get_ifdev(netif)) { + printf("LWIP: unexpected interface for gateway lookup\n"); + + return NULL; + } + + /* + * If this route has a gateway, return the IP address of the gateway. + * Otherwise, the route is for a local network, and we would typically + * not get here because lwIP performs the local-network check itself. + * It is possible that the local network consists of more than one IP + * range, and the user has configured a route for the other range. In + * that case, return the IP address of the actual destination. + * + * We store a packed version of the IPv4 address, so reconstruct the + * unpacked version to a static variable first - for consistency with + * the IPv6 code. + */ + if (route->re_flags & RTF_GATEWAY) { + ip4_addr_copy(gw_addr, route->re_gw4); + + return &gw_addr; + } else + return ip4addr; +} + +/* + * lwIP IPv6 routing function. Given an IPv6 source and destination address, + * look up and return the target interface, or NULL if there is no route to the + * address. Our routing algorithm is destination-based, meaning that the + * source address must be considered only to resolve zone ambiguity. + * + * This is a full replacement of the corresponding lwIP function, which should + * be overridden with weak symbols, using patches against the lwIP source code. + * As such, the lwIP headers should already provide the correct prototype for + * this function. If not, something will have changed in the lwIP + * implementation, and this code must be revised accordingly. + */ +struct netif * +ip6_route(const ip6_addr_t * src, const ip6_addr_t * dst) +{ + struct route_entry *route; + struct ifdev *ifdev; + ip6_addr_t dst_addr; + uint32_t zone; + + assert(src != NULL); + assert(dst != NULL); + + /* + * If the destination address is scoped but has no zone, use the source + * address to determine a zone, which we then set on the destination + * address to find the route, if successful. Obviously, the interface + * is not going to be different from the zone, but we do need to check + * other aspects of the route (e.g., one might want to null-route all + * multicast traffic). In the case that no source address is given at + * all, first see if the destination address happens to be a locally + * assigned address. In theory this could yield multiple matches, so + * pick the first one. If not even that helps, we have absolutely + * nothing we can use to refine route selection. We could pick an + * arbitrary interface in that case, but we currently don't. + */ + zone = IP6_NO_ZONE; + + if (ip6_addr_lacks_zone(dst, IP6_UNKNOWN)) { + if (ip6_addr_has_zone(src)) + zone = ip6_addr_zone(src); + else if (!ip6_addr_isany(src)) { + if ((ifdev = ifaddr_v6_map_by_addr(src)) == NULL) + return NULL; /* should never happen */ + zone = ifdev_get_index(ifdev); + } else { + if ((ifdev = ifaddr_v6_map_by_addr(dst)) != NULL) + zone = ifdev_get_index(ifdev); + else + return NULL; /* TODO: try harder */ + } + + if (zone != IP6_NO_ZONE) { + dst_addr = *dst; + + ip6_addr_set_zone(&dst_addr, zone); + + dst = &dst_addr; + } + } + + route = route_lookup_v6(dst); + + /* + * Look up the route for the destination IPv6 address. If no route is + * found at all, return NULL to the caller. + */ + if (route == NULL) { + /* + * Since we rely on userland to create routes for on-link + * prefixes and default routers, we do not have to call lwIP's + * nd6_find_route() here. + */ + + /* Generate an RTM_MISS message. */ + route_miss_v6(dst); + + return NULL; + } + + /* + * We have found a route based on the destination address. If we did + * not pick the destination address zone based on the source address, + * we should now check for source address zone violations. Note that + * if even the destination address zone violates its target interface, + * this case will be caught by route_lookup_v6(). + */ + if (zone == IP6_NO_ZONE && + ifaddr_is_zone_mismatch(src, route->re_ifdev)) + return NULL; + + route->re_use++; + + /* + * See ip4_route() for an explanation of the use of loopback here. For + * the IPv6 case, the matching logic is in route_output_v6(). + */ + if ((route->re_flags & (RTF_REJECT | RTF_BLACKHOLE)) && + !ifdev_is_loopback(route->re_ifdev)) + ifdev = ifdev_get_loopback(); + else + ifdev = route->re_ifdev; + + /* + * If the selected interface would cause the destination address to + * leave its zone, fail route selection altogether. This case may + * trigger especially for reject routes, for which the interface change + * to loopback may introduce a zone violation. + */ + if (ip6_addr_has_zone(dst) && + !ip6_addr_test_zone(dst, ifdev_get_netif(ifdev))) + return NULL; + + return ifdev_get_netif(ifdev); +} + +/* + * lwIP IPv6 (source) routing hook. Since this hook is called only from lwIP's + * own ip6_route() implementation, this hook must never fire. If it does, + * either something is wrong with overriding ip6_route(), or lwIP added other + * places from which this hook is called. Both cases are highly problematic + * and must be resolved somehow, which is why we simply call panic() here. + */ +struct netif * +lwip_hook_ip6_route(const ip6_addr_t * src, const ip6_addr_t * dst) +{ + + panic("IPv6 routing hook called - this should not happen!"); +} + +/* + * lwIP IPv6 ND6 gateway hook. + */ +const ip6_addr_t * +lwip_hook_nd6_get_gw(struct netif * netif, const ip6_addr_t * ip6addr) +{ + static ip6_addr_t gw_addr; /* may be returned to the caller */ + struct route_entry *route; + struct ifdev *ifdev; + + ifdev = netif_get_ifdev(netif); + assert(ifdev != NULL); + + /* Look up the route for the destination IP address. */ + if ((route = route_lookup_v6(ip6addr)) == NULL) + return NULL; + + /* As for IPv4. */ + if (route->re_ifdev != ifdev) { + printf("LWIP: unexpected interface for gateway lookup\n"); + + return NULL; + } + + /* + * We save memory by storing a packed (zoneless) version of the IPv6 + * gateway address. That means we cannot return a pointer to it here. + * Instead, we have to resort to expanding the address into a static + * variable. The caller will immediately make a copy anyway, though. + */ + if (route->re_flags & RTF_GATEWAY) { + ip6_addr_copy_from_packed(gw_addr, route->re_gw6); + ip6_addr_assign_zone(&gw_addr, IP6_UNKNOWN, netif); + + return &gw_addr; + } else + return ip6addr; +} + +/* + * Check whether a packet is allowed to be sent to the given destination IPv4 + * address 'ipaddr' on the interface 'ifdev', according to route information. + * Return TRUE if the packet should be sent. Return FALSE if the packet should + * be rejected or discarded, with 'err' set to the error to return to lwIP. + */ +int +route_output_v4(struct ifdev * ifdev, const ip4_addr_t * ipaddr, err_t * err) +{ + const struct route_entry *route; + + /* See if we should reject/blackhole packets to this destination. */ + if (ifdev_is_loopback(ifdev) && + (route = route_lookup_v4(ipaddr)) != NULL && + (route->re_flags & (RTF_REJECT | RTF_BLACKHOLE))) { + if (route->re_flags & RTF_REJECT) + *err = ERR_RTE; + else + *err = ERR_OK; + + return FALSE; + } + + return TRUE; +} + +/* + * Check whether a packet is allowed to be sent to the given destination IPv6 + * address 'ipaddr' on the interface 'ifdev', according to route information. + * Return TRUE if the packet should be sent. Return FALSE if the packet should + * be rejected or discarded, with 'err' set to the error to return to lwIP. + */ +int +route_output_v6(struct ifdev * ifdev, const ip6_addr_t * ipaddr, err_t * err) +{ + const struct route_entry *route; + + /* Do one more zone violation test, just in case. It's cheap. */ + if (ip6_addr_has_zone(ipaddr) && + !ip6_addr_test_zone(ipaddr, ifdev_get_netif(ifdev))) { + *err = ERR_RTE; + + return FALSE; + } + + /* See if we should reject/blackhole packets to this destination. */ + if (ifdev_is_loopback(ifdev) && + (route = route_lookup_v6(ipaddr)) != NULL && + (route->re_flags & (RTF_REJECT | RTF_BLACKHOLE))) { + if (route->re_flags & RTF_REJECT) + *err = ERR_RTE; + else + *err = ERR_OK; + + return FALSE; + } + + return TRUE; +} diff --git a/minix/net/lwip/route.h b/minix/net/lwip/route.h new file mode 100644 index 000000000..5573ba478 --- /dev/null +++ b/minix/net/lwip/route.h @@ -0,0 +1,39 @@ +#ifndef MINIX_NET_LWIP_ROUTE_H +#define MINIX_NET_LWIP_ROUTE_H + +#include + +struct route_entry; +struct rtsock_request; + +void route_init(void); +int route_add(const ip_addr_t * addr, unsigned int prefix, + const ip_addr_t * gateway, struct ifdev * ifdev, unsigned int flags, + const struct rtsock_request * rtr); +int route_can_add(const ip_addr_t * addr, unsigned int prefix, int is_host); +struct route_entry *route_find(const ip_addr_t * addr, unsigned int prefix, + int is_host); +struct route_entry *route_lookup(const ip_addr_t * addr); +void route_delete(struct route_entry * route, + const struct rtsock_request * rtr); +void route_clear(struct ifdev * ifdev); +int route_process(unsigned int type, const struct sockaddr * dst, + const struct sockaddr * mask, const struct sockaddr * gateway, + const struct sockaddr * ifp, const struct sockaddr * ifa, + unsigned int flags, unsigned long inits, + const struct rt_metrics * rmx, const struct rtsock_request * rtr); +void route_get(const struct route_entry * route, union sockaddr_any * addr, + union sockaddr_any * mask, union sockaddr_any * gateway, + union sockaddr_any * ifp, union sockaddr_any * ifa, + struct ifdev ** ifdev, unsigned int * flags, unsigned int * use); +unsigned int route_get_flags(const struct route_entry * route); +struct ifdev *route_get_ifdev(const struct route_entry * route); +int route_is_ipv6(const struct route_entry * route); +struct route_entry *route_enum_v4(struct route_entry * last); +struct route_entry *route_enum_v6(struct route_entry * last); +int route_output_v4(struct ifdev * ifdev, const ip4_addr_t * ipaddr, + err_t * err); +int route_output_v6(struct ifdev * ifdev, const ip6_addr_t * ipaddr, + err_t * err); + +#endif /* !MINIX_NET_LWIP_ROUTE_H */ diff --git a/minix/net/lwip/rtsock.c b/minix/net/lwip/rtsock.c new file mode 100644 index 000000000..7af8bb296 --- /dev/null +++ b/minix/net/lwip/rtsock.c @@ -0,0 +1,1912 @@ +/* LWIP service - rtsock.c - routing sockets and route sysctl support */ +/* + * In a nutshell, the intended abstraction is that only this module deals with + * route messages, message headers, and RTA arrays, whereas other modules + * (ifaddr, route) are responsible for parsing and providing sockaddr_* type + * addresses, with the exception of compression and expansion which is + * particular to routing sockets. Concretely, there should be no reference to + * (e.g.) rt_msghdr outside this module, and no mention of ip_addr_t inside it. + */ + +#include "lwip.h" +#include "ifaddr.h" +#include "rtsock.h" +#include "route.h" +#include "lldata.h" + +/* The number of routing sockets. */ +#define NR_RTSOCK 8 + +/* + * The send buffer maximum determines the maximum size of requests. The + * maximum possible request size is the size of the routing message header plus + * RTAX_MAX times the maximum socket address size, including alignment. That + * currently works out to a number in the low 400s, so 512 should be fine for + * now. At this time we do not support changing the send buffer size, because + * there really is no point in doing so. Hence also no RT_SNDBUF_{MIN,DEF}. + */ +#define RT_SNDBUF_MAX 512 /* maximum RT send buffer size */ + +#define RT_RCVBUF_MIN 0 /* minimum RT receive buffer size */ +#define RT_RCVBUF_DEF 16384 /* default RT receive buffer size */ +#define RT_RCVBUF_MAX 65536 /* maximum RT receive buffer size */ + +/* Address length of routing socket address structures; two bytes only. */ +#define RTSOCK_ADDR_LEN offsetof(struct sockaddr, sa_data) + +struct rtsock_rta { + const void *rta_ptr[RTAX_MAX]; + socklen_t rta_len[RTAX_MAX]; +}; + +static const char rtsock_padbuf[RT_ROUNDUP(0)]; + +static struct rtsock { + struct sock rt_sock; /* socket object, MUST be first */ + int rt_family; /* address family filter if not zero */ + unsigned int rt_flags; /* routing socket flags (RTF_) */ + struct pbuf *rt_rcvhead; /* receive buffer, first packet */ + struct pbuf **rt_rcvtailp; /* receive buffer, last ptr-ptr */ + size_t rt_rcvlen; /* receive buffer, length in bytes */ + size_t rt_rcvbuf; /* receive buffer, maximum size */ + TAILQ_ENTRY(rtsock) rt_next; /* next in active or free list */ +} rt_array[NR_RTSOCK]; + +#define RTF_NOLOOPBACK 0x1 /* suppress reply messages */ + +static TAILQ_HEAD(, rtsock) rt_freelist; /* free routing sockets */ +static TAILQ_HEAD(, rtsock) rt_activelist; /* active routing sockets */ + +struct rtsock_request { + struct rtsock *rtr_src; /* source socket of the request */ + pid_t rtr_pid; /* process ID of requesting process */ + int rtr_seq; /* sequence number from the request */ + int rtr_getif; /* RTM_GET only: get interface info */ +}; + +static const struct sockevent_ops rtsock_ops; + +static ssize_t rtsock_info(struct rmib_call *, struct rmib_node *, + struct rmib_oldp *, struct rmib_newp *); + +/* The CTL_NET PF_ROUTE subtree. */ +static struct rmib_node net_route_table[] = { + [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, rtsock_info, + "rtable", "Routing table information"), +}; + +/* The CTL_NET PF_ROUTE node. */ +static struct rmib_node net_route_node = + RMIB_NODE(RMIB_RO, net_route_table, "route", "PF_ROUTE information"); + +/* + * Initialize the routing sockets module. + */ +void +rtsock_init(void) +{ + const int mib[] = { CTL_NET, PF_ROUTE }; + unsigned int slot; + int r; + + /* Initialize the list of free routing sockets. */ + TAILQ_INIT(&rt_freelist); + + for (slot = 0; slot < __arraycount(rt_array); slot++) + TAILQ_INSERT_TAIL(&rt_freelist, &rt_array[slot], rt_next); + + /* Initialize the list of acive routing sockets. */ + TAILQ_INIT(&rt_activelist); + + /* Register the "net.route" subtree with the MIB service. */ + if ((r = rmib_register(mib, __arraycount(mib), &net_route_node)) != OK) + panic("unable to register net.route RMIB tree: %d", r); +} + +/* + * Allocate a pbuf suitable for storing a routing message of 'size' bytes. + * Return the allocated pbuf on success, or NULL on memory allocation failure. + */ +static struct pbuf * +rtsock_alloc(size_t size) +{ + struct pbuf *pbuf; + + /* + * The data will currently always fit in a single pool buffer. Just in + * case this changes in the future, warn and fail cleanly. The rest of + * the code is not able to deal with buffer chains as it is, although + * that can be changed if necessary. + */ + if (size > MEMPOOL_BUFSIZE) { + printf("LWIP: routing socket packet too large (%zu)\n", size); + + return NULL; + } + + pbuf = pbuf_alloc(PBUF_RAW, size, PBUF_RAM); + + assert(pbuf == NULL || pbuf->tot_len == pbuf->len); + + return pbuf; +} + +/* + * Initialize a routing addresses map. + */ +static void +rtsock_rta_init(struct rtsock_rta * rta) +{ + + memset(rta, 0, sizeof(*rta)); +} + +/* + * Set an entry in a routing addresses map. When computing sizes, 'ptr' may be + * NULL. + */ +static void +rtsock_rta_set(struct rtsock_rta * rta, unsigned int rtax, const void * ptr, + socklen_t len) +{ + + assert(rtax < RTAX_MAX); + + rta->rta_ptr[rtax] = ptr; + rta->rta_len[rtax] = len; +} + +/* + * Copy out a message with a header and any entries in a routing addresses map, + * either into a pbuf allocated for this purpose, or to a RMIB (sysctl) caller, + * at the given offset. If no destination is given ('pbuf ' and 'oldp' are + * both NULL), compute just the size of the resulting data. Otherwise, set the + * length and address mask fields in the header as a side effect. Return the + * number of bytes copied on success, and if 'pbuf' is not NULL, it is filled + * with a pointer to the newly allocated pbuf. Return a negative error code on + * failure. Note that when computing the size only, any actual data pointers + * ('hdr', 'msglen', 'addrs', and the pointers in 'rta') may be NULL or even + * invalid, even though the corresponding sizes should still be supplied. + */ +static ssize_t +rtsock_rta_finalize(void * hdr, size_t hdrlen, u_short * msglen, int * addrs, + const struct rtsock_rta * rta, struct pbuf ** pbuf, + struct rmib_oldp * oldp, ssize_t off) +{ + iovec_t iov[1 + RTAX_MAX * 2]; + size_t len, padlen, totallen; + unsigned int i, iovcnt; + int mask; + + assert(pbuf == NULL || oldp == NULL); + assert(pbuf == NULL || off == 0); + assert(RT_ROUNDUP(hdrlen) == hdrlen); + + iov[0].iov_addr = (vir_bytes)hdr; + iov[0].iov_size = hdrlen; + iovcnt = 1; + + totallen = hdrlen; + mask = 0; + + /* + * The addresses in the given RTA map, as present, should be stored in + * the numbering order of the map. + */ + for (i = 0; i < RTAX_MAX; i++) { + if (rta->rta_ptr[i] == NULL) + continue; + + if ((len = rta->rta_len[i]) > 0) { + assert(iovcnt < __arraycount(iov)); + iov[iovcnt].iov_addr = (vir_bytes)rta->rta_ptr[i]; + iov[iovcnt++].iov_size = len; + } + + /* Note that RT_ROUNDUP(0) is not 0.. */ + if ((padlen = RT_ROUNDUP(len) - len) > 0) { + assert(iovcnt < __arraycount(iov)); + iov[iovcnt].iov_addr = (vir_bytes)rtsock_padbuf; + iov[iovcnt++].iov_size = padlen; + } + + totallen += len + padlen; + mask |= (1 << i); /* convert RTAX_ to RTA_ */ + } + + /* If only the length was requested, return it now. */ + if (pbuf == NULL && oldp == NULL) + return totallen; + + /* + * Casting 'hdr' would violate C99 strict aliasing rules, but the + * address mask is not always at the same location anyway. + */ + *msglen = totallen; + *addrs = mask; + + if (pbuf != NULL) { + if ((*pbuf = rtsock_alloc(totallen)) == NULL) + return ENOMEM; + + return util_coalesce((char *)(*pbuf)->payload, totallen, iov, + iovcnt); + } else + return rmib_vcopyout(oldp, off, iov, iovcnt); +} + +/* + * Reduce the size of a network mask to the bytes actually used. It is highly + * doubtful that this extra complexity pays off in any form, but it is what the + * BSDs historically do. We currently implement compression for IPv4 only. + */ +static void +rtsock_compress_netmask(struct sockaddr * sa) +{ + struct sockaddr_in sin; + uint32_t addr; + + if (sa->sa_family != AF_INET) + return; /* nothing to do */ + + memcpy(&sin, sa, sizeof(sin)); /* no type punning.. (sigh) */ + + addr = htonl(sin.sin_addr.s_addr); + + if (addr & 0x000000ff) + sa->sa_len = 8; + else if (addr & 0x0000ffff) + sa->sa_len = 7; + else if (addr & 0x00ffffff) + sa->sa_len = 6; + else if (addr != 0) + sa->sa_len = 5; + else + sa->sa_len = 0; +} + +/* + * Expand a possibly compressed IPv4 or IPv6 network mask, given as 'sa', into + * 'mask'. Return TRUE if expansion succeeded. In that case, the resulting + * mask must have sa.sa_len and sa.sa_family filled in correctly, and have the + * appropriate size for its address family. Return FALSE if expansion failed + * and an error should be returned to the caller. + */ +static int +rtsock_expand_netmask(union sockaddr_any * mask, const struct sockaddr * sa) +{ + + if (sa->sa_len > sizeof(*mask)) + return FALSE; + + memset(mask, 0, sizeof(*mask)); + memcpy(mask, sa, sa->sa_len); + + /* + * Amazingly, even the address family may be chopped off, in which case + * an IPv4 address is implied. + */ + if (sa->sa_len >= offsetof(struct sockaddr, sa_data) && + sa->sa_family == AF_INET6) { + if (sa->sa_len > sizeof(struct sockaddr_in6)) + return FALSE; + + mask->sa.sa_len = sizeof(struct sockaddr_in6); + mask->sa.sa_family = AF_INET6; + } else { + if (sa->sa_len > sizeof(struct sockaddr_in)) + return FALSE; + + mask->sa.sa_len = sizeof(struct sockaddr_in); + mask->sa.sa_family = AF_INET; + } + + return TRUE; +} + +/* + * Create a routing socket. + */ +sockid_t +rtsock_socket(int type, int protocol, struct sock ** sockp, + const struct sockevent_ops ** ops) +{ + struct rtsock *rt; + + /* + * There is no superuser check here: regular users are allowed to issue + * (only) RTM_GET requests on routing sockets. + */ + if (type != SOCK_RAW) + return EPROTOTYPE; + + /* We could accept only the protocols we know, but this is fine too. */ + if (protocol < 0 || protocol >= AF_MAX) + return EPROTONOSUPPORT; + + if (TAILQ_EMPTY(&rt_freelist)) + return ENOBUFS; + + rt = TAILQ_FIRST(&rt_freelist); + TAILQ_REMOVE(&rt_freelist, rt, rt_next); + + rt->rt_flags = 0; + rt->rt_family = protocol; + rt->rt_rcvhead = NULL; + rt->rt_rcvtailp = &rt->rt_rcvhead; + rt->rt_rcvlen = 0; + rt->rt_rcvbuf = RT_RCVBUF_DEF; + + TAILQ_INSERT_HEAD(&rt_activelist, rt, rt_next); + + *sockp = &rt->rt_sock; + *ops = &rtsock_ops; + return SOCKID_RT | (sockid_t)(rt - rt_array); +} + +/* + * Enqueue data on the receive queue of a routing socket. The caller must have + * checked whether the receive buffer size allows for the receipt of the data. + */ +static void +rtsock_enqueue(struct rtsock * rt, struct pbuf * pbuf) +{ + + *rt->rt_rcvtailp = pbuf; + rt->rt_rcvtailp = pchain_end(pbuf); + rt->rt_rcvlen += pchain_size(pbuf); + + sockevent_raise(&rt->rt_sock, SEV_RECV); +} + +/* + * Determine whether a routing message for address family 'family', originated + * from routing socket 'rtsrc' if not NULL, should be sent to routing socket + * 'rt'. Return TRUE if the message should be sent to this socket, or FALSE + * if it should not. + */ +static int +rtsock_can_send(struct rtsock *rt, struct rtsock *rtsrc, int family) +{ + + /* Do not send anything on sockets shut down for reading. */ + if (sockevent_is_shutdown(&rt->rt_sock, SFL_SHUT_RD)) + return FALSE; + + /* + * Do not send a reply message to the source of the request if the + * source is not interested in replies to its own requests. + */ + if (rt == rtsrc && (rt->rt_flags & RTF_NOLOOPBACK)) + return FALSE; + + /* + * For address family specific messages, make sure the routing socket + * is interested in that family. Make an exception if the socket was + * the source of the request, though: we currently do not prevent user + * processes from issuing commands for the "wrong" family. + */ + if (rt->rt_family != AF_UNSPEC && family != AF_UNSPEC && + rt->rt_family != family && rt != rtsrc) + return FALSE; + + /* + * See whether the receive queue of the socket is already full. We do + * not consider the size of the current request, in order to not drop + * larger messages and then enqueue smaller ones. + */ + if (rt->rt_rcvlen >= rt->rt_rcvbuf) + return FALSE; + + /* All is well: go on and deliver the message. */ + return TRUE; +} + +/* + * Send the routing message in 'pbuf' to the given routing socket if possible, + * or check whether such a message could be sent to that socket if 'pbuf' is + * NULL. In the former case, the function takes ownership of 'pbuf'. The + * given routing socket is assumed to be the source of the routing request that + * generated this message. In the latter case, the function returns TRUE if + * the socket would take the message or FALSE if not. If 'family' is not + * AF_UNSPEC, it is to be the address family of the message. + */ +static int +rtsock_msg_one(struct rtsock * rt, int family, struct pbuf * pbuf) +{ + + if (rtsock_can_send(rt, rt, family)) { + if (pbuf != NULL) + rtsock_enqueue(rt, pbuf); + + return TRUE; + } else { + if (pbuf != NULL) + pbuf_free(pbuf); + + return FALSE; + } +} + +/* + * Send the routing message in 'pbuf' to all matching routing sockets, or check + * whether there are any such matching routing sockets if 'pbuf' is NULL. In + * the former case, the function takes ownership of 'pbuf'. In the latter + * case, the function returns TRUE if there are any matching sockets or FALSE + * if there are none. If 'rtsrc' is not NULL, it is to be the routing socket + * that is the source of the message. If 'family' is not AF_UNSPEC, it is to + * be the address family of the message. + */ +static int +rtsock_msg_match(struct rtsock * rtsrc, int family, struct pbuf * pbuf) +{ + struct rtsock *rt, *rtprev; + struct pbuf *pcopy; + + rtprev = NULL; + + TAILQ_FOREACH(rt, &rt_activelist, rt_next) { + if (!rtsock_can_send(rt, rtsrc, family)) + continue; + + /* + * There is at least one routing socket that is interested in + * receiving this message, and able to receive it. + */ + if (pbuf == NULL) + return TRUE; + + /* + * We need to make copies of the generated message for all but + * the last matching socket, which gets the original. If we're + * out of memory, free the original and stop: there are more + * important things to spend memory on than routing sockets. + */ + if (rtprev != NULL) { + if ((pcopy = rtsock_alloc(pbuf->tot_len)) == NULL) { + pbuf_free(pbuf); + + return TRUE; + } + + if (pbuf_copy(pcopy, pbuf) != ERR_OK) + panic("unexpected pbuf copy failure"); + + rtsock_enqueue(rtprev, pcopy); + } + + rtprev = rt; + } + + if (rtprev != NULL) + rtsock_enqueue(rtprev, pbuf); + else if (pbuf != NULL) + pbuf_free(pbuf); + + return (rtprev != NULL); +} + +/* + * Dequeue and free the head of the receive queue of a routing socket. + */ +static void +rtsock_dequeue(struct rtsock * rt) +{ + struct pbuf *pbuf, **pnext; + size_t size; + + pbuf = rt->rt_rcvhead; + assert(pbuf != NULL); + + pnext = pchain_end(pbuf); + size = pchain_size(pbuf); + + if ((rt->rt_rcvhead = *pnext) == NULL) + rt->rt_rcvtailp = &rt->rt_rcvhead; + + assert(rt->rt_rcvlen >= size); + rt->rt_rcvlen -= size; + + *pnext = NULL; + pbuf_free(pbuf); +} + +/* + * Process a routing message sent on a socket. Return OK on success, in which + * case the caller assumes that the processing routine has sent a reply to the + * user and possibly other routing sockets. Return a negative error code on + * failure, in which case the caller will send the reply to the user instead. + */ +static int +rtsock_process(struct rtsock *rt, struct rt_msghdr * rtm, char * buf, + size_t len, int is_root) +{ + struct rtsock_request rtr; + struct rtsock_rta rta; + const struct sockaddr *netmask; + struct sockaddr sa; + union sockaddr_any mask; + size_t off; + int i; + + if (rtm->rtm_msglen != len) + return EINVAL; + + if (rtm->rtm_version != RTM_VERSION) { + printf("LWIP: PID %d uses routing sockets version %u\n", + rtm->rtm_pid, rtm->rtm_version); + + return EPROTONOSUPPORT; + } + + /* + * Make sure that we won't misinterpret the rest of the message. While + * looking at the message type, also make sure non-root users can only + * ever issue RTM_GET requests. + */ + switch (rtm->rtm_type) { + case RTM_ADD: + case RTM_DELETE: + case RTM_CHANGE: + case RTM_LOCK: + if (!is_root) + return EPERM; + + /* FALLTHROUGH */ + case RTM_GET: + break; + + default: + return EOPNOTSUPP; + } + + /* + * Extract all given addresses. We do not actually support all types + * of entries, but we cannot skip the ones we do not need either. + */ + rtsock_rta_init(&rta); + + off = sizeof(*rtm); + assert(off == RT_ROUNDUP(off)); + + for (i = 0; i < RTAX_MAX; i++) { + if (!(rtm->rtm_addrs & (1 << i))) + continue; + + if (off + offsetof(struct sockaddr, sa_data) > len) + return EINVAL; + + /* + * It is safe to access sa_len and even sa_family in all cases, + * in particular even when the structure is of size zero. + */ + assert(offsetof(struct sockaddr, sa_data) <= RT_ROUNDUP(0)); + + memcpy(&sa, &buf[off], offsetof(struct sockaddr, sa_data)); + + if (off + sa.sa_len > len) + return EINVAL; + + rtsock_rta_set(&rta, i, &buf[off], sa.sa_len); + + off += RT_ROUNDUP((size_t)sa.sa_len); + } + + /* + * Expand the given netmask if it is in compressed IPv4 form. We do + * this here because it is particular to routing sockets; we also do + * the compression in this module. Note how the compression may even + * strip off the address family; really, who came up with this ****? + */ + netmask = (const struct sockaddr *)rta.rta_ptr[RTAX_NETMASK]; + + if (netmask != NULL) { + if (!rtsock_expand_netmask(&mask, netmask)) + return EINVAL; + + rtsock_rta_set(&rta, RTAX_NETMASK, &mask, mask.sa.sa_len); + } + + /* + * Actually process the command. Pass on enough information so that a + * reply can be generated on success. The abstraction as sketched at + * the top of the file imposes that we pass quite a few parameters. + */ + rtr.rtr_src = rt; + rtr.rtr_pid = rtm->rtm_pid; + rtr.rtr_seq = rtm->rtm_seq; + rtr.rtr_getif = (rtm->rtm_type == RTM_GET && + (rta.rta_ptr[RTAX_IFP] != NULL || rta.rta_ptr[RTAX_IFA] != NULL)); + + return route_process(rtm->rtm_type, + (const struct sockaddr *)rta.rta_ptr[RTAX_DST], + (const struct sockaddr *)rta.rta_ptr[RTAX_NETMASK], + (const struct sockaddr *)rta.rta_ptr[RTAX_GATEWAY], + (const struct sockaddr *)rta.rta_ptr[RTAX_IFP], + (const struct sockaddr *)rta.rta_ptr[RTAX_IFA], + rtm->rtm_flags, rtm->rtm_inits, &rtm->rtm_rmx, &rtr); +} + +/* + * Perform preliminary checks on a send request. + */ +static int +rtsock_pre_send(struct sock * sock __unused, size_t len, + socklen_t ctl_len __unused, const struct sockaddr * addr, + socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags) +{ + + if (flags != 0) + return EOPNOTSUPP; + + if (addr != NULL) + return EISCONN; + + /* + * For the most basic failures - that is, we cannot even manage to + * receive the request - we do not generate a reply message. + */ + if (len < sizeof(struct rt_msghdr)) + return ENOBUFS; + if (len > RT_SNDBUF_MAX) + return EMSGSIZE; + + return OK; +} + +/* + * Send data on a routing socket. + */ +static int +rtsock_send(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, + socklen_t ctl_len __unused, socklen_t * ctl_off __unused, + const struct sockaddr * addr __unused, socklen_t addr_len __unused, + endpoint_t user_endpt, int flags __unused, size_t min __unused) +{ + struct rtsock *rt = (struct rtsock *)sock; + char buf[RT_SNDBUF_MAX] __aligned(4); + struct rt_msghdr rtm; + struct pbuf *pbuf; + uid_t euid; + int r, is_root; + + /* Copy in the request, and adjust some fields right away. */ + assert(len >= sizeof(rtm)); + assert(len <= sizeof(buf)); + + if ((r = sockdriver_copyin(data, 0, buf, len)) != OK) + return r; + + memcpy(&rtm, buf, sizeof(rtm)); + rtm.rtm_errno = 0; + rtm.rtm_flags &= ~RTF_DONE; + rtm.rtm_pid = getepinfo(user_endpt, &euid, NULL /*gid*/); + + is_root = (euid == ROOT_EUID); + + /* Process the request. */ + r = rtsock_process(rt, &rtm, buf, len, is_root); + + /* + * If the request has been processed successfully, a reply has been + * sent already, possibly also to other routing sockets. Here, we + * handle the case that the request has resulted in failure, in which + * case we send a reply to the caller only. This behavior is different + * from the traditional BSD behavior, which also sends failure replies + * to other sockets. Our motivation is that while other parties are + * never going to be interested in failures anyway, it is in fact easy + * for an unprivileged user process to abuse the failure-reply system + * in order to fake other types of routing messages (e.g., RTM_IFINFO) + * to other parties. By sending failure replies only to the requestor, + * we eliminate the need for security-sensitive request validation. + */ + if (r != OK && rtsock_can_send(rt, rt, AF_UNSPEC)) { + rtm.rtm_errno = -r; + + if ((pbuf = rtsock_alloc(len)) == NULL) + return ENOMEM; + + /* For the reply, reuse the request message largely as is. */ + memcpy(pbuf->payload, &rtm, sizeof(rtm)); + if (len > sizeof(rtm)) + memcpy((uint8_t *)pbuf->payload + sizeof(rtm), + buf + sizeof(rtm), len - sizeof(rtm)); + + rtsock_enqueue(rt, pbuf); + } else if (r == OK) + *offp = len; + + return r; +} + +/* + * Perform preliminary checks on a receive request. + */ +static int +rtsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, + int flags) +{ + + /* + * We accept the same flags across all socket types in LWIP, and then + * simply ignore the ones we do not support for routing sockets. + */ + if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) + return EOPNOTSUPP; + + return OK; +} + +/* + * Receive data on a routing socket. + */ +static int +rtsock_recv(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * off, const struct sockdriver_data * ctl __unused, + socklen_t ctl_len __unused, socklen_t * ctl_off __unused, + struct sockaddr * addr, socklen_t * addr_len, + endpoint_t user_endpt __unused, int flags, size_t min __unused, + int * rflags) +{ + struct rtsock *rt = (struct rtsock *)sock; + struct pbuf *pbuf; + int r; + + if ((pbuf = rt->rt_rcvhead) == NULL) + return SUSPEND; + + /* Copy out the data to the calling user process. */ + if (len >= pbuf->tot_len) + len = pbuf->tot_len; + else + *rflags |= MSG_TRUNC; + + r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/); + + if (r != OK) + return r; + + /* Generate a dummy source address. */ + addr->sa_len = RTSOCK_ADDR_LEN; + addr->sa_family = AF_ROUTE; + *addr_len = RTSOCK_ADDR_LEN; + + /* Discard the data now, unless we were instructed to peek only. */ + if (!(flags & MSG_PEEK)) + rtsock_dequeue(rt); + + /* Return the received part of the data length. */ + *off = len; + return OK; +} + +/* + * Test whether data can be received on a routing socket, and if so, how many + * bytes of data. + */ +static int +rtsock_test_recv(struct sock * sock, size_t min __unused, size_t * size) +{ + struct rtsock *rt = (struct rtsock *)sock; + + if (rt->rt_rcvhead == NULL) + return SUSPEND; + + if (size != NULL) + *size = rt->rt_rcvhead->tot_len; + return OK; +} + +/* + * Set socket options on a routing socket. + */ +static int +rtsock_setsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t len) +{ + struct rtsock *rt = (struct rtsock *)sock; + int r, val; + + if (level == SOL_SOCKET) { + switch (name) { + case SO_USELOOPBACK: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (!val) + rt->rt_flags |= RTF_NOLOOPBACK; + else + rt->rt_flags &= ~RTF_NOLOOPBACK; + + return OK; + + case SO_RCVBUF: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < RT_RCVBUF_MIN || val > RT_RCVBUF_MAX) + return EINVAL; + + rt->rt_rcvbuf = (size_t)val; + + return OK; + } + } + + return ENOPROTOOPT; +} + +/* + * Retrieve socket options on a routing socket. + */ +static int +rtsock_getsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t * len) +{ + struct rtsock *rt = (struct rtsock *)sock; + int val; + + if (level == SOL_SOCKET) { + switch (name) { + case SO_USELOOPBACK: + val = !(rt->rt_flags & RTF_NOLOOPBACK); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case SO_RCVBUF: + val = rt->rt_rcvbuf; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + } + + return ENOPROTOOPT; +} + +/* + * Retrieve the local or remote socket address of a routing socket. + */ +static int +rtsock_getname(struct sock * sock __unused, struct sockaddr * addr, + socklen_t * addr_len) +{ + + /* This is entirely useless but apparently common between OSes. */ + addr->sa_len = RTSOCK_ADDR_LEN; + addr->sa_family = AF_ROUTE; + *addr_len = RTSOCK_ADDR_LEN; + + return OK; +} + +/* + * Drain the receive queue of a routing socket. + */ +static void +rtsock_drain(struct rtsock * rt) +{ + + while (rt->rt_rcvhead != NULL) + rtsock_dequeue(rt); +} + +/* + * Shut down a routing socket for reading and/or writing. + */ +static int +rtsock_shutdown(struct sock * sock, unsigned int mask) +{ + struct rtsock *rt = (struct rtsock *)sock; + + if (mask & SFL_SHUT_RD) + rtsock_drain(rt); + + return OK; +} + +/* + * Close a routing socket. + */ +static int +rtsock_close(struct sock * sock, int force __unused) +{ + struct rtsock *rt = (struct rtsock *)sock; + + rtsock_drain(rt); + + return OK; +} + +/* + * Free up a closed routing socket. + */ +static void +rtsock_free(struct sock * sock) +{ + struct rtsock *rt = (struct rtsock *)sock; + + TAILQ_REMOVE(&rt_activelist, rt, rt_next); + + TAILQ_INSERT_HEAD(&rt_freelist, rt, rt_next); +} + +static const struct sockevent_ops rtsock_ops = { + .sop_pre_send = rtsock_pre_send, + .sop_send = rtsock_send, + .sop_pre_recv = rtsock_pre_recv, + .sop_recv = rtsock_recv, + .sop_test_recv = rtsock_test_recv, + .sop_setsockopt = rtsock_setsockopt, + .sop_getsockopt = rtsock_getsockopt, + .sop_getsockname = rtsock_getname, + .sop_getpeername = rtsock_getname, + .sop_shutdown = rtsock_shutdown, + .sop_close = rtsock_close, + .sop_free = rtsock_free +}; + +/* + * Send an interface announcement message about the given interface. If + * 'arrival' is set, the interface has just been created; otherwise, the + * interface is about to be destroyed. + */ +void +rtsock_msg_ifannounce(struct ifdev * ifdev, int arrival) +{ + struct if_announcemsghdr ifan; + struct pbuf *pbuf; + + if (!rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, NULL /*pbuf*/)) + return; + + memset(&ifan, 0, sizeof(ifan)); + ifan.ifan_msglen = sizeof(ifan); + ifan.ifan_version = RTM_VERSION; + ifan.ifan_type = RTM_IFANNOUNCE; + ifan.ifan_index = ifdev_get_index(ifdev); + strlcpy(ifan.ifan_name, ifdev_get_name(ifdev), sizeof(ifan.ifan_name)); + ifan.ifan_what = (arrival) ? IFAN_ARRIVAL : IFAN_DEPARTURE; + + if ((pbuf = rtsock_alloc(sizeof(ifan))) == NULL) + return; + memcpy(pbuf->payload, &ifan, sizeof(ifan)); + + rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, pbuf); +} + +/* + * Send an interface information routing message. + */ +void +rtsock_msg_ifinfo(struct ifdev * ifdev) +{ + struct if_msghdr ifm; + struct pbuf *pbuf; + + if (!rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, NULL /*pbuf*/)) + return; + + memset(&ifm, 0, sizeof(ifm)); + ifm.ifm_msglen = sizeof(ifm); + ifm.ifm_version = RTM_VERSION; + ifm.ifm_type = RTM_IFINFO; + ifm.ifm_addrs = 0; + ifm.ifm_flags = ifdev_get_ifflags(ifdev); + ifm.ifm_index = ifdev_get_index(ifdev); + memcpy(&ifm.ifm_data, ifdev_get_ifdata(ifdev), sizeof(ifm.ifm_data)); + + if ((pbuf = rtsock_alloc(sizeof(ifm))) == NULL) + return; + memcpy(pbuf->payload, &ifm, sizeof(ifm)); + + rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, pbuf); +} + +/* + * Set up a RTA map and an interface address structure for use in a RTM_xxxADDR + * routing message. + */ +static void +rtsock_rta_init_ifam(struct rtsock_rta * rta, struct ifa_msghdr * ifam, + struct ifdev * ifdev, unsigned int type, struct sockaddr_dlx * sdlx) +{ + + memset(ifam, 0, sizeof(*ifam)); + ifam->ifam_version = RTM_VERSION; + ifam->ifam_type = type; + ifam->ifam_flags = 0; + ifam->ifam_index = ifdev_get_index(ifdev); + ifam->ifam_metric = ifdev_get_metric(ifdev); + + rtsock_rta_init(rta); + + ifaddr_dl_get(ifdev, (ifaddr_dl_num_t)0, sdlx); + + rtsock_rta_set(rta, RTAX_IFP, sdlx, sdlx->sdlx_len); +} + +/* + * Add a specific link-layer address for an interface to the given RTA map. + */ +static void +rtsock_rta_add_dl(struct rtsock_rta * rta, struct ifdev * ifdev, + ifaddr_dl_num_t num, struct sockaddr_dlx * sdlx) +{ + + /* Obtain the address data. */ + ifaddr_dl_get(ifdev, num, sdlx); + + /* Add the interface address. */ + rtsock_rta_set(rta, RTAX_IFA, sdlx, sdlx->sdlx_len); + + /* + * NetBSD also adds a RTAX_NETMASK entry here. At this moment it is + * not clear to me why, and it is a pain to make, so for now we do not. + */ +} + +/* + * Send a routing message about a new, changed, or deleted datalink address for + * the given interface. + */ +void +rtsock_msg_addr_dl(struct ifdev * ifdev, unsigned int type, + ifaddr_dl_num_t num) +{ + struct rtsock_rta rta; + struct ifa_msghdr ifam; + struct sockaddr_dlx name, addr; + struct pbuf *pbuf; + + if (!rtsock_msg_match(NULL /*rtsrc*/, AF_LINK, NULL /*pbuf*/)) + return; + + rtsock_rta_init_ifam(&rta, &ifam, ifdev, type, &name); + + rtsock_rta_add_dl(&rta, ifdev, num, &addr); + + if (rtsock_rta_finalize(&ifam, sizeof(ifam), &ifam.ifam_msglen, + &ifam.ifam_addrs, &rta, &pbuf, NULL, 0) > 0) + rtsock_msg_match(NULL /*rtsrc*/, AF_LINK, pbuf); +} + +/* + * Add a specific IPv4 address for an interface to the given RTA map. + */ +static void +rtsock_rta_add_v4(struct rtsock_rta * rta, struct ifdev * ifdev, + ifaddr_v4_num_t num, struct sockaddr_in sin[4]) +{ + + /* Obtain the address data. */ + (void)ifaddr_v4_get(ifdev, num, &sin[0], &sin[1], &sin[2], &sin[3]); + + /* Add the interface address. */ + rtsock_rta_set(rta, RTAX_IFA, &sin[0], sin[0].sin_len); + + /* Add the netmask, after compressing it. */ + rtsock_compress_netmask((struct sockaddr *)&sin[1]); + + rtsock_rta_set(rta, RTAX_NETMASK, &sin[1], sin[1].sin_len); + + /* Possibly add a broadcast or destination address. */ + if (sin[2].sin_len != 0) + rtsock_rta_set(rta, RTAX_BRD, &sin[2], sin[2].sin_len); + else if (sin[3].sin_len != 0) + rtsock_rta_set(rta, RTAX_DST, &sin[3], sin[3].sin_len); +} + +/* + * Send a routing message about a new or deleted IPv4 address for the given + * interface. + */ +void +rtsock_msg_addr_v4(struct ifdev * ifdev, unsigned int type, + ifaddr_v4_num_t num) +{ + struct rtsock_rta rta; + struct ifa_msghdr ifam; + struct sockaddr_dlx name; + struct sockaddr_in sin[4]; + struct pbuf *pbuf; + + if (!rtsock_msg_match(NULL /*rtsrc*/, AF_INET, NULL /*pbuf*/)) + return; + + rtsock_rta_init_ifam(&rta, &ifam, ifdev, type, &name); + + rtsock_rta_add_v4(&rta, ifdev, num, sin); + + if (rtsock_rta_finalize(&ifam, sizeof(ifam), &ifam.ifam_msglen, + &ifam.ifam_addrs, &rta, &pbuf, NULL, 0) > 0) + rtsock_msg_match(NULL /*rtsrc*/, AF_INET, pbuf); +} + +/* + * Add a specific IPv6 address for an interface to the given RTA map. + */ +static void +rtsock_rta_add_v6(struct rtsock_rta * rta, struct ifdev * ifdev, + ifaddr_v6_num_t num, struct sockaddr_in6 sin6[3]) +{ + + /* Obtain the address data. */ + ifaddr_v6_get(ifdev, num, &sin6[0], &sin6[1], &sin6[2]); + + /* Add the interface address. */ + rtsock_rta_set(rta, RTAX_IFA, &sin6[0], sin6[0].sin6_len); + + /* Add the netmask, after compressing it (a no-op at the moment). */ + rtsock_compress_netmask((struct sockaddr *)&sin6[1]); + + rtsock_rta_set(rta, RTAX_NETMASK, &sin6[1], sin6[1].sin6_len); + + /* Possibly add a destination address. */ + if (sin6[2].sin6_len != 0) + rtsock_rta_set(rta, RTAX_DST, &sin6[2], sin6[2].sin6_len); +} + +/* + * Send a routing message about a new or deleted IPv6 address for the given + * interface. + */ +void +rtsock_msg_addr_v6(struct ifdev * ifdev, unsigned int type, + ifaddr_v6_num_t num) +{ + struct rtsock_rta rta; + struct ifa_msghdr ifam; + struct sockaddr_dlx name; + struct sockaddr_in6 sin6[3]; + struct pbuf *pbuf; + + if (!rtsock_msg_match(NULL /*rtsrc*/, AF_INET6, NULL /*pbuf*/)) + return; + + rtsock_rta_init_ifam(&rta, &ifam, ifdev, type, &name); + + rtsock_rta_add_v6(&rta, ifdev, num, sin6); + + if (rtsock_rta_finalize(&ifam, sizeof(ifam), &ifam.ifam_msglen, + &ifam.ifam_addrs, &rta, &pbuf, NULL, 0) > 0) + rtsock_msg_match(NULL /*rtsrc*/, AF_INET6, pbuf); +} + +/* + * Send an RTM_MISS routing message about an address for which no route was + * found. The caller must provide the address in the appropriate form and + * perform any per-address rate limiting. + */ +void +rtsock_msg_miss(const struct sockaddr * addr) +{ + struct rt_msghdr rtm; + struct rtsock_rta rta; + struct pbuf *pbuf; + + /* + * Unfortunately the destination address has already been generated (as + * 'addr'), which is a big part of the work. Still, skip the rest if + * there is no routing socket to deliver the message to. + */ + if (!rtsock_msg_match(NULL /*rtsrc*/, addr->sa_family, NULL /*pbuf*/)) + return; + + memset(&rtm, 0, sizeof(rtm)); + rtm.rtm_version = RTM_VERSION; + rtm.rtm_type = RTM_MISS; + + rtsock_rta_init(&rta); + + rtsock_rta_set(&rta, RTAX_DST, addr, addr->sa_len); + + if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, + &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) + rtsock_msg_match(NULL /*rtsrc*/, addr->sa_family, pbuf); +} + +/* + * Generate routing socket data for a route, for either routing socket + * broadcasting or a sysctl(7) request. The route is given as 'route'. The + * type of the message (RTM_) is given as 'type'. The resulting routing + * message header is stored in 'rtm' and an address vector is stored in 'rta'. + * The latter may point to addresses generated in 'addr', 'mask', 'gateway', + * and optionally (if not NULL) 'ifp' and 'ifa'. The caller is responsible for + * combining the results into an appropriate routing message. + */ +static void +rtsock_get_route(struct rt_msghdr * rtm, struct rtsock_rta * rta, + union sockaddr_any * addr, union sockaddr_any * mask, + union sockaddr_any * gateway, union sockaddr_any * ifp, + union sockaddr_any * ifa, const struct route_entry * route, + unsigned int type) +{ + struct ifdev *ifdev; + unsigned int flags, use; + + route_get(route, addr, mask, gateway, ifp, ifa, &ifdev, &flags, &use); + + memset(rtm, 0, sizeof(*rtm)); + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = type; + rtm->rtm_flags = flags; + rtm->rtm_index = ifdev_get_index(ifdev); + rtm->rtm_use = use; + + rtsock_rta_init(rta); + + rtsock_rta_set(rta, RTAX_DST, addr, addr->sa.sa_len); + + if (!(flags & RTF_HOST)) { + rtsock_compress_netmask(&mask->sa); + + rtsock_rta_set(rta, RTAX_NETMASK, mask, mask->sa.sa_len); + } + + rtsock_rta_set(rta, RTAX_GATEWAY, gateway, gateway->sa.sa_len); + + if (ifp != NULL) + rtsock_rta_set(rta, RTAX_IFP, ifp, ifp->sa.sa_len); + + if (ifa != NULL) + rtsock_rta_set(rta, RTAX_IFA, ifa, ifa->sa.sa_len); +} + +/* + * Send a routing message about a route, with the given type which may be one + * of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_LOCK, and RTM_GET. The routing + * socket request information 'rtr', if not NULL, provides additional + * information about the routing socket that was the source of the request (if + * any), various fields that should be echoed, and (for RTM_GET) whether to + * add interface information to the output. + */ +void +rtsock_msg_route(const struct route_entry * route, unsigned int type, + const struct rtsock_request * rtr) +{ + union sockaddr_any addr, mask, gateway, ifp, ifa; + struct rt_msghdr rtm; + struct rtsock_rta rta; + struct rtsock *rtsrc; + struct pbuf *pbuf; + int family, getif; + + rtsrc = (rtr != NULL) ? rtr->rtr_src : NULL; + family = (route_is_ipv6(route)) ? AF_INET6 : AF_INET; + + if (!rtsock_msg_match(rtsrc, family, NULL /*pbuf*/)) + return; + + getif = (rtr != NULL && rtr->rtr_getif); + + rtsock_get_route(&rtm, &rta, &addr, &mask, &gateway, + (getif) ? &ifp : NULL, (getif) ? &ifa : NULL, route, type); + + if (rtr != NULL) { + rtm.rtm_flags |= RTF_DONE; + rtm.rtm_pid = rtr->rtr_pid; + rtm.rtm_seq = rtr->rtr_seq; + } + + if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, + &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) + rtsock_msg_match(rtsrc, family, pbuf); +} + +/* + * Generate sysctl(7) output or length for the given routing table entry + * 'route', provided that the route passes the flags filter 'filter'. The + * address length 'addr_len' is used to compute a cheap length estimate. On + * success, return the byte size of the output. If the route was not a match + * for the filter, return zero. On failure, return a negative error code. + */ +static ssize_t +rtsock_info_rtable_entry(const struct route_entry * route, unsigned int filter, + socklen_t addr_len, struct rmib_oldp * oldp, size_t off) +{ + union sockaddr_any addr, mask, gateway; + struct rt_msghdr rtm; + struct rtsock_rta rta; + unsigned int flags; + ssize_t len; + + flags = route_get_flags(route); + + /* Apparently, matching any of the flags (if given) is sufficient. */ + if (filter != 0 && (filter & flags) != 0) + return 0; + + /* Size (over)estimation shortcut. */ + if (oldp == NULL) { + len = sizeof(rtm) + RT_ROUNDUP(addr_len) + + RT_ROUNDUP(sizeof(gateway)); + + if (!(flags & RTF_HOST)) + len += RT_ROUNDUP(addr_len); + + return len; + } + + rtsock_get_route(&rtm, &rta, &addr, &mask, &gateway, NULL /*ifp*/, + NULL /*ifa*/, route, RTM_GET); + + return rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, + &rtm.rtm_addrs, &rta, NULL /*pbuf*/, oldp, off); +} + +/* + * Obtain routing table entries. + */ +static ssize_t +rtsock_info_rtable(struct rmib_oldp * oldp, int family, int filter) +{ + struct route_entry *route; + ssize_t r, off; + + off = 0; + + if (family == AF_UNSPEC || family == AF_INET) { + for (route = NULL; (route = route_enum_v4(route)) != NULL; ) { + if ((r = rtsock_info_rtable_entry(route, + (unsigned int)filter, sizeof(struct sockaddr_in), + oldp, off)) < 0) + return r; + off += r; + } + } + + if (family == AF_UNSPEC || family == AF_INET6) { + for (route = NULL; (route = route_enum_v6(route)) != NULL; ) { + if ((r = rtsock_info_rtable_entry(route, + (unsigned int)filter, sizeof(struct sockaddr_in6), + oldp, off)) < 0) + return r; + off += r; + } + } + + /* TODO: should we add slack here? */ + return off; +} + +/* + * Generate routing socket data for an ARP table entry, for either routing + * socket broadcasting or a sysctl(7) request. The ARP table entry number is + * given as 'num'. The type of the message (RTM_) is given as 'type'. The + * resulting routing message header is stored in 'rtm' and an address vector is + * stored in 'rta'. The latter may point to addresses generated in 'addr' and + * 'gateway'. The caller is responsible for combining the results into an + * appropriate routing message. + */ +static void +rtsock_get_arp(struct rt_msghdr * rtm, struct rtsock_rta * rta, + struct sockaddr_in * addr, struct sockaddr_dlx * gateway, + lldata_arp_num_t num, unsigned int type) +{ + struct ifdev *ifdev; + unsigned int flags; + + lldata_arp_get(num, addr, gateway, &ifdev, &flags); + + memset(rtm, 0, sizeof(*rtm)); + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = type; + rtm->rtm_flags = flags; + rtm->rtm_index = ifdev_get_index(ifdev); + + /* TODO: obtaining and reporting the proper expiry time, if any. */ + if (!(flags & RTF_STATIC)) + rtm->rtm_rmx.rmx_expire = (time_t)-1; + + rtsock_rta_init(rta); + + rtsock_rta_set(rta, RTAX_DST, addr, addr->sin_len); + + rtsock_rta_set(rta, RTAX_GATEWAY, gateway, gateway->sdlx_len); +} + +/* + * Send a routing message about an ARP table entry, with the given type which + * may be one of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_LOCK, and RTM_GET. The + * routing socket request information 'rtr', if not NULL, provides additional + * information about the routing socket that was the source of the request (if + * any) and various fields that should be echoed. + */ +void +rtsock_msg_arp(lldata_arp_num_t num, unsigned int type, + const struct rtsock_request * rtr) +{ + struct sockaddr_in addr; + struct sockaddr_dlx gateway; + struct rt_msghdr rtm; + struct rtsock_rta rta; + struct pbuf *pbuf; + + assert(rtr != NULL); + + /* + * We do not maintain the link-local tables ourselves, and thus, we do + * not have a complete view of modifications to them. In order not to + * confuse userland with inconsistent updates (e.g., deletion of + * previously unreported entries), send these routing messages to the + * source of the routing request only. + */ + if (!rtsock_msg_one(rtr->rtr_src, AF_INET, NULL /*pbuf*/)) + return; + + rtsock_get_arp(&rtm, &rta, &addr, &gateway, num, type); + + if (rtr != NULL) { + rtm.rtm_flags |= RTF_DONE; + rtm.rtm_pid = rtr->rtr_pid; + rtm.rtm_seq = rtr->rtr_seq; + } + + if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, + &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) + rtsock_msg_one(rtr->rtr_src, AF_INET, pbuf); +} + +/* + * Obtain ARP table entries. + */ +static ssize_t +rtsock_info_lltable_arp(struct rmib_oldp * oldp) +{ + struct sockaddr_in addr; + struct sockaddr_dlx gateway; + struct rt_msghdr rtm; + struct rtsock_rta rta; + lldata_arp_num_t num; + ssize_t r, off; + + off = 0; + + for (num = 0; lldata_arp_enum(&num); num++) { + /* Size (over)estimation shortcut. */ + if (oldp == NULL) { + off += sizeof(struct rt_msghdr) + + RT_ROUNDUP(sizeof(addr)) + + RT_ROUNDUP(sizeof(gateway)); + + continue; + } + + rtsock_get_arp(&rtm, &rta, &addr, &gateway, num, RTM_GET); + + if ((r = rtsock_rta_finalize(&rtm, sizeof(rtm), + &rtm.rtm_msglen, &rtm.rtm_addrs, &rta, NULL /*pbuf*/, oldp, + off)) < 0) + return r; + off += r; + } + + /* TODO: should we add slack here? */ + return off; +} + +/* + * Generate routing socket data for an NDP table entry, for either routing + * socket broadcasting or a sysctl(7) request. The NDP table entry number is + * given as 'num'. The type of the message (RTM_) is given as 'type'. The + * resulting routing message header is stored in 'rtm' and an address vector is + * stored in 'rta'. The latter may point to addresses generated in 'addr' and + * 'gateway'. The caller is responsible for combining the results into an + * appropriate routing message. + */ +static void +rtsock_get_ndp(struct rt_msghdr * rtm, struct rtsock_rta * rta, + struct sockaddr_in6 * addr, struct sockaddr_dlx * gateway, + lldata_ndp_num_t num, unsigned int type) +{ + struct ifdev *ifdev; + unsigned int flags; + + lldata_ndp_get(num, addr, gateway, &ifdev, &flags); + + memset(rtm, 0, sizeof(*rtm)); + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = type; + rtm->rtm_flags = flags; + rtm->rtm_index = ifdev_get_index(ifdev); + + rtsock_rta_init(rta); + + rtsock_rta_set(rta, RTAX_DST, addr, addr->sin6_len); + + rtsock_rta_set(rta, RTAX_GATEWAY, gateway, gateway->sdlx_len); +} + +/* + * Send a routing message about an NDP table entry, with the given type which + * may be one of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_LOCK, and RTM_GET. The + * routing socket request information 'rtr', if not NULL, provides additional + * information about the routing socket that was the source of the request (if + * any) and various fields that should be echoed. + */ +void +rtsock_msg_ndp(lldata_ndp_num_t num, unsigned int type, + const struct rtsock_request * rtr) +{ + struct sockaddr_in6 addr; + struct sockaddr_dlx gateway; + struct rt_msghdr rtm; + struct rtsock_rta rta; + struct pbuf *pbuf; + + assert(rtr != NULL); + + /* + * We do not maintain the link-local tables ourselves, and thus, we do + * not have a complete view of modifications to them. In order not to + * confuse userland with inconsistent updates (e.g., deletion of + * previously unreported entries), send these routing messages to the + * source of the routing request only. + */ + if (!rtsock_msg_one(rtr->rtr_src, AF_INET6, NULL /*pbuf*/)) + return; + + rtsock_get_ndp(&rtm, &rta, &addr, &gateway, num, type); + + if (rtr != NULL) { + rtm.rtm_flags |= RTF_DONE; + rtm.rtm_pid = rtr->rtr_pid; + rtm.rtm_seq = rtr->rtr_seq; + } + + if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, + &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) + rtsock_msg_one(rtr->rtr_src, AF_INET6, pbuf); +} + +/* + * Obtain NDP table entries. + */ +static ssize_t +rtsock_info_lltable_ndp(struct rmib_oldp * oldp) +{ + struct rt_msghdr rtm; + struct rtsock_rta rta; + struct sockaddr_in6 addr; + struct sockaddr_dlx gateway; + lldata_ndp_num_t num; + ssize_t r, off; + + off = 0; + + for (num = 0; lldata_ndp_enum(&num); num++) { + /* Size (over)estimation shortcut. */ + if (oldp == NULL) { + off += sizeof(struct rt_msghdr) + + RT_ROUNDUP(sizeof(addr)) + + RT_ROUNDUP(sizeof(gateway)); + + continue; + } + + rtsock_get_ndp(&rtm, &rta, &addr, &gateway, num, RTM_GET); + + if ((r = rtsock_rta_finalize(&rtm, sizeof(rtm), + &rtm.rtm_msglen, &rtm.rtm_addrs, &rta, NULL /*pbuf*/, oldp, + off)) < 0) + return r; + off += r; + } + + /* TODO: should we add slack here? */ + return off; +} + +/* + * Obtain link-layer (ARP, NDP) table entries. + */ +static ssize_t +rtsock_info_lltable(struct rmib_oldp * oldp, int family) +{ + + switch (family) { + case AF_INET: + return rtsock_info_lltable_arp(oldp); + + case AF_INET6: + return rtsock_info_lltable_ndp(oldp); + + default: + return 0; + } +} + +/* + * Obtain link-layer address information for one specific interface. + */ +static ssize_t +rtsock_info_if_dl(struct ifdev * ifdev, struct ifa_msghdr * ifam, + struct rmib_oldp * oldp, ssize_t off) +{ + struct rtsock_rta rta; + struct sockaddr_dlx sdlx; + ifaddr_dl_num_t num; + ssize_t r, len; + + len = 0; + + for (num = 0; ifaddr_dl_enum(ifdev, &num); num++) { + if (oldp == NULL) { + len += sizeof(*ifam) + RT_ROUNDUP(sizeof(sdlx)); + + continue; + } + + rtsock_rta_init(&rta); + + rtsock_rta_add_dl(&rta, ifdev, num, &sdlx); + + if ((r = rtsock_rta_finalize(ifam, sizeof(*ifam), + &ifam->ifam_msglen, &ifam->ifam_addrs, &rta, NULL /*pbuf*/, + oldp, off + len)) < 0) + return r; + len += r; + } + + return len; +} + +/* + * Obtain IPv4 address information for one specific interface. + */ +static ssize_t +rtsock_info_if_v4(struct ifdev * ifdev, struct ifa_msghdr * ifam, + struct rmib_oldp * oldp, ssize_t off) +{ + struct sockaddr_in sin[4]; + struct rtsock_rta rta; + ifaddr_v4_num_t num; + ssize_t r, len; + + len = 0; + + /* + * Mostly for future compatibility, we support multiple IPv4 interface + * addresses here. Every interface has an interface address and a + * netmask. In addition, an interface may have either a broadcast or a + * destination address. + */ + for (num = 0; ifaddr_v4_enum(ifdev, &num); num++) { + /* Size (over)estimation shortcut. */ + if (oldp == NULL) { + len += sizeof(*ifam) + RT_ROUNDUP(sizeof(sin[0])) * 3; + + continue; + } + + rtsock_rta_init(&rta); + + rtsock_rta_add_v4(&rta, ifdev, num, sin); + + if ((r = rtsock_rta_finalize(ifam, sizeof(*ifam), + &ifam->ifam_msglen, &ifam->ifam_addrs, &rta, NULL /*pbuf*/, + oldp, off + len)) < 0) + return r; + len += r; + } + + return len; +} + +/* + * Obtain IPv6 address information for one specific interface. + */ +static ssize_t +rtsock_info_if_v6(struct ifdev * ifdev, struct ifa_msghdr * ifam, + struct rmib_oldp * oldp, ssize_t off) +{ + struct sockaddr_in6 sin6[3]; + struct rtsock_rta rta; + ifaddr_v6_num_t num; + ssize_t r, len; + + len = 0; + + /* As with IPv4, except that IPv6 has no broadcast addresses. */ + for (num = 0; ifaddr_v6_enum(ifdev, &num); num++) { + /* Size (over)estimation shortcut. */ + if (oldp == NULL) { + len += sizeof(*ifam) + RT_ROUNDUP(sizeof(sin6[0])) * 3; + + continue; + } + + rtsock_rta_init(&rta); + + rtsock_rta_add_v6(&rta, ifdev, num, sin6); + + if ((r = rtsock_rta_finalize(ifam, sizeof(*ifam), + &ifam->ifam_msglen, &ifam->ifam_addrs, &rta, NULL /*pbuf*/, + oldp, off + len)) < 0) + return r; + len += r; + } + + return len; +} + +/* + * Obtain information for one specific interface. + */ +static ssize_t +rtsock_info_if(struct ifdev * ifdev, struct rmib_oldp * oldp, ssize_t off, + int family) +{ + struct rtsock_rta rta; + struct sockaddr_dlx sdlx; + struct if_msghdr ifm; + struct ifa_msghdr ifam; + unsigned int ifflags; + ssize_t r, len, sdlxsize; + + len = 0; + + ifflags = ifdev_get_ifflags(ifdev); + + /* Create an interface information entry. */ + rtsock_rta_init(&rta); + + if (oldp != NULL) { + memset(&ifm, 0, sizeof(ifm)); + ifm.ifm_version = RTM_VERSION; + ifm.ifm_type = RTM_IFINFO; + ifm.ifm_flags = ifflags; + ifm.ifm_index = ifdev_get_index(ifdev); + memcpy(&ifm.ifm_data, ifdev_get_ifdata(ifdev), + sizeof(ifm.ifm_data)); + } + + /* + * Generate a datalink socket address structure. TODO: see if it is + * worth obtaining just the length for the (oldp == NULL) case here. + */ + memset(&sdlx, 0, sizeof(sdlx)); + + ifaddr_dl_get(ifdev, 0, &sdlx); + + sdlxsize = RT_ROUNDUP(sdlx.sdlx_len); + + rtsock_rta_set(&rta, RTAX_IFP, &sdlx, sdlxsize); + + if ((r = rtsock_rta_finalize(&ifm, sizeof(ifm), &ifm.ifm_msglen, + &ifm.ifm_addrs, &rta, NULL /*pbuf*/, oldp, off + len)) < 0) + return r; + len += r; + + /* Generate a header for all addresses once. */ + if (oldp != NULL) { + memset(&ifam, 0, sizeof(ifam)); + ifam.ifam_version = RTM_VERSION; + ifam.ifam_type = RTM_NEWADDR; + ifam.ifam_flags = 0; + ifam.ifam_index = ifdev_get_index(ifdev); + ifam.ifam_metric = ifdev_get_metric(ifdev); + } + + /* If requested and applicable, add any datalink addresses. */ + if (family == AF_UNSPEC || family == AF_LINK) { + if ((r = rtsock_info_if_dl(ifdev, &ifam, oldp, off + len)) < 0) + return r; + len += r; + } + + /* If requested and applicable, add any IPv4 addresses. */ + if (family == AF_UNSPEC || family == AF_INET) { + if ((r = rtsock_info_if_v4(ifdev, &ifam, oldp, off + len)) < 0) + return r; + len += r; + } + + /* If requested and applicable, add any IPv6 addresses. */ + if (family == AF_UNSPEC || family == AF_INET6) { + if ((r = rtsock_info_if_v6(ifdev, &ifam, oldp, off + len)) < 0) + return r; + len += r; + } + + return len; +} + +/* + * Obtain interface information. + */ +static ssize_t +rtsock_info_iflist(struct rmib_oldp * oldp, int family, uint32_t ifindex) +{ + struct ifdev *ifdev; + ssize_t r, off; + + /* + * If information about a specific interface index is requested, then + * return information for just that interface. + */ + if (ifindex != 0) { + if ((ifdev = ifdev_get_by_index(ifindex)) != NULL) + return rtsock_info_if(ifdev, oldp, 0, family); + else + return 0; + } + + /* Otherwise, iterate through the list of all interfaces. */ + off = 0; + + for (ifdev = ifdev_enum(NULL); ifdev != NULL; + ifdev = ifdev_enum(ifdev)) { + + /* Avoid generating results that are never copied out. */ + if (oldp != NULL && !rmib_inrange(oldp, off)) + oldp = NULL; + + if ((r = rtsock_info_if(ifdev, oldp, off, family)) < 0) + return r; + + off += r; + } + + /* TODO: should we add slack here? */ + return off; +} + +/* + * Obtain routing table, ARP cache, and interface information through + * sysctl(7). Return the (produced, or if oldp is NULL, estimated) byte size + * of the output on success, or a negative error code on failure. + */ +static ssize_t +rtsock_info(struct rmib_call * call, struct rmib_node * node __unused, + struct rmib_oldp * oldp, struct rmib_newp * newp __unused) +{ + int family, filter; + + if (call->call_namelen != 3) + return EINVAL; + + family = call->call_name[0]; + filter = call->call_name[2]; + + switch (call->call_name[1]) { + case NET_RT_FLAGS: + /* + * Preliminary support for changes as of NetBSD 8, where by + * default, the use of this subcall implies an ARP/NDP-only + * request. + */ + if (filter == 0) + filter |= RTF_LLDATA; + + if (filter & RTF_LLDATA) { + if (family == AF_UNSPEC) + return EINVAL; + + /* + * Split off ARP/NDP handling from the normal routing + * table listing, as done since NetBSD 8. We generate + * the ARP/NDP listing from here, and keep those + * entries out of the routing table dump below. Since + * the filter is of a match-any type, and we have just + * matched a flag, no further filtering is needed here. + */ + return rtsock_info_lltable(oldp, family); + } + + /* FALLTHROUGH */ + case NET_RT_DUMP: + return rtsock_info_rtable(oldp, family, filter); + + case NET_RT_IFLIST: + return rtsock_info_iflist(oldp, family, filter); + + default: + return EINVAL; + } +} diff --git a/minix/net/lwip/rtsock.h b/minix/net/lwip/rtsock.h new file mode 100644 index 000000000..a5d2960d5 --- /dev/null +++ b/minix/net/lwip/rtsock.h @@ -0,0 +1,32 @@ +#ifndef MINIX_NET_LWIP_RTSOCK_H +#define MINIX_NET_LWIP_RTSOCK_H + +#include "ifaddr.h" +#include "lldata.h" + +struct route_entry; +struct rtsock_request; + +void rtsock_init(void); +sockid_t rtsock_socket(int type, int protocol, struct sock ** sock, + const struct sockevent_ops ** ops); + +void rtsock_msg_ifannounce(struct ifdev * ifdev, int arrival); +void rtsock_msg_ifinfo(struct ifdev * ifdev); + +void rtsock_msg_addr_dl(struct ifdev * ifdev, unsigned int type, + ifaddr_dl_num_t num); +void rtsock_msg_addr_v4(struct ifdev * ifdev, unsigned int type, + ifaddr_v4_num_t num); +void rtsock_msg_addr_v6(struct ifdev * ifdev, unsigned int type, + ifaddr_v6_num_t num); + +void rtsock_msg_miss(const struct sockaddr * addr); +void rtsock_msg_route(const struct route_entry * route, unsigned int type, + const struct rtsock_request * rtr); +void rtsock_msg_arp(lldata_arp_num_t num, unsigned int type, + const struct rtsock_request * rtr); +void rtsock_msg_ndp(lldata_ndp_num_t num, unsigned int type, + const struct rtsock_request * rtr); + +#endif /* !MINIX_NET_LWIP_RTSOCK_H */ diff --git a/minix/net/lwip/rttree.c b/minix/net/lwip/rttree.c new file mode 100644 index 000000000..eb0033464 --- /dev/null +++ b/minix/net/lwip/rttree.c @@ -0,0 +1,744 @@ +/* LWIP service - rttree.c - generic routing tree data structure */ +/* + * This module implements the Net/3 binary radix (Patricia) tree as described + * in TCP/IP Illustrated Vol.2, with a few important changes. First and + * foremost, we make the assumption that all address masks are "normal", i.e., + * they can be expressed in terms of a "prefix length" or "bit count", meaning + * that the first so many bits of the mask are set and the remaining bits are + * all clear. Based on this assumption, we store routing entries not just in + * leaf nodes, but rather in a node at the bit count of the routing entry's + * mask; this node may then also have children. As a result, instead of "leaf" + * and "internal" nodes, this module instead uses "data" and "link" nodes: + * + * - Data nodes are nodes with an associated routing entry. The data node + * structure is always the first field of its corresponding routing entry + * structure. Data nodes may have zero, one, or two children. Its children + * are always a refinement of the address mask in the routing entry. + * - Link nodes are nodes with no associated routing entry. They always have + * exactly two children. As with BSD's "internal" nodes: since the tree + * needs no more than one link node per routing entry, each routing entry + * structure contains a link node, which may be used anywhere in the tree. + * + * The result of this approach is that we do not use a linked list for each + * leaf, since entries with the same address and different masks are not stored + * as part of the same leaf node. There is however still one case where a + * linked list would be necessary: the coexistence of a full-mask network entry + * and a host entry (net/32 vs host for IPv4, net/128 vs host for IPv6). Since + * this tree implementation is not used for ARP/ND6 (host) entries, the need to + * support that case is not as high, and so it is currently not supported. It + * can be added later if needed. In that case, the prototype of only + * rttree_find_exact() will have to be changed, since rttree_add() already + * supports the difference by passing a full mask vs passing no mask at all. + * + * There are other differences with the BSD implementation, and certainly also + * more opportunities for improving performance. For now, the implementation + * should be good enough for its intended purpose. + */ + +#include "lwip.h" +#include "rttree.h" + +#define RTTREE_BITS_TO_BYTE(bits) ((bits) >> 3) +#define RTTREE_BITS_TO_SHIFT(bits) (7 - ((bits) & 7)) +#define RTTREE_BITS_TO_BYTES(bits) (RTTREE_BITS_TO_BYTE((bits) + 7)) + +/* + * The given node is being added to the given routing tree, and just had its + * bit count assigned. Precompute any additional fields used for fast address + * access on the node. + */ +static void +rttree_precompute(struct rttree * tree __unused, struct rttree_node * node) +{ + + node->rtn_byte = RTTREE_BITS_TO_BYTE(node->rtn_bits); + node->rtn_shift = RTTREE_BITS_TO_SHIFT(node->rtn_bits); +} + +/* + * For an operation on the routing tree 'tree', test whether the bit 'bit' is + * set or clear in 'addr'. Return 1 if the address has the bit set, 0 if it + * does not. + */ +static unsigned int +rttree_test(const struct rttree * tree __unused, const void * addr, + unsigned int bit) +{ + unsigned int byte, shift; + + byte = RTTREE_BITS_TO_BYTE(bit); + shift = RTTREE_BITS_TO_SHIFT(bit); + + return (((const uint8_t *)addr)[byte] >> shift) & 1; +} + +/* + * For an operation on the routing tree 'tree', test whether a particular bit + * as identified by the routing node 'node' is set or clear in 'address', + * effectively computing the side (left or right) to take when descending down + * the tree. Return 1 if the address has the bit set, 0 if it does not. + */ +static inline unsigned int +rttree_side(const struct rttree * tree, const struct rttree_node * node, + const void * addr) +{ + + return (((const uint8_t *)addr)[node->rtn_byte] >> + node->rtn_shift) & 1; +} + +/* + * Check for the routing tree 'tree' whether the routing entry 'entry' matches + * the address 'addr' exactly. Return TRUE or FALSE depending on the outcome. + * This function must be called only on entries that have already been + * determined to span the full bit width. + */ +static inline int +rttree_equals(const struct rttree * tree, const struct rttree_entry * entry, + const void * addr) +{ + unsigned int bits; + + bits = tree->rtt_bits; + + assert(bits == entry->rte_data.rtn_bits); + + return !memcmp(entry->rte_addr, addr, RTTREE_BITS_TO_BYTE(bits)); +} + +/* + * Check for the routing tree 'tree' whether the routing entry 'entry' matches + * the address 'addr'. Return TRUE if the address is matched by the entry's + * address and mask, or FALSE if not. + */ +static inline int +rttree_match(const struct rttree * tree, const struct rttree_entry * entry, + const void * addr) +{ + const uint8_t *aptr, *aptr2, *mptr; + unsigned int bits, bytes; + + if ((bits = entry->rte_data.rtn_bits) == 0) + return TRUE; + + if ((mptr = (const uint8_t *)entry->rte_mask) == NULL) + return rttree_equals(tree, entry, addr); + + aptr = (const uint8_t *)addr; + aptr2 = (const uint8_t *)entry->rte_addr; + + for (bytes = RTTREE_BITS_TO_BYTES(bits); bytes > 0; bytes--) { + if ((*aptr & *mptr) != *aptr2) + return FALSE; + + aptr++; + aptr2++; + mptr++; + } + + return TRUE; +} + +/* + * Find the first bit that differs between the two given addresses. Return the + * bit number if found, or the full bit width if the addresses are equal. + */ +static unsigned int +rttree_diff(const struct rttree * tree, const void * addr, const void * addr2) +{ + const uint8_t *aptr, *aptr2; + unsigned int bit, i; + uint8_t b; + + aptr = (const uint8_t *)addr; + aptr2 = (const uint8_t *)addr2; + + for (bit = 0; bit < tree->rtt_bits; bit += NBBY, aptr++, aptr2++) { + if ((b = *aptr ^ *aptr2) != 0) { + for (i = 0; i < NBBY; i++) + if (b & (1 << (NBBY - i - 1))) + break; + return bit + i; + } + } + + return bit; +} + +/* + * Add a link node to the free list of the given routing tree, marking it as + * free in the process. + */ +static void +rttree_add_free(struct rttree * tree, struct rttree_node * node) +{ + + node->rtn_child[0] = NULL; + if ((node->rtn_child[1] = tree->rtt_free) != NULL) + node->rtn_child[1]->rtn_child[0] = node; + tree->rtt_free = node; + node->rtn_parent = NULL; + node->rtn_type = RTNT_FREE; +} + +/* + * Remove the given free link node from the free list. The caller must already + * have verified that the node is on the free list, and has to change the node + * type as appropriate afterward. + */ +static void +rttree_del_free(struct rttree * tree, struct rttree_node * node) +{ + + assert(node->rtn_type == RTNT_FREE); + + if (node->rtn_child[0] != NULL) + node->rtn_child[0]->rtn_child[1] = node->rtn_child[1]; + else + tree->rtt_free = node->rtn_child[1]; + if (node->rtn_child[1] != NULL) + node->rtn_child[1]->rtn_child[0] = node->rtn_child[0]; +} + +/* + * Obtain, remove, and return a free link node from the free list. This + * function must be called only when it is already known that the free list is + * not empty. The caller has to change the node type as appropriate afterward. + */ +static struct rttree_node * +rttree_get_free(struct rttree * tree) +{ + struct rttree_node * node; + + node = tree->rtt_free; + assert(node != NULL); + assert(node->rtn_type == RTNT_FREE); + + rttree_del_free(tree, node); + + return node; +} + +/* + * Initialize the given routing tree, with the given address bit width. + */ +void +rttree_init(struct rttree * tree, unsigned int bits) +{ + + tree->rtt_root = NULL; + tree->rtt_free = NULL; + tree->rtt_bits = bits; +} + +/* + * Look up the most narrow routing tree entry that matches the given address. + * Return the entry on success, or NULL if no matching entry is found. + */ +struct rttree_entry * +rttree_lookup_match(struct rttree * tree, const void * addr) +{ + struct rttree_entry *entry, *best; + struct rttree_node *node; + unsigned int side; + + /* + * The current implementation is "forward-tracking", testing all + * potentially matching entries while descending into the tree and + * remembering the "best" (narrowest matching) entry. The assumption + * here is that most lookups will end up returning the default route or + * another broad route, and thus quickly fail a narrower match and bail + * out early. This assumption is in part motivated by the fact that + * our routing trees do not store link-layer (ARP/ND6) entries. If + * desired, the implementation can easily be rewritten to do + * backtracking instead. + */ + best = NULL; + + for (node = tree->rtt_root; node != NULL; + node = node->rtn_child[side]) { + if (node->rtn_type == RTNT_DATA) { + entry = (struct rttree_entry *)node; + + if (!rttree_match(tree, entry, addr)) + break; + + best = entry; + } + + side = rttree_side(tree, node, addr); + } + + return best; +} + +/* + * Look up a routing entry that is an exact match for the given (full) address. + * Return the entry if it was found, or NULL otherwise. + */ +struct rttree_entry * +rttree_lookup_host(struct rttree * tree, const void * addr) +{ + struct rttree_entry *entry; + struct rttree_node *node; + unsigned int side; + + for (node = tree->rtt_root; node != NULL; + node = node->rtn_child[side]) { + if (node->rtn_type == RTNT_DATA && + node->rtn_bits == tree->rtt_bits) { + entry = (struct rttree_entry *)node; + + if (rttree_equals(tree, entry, addr)) + return entry; + + break; + } + + side = rttree_side(tree, node, addr); + } + + return NULL; +} + +/* + * Look up a routing entry that is an exact match for the given address and + * prefix length. Return the entry if found, or NULL otherwise. + */ +struct rttree_entry * +rttree_lookup_exact(struct rttree * tree, const void * addr, + unsigned int prefix) +{ + struct rttree_entry *entry; + struct rttree_node *node; + unsigned int side; + + for (node = tree->rtt_root; node != NULL && node->rtn_bits <= prefix; + node = node->rtn_child[side]) { + if (node->rtn_type == RTNT_DATA) { + entry = (struct rttree_entry *)node; + + if (!rttree_match(tree, entry, addr)) + return NULL; + + if (node->rtn_bits == prefix) + return entry; + } + + side = rttree_side(tree, node, addr); + } + + return NULL; +} + +/* + * Enumerate entries in the routing tree. If 'last' is NULL, return the first + * entry. Otherwise, return the next entry starting from 'last'. In both + * cases, if no (more) entries are present in the tree, return NULL. The order + * of the returned entries is stable across tree modifications and the function + * may be called multiple times on the same entry. More specifically, it is + * safe to continue enumeration from a previous entry after deleting its + * successor from the tree. + */ +struct rttree_entry * +rttree_enum(struct rttree * tree, struct rttree_entry * last) +{ + struct rttree_node *node, *parent; + + /* + * For the first query, we may have to return the tree root right away. + * For subsequent queries, we have to move ahead by at least one node. + */ + if (last == NULL) { + if ((node = tree->rtt_root) == NULL) + return NULL; + + if (node->rtn_type == RTNT_DATA) + return (struct rttree_entry *)node; + } else + node = &last->rte_data; + + /* A basic iterative pre-order binary-tree depth-first search. */ + do { + assert(node != NULL); + + /* Can we descend further, either left or right? */ + if (node->rtn_child[0] != NULL) + node = node->rtn_child[0]; + else if (node->rtn_child[1] != NULL) + node = node->rtn_child[1]; + else { + /* + * No. Go back up the tree, until we can go right + * where we went left before.. or run out of tree. + */ + for (;; node = parent) { + if ((parent = node->rtn_parent) == NULL) + return NULL; + + if (parent->rtn_child[0] == node && + parent->rtn_child[1] != NULL) { + node = parent->rtn_child[1]; + + break; + } + } + } + + /* Skip link nodes. */ + } while (node->rtn_type != RTNT_DATA); + + return (struct rttree_entry *)node; +} + +/* + * Set the node 'node' to be part of tree 'tree', with type 'type' (either + * RTNT_DATA or RTNT_LINK) and a bit count of 'prefix'. The node is set to be + * a child of 'parent' on side 'side', unless 'parent' is NULL in which case + * the node is set to be the topmost node in the tree (and 'side' is ignored). + * The node's children are set to 'left' and 'right'; for each, if not NULL, + * its parent is set to 'node'. + */ +static void +rttree_set(struct rttree * tree, struct rttree_node * node, int type, + unsigned int prefix, struct rttree_node * parent, int side, + struct rttree_node * left, struct rttree_node * right) +{ + + assert(type == RTNT_DATA || type == RTNT_LINK); + assert(prefix <= tree->rtt_bits); + assert(side == 0 || side == 1); + + node->rtn_type = type; + node->rtn_bits = prefix; + + /* With rtn_bits assigned, precompute any derived fields. */ + rttree_precompute(tree, node); + + if ((node->rtn_parent = parent) != NULL) + parent->rtn_child[side] = node; + else + tree->rtt_root = node; + + if ((node->rtn_child[0] = left) != NULL) + left->rtn_parent = node; + if ((node->rtn_child[1] = right) != NULL) + right->rtn_parent = node; +} + +/* + * In the routing tree 'tree', replace old node 'onode' with new node 'node', + * setting the type of the latter to 'type'. The tree is updated accordingly, + * but it is left up to the caller to deal with the old node as appropriate. + */ +static void +rttree_replace(struct rttree * tree, struct rttree_node * onode, + struct rttree_node * node, int type) +{ + struct rttree_node *parent; + unsigned int side; + + /* + * Replacing one data node with another data node is not something that + * is currently being done, even if it would work. + */ + assert(onode->rtn_type != RTNT_DATA || node->rtn_type != RTNT_DATA); + assert(onode->rtn_child[0] != NULL); + assert(onode->rtn_child[1] != NULL); + + parent = onode->rtn_parent; + + side = (parent != NULL && parent->rtn_child[1] == onode); + + rttree_set(tree, node, type, onode->rtn_bits, parent, side, + onode->rtn_child[0], onode->rtn_child[1]); +} + +/* + * Add a new routing entry 'entry' to the routing tree 'tree'. The entry + * object will be initialized as a result. The address to add is given as + * 'addr', and the address mask as 'mask'. Both those pointers must be point + * to memory that is as long-lived as the routing entry; this is typically + * accomplished by storing them in a larger object that embeds 'entry'. + * However, 'mask' may be NULL, signifying a host type entry with an implied + * full mask. If not NULL, the given mask must be normalized, i.e., it must + * consist of a run of zero or more 1-bits followed by a remainder of only + * 0-bits. The number of 1-bits must also be given as a bit count 'prefix', + * even if 'mask' is NULL. The address must be normalized to its mask: no bits + * starting from bit 'prefix' must be set in 'addr'. Return OK if adding the + * routing entry succeeded, or EEXIST if an entry already exists for the + * combination of that address and mask. If the caller has already verified + * with rttree_lookup_exact() that no such entry exists, the call will succeed. + */ +int +rttree_add(struct rttree * tree, struct rttree_entry * entry, + const void * addr, const void * mask, unsigned int prefix) +{ + struct rttree_node *node, *parent, *link; + struct rttree_entry *other_entry; + unsigned int bit, side, side2; + int match; + + assert(mask != NULL || prefix == tree->rtt_bits); + + /* + * We start by determining the path, bit count, and method of the + * addition. We do this with a lookup on the address, for the full + * address width--that is, not limited to the given prefix length. As + * a result, at some point we will find either a NULL pointer, or a + * data node with a width that is at least as large as the given prefix + * length. The NULL case is easy: we EXTEND the tree with our new + * entry wherever we ran into the NULL pointer. + * + * If instead we find a sufficiently wide data node, then we see if it + * is a match for the new address. If so, our new data node should + * either be INSERTed between two nodes along the path taken so far, or + * REPLACE a link node along that path with the new data node. If it + * it is not a match, then the action to take depends on whether the + * first differing bit falls within the given prefix length: if so, we + * have to BRANCH along the path, using a link node allocated for that + * differing bit; if not, we should use INSERT or REPLACE after all. + * + * As the only exceptional case, we might in fact find an entry for the + * exact same address and prefix length as what is being added. In the + * current design of the routing tree, this is always a failure case. + */ + parent = NULL; + side = 0; + other_entry = NULL; + + for (node = tree->rtt_root; node != NULL; + node = node->rtn_child[side]) { + if (node->rtn_type == RTNT_DATA) { + other_entry = (struct rttree_entry *)node; + + bit = rttree_diff(tree, other_entry->rte_addr, addr); + + match = (bit >= node->rtn_bits); + + /* Test whether the exact entry already exists. */ + if (match && node->rtn_bits == prefix) + return EEXIST; + + /* + * Test the INSERT/REPLACE and BRANCH cases. Note that + * this condition is in a terse, optimized form that + * does not map directly to the two different cases. + */ + if (!match || node->rtn_bits > prefix) { + if (bit > prefix) + bit = prefix; + break; + } + } + + parent = node; + side = rttree_side(tree, node, addr); + } + + /* + * At this point, addition is going to succeed no matter what. Start + * by initializing part of 'entry'. In particular, add the given + * entry's link node to the list of free link nodes, because the common + * case is that we end up not using it. If we do, we will just take it + * off again right away. The entry's data node will be initialized as + * part of the addition process below. + */ + entry->rte_addr = addr; + entry->rte_mask = mask; + + rttree_add_free(tree, &entry->rte_link); + + /* + * First deal with the EXTEND case. In that case we already know the + * intended parent and the side (left/right) for the addition. + */ + if (node == NULL) { + assert(parent == NULL || parent->rtn_bits < prefix); + assert(parent == NULL || parent->rtn_child[side] == NULL); + + rttree_set(tree, &entry->rte_data, RTNT_DATA, prefix, parent, + side, NULL /*left*/, NULL /*right*/); + + return OK; + } + + /* + * For the other three cases, we now have to walk back along the path + * we have taken so far in order to find the correct insertion point. + */ + while (parent != NULL && parent->rtn_bits >= bit) { + node = parent; + + parent = node->rtn_parent; + } + + if (bit == prefix && node->rtn_bits == bit) { + /* + * The REPLACE case. Replace the link node 'node' with our new + * entry. Afterwards, mark the link node as free. + */ + assert(node->rtn_type != RTNT_DATA); + + rttree_replace(tree, node, &entry->rte_data, RTNT_DATA); + + rttree_add_free(tree, node); + } else if (bit == prefix) { + /* + * The INSERT case. Insert the data node between 'parent' and + * 'node'. Note that 'parent' may be NULL. We need to use the + * address we found earlier, as 'other_entry', to determine + * whether we should add 'node' to the left or right of the + * inserted data node. + */ + assert(node->rtn_bits > bit); + assert(parent == NULL || parent->rtn_bits < bit); + assert(other_entry != NULL); + + side = (parent != NULL && parent->rtn_child[1] == node); + + side2 = rttree_test(tree, other_entry->rte_addr, bit); + + rttree_set(tree, &entry->rte_data, RTNT_DATA, prefix, parent, + side, (!side2) ? node : NULL, (side2) ? node : NULL); + } else { + /* + * The BRANCH case. In this case, it is impossible that we + * find a link node with a bit count equal to the first + * differing bit between the address we found and the address + * we want to insert: if such a node existed, we would have + * descended down its other child during the initial lookup. + * + * Interpose a link node between 'parent' and 'current' for bit + * 'bit', with its other child set to point to 'entry'. Again, + * we need to perform an additional bit test here, because even + * though we know that the address we found during the lookup + * differs from the given address at bit 'bit', we do not know + * the value of either bit yet. + */ + assert(bit < prefix); + assert(node->rtn_bits > bit); + assert(parent == NULL || parent->rtn_bits < bit); + + link = rttree_get_free(tree); + + side = (parent != NULL && parent->rtn_child[1] == node); + + side2 = rttree_test(tree, addr, bit); + + /* Use NULL for the data node we are about to add. */ + rttree_set(tree, link, RTNT_LINK, bit, parent, side, + (side2) ? node : NULL, (!side2) ? node : NULL); + + /* This addition will replace the NULL pointer again. */ + rttree_set(tree, &entry->rte_data, RTNT_DATA, prefix, link, + side2, NULL /*left*/, NULL /*right*/); + } + + return OK; +} + +/* + * Remove a particular node 'node' from the routing tree 'tree'. The given + * node must have zero or one children. As integrity check only, if 'nonempty' + * is set, the node must have one child. If the node has one child, that child + * will be linked to the node's parent (or the tree root), thus cutting the + * node itself out of the tree. If the node has zero children, the + * corresponding slot in its parent (or the tree root) will be cleared. The + * function will return a pointer to the parent node if it too qualifies for + * removal afterwards, or NULL if no further removal action needs to be taken. + */ +static struct rttree_node * +rttree_remove(struct rttree * tree, struct rttree_node * node, + int nonempty __unused) +{ + struct rttree_node *parent, *child; + unsigned int side; + + if ((child = node->rtn_child[0]) == NULL) + child = node->rtn_child[1]; + + assert(child != NULL || !nonempty); + + if ((parent = node->rtn_parent) != NULL) { + side = (parent->rtn_child[1] == node); + + parent->rtn_child[side] = child; + + if (child != NULL) + child->rtn_parent = parent; + else if (parent->rtn_type == RTNT_LINK) + return parent; + } else { + tree->rtt_root = child; + + if (child != NULL) + child->rtn_parent = NULL; + } + + return NULL; +} + +/* + * Delete the routing entry 'entry' from the routing tree 'tree'. The entry + * must have been added before. This function always succeeds. + */ +void +rttree_delete(struct rttree * tree, struct rttree_entry * entry) +{ + struct rttree_node *node, *link; + + /* + * Remove the data node from the tree. If the data node also has two + * children, we have to replace it with a link node. Otherwise, we + * have to remove it and, if it has no children at all, possibly remove + * its parent as well. + */ + node = &entry->rte_data; + + assert(node->rtn_type == RTNT_DATA); + + if (node->rtn_child[0] != NULL && node->rtn_child[1] != NULL) { + /* + * The link node we allocate here may actually be the entry's + * own link node. We do not make an exception for that case + * here, as we have to deal with the entry's link node being in + * use a bit further down anyway. + */ + link = rttree_get_free(tree); + + rttree_replace(tree, node, link, RTNT_LINK); + } else { + /* + * Remove the data node from the tree. If the node has no + * children, its removal may leave a link node with one child. + * That would be its original parent. That node must then also + * be removed from the tree, and freed up. + */ + link = rttree_remove(tree, node, FALSE /*nonempty*/); + + if (link != NULL) { + (void)rttree_remove(tree, link, TRUE /*nonempty*/); + + rttree_add_free(tree, link); + } + } + + /* + * Remove the entry's link node from either the tree or the free list, + * depending on the type currently assigned to it. If it has to be + * removed from the tree, it must be replaced with another link node. + * There will always be enough link nodes available for this to work. + */ + node = &entry->rte_link; + + if (node->rtn_type == RTNT_LINK) { + link = rttree_get_free(tree); + + rttree_replace(tree, node, link, RTNT_LINK); + } else { + assert(node->rtn_type == RTNT_FREE); + + rttree_del_free(tree, node); + } +} diff --git a/minix/net/lwip/rttree.h b/minix/net/lwip/rttree.h new file mode 100644 index 000000000..b2a24d8c7 --- /dev/null +++ b/minix/net/lwip/rttree.h @@ -0,0 +1,50 @@ +#ifndef MINIX_NET_LWIP_RTTREE_H +#define MINIX_NET_LWIP_RTTREE_H + +/* Routing table node structure. */ +struct rttree_node { + struct rttree_node *rtn_child[2]; /* left child node */ + struct rttree_node *rtn_parent; /* parent node */ + uint8_t rtn_type; /* node type (RNT_) */ + uint8_t rtn_bits; /* prefix bit count */ + uint8_t rtn_byte; /* bits-derived byte index */ + uint8_t rtn_shift; /* bits-derived shift count */ +}; + +#define RTNT_DATA 0 /* data node (entry) */ +#define RTNT_LINK 1 /* link node, in use */ +#define RTNT_FREE 2 /* link node, free */ + +/* Routing table entry structure. */ +struct rttree_entry { + struct rttree_node rte_data; /* data node - MUST be first */ + struct rttree_node rte_link; /* link node */ + const void *rte_addr; /* pointer to address */ + const void *rte_mask; /* pointer to mask */ +}; + +/* Routing table structure. */ +struct rttree { + struct rttree_node *rtt_root; /* root of the route tree */ + struct rttree_node *rtt_free; /* free internal nodes list */ + uint8_t rtt_bits; /* number of bits in address */ +}; + +#define rttree_get_addr(entry) ((entry)->rte_addr) +#define rttree_get_mask(entry) ((entry)->rte_mask) +#define rttree_get_prefix(entry) ((entry)->rte_data.rtn_bits) + +void rttree_init(struct rttree * tree, unsigned int bits); +struct rttree_entry *rttree_lookup_match(struct rttree * tree, + const void * addr); +struct rttree_entry *rttree_lookup_host(struct rttree * tree, + const void * addr); +struct rttree_entry *rttree_lookup_exact(struct rttree * tree, + const void * addr, unsigned int prefix); +struct rttree_entry *rttree_enum(struct rttree * tree, + struct rttree_entry * entry); +int rttree_add(struct rttree * tree, struct rttree_entry * entry, + const void * addr, const void * mask, unsigned int prefix); +void rttree_delete(struct rttree * tree, struct rttree_entry * entry); + +#endif /* !MINIX_NET_LWIP_RTTREE_H */ diff --git a/minix/net/lwip/tcpisn.c b/minix/net/lwip/tcpisn.c new file mode 100644 index 000000000..382cf32c2 --- /dev/null +++ b/minix/net/lwip/tcpisn.c @@ -0,0 +1,203 @@ +/* LWIP service - tcpisn.c - TCP Initial Sequence Number generation */ +/* + * This module implements the TCP ISN algorithm standardized in RFC 6528. It + * currently uses the current time, at clock tick granularity, as source for + * the 4-microsecond timer, and SHA256 as the hashing algorithm. As part of + * the input to the hash function, we use an "ISN secret" that can be set + * through the (hidden, root-only) net.inet.tcp.isn_secret sysctl(7) node. + * Ideally, the secret should remain the same across system reboots; it is left + * up to userland to take care of that. + * + * TODO: while this module provides the strongest possible implementation of + * the algorithm, it is also quite heavyweight. We should consider allowing + * for a more configurable level of strength, perhaps with the possibility for + * less powerful platforms to revert to simple use of a random number. + */ + +#include "lwip.h" +#include "tcpisn.h" + +#include + +/* + * The TCP ISN hash input consists of the TCP 4-tuple of the new connection and + * a static secret. The 4-tuple consists of two IP addresses, at most 16 bytes + * (128 bits, for IPv6) each, and two port numbers, two bytes (16 bits) each. + * We use the SHA256 input block size of 64 bytes to avoid copying, so that + * leaves us with 28 bytes of room for the static secret. We use 16 bytes, and + * leave the rest blank. As a sidenote, while hardcoding sizes is not nice, we + * really need to get the layout exactly right in this case. + */ +#define TCPISN_TUPLE_LENGTH (16 * 2 + 2 * 2) + +#if TCPISN_SECRET_LENGTH > (SHA256_BLOCK_LENGTH - TCPISN_TUPLE_LENGTH) +#error "TCP ISN secret length exceeds remainder of hash block" +#endif + +/* We are using memchr() on this, so do not remove the '32' size here! */ +static const uint8_t tcpisn_hextab[32] = "0123456789abcdef0123456789ABCDEF"; + +static uint8_t tcpisn_input[SHA256_BLOCK_LENGTH] __aligned(4); + +static int tcpisn_set; + +/* + * Initialize the TCP ISN module. + */ +void +tcpisn_init(void) +{ + time_t boottime; + + /* + * Part of the input to the hash function is kept as is between calls + * to the TCP ISN hook. In particular, we zero the entire input here, + * so that the padding is zero. We also zero the area where the secret + * will be stored, but we put in the system boot time as a last effort + * to try to create at least some minimal amount of unpredictability. + * The boot time is by no means sufficient though, so issue a warning + * if a TCP ISN is requested before an actual secret is set. Note that + * an actual secret will overwrite the boot time based pseudo-secret. + */ + memset(tcpisn_input, 0, sizeof(tcpisn_input)); + + (void)getuptime(NULL, NULL, &boottime); + memcpy(&tcpisn_input[TCPISN_TUPLE_LENGTH], &boottime, + sizeof(boottime)); + + tcpisn_set = FALSE; +} + +/* + * Set and/or retrieve the ISN secret. In order to allow the hash value to be + * set from the command line, this sysctl(7) node is a hex-encoded string. + */ +ssize_t +tcpisn_secret(struct rmib_call * call __unused, + struct rmib_node * node __unused, struct rmib_oldp * oldp, + struct rmib_newp * newp) +{ + uint8_t secret[TCPISN_SECRET_HEX_LENGTH], byte, *p; + unsigned int i; + int r; + + /* First copy out the old (current) ISN secret. */ + if (oldp != NULL) { + for (i = 0; i < TCPISN_SECRET_LENGTH; i++) { + byte = tcpisn_input[TCPISN_TUPLE_LENGTH + i]; + secret[i * 2] = tcpisn_hextab[byte >> 4]; + secret[i * 2 + 1] = tcpisn_hextab[byte & 0xf]; + } + secret[i * 2] = '\0'; + assert(i * 2 + 1 == sizeof(secret)); + + if ((r = rmib_copyout(oldp, 0, secret, sizeof(secret))) < 0) + return r; + } + + /* + * Then copy in the new ISN secret. We require the given string to be + * exactly as large as we need. + */ + if (newp != NULL) { + /* Copy in the user-given string. */ + if ((r = rmib_copyin(newp, secret, sizeof(secret))) != OK) + return r; + if (secret[i * 2] != '\0') + return EINVAL; + + /* Hex-decode the given string (in place). */ + for (i = 0; i < TCPISN_SECRET_LENGTH; i++) { + if ((p = memchr(tcpisn_hextab, secret[i * 2], + sizeof(tcpisn_hextab))) == NULL) + return EINVAL; + secret[i] = ((uint8_t)(p - tcpisn_hextab) & 0xf) << 4; + if ((p = memchr(tcpisn_hextab, secret[i * 2 + 1], + sizeof(tcpisn_hextab))) == NULL) + return EINVAL; + secret[i] |= (uint8_t)(p - tcpisn_hextab) & 0xf; + } + + /* Once fully validated, switch to the new secret. */ + memcpy(&tcpisn_input[TCPISN_TUPLE_LENGTH], secret, + TCPISN_SECRET_LENGTH); + + tcpisn_set = TRUE; + } + + /* Return the length of the node. */ + return sizeof(secret); +} + +/* + * Hook to generate an Initial Sequence Number (ISN) for a new TCP connection. + */ +uint32_t +lwip_hook_tcp_isn(const ip_addr_t * local_ip, uint16_t local_port, + const ip_addr_t * remote_ip, uint16_t remote_port) +{ + uint8_t output[SHA256_DIGEST_LENGTH] __aligned(4); + SHA256_CTX ctx; + clock_t realtime; + time_t boottime; + uint32_t isn; + + if (!tcpisn_set) { + printf("LWIP: warning, no TCP ISN secret has been set\n"); + + tcpisn_set = TRUE; /* print the warning only once */ + } + + if (IP_IS_V6(local_ip)) { + assert(IP_IS_V6(remote_ip)); + + memcpy(&tcpisn_input[0], &ip_2_ip6(local_ip)->addr, 16); + memcpy(&tcpisn_input[16], &ip_2_ip6(remote_ip)->addr, 16); + } else { + assert(IP_IS_V4(local_ip)); + assert(IP_IS_V4(remote_ip)); + + /* + * Store IPv4 addresses as IPv4-mapped IPv6 addresses, even + * though lwIP will never give us an IPv4-mapped IPv6 address, + * so as to ensure completely disjoint address spaces and thus + * no potential abuse of IPv6 addresses in order to predict + * ISNs for IPv4 connections. + */ + memset(&tcpisn_input[0], 0, 10); + tcpisn_input[10] = 0xff; + tcpisn_input[11] = 0xff; + memcpy(&tcpisn_input[12], &ip_2_ip4(local_ip)->addr, 4); + memset(&tcpisn_input[16], 0, 10); + tcpisn_input[26] = 0xff; + tcpisn_input[27] = 0xff; + memcpy(&tcpisn_input[28], &ip_2_ip4(local_ip)->addr, 4); + } + + tcpisn_input[32] = local_port >> 8; + tcpisn_input[33] = local_port & 0xff; + tcpisn_input[34] = remote_port >> 8; + tcpisn_input[35] = remote_port & 0xff; + + /* The rest of the input (secret and padding) is already filled in. */ + + SHA256_Init(&ctx); /* this call zeroes a buffer we don't use.. */ + SHA256_Update(&ctx, tcpisn_input, sizeof(tcpisn_input)); + SHA256_Final(output, &ctx); + + /* Arbitrarily take the first 32 bits from the generated hash. */ + memcpy(&isn, output, sizeof(isn)); + + /* + * Add the current time in 4-microsecond units. The time value should + * be wall-clock accurate and stable even across system reboots and + * downtime. Do not precompute the boot time part: it may change. + */ + (void)getuptime(NULL, &realtime, &boottime); + + isn += (uint32_t)boottime * 250000; + isn += (uint32_t)(((uint64_t)realtime * 250000) / sys_hz()); + + /* The result is the ISN to use for this connection. */ + return isn; +} diff --git a/minix/net/lwip/tcpisn.h b/minix/net/lwip/tcpisn.h new file mode 100644 index 000000000..b7a2b60ed --- /dev/null +++ b/minix/net/lwip/tcpisn.h @@ -0,0 +1,20 @@ +#ifndef MINIX_NET_LWIP_TCPISN_H +#define MINIX_NET_LWIP_TCPISN_H + +/* + * Length, in bytes, of the secret (random seed) that is used as part of the + * input to the hashing function that generates TCP Initial Sequence Numbers. + */ +#define TCPISN_SECRET_LENGTH 16 + +/* + * Size of the hexadecimal-string representation of the secret, including + * trailing null terminator. + */ +#define TCPISN_SECRET_HEX_LENGTH (TCPISN_SECRET_LENGTH * 2 + 1) + +void tcpisn_init(void); +ssize_t tcpisn_secret(struct rmib_call * call, struct rmib_node * node, + struct rmib_oldp * oldp, struct rmib_newp * newp); + +#endif /* !MINIX_NET_LWIP_TCPISN_H */ diff --git a/minix/net/lwip/tcpsock.c b/minix/net/lwip/tcpsock.c new file mode 100644 index 000000000..8266a05c3 --- /dev/null +++ b/minix/net/lwip/tcpsock.c @@ -0,0 +1,2793 @@ +/* LWIP service - tcpsock.c - TCP sockets */ +/* + * This module implements support for TCP sockets based on lwIP's core TCP PCB + * module, which is largely but not fully cooperative with exactly what we want + * to achieve, with as a result that this module is rather complicated. + * + * Each socket has a send queue and a receive queue. Both are using lwIP's own + * (pbuf) buffers, which largely come out of the main 512-byte buffer pool. + * The buffers on the send queue are allocated and freed by us--the latter only + * once they are no longer in use by lwIP as well. A bit counterintuitively, + * we deliberately use a smaller lwIP per-PCB TCP send buffer limit + * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more + * easily trigger conditions where we cannot enqueue data (or the final FIN) + * right away. This way, we get to test the internal logic of this module a + * lot more easily. The small lwIP send queue size should not have any impact + * on performance, as our own per-socket send queues can be much larger and we + * enqueue more of that on the lwIP PCB as soon as we can in all cases. + * + * The receive queue consists of whatever buffers were given to us by lwIP, but + * since those may be many buffers with small amounts of data each, we perform + * fairly aggressive merging of consecutive buffers. The intended result is + * that we waste no more than 50% of memory within the receive queue. Merging + * requires memory copies, which makes it expensive, but we do not configure + * lwIP with enough buffers to make running out of buffers a non-issue, so this + * trade-off is necessary. Practical experience and measurements of the merge + * policy will have to show whether and how the current policy may be improved. + * + * As can be expected, the connection close semantics are by far the most + * complicated part of this module. We attempt to get rid of the lwIP PCB as + * soon as we can, letting lwIP take care of the TIME_WAIT state for example. + * However, there are various conditions that have to be met before we can + * forget about the PCB here--most importantly, that none of our sent data + * blocks are still referenced by lwIP because they have not yet been sent or + * acknowledged. We can only free the data blocks once lwIP is done with them. + * + * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating + * full state tracking here. However, we do not look at a socket's TCP state + * while in a lwIP-generated event for that socket, because the state may not + * necessarily reflect the (correct or new) TCP state of the connection, nor + * may the PCB be available--this is the case for error events. For these + * reasons we use a few internal TCPF_ flags to perform partial state tracking. + * + * More generally, we tend to access lwIP PCB fields directly only when lwIP's + * own BSD API implementation does that too and there is no better alternative. + * One example of this is the check to see if our FIN was acknowledged, for + * SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API + * changes later, we can change our code to imitate whatever lwIP's BSD API + * implementation does at that point. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Unfortunately, NetBSD and lwIP have different definitions of a few relevant + * preprocessor variables. Make sure we do not attempt to use the NetBSD one + * where it matters. We do need one of the NetBSD definitions though. + */ +static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY; +#undef TF_NODELAY +#undef TCP_MSS + +#include "lwip.h" +#include "tcpisn.h" + +#include "lwip/tcp.h" +#include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */ + +/* + * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration. + */ + +/* + * We fully control the send buffer, so we can let its size be set to whatever + * we want. The receive buffer is different: if it is smaller than the window + * size, we may have to refuse data that lwIP hands us, at which point more + * incoming data will cause lwIP to abort the TCP connection--even aside from + * performance issues. Therefore, we must make sure the receive buffer is + * larger than the TCP window at all times. + */ +#define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */ +#define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */ +#define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */ +#define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */ +#define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */ +#define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */ + +/* + * The total number of buffers that may in use for TCP socket send queues. The + * goal is to allow at least some progress to be made on receiving from TCP + * sockets and on differently-typed sockets, at least as long as the LWIP + * service can manage to allocate the memory it wants. For the case that it + * does not, we can only reactively kill off TCP sockets and/or free enqueued + * ethernet packets, neither of which is currently implemented (TODO). + */ +#define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4) + +/* Polling intervals, in 500-millsecond units. */ +#define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */ +#define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */ + +static struct tcpsock { + struct ipsock tcp_ipsock; /* IP socket, MUST be first */ + struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */ + union pxfer_tcp_queue { /* free/accept queue */ + TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */ + TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */ + } tcp_queue; + struct tcpsock *tcp_listener; /* listener if on accept q. */ + struct { /* send queue */ + struct pbuf *ts_head; /* first pbuf w/unacked data */ + struct pbuf *ts_unsent; /* first pbuf w/unsent data */ + struct pbuf *ts_tail; /* most recently added data */ + size_t ts_len; /* total sent + unsent */ + unsigned short ts_head_off; /* offset into head pbuf */ + unsigned short ts_unsent_off; /* offset into unsent pbuf */ + } tcp_snd; + struct { /* receive queue */ + struct pbuf *tr_head; /* first pbuf w/unrecvd data */ + struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */ + size_t tr_len; /* bytes on receive queue */ + unsigned short tr_head_off; /* offset into head pbuf */ + unsigned short tr_unacked; /* current window reduction */ + } tcp_rcv; +} tcp_array[NR_TCPSOCK]; + +static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */ + +static const struct sockevent_ops tcpsock_ops; + +static unsigned int tcpsock_sendbufs; /* # send buffers in use */ +static unsigned int tcpsock_recvbufs; /* # receive buffers in use */ + +/* A bunch of macros that are just for convenience. */ +#define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array)) +#define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock) +#define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp))) +#define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp))) +#define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp))) +#define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp))) +#define tcpsock_is_shutdown(tcp,fl) \ + (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl)) +#define tcpsock_is_listening(tcp) \ + (sockevent_is_listening(tcpsock_get_sock(tcp))) +#define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp))) +#define tcpsock_set_flag(tcp,fl) \ + (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl)) +#define tcpsock_clear_flag(tcp,fl) \ + (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl)) + +static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *, + struct rmib_oldp *, struct rmib_newp *); + +/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */ +/* TODO: add many more and make some of them writable.. */ +static struct rmib_node net_inet_tcp_table[] = { +/* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF, + "sendspace", + "Default TCP send buffer size"), +/* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF, + "recvspace", + "Default TCP receive buffer size"), +/*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int), + loopif_cksum, "do_loopback_cksum", + "Perform TCP checksum on loopback"), +/*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, + tcpsock_pcblist, "pcblist", + "TCP protocol control block list"), +/*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE | + CTLFLAG_HIDDEN | CTLTYPE_STRING, + TCPISN_SECRET_HEX_LENGTH, tcpisn_secret, + "isn_secret", + "TCP ISN secret (MINIX 3 specific)") +}; + +static struct rmib_node net_inet_tcp_node = + RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings"); +static struct rmib_node net_inet6_tcp6_node = + RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings"); + +/* + * Initialize the TCP sockets module. + */ +void +tcpsock_init(void) +{ + unsigned int slot; + + /* Initialize the list of free TCP sockets. */ + TAILQ_INIT(&tcp_freelist); + + for (slot = 0; slot < __arraycount(tcp_array); slot++) + TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot], + tcp_queue.tq_next); + + /* Initialize other variables. */ + tcpsock_sendbufs = 0; + + /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */ + mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node); + mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node); +} + +/* + * Initialize the state of a TCP socket's send queue. + */ +static void +tcpsock_reset_send(struct tcpsock * tcp) +{ + + tcp->tcp_snd.ts_tail = NULL; + tcp->tcp_snd.ts_unsent = NULL; + tcp->tcp_snd.ts_head = NULL; + tcp->tcp_snd.ts_len = 0; + tcp->tcp_snd.ts_unsent_off = 0; + tcp->tcp_snd.ts_head_off = 0; +} + +/* + * Initialize the state of a TCP socket's receive queue. + */ +static void +tcpsock_reset_recv(struct tcpsock * tcp) +{ + + tcp->tcp_rcv.tr_pre_tailp = NULL; + tcp->tcp_rcv.tr_head = NULL; + tcp->tcp_rcv.tr_len = 0; + tcp->tcp_rcv.tr_head_off = 0; + tcp->tcp_rcv.tr_unacked = 0; +} + +/* + * Create a TCP socket. + */ +sockid_t +tcpsock_socket(int domain, int protocol, struct sock ** sockp, + const struct sockevent_ops ** ops) +{ + struct tcpsock *tcp; + uint8_t ip_type; + + switch (protocol) { + case 0: + case IPPROTO_TCP: + break; + + default: + return EPROTONOSUPPORT; + } + + if (TAILQ_EMPTY(&tcp_freelist)) + return ENOBUFS; + + tcp = TAILQ_FIRST(&tcp_freelist); + + /* + * Initialize the structure. Do not memset it to zero, as it is still + * part of the linked free list. Initialization may still fail. When + * adding new fields, make sure to change tcpsock_clone() accordingly. + */ + + ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain, + TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp); + + if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL) + return ENOBUFS; + tcp_arg(tcp->tcp_pcb, tcp); + + tcp->tcp_listener = NULL; + + tcpsock_reset_send(tcp); + tcpsock_reset_recv(tcp); + + TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); + + *ops = &tcpsock_ops; + return tcpsock_get_id(tcp); +} + +/* + * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection + * incoming on listening socket 'listener'. The new socket is essentially a + * "clone" of the listening TCP socket, in that it should inherit any settings + * from the listening socket. The socket has not yet been accepted by userland + * so add it to the queue of connetions pending for the listening socket. On + * success, return OK. On failure, return a negative error code. + */ +static int +tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb) +{ + struct tcpsock *tcp; + + if (TAILQ_EMPTY(&tcp_freelist)) + return ENOBUFS; + + tcp = TAILQ_FIRST(&tcp_freelist); + + /* + * Initialize the structure. Do not memset it to zero, as it is still + * part of the linked free list. Initialization may still fail. Most + * settings should be inherited from the listening socket here, rather + * than being initialized to their default state. + */ + + ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp), + tcpsock_get_id(tcp)); + + tcp->tcp_pcb = pcb; + tcp_arg(pcb, tcp); + + tcpsock_reset_send(tcp); + tcpsock_reset_recv(tcp); + + /* + * Remove the new socket from the free list, and add it to the queue of + * the listening socket--in this order, because the same next pointer + * is used for both. + */ + TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); + + TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp, + tcp_queue.tq_next); + tcp->tcp_listener = listener; + + return OK; +} + +/* + * Allocate a buffer from the pool, using the standard pool size. The returned + * buffer is a single element--never a chain. + */ +static struct pbuf * +tcpsock_alloc_buf(void) +{ + struct pbuf *pbuf; + + pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM); + + assert(pbuf == NULL || pbuf->len == pbuf->tot_len); + + return pbuf; +} + +/* + * Free the given buffer. Ensure that pbuf_free() will not attempt to free the + * next buffer(s) in the chain as well. This may be called for pbufs other + * than those allocated with tcpsock_alloc_buf(). + */ +static void +tcpsock_free_buf(struct pbuf * pbuf) +{ + + /* + * Resetting the length is currently not necessary, but better safe + * than sorry.. + */ + pbuf->len = pbuf->tot_len; + pbuf->next = NULL; + + pbuf_free(pbuf); +} + +/* + * Clear the send queue of a TCP socket. The caller must ensure that lwIP will + * no longer access any of data on the send queue. + */ +static void +tcpsock_clear_send(struct tcpsock * tcp) +{ + struct pbuf *phead; + + assert(tcp->tcp_pcb == NULL); + + while ((phead = tcp->tcp_snd.ts_head) != NULL) { + tcp->tcp_snd.ts_head = phead->next; + + assert(tcpsock_sendbufs > 0); + tcpsock_sendbufs--; + + tcpsock_free_buf(phead); + } + + tcpsock_reset_send(tcp); +} + +/* + * Clear the receive queue of a TCP socket. If 'ack_data' is set, also + * acknowledge the previous contents of the receive queue to lwIP. + */ +static size_t +tcpsock_clear_recv(struct tcpsock * tcp, int ack_data) +{ + struct pbuf *phead; + size_t rlen; + + rlen = tcp->tcp_rcv.tr_len; + + while ((phead = tcp->tcp_rcv.tr_head) != NULL) { + tcp->tcp_rcv.tr_head = phead->next; + + assert(tcpsock_recvbufs > 0); + tcpsock_recvbufs--; + + tcpsock_free_buf(phead); + } + + /* + * From now on, we will basically be discarding incoming data as fast + * as possible, to keep the full window open at all times. + */ + if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0) + tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked); + + tcpsock_reset_recv(tcp); + + return rlen; +} + +/* + * The TCP socket's PCB has been detached from the socket, typically because + * the connection was aborted, either by us or by lwIP. Either way, any TCP + * connection is gone. Clear the socket's send queue, remove the socket from + * a listening socket's queue, and if the socket itself is ready and allowed to + * be freed, free it now. The socket is ready to be freed if it was either on + * a listening queue or being closed already. The socket is allowed to be + * freed only if 'may_free' is TRUE. If the socket is not freed, its receive + * queue is left as is, as it may still have data to be received by userland. + */ +static int +tcpsock_cleanup(struct tcpsock * tcp, int may_free) +{ + int destroy; + + assert(tcp->tcp_pcb == NULL); + + /* + * Free any data on the send queue. This is safe to do right now, + * because the PCB has been aborted (or was already gone). We must be + * very careful about clearing the send queue in all other situations. + */ + tcpsock_clear_send(tcp); + + /* + * If this was a socket pending acceptance, remove it from the + * corresponding listener socket's queue, and free it. Otherwise, free + * the socket only if it suspended a graceful close operation. + */ + if (tcp->tcp_listener != NULL) { + TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp, + tcp_queue.tq_next); + tcp->tcp_listener = NULL; + + /* + * The listener socket's backlog count should be adjusted by + * lwIP whenever the PCB is freed up, so we need (and must) not + * attempt to do that here. + */ + + destroy = TRUE; + } else + destroy = sockevent_is_closing(tcpsock_get_sock(tcp)); + + /* + * Do not free the socket if 'may_free' is FALSE. That flag may be set + * if we are currently in the second tcpsock_close() call on the + * socket, in which case sockevent_is_closing() is TRUE but we must + * still not free the socket now: doing so would derail libsockevent. + */ + if (destroy && may_free) { + (void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/); + + sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); + } + + return destroy; +} + +/* + * Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is + * connected, this will cause the connection to be reset. The PCB, which must + * have still been present before the call, will be gone after the call. + */ +static void +tcpsock_pcb_abort(struct tcpsock * tcp) +{ + + assert(tcp->tcp_pcb != NULL); + assert(!tcpsock_is_listening(tcp)); + + tcp_recv(tcp->tcp_pcb, NULL); + tcp_sent(tcp->tcp_pcb, NULL); + tcp_err(tcp->tcp_pcb, NULL); + tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); + + tcp_arg(tcp->tcp_pcb, NULL); + + tcp_abort(tcp->tcp_pcb); + + tcp->tcp_pcb = NULL; +} + +/* + * Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is + * connected, its graceful close will be finished by lwIP in the background. + * The PCB, which must have still been present before the call, will be gone + * after the call. + */ +static void +tcpsock_pcb_close(struct tcpsock * tcp) +{ + err_t err; + + assert(tcp->tcp_pcb != NULL); + assert(tcp->tcp_snd.ts_len == 0); + + if (!tcpsock_is_listening(tcp)) { + tcp_recv(tcp->tcp_pcb, NULL); + tcp_sent(tcp->tcp_pcb, NULL); + tcp_err(tcp->tcp_pcb, NULL); + tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); + } + + tcp_arg(tcp->tcp_pcb, NULL); + + if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK) + panic("unexpected TCP close failure: %d", err); + + tcp->tcp_pcb = NULL; +} + +/* + * Return TRUE if all conditions are met for closing the TCP socket's PCB, or + * FALSE if they are not. Upon calling this function, the socket's PCB must + * still be around. + */ +static int +tcpsock_may_close(struct tcpsock * tcp) +{ + + assert(tcp->tcp_pcb != NULL); + + /* + * Regular closing of the PCB requires three conditions to be met: + * + * 1. all our data has been transmitted AND acknowledged, so that we do + * not risk corruption in case there are still unsent or unack'ed + * data buffers that may otherwise be recycled too soon; + * 2. we have sent our FIN to the peer; and, + * 3. we have received a FIN from the peer. + */ + return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) == + (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0); +} + +/* + * The given socket is ready to be closed as per the tcpsock_may_close() rules. + * This implies that its send queue is already empty. Gracefully close the + * PCB. In addition, if the socket is being closed gracefully, meaning we + * suspended an earlier tcpsock_close() call (and as such already emptied the + * receive queue as well), then tell libsockevent that the close is finished, + * freeing the socket. Return TRUE if the socket has indeed been freed this + * way, or FALSE if the socket is still around. + */ +static int +tcpsock_finish_close(struct tcpsock * tcp) +{ + + assert(tcp->tcp_snd.ts_len == 0); + assert(tcp->tcp_listener == NULL); + + /* + * If we get here, we have already shut down the sending side of the + * PCB. Technically, we are interested only in shutting down the + * receiving side of the PCB here, so that lwIP may decide to recycle + * the socket later etcetera. We call tcp_close() because we do not + * want to rely on tcp_shutdown(RX) doing the exact same thing. + * However, we do rely on the fact that the PCB is not immediately + * destroyed by the tcp_close() call: otherwise we may have to return + * ERR_ABRT if this function is called from a lwIP-generated event. + */ + tcpsock_pcb_close(tcp); + + /* + * If we suspended an earlier tcpsock_close() call, we have to tell + * libsockevent that the close operation is now complete. + */ + if (sockevent_is_closing(tcpsock_get_sock(tcp))) { + assert(tcp->tcp_rcv.tr_len == 0); + + sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); + + return TRUE; + } else + return FALSE; +} + +/* + * Attempt to start or resume enqueuing data and/or a FIN to send on the given + * TCP socket. Return TRUE if anything at all could be newly enqueued on the + * lwIP PCB, even if less than desired. In that case, the caller should try to + * send whatever was enqueued, and if applicable, check if the socket may now + * be closed (due to the FIN being enqueued). In particular, in any situation + * where the socket may be in the process of being closed, the caller must use + * tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could + * be enqueued, in which case no send attempt need to be made either. + */ +static int +tcpsock_pcb_enqueue(struct tcpsock * tcp) +{ + struct pbuf *punsent; + size_t space, chunk; + unsigned int flags; + err_t err; + int enqueued; + + assert(tcp->tcp_pcb != NULL); + + if (tcpsock_get_flags(tcp) & TCPF_FULL) + return FALSE; + + /* + * Attempt to enqueue more unsent data, if any, on the PCB's send + * queue. + */ + enqueued = FALSE; + + while (tcp->tcp_snd.ts_unsent != NULL) { + if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0) + break; + + /* + * We may maintain a non-NULL unsent pointer even when there is + * nothing more to send right now, because the tail buffer may + * be filled up further later on. + */ + punsent = tcp->tcp_snd.ts_unsent; + + assert(punsent->len >= tcp->tcp_snd.ts_unsent_off); + + chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off; + if (chunk == 0) + break; + + if (chunk > space) + chunk = space; + + /* Try to enqueue more data for sending. */ + if (chunk < punsent->len || punsent->next != NULL) + flags = TCP_WRITE_FLAG_MORE; + else + flags = 0; + + err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload + + tcp->tcp_snd.ts_unsent_off, chunk, flags); + + /* + * Since tcp_write() enqueues data only, it should only return + * out-of-memory errors; no fatal ones. In any case, stop. + */ + if (err != ERR_OK) { + assert(err == ERR_MEM); + + break; + } + + /* We have successfully enqueued data. */ + enqueued = TRUE; + + tcp->tcp_snd.ts_unsent_off += chunk; + + if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) { + assert(tcp->tcp_snd.ts_unsent_off < punsent->len || + punsent->next == NULL); + + break; + } + + tcp->tcp_snd.ts_unsent = punsent->next; + tcp->tcp_snd.ts_unsent_off = 0; + } + + /* + * If all pending data has been enqueued for sending, and we should + * shut down the sending end of the socket, try that now. + */ + if ((tcp->tcp_snd.ts_unsent == NULL || + tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) && + tcpsock_is_shutdown(tcp, SFL_SHUT_WR) && + !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) { + err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/); + + if (err == ERR_OK) { + /* + * We have successfully enqueued a FIN. The caller is + * now responsible for checking whether the PCB and + * possibly even the socket object can now be freed. + */ + tcpsock_set_flag(tcp, TCPF_SENT_FIN); + + enqueued = TRUE; + } else { + assert(err == ERR_MEM); + + /* + * FIXME: the resolution for lwIP bug #47485 has taken + * away even more control over the closing process from + * us, making tracking sockets especially for SO_LINGER + * even harder. For now, we simply effectively undo + * the patch by clearing TF_CLOSEPEND if tcp_shutdown() + * returns ERR_MEM. This will not be sustainable in + * the long term, though. + */ + tcp->tcp_pcb->flags &= ~TF_CLOSEPEND; + + tcpsock_set_flag(tcp, TCPF_FULL); + } + } + + return enqueued; +} + +/* + * Request lwIP to start sending any enqueued data and/or FIN on the TCP + * socket's lwIP PCB. On success, return OK. On failure, return a negative + * error code, after cleaning up the socket, freeing the PCB. If the socket + * was already being closed, also free the socket object in that case; the + * caller must then not touch the socket object anymore upon return. If the + * socket object is not freed, and if 'raise_error' is TRUE, raise the error + * on the socket object. + */ +static int +tcpsock_pcb_send(struct tcpsock * tcp, int raise_error) +{ + err_t err; + int r; + + assert(tcp->tcp_pcb != NULL); + + /* + * If we have enqueued something, ask lwIP to send TCP packets now. + * This may result in a fatal error, in which case we clean up the + * socket and return the error to the caller. Since cleaning up the + * socket may free the socket object, and the caller cannot tell + * whether that will happen or has happened, also possibly raise the + * error on the socket object if it is not gone. As such, callers that + * set 'raise_error' to FALSE must know for sure that the socket was + * not being closed, for example because the caller is processing a + * (send) call from userland. + */ + err = tcp_output(tcp->tcp_pcb); + + if (err != ERR_OK && err != ERR_MEM) { + tcpsock_pcb_abort(tcp); + + r = util_convert_err(err); + + if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { + if (raise_error) + sockevent_set_error(tcpsock_get_sock(tcp), r); + } + /* Otherwise, do not touch the socket object anymore! */ + + return r; + } else + return OK; +} + +/* + * Callback from lwIP. The given number of data bytes have been acknowledged + * as received by the remote end. Dequeue and free data from the TCP socket's + * send queue as appropriate. + */ +static err_t +tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len) +{ + struct tcpsock *tcp = (struct tcpsock *)arg; + struct pbuf *phead; + size_t left; + + assert(tcp != NULL); + assert(pcb == tcp->tcp_pcb); + assert(len > 0); + + assert(tcp->tcp_snd.ts_len >= len); + assert(tcp->tcp_snd.ts_head != NULL); + + left = len; + + /* + * First see if we can free up whole buffers. Check against the head + * buffer's 'len' rather than 'tot_len', or we may end up leaving an + * empty buffer on the chain. + */ + while ((phead = tcp->tcp_snd.ts_head) != NULL && + left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) { + left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off; + + tcp->tcp_snd.ts_head = phead->next; + tcp->tcp_snd.ts_head_off = 0; + + if (phead == tcp->tcp_snd.ts_unsent) { + assert(tcp->tcp_snd.ts_unsent_off == phead->len); + + tcp->tcp_snd.ts_unsent = phead->next; + tcp->tcp_snd.ts_unsent_off = 0; + } + + assert(tcpsock_sendbufs > 0); + tcpsock_sendbufs--; + + tcpsock_free_buf(phead); + } + + /* + * The rest of the given length is for less than the current head + * buffer. + */ + if (left > 0) { + assert(tcp->tcp_snd.ts_head != NULL); + assert((size_t)tcp->tcp_snd.ts_head->len - + tcp->tcp_snd.ts_head_off > left); + + tcp->tcp_snd.ts_head_off += left; + } + + tcp->tcp_snd.ts_len -= (size_t)len; + + if (tcp->tcp_snd.ts_head == NULL) { + assert(tcp->tcp_snd.ts_len == 0); + assert(tcp->tcp_snd.ts_unsent == NULL); + tcp->tcp_snd.ts_tail = NULL; + } else + assert(tcp->tcp_snd.ts_len > 0); + + /* + * If we emptied the send queue, and we already managed to send a FIN + * earlier, we may now have met all requirements to close the socket's + * PCB. Otherwise, we may also be able to send more now, so try to + * resume sending. Since we are invoked from the "sent" event, + * tcp_output() will not actually process anything, and so we do not + * call it either. If we did, we would have to deal with errors here. + */ + if (tcpsock_may_close(tcp)) { + if (tcpsock_finish_close(tcp)) + return ERR_OK; + } else { + tcpsock_clear_flag(tcp, TCPF_FULL); + + /* + * If we now manage to enqueue a FIN, we may be ready to close + * the PCB after all. + */ + if (tcpsock_pcb_enqueue(tcp)) { + if (tcpsock_may_close(tcp) && + tcpsock_finish_close(tcp)) + return ERR_OK; + } + } + + /* The user may also be able to send more now. */ + sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); + + return ERR_OK; +} + +/* + * Check whether any (additional) data previously received on a TCP socket + * should be acknowledged, possibly allowing the remote end to send additional + * data as a result. + */ +static void +tcpsock_ack_recv(struct tcpsock * tcp) +{ + size_t rcvbuf, left, delta, ack; + + assert(tcp->tcp_pcb != NULL); + + /* + * We must make sure that at all times, we can still add an entire + * window's worth of data to the receive queue. If the amount of free + * space drops below that threshold, we stop acknowledging received + * data. The user may change the receive buffer size at all times; we + * update the window size lazily as appropriate. + */ + rcvbuf = tcpsock_get_rcvbuf(tcp); + + if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) { + /* + * The number of bytes that lwIP can still give us at any time + * is represented as 'left'. The number of bytes that we still + * allow to be stored in the receive queue is represented as + * 'delta'. We must make sure that 'left' does not ever exceed + * 'delta' while acknowledging as many bytes as possible under + * that rule. + */ + left = TCP_WND - tcp->tcp_rcv.tr_unacked; + delta = rcvbuf - tcp->tcp_rcv.tr_len; + + if (left < delta) { + ack = delta - left; + + if (ack > tcp->tcp_rcv.tr_unacked) + ack = tcp->tcp_rcv.tr_unacked; + + tcp_recved(tcp->tcp_pcb, ack); + + tcp->tcp_rcv.tr_unacked -= ack; + + assert(tcp->tcp_rcv.tr_len + TCP_WND - + tcp->tcp_rcv.tr_unacked <= rcvbuf); + } + } +} + +/* + * Attempt to merge two consecutive underfilled buffers in the receive queue of + * a TCP socket, freeing up one of the two buffers as a result. The first + * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at + * 'pnext'. The second (new) buffer is 'pbuf', which is already attached to + * the first buffer. The second buffer may be followed by additional buffers + * with even more new data. Return TRUE if buffers have been merged, in which + * case the pointer at 'pnext' may have changed, and no assumptions should be + * made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE + * if no merging was necessary or if no new buffer could be allocated. + */ +static int +tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf) +{ + struct pbuf *pnew; + + assert(*pnext == ptail); + assert(ptail->next == pbuf); + + /* + * Unfortunately, we cannot figure out what kind of pbuf we were given + * by the lower layers, so we cannot merge two buffers without first + * allocating a third. Once we have done that, though, we can easily + * merge more into that new buffer. For now we use the following + * policies: + * + * 1. if two consecutive lwIP-provided buffers are both used less than + * half the size of a full buffer, try to allocate a new buffer and + * copy both lwIP-provided buffers into that new buffer, freeing up + * the pair afterwards; + * 2. if the tail buffer on the chain is allocated by us and not yet + * full, and the next buffer's contents can be added to the tail + * buffer in their entirety, do just that. + * + * Obviously there is a trade-off between the performance overhead of + * copying and the resource overhead of keeping less-than-full buffers + * on the receive queue, but this policy should both keep actual memory + * usage to no more than twice the receive queue length and prevent + * excessive copying. The policy deliberately performs more aggressive + * merging into a buffer that we allocated ourselves. + */ + if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 && + pbuf->len <= MEMPOOL_BUFSIZE / 2) { + /* + * Case #1. + */ + assert(ptail->tot_len == ptail->len); + assert(pbuf->tot_len == pbuf->len); + + pnew = tcpsock_alloc_buf(); + if (pnew == NULL) + return FALSE; + + memcpy(pnew->payload, ptail->payload, ptail->len); + memcpy((char *)pnew->payload + ptail->len, pbuf->payload, + pbuf->len); + pnew->len = ptail->len + pbuf->len; + assert(pnew->len <= pnew->tot_len); + + pnew->next = pbuf->next; + /* For now, we need not inherit any flags from either pbuf. */ + + *pnext = pnew; + + /* One allocated, two about to be deallocated. */ + assert(tcpsock_recvbufs > 0); + tcpsock_recvbufs--; + + tcpsock_free_buf(ptail); + tcpsock_free_buf(pbuf); + + return TRUE; + } else if (ptail->tot_len - ptail->len >= pbuf->len) { + /* + * Case #2. + */ + memcpy((char *)ptail->payload + ptail->len, pbuf->payload, + pbuf->len); + + ptail->len += pbuf->len; + + ptail->next = pbuf->next; + + assert(tcpsock_recvbufs > 0); + tcpsock_recvbufs--; + + tcpsock_free_buf(pbuf); + + return TRUE; + } else + return FALSE; +} + +/* + * Callback from lwIP. New data or flags have been received on a TCP socket. + */ +static err_t +tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused, + struct pbuf * pbuf, err_t err) +{ + struct tcpsock *tcp = (struct tcpsock *)arg; + struct pbuf *ptail, **pprevp; + size_t len; + + assert(tcp != NULL); + assert(pcb == tcp->tcp_pcb); + + /* + * lwIP should never provide anything other than ERR_OK in 'err', and + * it is not clear what we should do if it would. If lwIP ever changes + * in this regard, we will likely have to change this code accordingly. + */ + if (err != ERR_OK) + panic("TCP receive event with error: %d", err); + + /* If the given buffer is NULL, we have received a FIN. */ + if (pbuf == NULL) { + tcpsock_set_flag(tcp, TCPF_RCVD_FIN); + + /* Userland may now receive EOF. */ + if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) + sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); + + /* + * If we were in the process of closing the socket, and we + * receive a FIN before our FIN got acknowledged, we close the + * socket anyway, as described in tcpsock_close(). However, if + * there is still unacknowledged outgoing data or we did not + * even manage to send our FIN yet, hold off closing the socket + * for now. + */ + if (tcpsock_may_close(tcp)) + (void)tcpsock_finish_close(tcp); + + return ERR_OK; + } + + /* + * If the socket is being closed, receiving new data should cause a + * reset. + */ + if (sockevent_is_closing(tcpsock_get_sock(tcp))) { + tcpsock_pcb_abort(tcp); + + (void)tcpsock_cleanup(tcp, TRUE /*may_free*/); + /* Do not touch the socket object anymore! */ + + pbuf_free(pbuf); + + return ERR_ABRT; + } + + /* + * If the socket has already been shut down for reading, discard the + * incoming data and do nothing else. + */ + if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) { + tcp_recved(tcp->tcp_pcb, pbuf->tot_len); + + pbuf_free(pbuf); + + return ERR_OK; + } + + /* + * We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would + * enable the receive functionality to delay delivering "un-pushed" + * data to applications. The implementation of this scheme could track + * the amount of data up to and including the last-pushed segment using + * a "tr_push_len" field or so. Deciding when to deliver "un-pushed" + * data after all is a bit tricker though. As far as I can tell, the + * BSDs do not implement anything like that. Windows does, and this + * results in interaction problems with even more lightweight TCP/IP + * stacks that do not send the TCP PSH flag. Currently, there is no + * obvious benefit for us to support delaying data delivery like that. + * In addition, testing its implementation reliably would be difficult. + */ + + len = (size_t)pbuf->tot_len; + + /* + * Count the number of buffers that are now owned by us. The new total + * of buffers owned by us must not exceed the size of the memory pool. + * Any more would indicate an accounting error. Note that + * tcpsock_recvbufs is currently used for debugging only! + */ + tcpsock_recvbufs += pbuf_clen(pbuf); + assert(tcpsock_recvbufs < mempool_cur_buffers()); + + /* + * The pre-tail pointer points to whatever is pointing to the tail + * buffer. The latter pointer may be the 'tr_head' field in our + * tcpsock structure, or the 'next' field in the penultimate buffer, + * or NULL if there are currently no buffers on the receive queue. + */ + if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) { + ptail = *pprevp; + + assert(ptail != NULL); + assert(ptail->next == NULL); + assert(tcp->tcp_rcv.tr_head != NULL); + + ptail->next = pbuf; + pbuf->tot_len = pbuf->len; /* to help freeing on merges */ + + if (tcpsock_try_merge(pprevp, ptail, pbuf)) { + ptail = *pprevp; + pbuf = ptail->next; + } + + if (pbuf != NULL) + pprevp = &ptail->next; + } else { + assert(tcp->tcp_rcv.tr_head == NULL); + assert(tcp->tcp_rcv.tr_head_off == 0); + + tcp->tcp_rcv.tr_head = pbuf; + + pprevp = &tcp->tcp_rcv.tr_head; + } + + /* + * Chop up the chain into individual buffers. This is necessary as we + * overload 'tot_len' to mean "space available in the buffer", as we + * want for buffers allocated by us as part of buffer merges. Also get + * a pointer to the pointer to the new penultimate tail buffer. Due to + * merging, the chain may already be empty by now, though. + */ + if (pbuf != NULL) { + for (; pbuf->next != NULL; pbuf = pbuf->next) { + pbuf->tot_len = pbuf->len; + + pprevp = &pbuf->next; + } + assert(pbuf->len == pbuf->tot_len); + } + + assert(*pprevp != NULL); + assert((*pprevp)->next == NULL); + tcp->tcp_rcv.tr_pre_tailp = pprevp; + + tcp->tcp_rcv.tr_len += len; + tcp->tcp_rcv.tr_unacked += len; + + assert(tcp->tcp_rcv.tr_unacked <= TCP_WND); + + /* + * Note that tr_len may now exceed the receive buffer size in the + * highly exceptional case that the user is adjusting the latter after + * the socket had already received data. + */ + + /* See if we can immediately acknowledge some or all of the data. */ + tcpsock_ack_recv(tcp); + + /* Also wake up any receivers now. */ + sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); + + return ERR_OK; +} + +/* + * Callback from lwIP. The PCB corresponding to the socket identified by 'arg' + * has been closed by lwIP, with the reason specified in 'err': either the + * connection has been aborted locally (ERR_ABRT), it has been reset by the + * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD). + */ +static void +tcpsock_event_err(void * arg, err_t err) +{ + struct tcpsock *tcp = (struct tcpsock *)arg; + int r; + + assert(tcp != NULL); + assert(tcp->tcp_pcb != NULL); + assert(err != ERR_OK); + + /* The original PCB is now gone, or will be shortly. */ + tcp->tcp_pcb = NULL; + + /* + * Clean up the socket. As a result it may be freed, in which case we + * must not touch it anymore. No need to return ERR_ABRT from here, as + * the PCB has been aborted already. + */ + if (tcpsock_cleanup(tcp, TRUE /*may_free*/)) + return; + + if (err == ERR_CLSD) { + /* + * We may get here if the socket is shut down for writing and + * we already received a FIN from the remote side, thus putting + * the socket in LAST_ACK state, and we receive that last + * acknowledgment. There is nothing more we need to do. + * + * We will never get here in the other case that ERR_CLSD is + * raised, which is when the socket is reset because of + * unacknowledged data while closing: we handle the + * reset-on-ACK case ourselves in tcpsock_close(), and the + * socket is in closing state after that. + */ + assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); + assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); + } else { + /* + * Anything else should be an error directly from lwIP; + * currently either ERR_ABRT and ERR_RST. Covert it to a + * regular error and set it on the socket. Doing so will also + * raise the appropriate events. + */ + /* + * Unfortunately, lwIP is not throwing accurate errors even + * when it can. We convert some errors to reflect more + * accurately the most likely cause. + * + * TODO: fix lwIP in this regard.. + */ + r = util_convert_err(err); + + if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) { + switch (err) { + case ERR_ABRT: r = ETIMEDOUT; break; + case ERR_RST: r = ECONNREFUSED; break; + } + } + + sockevent_set_error(tcpsock_get_sock(tcp), r); + } +} + +/* + * Callback from lwIP. Perform regular checks on a TCP socket. This function + * is called one per five seconds on connected sockets, and twice per second on + * closing sockets. + */ +static err_t +tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused) +{ + struct tcpsock *tcp = (struct tcpsock *)arg; + err_t err; + int r; + + assert(tcp != NULL); + assert(pcb == tcp->tcp_pcb); + + /* + * If we ended up running out of buffers earlier, try resuming any send + * requests now, both for enqueuing TCP data with lwIP and for user + * requests. + */ + if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) { + tcpsock_clear_flag(tcp, TCPF_FULL); + tcpsock_clear_flag(tcp, TCPF_OOM); + + /* See if we can enqueue more data with lwIP. */ + if (tcpsock_pcb_enqueue(tcp)) { + /* In some cases, we can now close the PCB. */ + if (tcpsock_may_close(tcp)) { + (void)tcpsock_finish_close(tcp); + /* + * The PCB is definitely gone here, and the + * entire socket object may be gone now too. + * Do not touch either anymore! + */ + + return ERR_OK; + } + + /* + * If actually sending the data fails, the PCB will be + * gone, and the socket object may be gone as well. Do + * not touch either anymore in that case! + */ + if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK) + return ERR_ABRT; + } + + /* + * If we ran out of buffers earlier, it may be possible to take + * in more data from a user process now, even if we did not + * manage to enqueue any more pending data with lwIP. + */ + sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); + + assert(tcp->tcp_pcb != NULL); + } else if (tcp->tcp_snd.ts_unsent != NULL && + tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) { + /* + * If the send buffer is full, we will no longer call + * tcp_output(), which means we may also miss out on fatal + * errors that would otherwise kill the connection (e.g., no + * route). As a result, the connection may erroneously + * continue to exist for a long time. To avoid this, we call + * tcp_output() every once in a while when there are still + * unsent data. + */ + err = tcp_output(tcp->tcp_pcb); + + if (err != ERR_OK && err != ERR_MEM) { + tcpsock_pcb_abort(tcp); + + if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { + r = util_convert_err(err); + + sockevent_set_error(tcpsock_get_sock(tcp), r); + } + /* Otherwise do not touch the socket object anymore! */ + + return ERR_ABRT; + } + } + + /* + * If we are closing the socket, and we sent a FIN, see if the FIN got + * acknowledged. If so, finish closing the socket. Unfortunately, we + * can perform this check by polling only. TODO: change lwIP.. + */ + if (sockevent_is_closing(tcpsock_get_sock(tcp)) && + (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) && + tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) { + assert(tcp->tcp_snd.ts_len == 0); + + tcpsock_finish_close(tcp); + } + + return ERR_OK; +} + +/* + * Bind a TCP socket to a local address. + */ +static int +tcpsock_bind(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + ip_addr_t ipaddr; + uint16_t port; + err_t err; + int r; + + if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED) + return EINVAL; + + if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len, + user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port, + FALSE /*allow_mcast*/, &ipaddr, &port)) != OK) + return r; + + err = tcp_bind(tcp->tcp_pcb, &ipaddr, port); + + return util_convert_err(err); +} + +/* + * Callback from lwIP. A new connection 'pcb' has arrived on the listening + * socket identified by 'arg'. Note that 'pcb' may be NULL in the case that + * lwIP could not accept the connection itself. + */ +static err_t +tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err) +{ + struct tcpsock *tcp = (struct tcpsock *)arg; + + assert(tcp != NULL); + assert(tcpsock_is_listening(tcp)); + + /* + * If the given PCB is NULL, then lwIP ran out of memory allocating a + * PCB for the new connection. There is nothing we can do with that + * information. Also check 'err' just to make sure. + */ + if (pcb == NULL || err != OK) + return ERR_OK; + + /* + * The TCP socket is the listening socket, but the PCB is for the + * incoming connection. + */ + if (tcpsock_clone(tcp, pcb) != OK) { + /* + * We could not allocate the resources necessary to accept the + * connection. Abort it immediately. + */ + tcp_abort(pcb); + + return ERR_ABRT; + } + + /* + * The connection has not yet been accepted, and thus should still be + * considered on the listen queue. + */ + tcp_backlog_delayed(pcb); + + /* Set the callback functions. */ + tcp_recv(pcb, tcpsock_event_recv); + tcp_sent(pcb, tcpsock_event_sent); + tcp_err(pcb, tcpsock_event_err); + tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); + + sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT); + + return ERR_OK; +} + +/* + * Put a TCP socket in listening mode. + */ +static int +tcpsock_listen(struct sock * sock, int backlog) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + struct tcp_pcb *pcb; + err_t err; + + /* The maximum backlog value must not exceed its field size. */ + assert(SOMAXCONN <= UINT8_MAX); + + /* + * Allow only CLOSED sockets to enter listening mode. If the socket + * was already in listening mode, allow its backlog value to be + * updated, even if it was shut down already (making this a no-op). + */ + if (!tcpsock_is_listening(tcp) && + (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)) + return EINVAL; + + /* + * If the socket was not already in listening mode, put it in that mode + * now. That involves switching PCBs as lwIP attempts to save memory + * by replacing the original PCB with a smaller one. If the socket was + * already in listening mode, simply update its backlog value--this has + * no effect on the sockets already in the backlog. + */ + if (!tcpsock_is_listening(tcp)) { + assert(tcp->tcp_pcb != NULL); + + /* + * If the socket has not been bound to a port yet, do that + * first. This does mean that the listen call may fail with + * side effects, but that is acceptable in this case. + */ + if (tcp->tcp_pcb->local_port == 0) { + err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip, + 0 /*port*/); + + if (err != ERR_OK) + return util_convert_err(err); + } + + /* + * Clear the argument on the PCB that is about to be replaced, + * because if we do not, once the PCB is reused (which does not + * clear the argument), we might get weird events. Do this + * before the tcp_listen() call, because we should no longer + * access the old PCB afterwards (even if we can). + */ + tcp_arg(tcp->tcp_pcb, NULL); + + pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog, + &err); + + if (pcb == NULL) { + tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */ + + return util_convert_err(err); + } + + tcp_arg(pcb, tcp); + tcp->tcp_pcb = pcb; + + tcp_accept(pcb, tcpsock_event_accept); + + /* Initialize the queue head for sockets pending acceptance. */ + TAILQ_INIT(&tcp->tcp_queue.tq_head); + } else if (tcp->tcp_pcb != NULL) + tcp_backlog_set(tcp->tcp_pcb, backlog); + + return OK; +} + +/* + * Callback from lwIP. A socket connection attempt has succeeded. Note that + * failed socket events will trigger the tcpsock_event_err() callback instead. + */ +static err_t +tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err) +{ + struct tcpsock *tcp = (struct tcpsock *)arg; + + assert(tcp != NULL); + assert(pcb == tcp->tcp_pcb); + assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING); + + /* + * If lwIP ever changes so that this callback is called for connect + * failures as well, then we need to change the code here accordingly. + */ + if (err != ERR_OK) + panic("TCP connected event with error: %d", err); + + tcpsock_clear_flag(tcp, TCPF_CONNECTING); + + sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND); + + return ERR_OK; +} + +/* + * Connect a TCP socket to a remote address. + */ +static int +tcpsock_connect(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + ip_addr_t dst_addr; + uint16_t dst_port; + err_t err; + int r; + + /* + * Listening sockets may not have a PCB, so we use higher-level flags + * to throw the correct error code for those instead. + */ + if (tcpsock_is_listening(tcp)) + return EOPNOTSUPP; + + /* + * If there is no longer any PCB, we obviously cannot perform the + * connection, but POSIX is not clear on which error to return. We + * copy NetBSD's. + */ + if (tcp->tcp_pcb == NULL) + return EINVAL; + + /* + * The only state from which a connection can be initiated, is CLOSED. + * Some of the other states require distinct error codes, though. + */ + switch (tcp->tcp_pcb->state) { + case CLOSED: + break; + case SYN_SENT: + return EALREADY; + case LISTEN: + assert(0); /* we just checked.. */ + default: + return EISCONN; + } + + /* + * Get the destination address, and attempt to start connecting. If + * the socket was not bound before, or it was bound to a port only, + * then lwIP will select a source address for us. We cannot do this + * ourselves even if we wanted to: it is impossible to re-bind a TCP + * PCB in the case it was previously bound to a port only. + */ + if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len, + &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK) + return r; + + err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port, + tcpsock_event_connected); + + /* + * Note that various tcp_connect() error cases will leave the PCB with + * a newly set local and remote IP address anyway. We should be + * careful not to rely on the addresses being as they were before. + */ + if (err != ERR_OK) + return util_convert_err(err); + + /* Set the other callback functions. */ + tcp_recv(tcp->tcp_pcb, tcpsock_event_recv); + tcp_sent(tcp->tcp_pcb, tcpsock_event_sent); + tcp_err(tcp->tcp_pcb, tcpsock_event_err); + tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); + + /* + * Set a flag so that we can correct lwIP's error codes in case the + * connection fails. + */ + tcpsock_set_flag(tcp, TCPF_CONNECTING); + + return SUSPEND; +} + +/* + * Test whether any new connections are pending on a listening TCP socket. + */ +static int +tcpsock_test_accept(struct sock * sock) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + + /* Is this socket in listening mode at all? */ + if (!tcpsock_is_listening(tcp)) + return EINVAL; + + /* Are there any connections to accept right now? */ + if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) + return OK; + + /* If the socket has been shut down, we return ECONNABORTED. */ + if (tcp->tcp_pcb == NULL) + return ECONNABORTED; + + /* Otherwise, wait for a new connection first. */ + return SUSPEND; +} + +/* + * Accept a connection on a listening TCP socket, creating a new TCP socket. + */ +static sockid_t +tcpsock_accept(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len, endpoint_t user_endpt __unused, + struct sock ** newsockp) +{ + struct tcpsock *listener = (struct tcpsock *)sock; + struct tcpsock *tcp; + int r; + + if ((r = tcpsock_test_accept(sock)) != OK) + return r; + /* Below, we must not assume that the listener has a PCB. */ + + tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head); + assert(tcp->tcp_listener == listener); + assert(tcp->tcp_pcb != NULL); + + TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next); + tcp->tcp_listener = NULL; + + tcp_backlog_accepted(tcp->tcp_pcb); + + ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, + &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); + + /* + * Set 'newsockp' to NULL so that libsockevent knows we already cloned + * the socket, and it must not be reinitialized anymore. + */ + *newsockp = NULL; + return tcpsock_get_id(tcp); +} + +/* + * Perform preliminary checks on a send request. + */ +static int +tcpsock_pre_send(struct sock * sock, size_t len __unused, + socklen_t ctl_len __unused, const struct sockaddr * addr __unused, + socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags) +{ + + /* + * Reject calls with unknown flags. Since libsockevent strips out the + * flags it handles itself here, we only have to test for ones we can + * not handle. Currently, there are no send flags that we support. + */ + if (flags != 0) + return EOPNOTSUPP; + + return OK; +} + +/* + * Test whether the given number of data bytes can be sent on a TCP socket. + */ +static int +tcpsock_test_send(struct sock * sock, size_t min) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + size_t sndbuf; + + if (tcp->tcp_pcb == NULL) + return EPIPE; + + switch (tcp->tcp_pcb->state) { + case CLOSED: /* new */ + case LISTEN: /* listening */ + return ENOTCONN; + case SYN_SENT: /* connecting */ + case SYN_RCVD: /* simultaneous open, maybe someday? */ + return SUSPEND; + case ESTABLISHED: /* connected */ + case CLOSE_WAIT: /* closed remotely */ + break; + default: /* shut down locally */ + assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); + return EPIPE; + } + + sndbuf = tcpsock_get_sndbuf(tcp); + if (min > sndbuf) + min = sndbuf; + + if (tcp->tcp_snd.ts_len + min > sndbuf) + return SUSPEND; + else + return OK; +} + +/* + * Send data on a TCP socket. + */ +static int +tcpsock_send(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, + socklen_t ctl_len __unused, socklen_t * ctl_off __unused, + const struct sockaddr * addr __unused, socklen_t addr_len __unused, + endpoint_t user_endpt __unused, int flags __unused, size_t min) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + struct pbuf *ptail, *pfirst, *pnext, *plast; + size_t off, tail_off, chunk, left, sndbuf; + int r; + + if ((r = tcpsock_test_send(sock, min)) != OK) + return r; + + if (len == 0) + return OK; /* nothing to do */ + + sndbuf = tcpsock_get_sndbuf(tcp); + if (min > sndbuf) + min = sndbuf; + assert(min > 0); + + assert(sndbuf > tcp->tcp_snd.ts_len); + left = sndbuf - tcp->tcp_snd.ts_len; + if (left > len) + left = len; + + /* + * First see if we can fit any more data in the current tail buffer. + * If so, we set 'ptail' to point to it and 'tail_off' to the previous + * length of the tail buffer, while optimistically extending it to + * include the new data. If not, we set them to NULL/0. + */ + if ((ptail = tcp->tcp_snd.ts_tail) != NULL && + ptail->len < ptail->tot_len) { + assert(ptail->len > 0); + tail_off = (size_t)ptail->len; + + /* + * Optimistically extend the head buffer to include whatever + * fits in it. This is needed for util_copy_data(). + */ + assert(ptail->tot_len > ptail->len); + off = (size_t)ptail->tot_len - (size_t)ptail->len; + if (off > left) + off = left; + ptail->len += off; + } else { + ptail = NULL; + tail_off = 0; + off = 0; + } + + /* + * Then, if there is more to send, allocate new buffers as needed. If + * we run out of memory, work with whatever we did manage to grab. + */ + pfirst = NULL; + plast = NULL; + while (off < left) { + if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS || + (pnext = tcpsock_alloc_buf()) == NULL) { + /* + * Chances are that we will end up suspending this send + * request because of being out of buffers. We try to + * resume such requests from the polling function. + */ + tcpsock_set_flag(tcp, TCPF_OOM); + + break; + } + + tcpsock_sendbufs++; + + if (pfirst == NULL) + pfirst = pnext; + else + plast->next = pnext; + plast = pnext; + + chunk = (size_t)pnext->tot_len; + if (chunk > left - off) + chunk = left - off; + pnext->len = chunk; + off += chunk; + } + + /* + * Copy in the data and continue, unless we did not manage to find + * enough space to even meet the low send watermark, in which case we + * undo any allocation and suspend the call until later. + */ + if (off >= min) { + /* + * Optimistically attach the new buffers to the tail, also for + * util_copy_data(). We undo all this if the copy fails. + */ + if (ptail != NULL) { + ptail->next = pfirst; + + pnext = ptail; + } else + pnext = pfirst; + + assert(pnext != NULL); + + r = util_copy_data(data, off, *offp, pnext, tail_off, + TRUE /*copy_in*/); + } else + r = SUSPEND; + + if (r != OK) { + /* Undo the modifications made so far. */ + while (pfirst != NULL) { + pnext = pfirst->next; + + assert(tcpsock_sendbufs > 0); + tcpsock_sendbufs--; + + tcpsock_free_buf(pfirst); + + pfirst = pnext; + } + + if (ptail != NULL) { + ptail->next = NULL; + + ptail->len = tail_off; + } + + return r; + } + + /* Attach the new buffers, if any, to the buffer tail. */ + if (pfirst != NULL) { + if ((ptail = tcp->tcp_snd.ts_tail) != NULL) { + assert(ptail->len == ptail->tot_len); + + /* + * Due to our earlier optimistic modifications, this + * may or may not be redundant. + */ + ptail->next = pfirst; + } + + assert(plast != NULL); + tcp->tcp_snd.ts_tail = plast; + + if (tcp->tcp_snd.ts_head == NULL) { + tcp->tcp_snd.ts_head = pfirst; + assert(tcp->tcp_snd.ts_head_off == 0); + } + if (tcp->tcp_snd.ts_unsent == NULL) { + tcp->tcp_snd.ts_unsent = pfirst; + assert(tcp->tcp_snd.ts_unsent_off == 0); + } + } + + tcp->tcp_snd.ts_len += off; + + /* + * See if we can send any of the data we just enqueued. The socket is + * still open as we are still processing a call from userland on it; + * this saves us from having to deal with the cases that the following + * calls end up freeing the socket object. + */ + if (tcpsock_pcb_enqueue(tcp) && + (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) { + /* + * That did not go well. Return the error immediately if we + * had not made any progress earlier. Otherwise, return our + * partial progress and leave the error to be picked up later. + */ + if (*offp > 0) { + sockevent_set_error(tcpsock_get_sock(tcp), r); + + return OK; + } else + return r; + } + + *offp += off; + return (off < len) ? SUSPEND : OK; +} + +/* + * Perform preliminary checks on a receive request. + */ +static int +tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, + int flags) +{ + + /* + * Reject calls with unknown flags. Since libsockevent strips out the + * flags it handles itself here, we only have to test for ones we can + * not handle. + */ + if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) + return EOPNOTSUPP; + + return OK; +} + +/* + * Return TRUE if receive calls may wait for more data to come in on the + * connection, or FALSE if we already know that that is not going to happen. + */ +static int +tcpsock_may_wait(struct tcpsock * tcp) +{ + + return (tcp->tcp_pcb != NULL && + !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN)); +} + +/* + * Test whether data can be received on a TCP socket, and if so, how many bytes + * of data. + */ +static int +tcpsock_test_recv(struct sock * sock, size_t min, size_t * size) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + int may_wait; + + /* If there is and never was a connection, refuse the call at all. */ + if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED || + tcp->tcp_pcb->state == LISTEN)) + return ENOTCONN; + + /* + * If we are certain that no more data will come in later, ignore the + * low receive watermark. Otherwise, bound it to the size of the + * receive buffer, or receive calls may block forever. + */ + if (!(may_wait = tcpsock_may_wait(tcp))) + min = 1; + else if (min > tcpsock_get_rcvbuf(tcp)) + min = tcpsock_get_rcvbuf(tcp); + + if (tcp->tcp_rcv.tr_len >= min) { + if (size != NULL) + *size = tcp->tcp_rcv.tr_len; + + return OK; + } + + return (may_wait) ? SUSPEND : SOCKEVENT_EOF; +} + +/* + * Receive data on a TCP socket. + */ +static int +tcpsock_recv(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, + socklen_t ctl_len __unused, socklen_t * ctl_off __unused, + struct sockaddr * addr __unused, socklen_t * addr_len __unused, + endpoint_t user_endpt __unused, int flags, size_t min, + int * rflags __unused) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + struct pbuf *ptail; + size_t off, left; + int r; + + /* See if we can receive at all, and if so, how much at most. */ + if ((r = tcpsock_test_recv(sock, min, NULL)) != OK) + return r; + + if (len == 0) + return OK; /* nothing to do */ + + off = tcp->tcp_rcv.tr_len; + if (off > len) + off = len; + + assert(tcp->tcp_rcv.tr_head != NULL); + assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len); + + /* Copy out the data to the caller. */ + if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head, + tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK) + return r; + + /* Unless peeking, remove the data from the receive queue. */ + if (!(flags & MSG_PEEK)) { + left = off; + + /* Dequeue and free as many entire buffers as possible. */ + while ((ptail = tcp->tcp_rcv.tr_head) != NULL && + left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) { + left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off; + + tcp->tcp_rcv.tr_head = ptail->next; + tcp->tcp_rcv.tr_head_off = 0; + + if (tcp->tcp_rcv.tr_head == NULL) + tcp->tcp_rcv.tr_pre_tailp = NULL; + else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next) + tcp->tcp_rcv.tr_pre_tailp = + &tcp->tcp_rcv.tr_head; + + assert(tcpsock_recvbufs > 0); + tcpsock_recvbufs--; + + tcpsock_free_buf(ptail); + } + + /* + * If only part of the (new) head buffer is consumed, adjust + * the saved offset into that buffer. + */ + if (left > 0) { + assert(tcp->tcp_rcv.tr_head != NULL); + assert((size_t)tcp->tcp_rcv.tr_head->len - + tcp->tcp_rcv.tr_head_off > left); + + tcp->tcp_rcv.tr_head_off += left; + } + + tcp->tcp_rcv.tr_len -= off; + + if (tcp->tcp_rcv.tr_head != NULL) { + assert(tcp->tcp_rcv.tr_pre_tailp != NULL); + assert(tcp->tcp_rcv.tr_len > 0); + } else { + assert(tcp->tcp_rcv.tr_pre_tailp == NULL); + assert(tcp->tcp_rcv.tr_len == 0); + } + + /* + * The receive buffer has shrunk, so there may now be space to + * receive more data. + */ + if (tcp->tcp_pcb != NULL) + tcpsock_ack_recv(tcp); + } else + flags &= ~MSG_WAITALL; /* for the check below */ + + /* Advance the current copy position, and see if we are done. */ + *offp += off; + if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp)) + return SUSPEND; + else + return OK; +} + +/* + * Update the set of flag-type socket options on a TCP socket. + */ +static void +tcpsock_setsockmask(struct sock * sock, unsigned int mask) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + + if (tcp->tcp_pcb == NULL) + return; + + if (mask & SO_REUSEADDR) + ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR); + else + ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR); + + if (mask & SO_KEEPALIVE) + ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE); + else + ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE); +} + +/* + * Prepare a helper structure for IP-level option processing. + */ +static void +tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts) +{ + + ipopts->local_ip = &tcp->tcp_pcb->local_ip; + ipopts->remote_ip = &tcp->tcp_pcb->remote_ip; + ipopts->tos = &tcp->tcp_pcb->tos; + ipopts->ttl = &tcp->tcp_pcb->ttl; + ipopts->sndmin = TCP_SNDBUF_MIN; + ipopts->sndmax = TCP_SNDBUF_MAX; + ipopts->rcvmin = TCP_RCVBUF_MIN; + ipopts->rcvmax = TCP_RCVBUF_MAX; +} + +/* + * Set socket options on a TCP socket. + */ +static int +tcpsock_setsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t len) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + struct ipopts ipopts; + uint32_t uval; + int r, val; + + if (tcp->tcp_pcb == NULL) + return ECONNRESET; + + /* Handle TCP-level options. */ + switch (level) { + case IPPROTO_IPV6: + switch (name) { + case IPV6_RECVTCLASS: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + /* + * This option is not supported for TCP sockets; it + * would not even make sense. However, named(8) + * insists on trying to set it anyway. We accept the + * request but ignore the value, not even returning + * what was set through getsockopt(2). + */ + return OK; + + case IPV6_FAITH: + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + /* + * This option is not supported at all, but to save + * ourselves from having to remember the current state + * for getsockopt(2), we also refuse to enable it. + */ + if (val != 0) + return EINVAL; + + return OK; + } + + break; + + case IPPROTO_TCP: + switch (name) { + case TCP_NODELAY: + /* + * lwIP's listening TCP PCBs do not have this field. + * If this ever becomes an issue, we can create our own + * shadow flag and do the inheritance ourselves. + */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val) + tcp_nagle_disable(tcp->tcp_pcb); + else + tcp_nagle_enable(tcp->tcp_pcb); + + return OK; + + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + /* + * lwIP's listening TCP PCBs do not have these fields. + */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val == 0) + return EINVAL; + + /* + * The given value is unsigned, but lwIP stores the + * value in milliseconds in a uint32_t field, so we + * have to limit large values to whatever fits in the + * field anyway. + */ + if (val < 0 || (uint32_t)val > UINT32_MAX / 1000) + uval = UINT32_MAX; + else + uval = (uint32_t)val * 1000; + + if (name == TCP_KEEPIDLE) + tcp->tcp_pcb->keep_idle = uval; + else + tcp->tcp_pcb->keep_intvl = uval; + + return OK; + + case TCP_KEEPCNT: + /* lwIP's listening TCP PCBs do not have this field. */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val == 0) + return EINVAL; + + tcp->tcp_pcb->keep_cnt = (uint32_t)val; + + return OK; + } + + return EOPNOTSUPP; + } + + /* Handle all other options at the IP level. */ + tcpsock_get_ipopts(tcp, &ipopts); + + return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data, + len, &ipopts); +} + +/* + * Retrieve socket options on a TCP socket. + */ +static int +tcpsock_getsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t * len) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + struct ipopts ipopts; + int val; + + if (tcp->tcp_pcb == NULL) + return ECONNRESET; + + /* Handle TCP-level options. */ + switch (level) { + case IPPROTO_IPV6: + switch (name) { + case IPV6_RECVTCLASS: + case IPV6_FAITH: + val = 0; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + + case IPPROTO_TCP: + switch (name) { + case TCP_NODELAY: + /* lwIP's listening TCP PCBs do not have this field. */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + val = tcp_nagle_disabled(tcp->tcp_pcb); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case TCP_MAXSEG: + /* lwIP's listening TCP PCBs do not have this field. */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + /* This option is read-only at this time. */ + val = tcp->tcp_pcb->mss; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case TCP_KEEPIDLE: + /* lwIP's listening TCP PCBs do not have this field. */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + val = (int)(tcp->tcp_pcb->keep_idle / 1000); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case TCP_KEEPINTVL: + /* lwIP's listening TCP PCBs do not have this field. */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + val = (int)(tcp->tcp_pcb->keep_intvl / 1000); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case TCP_KEEPCNT: + /* lwIP's listening TCP PCBs do not have this field. */ + if (tcp->tcp_pcb->state == LISTEN) + return EINVAL; + + val = (int)tcp->tcp_pcb->keep_cnt; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + return EOPNOTSUPP; + } + + /* Handle all other options at the IP level. */ + tcpsock_get_ipopts(tcp, &ipopts); + + return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data, + len, &ipopts); +} + +/* + * Retrieve the local socket address of a TCP socket. + */ +static int +tcpsock_getsockname(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + + if (tcp->tcp_pcb == NULL) + return EINVAL; + + ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, + &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port); + + return OK; +} + +/* + * Retrieve the remote socket address of a TCP socket. + */ +static int +tcpsock_getpeername(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + + if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED || + tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT) + return ENOTCONN; + + ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, + &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); + + return OK; +} + +/* + * Perform a TCP half-close on a TCP socket. This operation may not complete + * immediately due to memory conditions, in which case it will be completed at + * a later time. + */ +static void +tcpsock_send_fin(struct tcpsock * tcp) +{ + + sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR); + + /* + * Attempt to send the FIN. If a fatal error occurs as a result, raise + * it as an asynchronous error, because this function's callers cannot + * do much with it. That happens to match the way these functions are + * used elsewhere. In any case, as a result, the PCB may be closed. + * However, we are never called from a situation where the socket is + * being closed here, so the socket object will not be freed either. + */ + if (tcpsock_pcb_enqueue(tcp)) { + assert(!sockevent_is_closing(tcpsock_get_sock(tcp))); + + if (tcpsock_may_close(tcp)) + tcpsock_finish_close(tcp); + else + (void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/); + } +} + +/* + * Shut down a TCP socket for reading and/or writing. + */ +static int +tcpsock_shutdown(struct sock * sock, unsigned int mask) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + + /* + * If the PCB is gone, we want to allow shutdowns for reading but not + * writing: shutting down for writing affects the PCB, shutting down + * for reading does not. Also, if the PCB is in CLOSED state, we would + * not know how to deal with subsequent operations after a shutdown for + * writing, so forbid such calls altogether. + */ + if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) && + (mask & SFL_SHUT_WR)) + return ENOTCONN; + + /* + * Handle listening sockets as a special case. Shutting down a + * listening socket frees its PCB. Sockets pending on the accept queue + * may still be accepted, but after that, accept(2) will start + * returning ECONNABORTED. This feature allows multi-process server + * applications to shut down gracefully, supposedly.. + */ + if (tcpsock_is_listening(tcp)) { + if (tcp->tcp_pcb != NULL) + tcpsock_pcb_close(tcp); + + return OK; + } + + /* + * We control shutdown-for-reading locally, and intentially do not tell + * lwIP about it: if we do that and also shut down for writing, the PCB + * may disappear (now or eventually), which is not what we want. + * Instead, we only tell lwIP to shut down for reading once we actually + * want to get rid of the PCB, using tcp_close(). In the meantime, if + * the socket is shut down for reading by the user, we simply discard + * received data as fast as we can--one out of a number of possible + * design choices there, and (reportedly) the one used by the BSDs. + */ + if (mask & SFL_SHUT_RD) + (void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/); + + /* + * Shutting down for writing a connecting socket simply closes its PCB. + * Closing a PCB in SYN_SENT state simply deallocates it, so this can + * not fail. On the other hand, for connected sockets we want to send + * a FIN, which may fail due to memory shortage, in which case we have + * to try again later.. + */ + if (mask & SFL_SHUT_WR) { + if (tcp->tcp_pcb->state == SYN_SENT) + tcpsock_pcb_close(tcp); + else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) + tcpsock_send_fin(tcp); + } + + return OK; +} + +/* + * Close a TCP socket. Complete the operation immediately if possible, or + * otherwise initiate the closing process and complete it later, notifying + * libsockevent about that as well. Depending on linger settings, this + * function may be called twice on the same socket: the first time with the + * 'force' flag cleared, and the second time with the 'force' flag set. + */ +static int +tcpsock_close(struct sock * sock, int force) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + struct tcpsock *queued; + size_t rlen; + + assert(tcp->tcp_listener == NULL); + + /* + * If this was a listening socket, so abort and clean up any and all + * connections on its listener queue. Note that the listening socket + * may or may not have a PCB at this point. + */ + if (tcpsock_is_listening(tcp)) { + while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) { + queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head); + + tcpsock_pcb_abort(queued); + + (void)tcpsock_cleanup(queued, TRUE /*may_free*/); + } + } + + /* + * Clear the receive queue, and make sure that we no longer add new + * data to it. The latter is relevant only for the case that we end up + * returning SUSPEND below. Remember whether there were bytes left, + * because we should reset the connection if there were. + */ + rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/); + + sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD); + + /* + * If the socket is connected, perform a graceful shutdown, unless 1) + * we are asked to force-close the socket, or 2) if the local side has + * not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP + * would take care of the second point, but we may have data in our + * receive buffer of which lwIP is not aware. + * + * Implementing proper linger support is somewhat difficult with lwIP. + * In particular, we cannot reliably wait for our FIN to be ACK'ed by + * the other side in all cases: + * + * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not + * trigger any event and once in the TIME_WAIT state, the poll event + * no longer triggers either; + * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to + * TIME_WAIT will trigger a receive event, but it is not clear + * whether we can reliably check that our FIN was ACK'ed from there. + * + * That means we have to compromise. Instead of the proper approach, + * we complete our side of the close operation whenever: + * + * 1. all of or data was acknowledged, AND, + * 2. our FIN was sent, AND, + * 3a. our FIN was acknowledged, OR, + * 3b. we received a FIN from the other side. + * + * With the addition of the rule 3b, we do not run into the above + * reliability problems, but we may return from SO_LINGER-blocked close + * calls too early and thus give callers a false impression of success. + * TODO: if lwIP ever gets improved on this point, the code in this + * module should be rewritten to make use of the improvements. + * + * The set of rules is basically the same as for closing the PCB early + * as per tcpsock_may_close(), except with the check for our FIN being + * acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and + * (reentered) CLOSED TCP states guarantee that there are no + * unacknowledged data segments anymore, so we may have to wait for + * reaching any one of these before we can actually finish closing the + * socket with tcp_close(). + * + * In addition, lwIP does not tell us when our FIN gets acknowledged, + * so we have to use polling and direct access to lwIP's PCB fields + * instead, just like lwIP's BSD API does. There is no other way. + * Also, we may not even be able to send the FIN right away, in which + * case we must defer that until later. + */ + if (tcp->tcp_pcb != NULL) { + switch (tcp->tcp_pcb->state) { + case CLOSE_WAIT: + case CLOSING: + case LAST_ACK: + assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); + + /* FALLTHROUGH */ + case SYN_RCVD: + case ESTABLISHED: + case FIN_WAIT_1: + /* First check if we should abort the connection. */ + if (force || rlen > 0) + break; + + /* + * If we have not sent a FIN yet, try sending it now; + * if all other conditions are met for closing the + * socket, successful FIN transmission will complete + * the close. Otherwise, perform the close check + * explicitly. + */ + if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) + tcpsock_send_fin(tcp); + else if (tcpsock_may_close(tcp)) + tcpsock_pcb_close(tcp); + + /* + * If at this point the PCB is gone, we managed to + * close the connection immediately, and the socket has + * already been cleaned up by now. This may occur if + * there is no unacknowledged data and we already + * received a FIN earlier on. + */ + if (tcp->tcp_pcb == NULL) + return OK; + + /* + * Complete the close operation at a later time. + * Adjust the polling interval, so that we can detect + * completion of the close as quickly as possible. + */ + tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, + TCP_POLL_CLOSE_INTERVAL); + + return SUSPEND; + + default: + /* + * The connection is either not yet established, or + * already in a state where we can close it right now. + */ + tcpsock_pcb_close(tcp); + } + } + + /* + * Abort the connection is the PCB is still around, and clean up the + * socket. We cannot let tcpsock_cleanup() free the socket object yet, + * because we are still in the callback from libsockevent, and the + * latter cannot handle the socket object being freed from here. + */ + if (tcp->tcp_pcb != NULL) + tcpsock_pcb_abort(tcp); + + (void)tcpsock_cleanup(tcp, FALSE /*may_free*/); + + return OK; +} + +/* + * Free up a closed TCP socket. + */ +static void +tcpsock_free(struct sock * sock) +{ + struct tcpsock *tcp = (struct tcpsock *)sock; + + assert(tcp->tcp_pcb == NULL); + assert(tcp->tcp_snd.ts_len == 0); + assert(tcp->tcp_snd.ts_head == NULL); + assert(tcp->tcp_rcv.tr_len == 0); + assert(tcp->tcp_rcv.tr_head == NULL); + + TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next); +} + +/* This table maps TCP states from lwIP numbers to NetBSD numbers. */ +static const struct { + int tsm_tstate; + int tsm_sostate; +} tcpsock_statemap[] = { + [CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED }, + [LISTEN] = { TCPS_LISTEN, 0 }, + [SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING }, + [SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING }, + [ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED }, + [FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING }, + [FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING }, + [CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED }, + [CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING }, + [LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING }, + [TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED }, +}; + +/* + * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP + * PCB identified by the given pointer. + */ +static void +tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr) +{ + const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr; + struct tcpsock *tcp; + + /* + * Not all TCP PCBs have an associated tcpsock structure. We are + * careful enough clearing the callback argument for PCBs on any of the + * TCP lists that we can use that callback argument to determine + * whether there is an associated tcpsock structure, although with one + * exception: PCBs for incoming connections that have not yet been + * fully established (i.e., in SYN_RCVD state). These will have the + * callback argument of the listening socket (which itself may already + * have been deallocated at this point) but should not be considered as + * associated with the listening socket's tcpsock structure. + */ + if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) { + tcp = (struct tcpsock *)pcb->callback_arg; + assert(tcp >= tcp_array && + tcp < &tcp_array[__arraycount(tcp_array)]); + + /* TODO: change this so that sockstat(1) may work one day. */ + ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp); + } else { + /* No tcpsock. Could also be in TIME_WAIT state etc. */ + tcp = NULL; + + ki->ki_sostate = SS_NOFDREF; + } + + ki->ki_type = SOCK_STREAM; + + if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) { + ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate; + /* TODO: this needs work, but does anything rely on it? */ + ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate; + } + + /* Careful with the LISTEN state here (see below). */ + ipsock_get_info(ki, &pcb->local_ip, pcb->local_port, + &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0); + + /* + * The PCBs for listening sockets are actually smaller. Thus, for + * listening sockets, do not attempt to access any of the fields beyond + * those provided in the smaller structure. + */ + if (pcb->state == LISTEN) { + assert(tcp != NULL); + ki->ki_refs = + (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head); + } else { + if (tcp_nagle_disabled(pcb)) + ki->ki_tflags |= NETBSD_TF_NODELAY; + + if (tcp != NULL) { + ki->ki_rcvq = tcp->tcp_rcv.tr_len; + ki->ki_sndq = tcp->tcp_snd.ts_len; + + if (tcp->tcp_listener != NULL) + ki->ki_nextref = (uint64_t)(uintptr_t) + TAILQ_NEXT(tcp, tcp_queue.tq_next); + } + } +} + +/* + * Given either NULL or a previously returned TCP PCB pointer, return the first + * or next TCP PCB pointer, or NULL if there are no more. The current + * implementation supports only one concurrent iteration at once. + */ +static const void * +tcpsock_enum(const void * last) +{ + static struct { + unsigned int i; + const struct tcp_pcb *pcb; + } iter; + + if (last != NULL && (iter.pcb = iter.pcb->next) != NULL) + return (const void *)iter.pcb; + + for (iter.i = (last != NULL) ? iter.i + 1 : 0; + iter.i < __arraycount(tcp_pcb_lists); iter.i++) { + if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL) + return (const void *)iter.pcb; + } + + return NULL; +} + +/* + * Obtain the list of TCP protocol control blocks, for sysctl(7). + */ +static ssize_t +tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused, + struct rmib_oldp * oldp, struct rmib_newp * newp __unused) +{ + + return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info); +} + +static const struct sockevent_ops tcpsock_ops = { + .sop_bind = tcpsock_bind, + .sop_listen = tcpsock_listen, + .sop_connect = tcpsock_connect, + .sop_accept = tcpsock_accept, + .sop_test_accept = tcpsock_test_accept, + .sop_pre_send = tcpsock_pre_send, + .sop_send = tcpsock_send, + .sop_test_send = tcpsock_test_send, + .sop_pre_recv = tcpsock_pre_recv, + .sop_recv = tcpsock_recv, + .sop_test_recv = tcpsock_test_recv, + .sop_ioctl = ifconf_ioctl, + .sop_setsockmask = tcpsock_setsockmask, + .sop_setsockopt = tcpsock_setsockopt, + .sop_getsockopt = tcpsock_getsockopt, + .sop_getsockname = tcpsock_getsockname, + .sop_getpeername = tcpsock_getpeername, + .sop_shutdown = tcpsock_shutdown, + .sop_close = tcpsock_close, + .sop_free = tcpsock_free +}; diff --git a/minix/net/lwip/udpsock.c b/minix/net/lwip/udpsock.c new file mode 100644 index 000000000..eea8194fc --- /dev/null +++ b/minix/net/lwip/udpsock.c @@ -0,0 +1,997 @@ +/* LWIP service - udpsock.c - UDP sockets */ + +#include "lwip.h" +#include "ifaddr.h" +#include "pktsock.h" + +#include "lwip/udp.h" + +#include +#include +#include + +/* The number of UDP sockets. Inherited from the lwIP configuration. */ +#define NR_UDPSOCK MEMP_NUM_UDP_PCB + +/* + * Outgoing packets are not getting buffered, so the send buffer size simply + * determines the maximum size for sent packets. The send buffer maximum is + * therefore limited to the maximum size of a single packet (64K-1 bytes), + * which is already enforced by lwIP's 16-bit length parameter to pbuf_alloc(). + * + * The actual transmission may enforce a lower limit, though. The full packet + * size must not exceed the same 64K-1 limit, and that includes any headers + * that still have to be prepended to the given packet. The size of those + * headers depends on the socket type (IPv4/IPv6) and the IP_HDRINCL setting. + */ +#define UDP_MAX_PAYLOAD (UINT16_MAX) + +#define UDP_SNDBUF_MIN 1 /* minimum UDP send buffer size */ +#define UDP_SNDBUF_DEF 8192 /* default UDP send buffer size */ +#define UDP_SNDBUF_MAX UDP_MAX_PAYLOAD /* maximum UDP send buffer size */ +#define UDP_RCVBUF_MIN MEMPOOL_BUFSIZE /* minimum UDP receive buffer size */ +#define UDP_RCVBUF_DEF 32768 /* default UDP receive buffer size */ +#define UDP_RCVBUF_MAX 65536 /* maximum UDP receive buffer size */ + +static struct udpsock { + struct pktsock udp_pktsock; /* pkt socket, MUST be first */ + struct udp_pcb *udp_pcb; /* lwIP UDP control block */ + SIMPLEQ_ENTRY(udpsock) udp_next; /* next in free list */ +} udp_array[NR_UDPSOCK]; + +static SIMPLEQ_HEAD(, udpsock) udp_freelist; /* list of free UDP sockets */ + +static const struct sockevent_ops udpsock_ops; + +#define udpsock_get_sock(udp) (ipsock_get_sock(udpsock_get_ipsock(udp))) +#define udpsock_get_ipsock(udp) (pktsock_get_ipsock(&(udp)->udp_pktsock)) +#define udpsock_is_ipv6(udp) (ipsock_is_ipv6(udpsock_get_ipsock(udp))) +#define udpsock_is_conn(udp) \ + (udp_flags((udp)->udp_pcb) & UDP_FLAGS_CONNECTED) + +static ssize_t udpsock_pcblist(struct rmib_call *, struct rmib_node *, + struct rmib_oldp *, struct rmib_newp *); + +/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_UDP subtree. */ +/* TODO: add many more and make some of them writable.. */ +static struct rmib_node net_inet_udp_table[] = { +/* 1*/ [UDPCTL_CHECKSUM] = RMIB_INT(RMIB_RO, 1, "checksum", + "Compute UDP checksums"), +/* 2*/ [UDPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, UDP_SNDBUF_DEF, + "sendspace", + "Default UDP send buffer size"), +/* 3*/ [UDPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, UDP_RCVBUF_DEF, + "recvspace", + "Default UDP receive buffer size"), +/* 4*/ [UDPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int), + loopif_cksum, "do_loopback_cksum", + "Perform UDP checksum on loopback"), +/*+0*/ [UDPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, + udpsock_pcblist, "pcblist", + "UDP protocol control block list"), +}; + +static struct rmib_node net_inet_udp_node = + RMIB_NODE(RMIB_RO, net_inet_udp_table, "udp", "UDPv4 related settings"); +static struct rmib_node net_inet6_udp6_node = + RMIB_NODE(RMIB_RO, net_inet_udp_table, "udp6", "UDPv6 related settings"); + +/* + * Initialize the UDP sockets module. + */ +void +udpsock_init(void) +{ + unsigned int slot; + + /* Initialize the list of free UDP sockets. */ + SIMPLEQ_INIT(&udp_freelist); + + for (slot = 0; slot < __arraycount(udp_array); slot++) + SIMPLEQ_INSERT_TAIL(&udp_freelist, &udp_array[slot], udp_next); + + /* Register the net.inet.udp and net.inet6.udp6 RMIB subtrees. */ + mibtree_register_inet(PF_INET, IPPROTO_UDP, &net_inet_udp_node); + mibtree_register_inet(PF_INET6, IPPROTO_UDP, &net_inet6_udp6_node); +} + +/* + * A packet has arrived on a UDP socket. We own the given packet buffer, and + * so we must free it if we do not want to keep it. + */ +static void +udpsock_input(void * arg, struct udp_pcb * pcb __unused, struct pbuf * pbuf, + const ip_addr_t * ipaddr, uint16_t port) +{ + struct udpsock *udp = (struct udpsock *)arg; + + /* All UDP input processing is handled by pktsock. */ + pktsock_input(&udp->udp_pktsock, pbuf, ipaddr, port); +} + +/* + * Create a UDP socket. + */ +sockid_t +udpsock_socket(int domain, int protocol, struct sock ** sockp, + const struct sockevent_ops ** ops) +{ + struct udpsock *udp; + unsigned int flags; + uint8_t ip_type; + + switch (protocol) { + case 0: + case IPPROTO_UDP: + break; + + /* NetBSD does not support IPPROTO_UDPLITE, even though lwIP does. */ + default: + return EPROTONOSUPPORT; + } + + if (SIMPLEQ_EMPTY(&udp_freelist)) + return ENOBUFS; + + udp = SIMPLEQ_FIRST(&udp_freelist); + + ip_type = pktsock_socket(&udp->udp_pktsock, domain, UDP_SNDBUF_DEF, + UDP_RCVBUF_DEF, sockp); + + /* We should have enough PCBs so this call should not fail.. */ + if ((udp->udp_pcb = udp_new_ip_type(ip_type)) == NULL) + return ENOBUFS; + udp_recv(udp->udp_pcb, udpsock_input, (void *)udp); + + /* By default, the multicast TTL is 1 and looping is enabled. */ + udp_set_multicast_ttl(udp->udp_pcb, 1); + + flags = udp_flags(udp->udp_pcb); + udp_setflags(udp->udp_pcb, flags | UDP_FLAGS_MULTICAST_LOOP); + + SIMPLEQ_REMOVE_HEAD(&udp_freelist, udp_next); + + *ops = &udpsock_ops; + return SOCKID_UDP | (sockid_t)(udp - udp_array); +} + +/* + * Bind a UDP socket to a local address. + */ +static int +udpsock_bind(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt) +{ + struct udpsock *udp = (struct udpsock *)sock; + ip_addr_t ipaddr; + uint16_t port; + err_t err; + int r; + + if ((r = ipsock_get_src_addr(udpsock_get_ipsock(udp), addr, addr_len, + user_endpt, &udp->udp_pcb->local_ip, udp->udp_pcb->local_port, + TRUE /*allow_mcast*/, &ipaddr, &port)) != OK) + return r; + + err = udp_bind(udp->udp_pcb, &ipaddr, port); + + return util_convert_err(err); +} + +/* + * Connect a UDP socket to a remote address. + */ +static int +udpsock_connect(struct sock * sock, const struct sockaddr * addr, + socklen_t addr_len, endpoint_t user_endpt __unused) +{ + struct udpsock *udp = (struct udpsock *)sock; + struct ifdev *ifdev; + const ip_addr_t *src_addr; + ip_addr_t dst_addr; + uint16_t dst_port; + uint32_t ifindex, ifindex2; + err_t err; + int r; + + /* + * One may "unconnect" socket by providing an address with family + * AF_UNSPEC. Providing an :0 address does not achieve the same. + */ + if (addr_is_unspec(addr, addr_len)) { + udp_disconnect(udp->udp_pcb); + + return OK; + } + + if ((r = ipsock_get_dst_addr(udpsock_get_ipsock(udp), addr, + addr_len, &udp->udp_pcb->local_ip, &dst_addr, &dst_port)) != OK) + return r; + + /* + * Bind explicitly to a source address if the PCB is not bound to one + * yet. This is expected in the BSD socket API, but lwIP does not do + * it for us. + */ + if (ip_addr_isany(&udp->udp_pcb->local_ip)) { + /* Help the multicast case a bit, if possible. */ + ifdev = NULL; + + if (ip_addr_ismulticast(&dst_addr)) { + ifindex = pktsock_get_ifindex(&udp->udp_pktsock); + ifindex2 = udp_get_multicast_netif_index(udp->udp_pcb); + if (ifindex == 0) + ifindex = ifindex2; + + if (ifindex != 0) { + ifdev = ifdev_get_by_index(ifindex); + + if (ifdev == NULL) + return ENXIO; + } + } + + src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/); + + if (src_addr == NULL) + return EHOSTUNREACH; + + err = udp_bind(udp->udp_pcb, src_addr, + udp->udp_pcb->local_port); + + if (err != ERR_OK) + return util_convert_err(err); + } + + /* + * Connecting a UDP socket serves two main purposes: 1) the socket uses + * the address as destination when sending, and 2) the socket receives + * packets from only the connected address. + */ + err = udp_connect(udp->udp_pcb, &dst_addr, dst_port); + + if (err != ERR_OK) + return util_convert_err(err); + + return OK; +} + +/* + * Perform preliminary checks on a send request. + */ +static int +udpsock_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, + const struct sockaddr * addr, socklen_t addr_len __unused, + endpoint_t user_endpt __unused, int flags) +{ + struct udpsock *udp = (struct udpsock *)sock; + + if ((flags & ~MSG_DONTROUTE) != 0) + return EOPNOTSUPP; + + if (!udpsock_is_conn(udp) && addr == NULL) + return EDESTADDRREQ; + + /* + * This is only one part of the length check. The rest is done from + * udpsock_send(), once we have more information. + */ + if (len > ipsock_get_sndbuf(udpsock_get_ipsock(udp))) + return EMSGSIZE; + + return OK; +} + +/* + * Swap IP-level options between the UDP PCB and the packet options structure, + * for all options that have their flag set in the packet options structure. + * This function is called twice when sending a packet. The result is that the + * flagged options are overridden for only the packet being sent. + */ +static void +udpsock_swap_opt(struct udpsock * udp, struct pktopt * pkto) +{ + uint8_t tos, ttl, mcast_ttl; + + if (pkto->pkto_flags & PKTOF_TOS) { + tos = udp->udp_pcb->tos; + udp->udp_pcb->tos = pkto->pkto_tos; + pkto->pkto_tos = tos; + } + + if (pkto->pkto_flags & PKTOF_TTL) { + ttl = udp->udp_pcb->ttl; + mcast_ttl = udp_get_multicast_ttl(udp->udp_pcb); + udp->udp_pcb->ttl = pkto->pkto_ttl; + udp_set_multicast_ttl(udp->udp_pcb, pkto->pkto_mcast_ttl); + pkto->pkto_ttl = ttl; + pkto->pkto_mcast_ttl = mcast_ttl; + } +} + +/* + * Send a packet on a UDP socket. + */ +static int +udpsock_send(struct sock * sock, const struct sockdriver_data * data, + size_t len, size_t * off, const struct sockdriver_data * ctl, + socklen_t ctl_len, socklen_t * ctl_off __unused, + const struct sockaddr * addr, socklen_t addr_len, + endpoint_t user_endpt __unused, int flags, size_t min __unused) +{ + struct udpsock *udp = (struct udpsock *)sock; + struct pktopt pktopt; + struct pbuf *pbuf; + struct ifdev *ifdev; + struct netif *netif; + const ip_addr_t *src_addrp, *dst_addrp; + ip_addr_t src_addr, dst_addr; /* for storage only; not always used! */ + uint16_t dst_port; + uint32_t ifindex; + size_t hdrlen; + err_t err; + int r; + + /* Copy in and parse any packet options. */ + pktopt.pkto_flags = 0; + + if ((r = pktsock_get_ctl(&udp->udp_pktsock, ctl, ctl_len, + &pktopt)) != OK) + return r; + + /* + * The code below will both determine an outgoing interface and a + * source address for the packet. Even though lwIP could do this for + * us in some cases, there are other cases where we must do so + * ourselves, with as main reasons 1) the possibility that either or + * both have been provided through IPV6_PKTINFO, and 2) our intent to + * detect and stop zone violations for (combinations of) scoped IPv6 + * addresses. As a result, it is easier to simply take over the + * selection tasks lwIP in their entirety. + * + * Much of the same applies to rawsock_send() as well. Functional + * differences (e.g. IP_HDRINCL support) as well as the PCB accesses in + * the code make it hard to merge the two into a single pktsock copy. + * Please do keep the two in sync as much as possible. + */ + + /* + * Start by checking whether the source address and/or the outgoing + * interface are overridden using sticky and/or ancillary options. The + * call to pktsock_get_pktinfo(), if successful, will either set + * 'ifdev' to NULL, in which case there is no override, or it will set + * 'ifdev' to the outgoing interface to use, and (only) in that case + * also fill 'src_addr', with an address that may either be a locally + * owned unicast address or the unspecified ('any') address. If it is + * a unicast address, that is the source address to use for the packet. + * Otherwise, fall back to the address to which the socket is bound, + * which may also be the unspecified address or even a multicast + * address. In those case we will pick a source address further below. + */ + if ((r = pktsock_get_pktinfo(&udp->udp_pktsock, &pktopt, &ifdev, + &src_addr)) != OK) + return r; + + if (ifdev != NULL && !ip_addr_isany(&src_addr)) { + /* This is guaranteed to be a proper local unicast address. */ + src_addrp = &src_addr; + } else { + src_addrp = &udp->udp_pcb->local_ip; + + /* + * If the socket is bound to a multicast address, use the + * unspecified ('any') address as source address instead, until + * we select a real source address (further below). This + * substitution keeps the rest of the code a bit simpler. + */ + if (ip_addr_ismulticast(src_addrp)) + src_addrp = IP46_ADDR_ANY(IP_GET_TYPE(src_addrp)); + } + + /* + * Determine the destination address to use. If the socket is + * connected, always ignore any address provided in the send call. + */ + if (!udpsock_is_conn(udp)) { + assert(addr != NULL); /* already checked in pre_send */ + + if ((r = ipsock_get_dst_addr(udpsock_get_ipsock(udp), addr, + addr_len, src_addrp, &dst_addr, &dst_port)) != OK) + return r; + + dst_addrp = &dst_addr; + } else { + dst_addrp = &udp->udp_pcb->remote_ip; + dst_port = udp->udp_pcb->remote_port; + } + + /* + * If the destination is a multicast address, select the outgoing + * interface based on the multicast interface index, if one is set. + * This must be done here in order to allow the code further below to + * detect zone violations, because if we leave this selection to lwIP, + * it will not perform zone violation detection at all. Also note that + * this case must *not* override an interface index already specified + * using IPV6_PKTINFO, as per RFC 3542 Sec. 6.7. + */ + if (ifdev == NULL && ip_addr_ismulticast(dst_addrp)) { + ifindex = udp_get_multicast_netif_index(udp->udp_pcb); + + if (ifindex != NETIF_NO_INDEX) + ifdev = ifdev_get_by_index(ifindex); /* (may fail) */ + } + + /* + * If an interface has been determined already now, the send operation + * will bypass routing. In that case, we must perform our own checks + * on address zone violations, because those will not be made anywhere + * else. Subsequent steps below will never introduce violations. + */ + if (ifdev != NULL && IP_IS_V6(dst_addrp)) { + if (ifaddr_is_zone_mismatch(ip_2_ip6(dst_addrp), ifdev)) + return EHOSTUNREACH; + + if (IP_IS_V6(src_addrp) && + ifaddr_is_zone_mismatch(ip_2_ip6(src_addrp), ifdev)) + return EHOSTUNREACH; + } + + /* + * If we do not yet have an interface at this point, perform a route + * lookup to determine the outgoing interface. Unless MSG_DONTROUTE is + * set (which covers SO_DONTROUTE as well), in which case we look for a + * local subnet that matches the destination address. + */ + if (ifdev == NULL) { + if (!(flags & MSG_DONTROUTE)) { + /* + * ip_route() should never be called with an + * IPADDR_TYPE_ANY type address. This is a lwIP- + * internal requirement; while we override both routing + * functions, we do not deviate from it. + */ + if (IP_IS_ANY_TYPE_VAL(*src_addrp)) + src_addrp = + IP46_ADDR_ANY(IP_GET_TYPE(dst_addrp)); + + /* Perform the route lookup. */ + if ((netif = ip_route(src_addrp, dst_addrp)) == NULL) + return EHOSTUNREACH; + + ifdev = netif_get_ifdev(netif); + } else { + if ((ifdev = ifaddr_map_by_subnet(dst_addrp)) == NULL) + return EHOSTUNREACH; + } + } + + /* + * At this point we have an outgoing interface. If we do not have a + * source address yet, pick one now. + */ + assert(ifdev != NULL); + + if (ip_addr_isany(src_addrp)) { + src_addrp = ifaddr_select(dst_addrp, ifdev, NULL /*ifdevp*/); + + if (src_addrp == NULL) + return EHOSTUNREACH; + } + + /* + * Now that we know the full conditions of what we are about to send, + * check whether the packet size leaves enough room for lwIP to prepend + * headers. If so, allocate a chain of pbufs for the packet. + */ + assert(len <= UDP_MAX_PAYLOAD); + + if (IP_IS_V6(dst_addrp)) + hdrlen = IP6_HLEN + UDP_HLEN; + else + hdrlen = IP_HLEN + UDP_HLEN; + + if (hdrlen + len > UDP_MAX_PAYLOAD) + return EMSGSIZE; + + if ((pbuf = pchain_alloc(PBUF_TRANSPORT, len)) == NULL) + return ENOBUFS; + + /* Copy in the packet data. */ + if ((r = pktsock_get_data(&udp->udp_pktsock, data, len, pbuf)) != OK) { + pbuf_free(pbuf); + + return r; + } + + /* + * Set broadcast/multicast flags for accounting purposes. Only the + * multicast flag is used for output accounting, but for loopback + * traffic, both flags are copied and used for input accounting and + * setting MSG_MCAST/MSG_BCAST. + */ + if (ip_addr_ismulticast(dst_addrp)) + pbuf->flags |= PBUF_FLAG_LLMCAST; + else if (ip_addr_isbroadcast(dst_addrp, ifdev_get_netif(ifdev))) + pbuf->flags |= PBUF_FLAG_LLBCAST; + + /* Send the packet. */ + udpsock_swap_opt(udp, &pktopt); + + assert(!ip_addr_isany(src_addrp)); + assert(!ip_addr_ismulticast(src_addrp)); + + err = udp_sendto_if_src(udp->udp_pcb, pbuf, dst_addrp, dst_port, + ifdev_get_netif(ifdev), src_addrp); + + udpsock_swap_opt(udp, &pktopt); + + /* Free the pbuf, as a copy has been made. */ + pbuf_free(pbuf); + + /* + * On success, make sure to return the size of the sent packet as well. + * As an aside: ctl_off need not be updated, as it is not returned. + */ + if ((r = util_convert_err(err)) == OK) + *off = len; + return r; +} + +/* + * Update the set of flag-type socket options on a UDP socket. + */ +static void +udpsock_setsockmask(struct sock * sock, unsigned int mask) +{ + struct udpsock *udp = (struct udpsock *)sock; + + if (mask & SO_REUSEADDR) + ip_set_option(udp->udp_pcb, SOF_REUSEADDR); + else + ip_reset_option(udp->udp_pcb, SOF_REUSEADDR); + + if (mask & SO_BROADCAST) + ip_set_option(udp->udp_pcb, SOF_BROADCAST); + else + ip_reset_option(udp->udp_pcb, SOF_BROADCAST); +} + +/* + * Prepare a helper structure for IP-level option processing. + */ +static void +udpsock_get_ipopts(struct udpsock * udp, struct ipopts * ipopts) +{ + + ipopts->local_ip = &udp->udp_pcb->local_ip; + ipopts->remote_ip = &udp->udp_pcb->remote_ip; + ipopts->tos = &udp->udp_pcb->tos; + ipopts->ttl = &udp->udp_pcb->ttl; + ipopts->sndmin = UDP_SNDBUF_MIN; + ipopts->sndmax = UDP_SNDBUF_MAX; + ipopts->rcvmin = UDP_RCVBUF_MIN; + ipopts->rcvmax = UDP_RCVBUF_MAX; +} + +/* + * Set socket options on a UDP socket. + */ +static int +udpsock_setsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t len) +{ + struct udpsock *udp = (struct udpsock *)sock; + struct ipopts ipopts; + ip_addr_t ipaddr; + struct in_addr in_addr; + struct ifdev *ifdev; + unsigned int flags; + uint32_t ifindex; + uint8_t byte; + int r, val; + + /* + * Unfortunately, we have to duplicate most of the multicast options + * rather than sharing them with rawsock at the pktsock level. The + * reason is that each of the PCBs have their own multicast abstraction + * functions and so we cannot merge the rest. Same for getsockopt. + */ + + switch (level) { + case IPPROTO_IP: + if (udpsock_is_ipv6(udp)) + break; + + switch (name) { + case IP_MULTICAST_IF: + pktsock_set_mcaware(&udp->udp_pktsock); + + if ((r = sockdriver_copyin_opt(data, &in_addr, + sizeof(in_addr), len)) != OK) + return r; + + ip_addr_set_ip4_u32(&ipaddr, in_addr.s_addr); + + if ((ifdev = ifaddr_map_by_addr(&ipaddr)) == NULL) + return EADDRNOTAVAIL; + + udp_set_multicast_netif_index(udp->udp_pcb, + ifdev_get_index(ifdev)); + + return OK; + + case IP_MULTICAST_LOOP: + pktsock_set_mcaware(&udp->udp_pktsock); + + if ((r = sockdriver_copyin_opt(data, &byte, + sizeof(byte), len)) != OK) + return r; + + flags = udp_flags(udp->udp_pcb); + + if (byte) + flags |= UDP_FLAGS_MULTICAST_LOOP; + else + flags &= ~UDP_FLAGS_MULTICAST_LOOP; + + udp_setflags(udp->udp_pcb, flags); + + return OK; + + case IP_MULTICAST_TTL: + pktsock_set_mcaware(&udp->udp_pktsock); + + if ((r = sockdriver_copyin_opt(data, &byte, + sizeof(byte), len)) != OK) + return r; + + udp_set_multicast_ttl(udp->udp_pcb, byte); + + return OK; + } + + break; + + case IPPROTO_IPV6: + if (!udpsock_is_ipv6(udp)) + break; + + switch (name) { + case IPV6_MULTICAST_IF: + pktsock_set_mcaware(&udp->udp_pktsock); + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val != 0) { + ifindex = (uint32_t)val; + + ifdev = ifdev_get_by_index(ifindex); + + if (ifdev == NULL) + return ENXIO; + } else + ifindex = NETIF_NO_INDEX; + + udp_set_multicast_netif_index(udp->udp_pcb, ifindex); + + return OK; + + case IPV6_MULTICAST_LOOP: + pktsock_set_mcaware(&udp->udp_pktsock); + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < 0 || val > 1) + return EINVAL; + + flags = udp_flags(udp->udp_pcb); + + if (val) + flags |= UDP_FLAGS_MULTICAST_LOOP; + else + flags &= ~UDP_FLAGS_MULTICAST_LOOP; + + /* + * lwIP's IPv6 functionality does not actually check + * this flag at all yet. We set it in the hope that + * one day this will magically start working. + */ + udp_setflags(udp->udp_pcb, flags); + + return OK; + + case IPV6_MULTICAST_HOPS: + pktsock_set_mcaware(&udp->udp_pktsock); + + if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), + len)) != OK) + return r; + + if (val < -1 || val > UINT8_MAX) + return EINVAL; + + if (val == -1) + val = 1; + + udp_set_multicast_ttl(udp->udp_pcb, val); + + return OK; + } + + break; + } + + /* Handle all other options at the packet or IP level. */ + udpsock_get_ipopts(udp, &ipopts); + + return pktsock_setsockopt(&udp->udp_pktsock, level, name, data, len, + &ipopts); +} + +/* + * Retrieve socket options on a UDP socket. + */ +static int +udpsock_getsockopt(struct sock * sock, int level, int name, + const struct sockdriver_data * data, socklen_t * len) +{ + struct udpsock *udp = (struct udpsock *)sock; + struct ipopts ipopts; + const ip4_addr_t *ip4addr; + struct in_addr in_addr; + struct ifdev *ifdev; + unsigned int flags; + uint32_t ifindex; + uint8_t byte; + int val; + + switch (level) { + case IPPROTO_IP: + if (udpsock_is_ipv6(udp)) + break; + + switch (name) { + case IP_MULTICAST_IF: + ifindex = udp_get_multicast_netif_index(udp->udp_pcb); + + /* + * Map back from the interface index to the IPv4 + * address assigned to the corresponding interface. + * Should this not work out, return the 'any' address. + */ + if (ifindex != NETIF_NO_INDEX && + (ifdev = ifdev_get_by_index(ifindex)) != NULL) { + ip4addr = + netif_ip4_addr(ifdev_get_netif(ifdev)); + + in_addr.s_addr = ip4_addr_get_u32(ip4addr); + } else + in_addr.s_addr = PP_HTONL(INADDR_ANY); + + return sockdriver_copyout_opt(data, &in_addr, + sizeof(in_addr), len); + + case IP_MULTICAST_LOOP: + flags = udp_flags(udp->udp_pcb); + + byte = !!(flags & UDP_FLAGS_MULTICAST_LOOP); + + return sockdriver_copyout_opt(data, &byte, + sizeof(byte), len); + + case IP_MULTICAST_TTL: + byte = udp_get_multicast_ttl(udp->udp_pcb); + + return sockdriver_copyout_opt(data, &byte, + sizeof(byte), len); + } + + break; + + case IPPROTO_IPV6: + if (!udpsock_is_ipv6(udp)) + break; + + switch (name) { + case IPV6_MULTICAST_IF: + ifindex = udp_get_multicast_netif_index(udp->udp_pcb); + + val = (int)ifindex; + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_MULTICAST_LOOP: + flags = udp_flags(udp->udp_pcb); + + val = !!(flags & UDP_FLAGS_MULTICAST_LOOP); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + + case IPV6_MULTICAST_HOPS: + val = udp_get_multicast_ttl(udp->udp_pcb); + + return sockdriver_copyout_opt(data, &val, sizeof(val), + len); + } + + break; + } + + /* Handle all other options at the packet or IP level. */ + udpsock_get_ipopts(udp, &ipopts); + + return pktsock_getsockopt(&udp->udp_pktsock, level, name, data, len, + &ipopts); +} + +/* + * Retrieve the local socket address of a UDP socket. + */ +static int +udpsock_getsockname(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct udpsock *udp = (struct udpsock *)sock; + + ipsock_put_addr(udpsock_get_ipsock(udp), addr, addr_len, + &udp->udp_pcb->local_ip, udp->udp_pcb->local_port); + + return OK; +} + +/* + * Retrieve the remote socket address of a UDP socket. + */ +static int +udpsock_getpeername(struct sock * sock, struct sockaddr * addr, + socklen_t * addr_len) +{ + struct udpsock *udp = (struct udpsock *)sock; + + if (!udpsock_is_conn(udp)) + return ENOTCONN; + + ipsock_put_addr(udpsock_get_ipsock(udp), addr, addr_len, + &udp->udp_pcb->remote_ip, udp->udp_pcb->remote_port); + + return OK; +} + +/* + * Shut down a UDP socket for reading and/or writing. + */ +static int +udpsock_shutdown(struct sock * sock, unsigned int mask) +{ + struct udpsock *udp = (struct udpsock *)sock; + + if (mask & SFL_SHUT_RD) + udp_recv(udp->udp_pcb, NULL, NULL); + + pktsock_shutdown(&udp->udp_pktsock, mask); + + return OK; +} + +/* + * Close a UDP socket. + */ +static int +udpsock_close(struct sock * sock, int force __unused) +{ + struct udpsock *udp = (struct udpsock *)sock; + + udp_recv(udp->udp_pcb, NULL, NULL); + + udp_remove(udp->udp_pcb); + udp->udp_pcb = NULL; + + pktsock_close(&udp->udp_pktsock); + + return OK; +} + +/* + * Free up a closed UDP socket. + */ +static void +udpsock_free(struct sock * sock) +{ + struct udpsock *udp = (struct udpsock *)sock; + + assert(udp->udp_pcb == NULL); + + SIMPLEQ_INSERT_HEAD(&udp_freelist, udp, udp_next); +} + +/* + * Fill the given kinfo_pcb sysctl(7) structure with information about the UDP + * PCB identified by the given pointer. + */ +static void +udpsock_get_info(struct kinfo_pcb * ki, const void * ptr) +{ + const struct udp_pcb *pcb = (const struct udp_pcb *)ptr; + struct udpsock *udp; + + ki->ki_type = SOCK_DGRAM; + + /* + * All UDP sockets should be created by this module, but protect + * ourselves from the case that that is not true anyway. + */ + if (pcb->recv_arg != NULL) { + udp = (struct udpsock *)pcb->recv_arg; + + assert(udp >= udp_array && + udp < &udp_array[__arraycount(udp_array)]); + } else + udp = NULL; + + ipsock_get_info(ki, &pcb->local_ip, pcb->local_port, &pcb->remote_ip, + pcb->remote_port); + + if (udp != NULL) { + /* TODO: change this so that sockstat(1) may work one day. */ + ki->ki_sockaddr = (uint64_t)(uintptr_t)udpsock_get_sock(udp); + + ki->ki_rcvq = pktsock_get_recvlen(&udp->udp_pktsock); + } +} + +/* + * Given either NULL or a previously returned UDP PCB pointer, return the first + * or next UDP PCB pointer, or NULL if there are no more. Skip UDP PCBs that + * are not bound to an address, as there is no use reporting them. + */ +static const void * +udpsock_enum(const void * last) +{ + const struct udp_pcb *pcb; + + if (last != NULL) + pcb = (const void *)((const struct udp_pcb *)last)->next; + else + pcb = (const void *)udp_pcbs; + + while (pcb != NULL && pcb->local_port == 0) + pcb = pcb->next; + + return pcb; +} + +/* + * Obtain the list of UDP protocol control blocks, for sysctl(7). + */ +static ssize_t +udpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused, + struct rmib_oldp * oldp, struct rmib_newp * newp __unused) +{ + + return util_pcblist(call, oldp, udpsock_enum, udpsock_get_info); +} + +static const struct sockevent_ops udpsock_ops = { + .sop_bind = udpsock_bind, + .sop_connect = udpsock_connect, + .sop_pre_send = udpsock_pre_send, + .sop_send = udpsock_send, + .sop_pre_recv = pktsock_pre_recv, + .sop_recv = pktsock_recv, + .sop_test_recv = pktsock_test_recv, + .sop_ioctl = ifconf_ioctl, + .sop_setsockmask = udpsock_setsockmask, + .sop_setsockopt = udpsock_setsockopt, + .sop_getsockopt = udpsock_getsockopt, + .sop_getsockname = udpsock_getsockname, + .sop_getpeername = udpsock_getpeername, + .sop_shutdown = udpsock_shutdown, + .sop_close = udpsock_close, + .sop_free = udpsock_free +}; diff --git a/minix/net/lwip/util.c b/minix/net/lwip/util.c new file mode 100644 index 000000000..c83a94e2e --- /dev/null +++ b/minix/net/lwip/util.c @@ -0,0 +1,251 @@ +/* LWIP service - util.c - shared utility functions */ + +#include "lwip.h" + +#define US 1000000 /* number of microseconds per second */ + +/* + * Convert the given timeval structure to a number of clock ticks, checking + * whether the given structure is valid and whether the resulting number of + * ticks can be expressed as a (relative) clock ticks value. Upon success, + * return OK, with the number of clock ticks stored in 'ticksp'. Upon failure, + * return a negative error code that may be returned to userland directly. In + * that case, the contents of 'ticksp' are left unchanged. + * + * TODO: move this function into libsys and remove other redundant copies. + */ +int +util_timeval_to_ticks(const struct timeval * tv, clock_t * ticksp) +{ + clock_t ticks; + + if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= US) + return EINVAL; + + if (tv->tv_sec >= TMRDIFF_MAX / sys_hz()) + return EDOM; + + ticks = tv->tv_sec * sys_hz() + (tv->tv_usec * sys_hz() + US - 1) / US; + assert(ticks <= TMRDIFF_MAX); + + *ticksp = ticks; + return OK; +} + +/* + * Convert the given number of clock ticks to a timeval structure. This + * function never fails. + */ +void +util_ticks_to_timeval(clock_t ticks, struct timeval * tv) +{ + + memset(tv, 0, sizeof(*tv)); + tv->tv_sec = ticks / sys_hz(); + tv->tv_usec = (ticks % sys_hz()) * US / sys_hz(); +} + +/* + * Copy data between a user process and a chain of buffers. If the 'copy_in' + * flag is set, the data will be copied in from the user process to the given + * chain of buffers; otherwise, the data will be copied out from the given + * buffer chain to the user process. The 'data' parameter is a sockdriver- + * supplied structure identifying the remote source or destination of the data. + * The 'len' parameter contains the number of bytes to copy, and 'off' contains + * the offset into the remote source or destination. 'pbuf' is a pointer to + * the buffer chain, and 'skip' is the number of bytes to skip in the first + * buffer on the chain. Return OK on success, or a negative error code if the + * copy operation failed. This function is packet queue friendly. + */ +int +util_copy_data(const struct sockdriver_data * data, size_t len, size_t off, + const struct pbuf * pbuf, size_t skip, int copy_in) +{ + iovec_t iov[SOCKDRIVER_IOV_MAX]; + unsigned int i; + size_t sub, chunk; + int r; + + while (len > 0) { + sub = 0; + + for (i = 0; len > 0 && i < __arraycount(iov); i++) { + assert(pbuf != NULL); + + chunk = (size_t)pbuf->len - skip; + if (chunk > len) + chunk = len; + + iov[i].iov_addr = (vir_bytes)pbuf->payload + skip; + iov[i].iov_size = chunk; + + sub += chunk; + len -= chunk; + + pbuf = pbuf->next; + skip = 0; + } + + if (copy_in) + r = sockdriver_vcopyin(data, off, iov, i); + else + r = sockdriver_vcopyout(data, off, iov, i); + if (r != OK) + return r; + + off += sub; + } + + return OK; +} + +/* + * Copy from a vector of (local) buffers to a single (local) buffer. Return + * the total number of copied bytes on success, or E2BIG if not all of the + * results could be stored in the given bfufer. + */ +ssize_t +util_coalesce(char * ptr, size_t max, const iovec_t * iov, unsigned int iovcnt) +{ + size_t off, size; + + for (off = 0; iovcnt > 0; iov++, iovcnt--) { + if ((size = iov->iov_size) > max) + return E2BIG; + + memcpy(&ptr[off], (void *)iov->iov_addr, size); + + off += size; + max -= size; + } + + return off; +} + +/* + * Return TRUE if the given endpoint has superuser privileges, FALSE otherwise. + */ +int +util_is_root(endpoint_t endpt) +{ + + return (getnuid(endpt) == ROOT_EUID); +} + +/* + * Convert a lwIP-provided error code (of type err_t) to a negative MINIX 3 + * error code. + */ +int +util_convert_err(err_t err) +{ + + switch (err) { + case ERR_OK: return OK; + case ERR_MEM: return ENOMEM; + case ERR_BUF: return ENOBUFS; + case ERR_TIMEOUT: return ETIMEDOUT; + case ERR_RTE: return EHOSTUNREACH; + case ERR_VAL: return EINVAL; + case ERR_USE: return EADDRINUSE; + case ERR_ALREADY: return EALREADY; + case ERR_ISCONN: return EISCONN; + case ERR_CONN: return ENOTCONN; + case ERR_IF: return ENETDOWN; + case ERR_ABRT: return ECONNABORTED; + case ERR_RST: return ECONNRESET; + case ERR_INPROGRESS: return EINPROGRESS; /* should not be thrown */ + case ERR_WOULDBLOCK: return EWOULDBLOCK; /* should not be thrown */ + case ERR_ARG: return EINVAL; + case ERR_CLSD: /* should be caught as separate case */ + default: /* should have a case here */ + printf("LWIP: unexpected error from lwIP: %d", err); + return EGENERIC; + } +} + +/* + * Obtain the list of protocol control blocks for a particular domain and + * protocol. The call may be used for requesting either IPv4 or IPv6 PCBs, + * based on the path used to get here. It is used for TCP, UDP, and RAW PCBs. + */ +ssize_t +util_pcblist(struct rmib_call * call, struct rmib_oldp * oldp, + const void *(*enum_proc)(const void *), + void (*get_info_proc)(struct kinfo_pcb *, const void *)) +{ + const void *pcb; + ip_addr_t local_ip; + struct kinfo_pcb ki; + ssize_t off; + int r, size, max, domain, protocol; + + if (call->call_namelen != 4) + return EINVAL; + + /* The first two added name fields are not used. */ + + size = call->call_name[2]; + if (size < 0 || (size_t)size > sizeof(ki)) + return EINVAL; + if (size == 0) + size = sizeof(ki); + max = call->call_name[3]; + + domain = call->call_oname[1]; + protocol = call->call_oname[2]; + + off = 0; + + for (pcb = enum_proc(NULL); pcb != NULL; pcb = enum_proc(pcb)) { + /* Filter on IPv4/IPv6. */ + memcpy(&local_ip, &((const struct ip_pcb *)pcb)->local_ip, + sizeof(local_ip)); + + /* + * lwIP does not support IPv6 sockets with IPv4-mapped IPv6 + * addresses, and requires that those be represented as IPv4 + * sockets instead. We perform the appropriate conversions to + * make that work in general, but here we only have the lwIP + * PCB to go on, and that PCB may not even have an associated + * sock data structure. As a result, we have to report IPv6 + * sockets with IPv4-mapped IPv6 addresses as IPv4 sockets + * here. There is little room for improvement until lwIP + * allows us to store a "this is really an IPv6 socket" flag in + * its PCBs. As documented in the ipsock module, a partial + * solution would for example cause TCP sockets to "jump" from + * the IPv6 listing to the IPv4 listing when entering TIME_WAIT + * state. The jumping already occurs now for sockets that are + * getting bound, but that is not as problematic. + */ + if ((domain == AF_INET) != IP_IS_V4(&local_ip)) + continue; + + if (rmib_inrange(oldp, off)) { + memset(&ki, 0, sizeof(ki)); + + ki.ki_pcbaddr = (uint64_t)(uintptr_t)pcb; + ki.ki_ppcbaddr = (uint64_t)(uintptr_t)pcb; + ki.ki_family = domain; + ki.ki_protocol = protocol; + + get_info_proc(&ki, pcb); + + if ((r = rmib_copyout(oldp, off, &ki, size)) < OK) + return r; + } + + off += size; + if (max > 0 && --max == 0) + break; + } + + /* + * Margin to limit the possible effects of the inherent race condition + * between receiving just the data size and receiving the actual data. + */ + if (oldp == NULL) + off += PCB_SLOP * size; + + return off; +} diff --git a/minix/net/lwip/util.h b/minix/net/lwip/util.h new file mode 100644 index 000000000..b770580f0 --- /dev/null +++ b/minix/net/lwip/util.h @@ -0,0 +1,27 @@ +#ifndef MINIX_NET_LWIP_UTIL_H +#define MINIX_NET_LWIP_UTIL_H + +/* util.c */ +int util_timeval_to_ticks(const struct timeval * tv, clock_t * ticksp); +void util_ticks_to_timeval(clock_t ticks, struct timeval * tv); +int util_copy_data(const struct sockdriver_data * data, size_t len, size_t off, + const struct pbuf * pbuf, size_t skip, int copy_in); +ssize_t util_coalesce(char * buf, size_t max, const iovec_t * iov, + unsigned int iovcnt); +int util_convert_err(err_t err); +int util_is_root(endpoint_t user_endpt); +ssize_t util_pcblist(struct rmib_call * call, struct rmib_oldp * oldp, + const void *(*enum_proc)(const void *), + void (*get_info_proc)(struct kinfo_pcb *, const void *)); + +/* + * In our code, pbuf header adjustments should never fail. This wrapper checks + * that the pbuf_header() call succeeds, and panics otherwise. + */ +#define util_pbuf_header(pbuf,incr) \ + do { \ + if (pbuf_header((pbuf), (incr))) \ + panic("unexpected pbuf header adjustment failure"); \ + } while (0) + +#endif /* !MINIX_NET_LWIP_UTIL_H */ diff --git a/minix/servers/mib/minix.c b/minix/servers/mib/minix.c index 08a1739ef..36e2986e7 100644 --- a/minix/servers/mib/minix.c +++ b/minix/servers/mib/minix.c @@ -75,6 +75,7 @@ static struct mib_node mib_minix_table[] = { "mib", "MIB service information"), /* 2*/ [MINIX_PROC] = MIB_NODE(_P | _RO, mib_minix_proc_table, "proc", "Process information for ProcFS"), +/* 3*/ /* MINIX_LWIP is mounted through RMIB and thus not present here. */ }; /* diff --git a/minix/usr.bin/trace/ioctl/net.c b/minix/usr.bin/trace/ioctl/net.c index 1842d5e6e..adc0d3d62 100644 --- a/minix/usr.bin/trace/ioctl/net.c +++ b/minix/usr.bin/trace/ioctl/net.c @@ -17,12 +17,115 @@ #include #include +#include +#include +#include +#include + const char * net_ioctl_name(unsigned long req) { switch (req) { NAME(FIONREAD); + /* sys/sockio.h */ + NAME(SIOCSHIWAT); /* TODO: print argument */ + NAME(SIOCGHIWAT); /* TODO: print argument */ + NAME(SIOCSLOWAT); /* TODO: print argument */ + NAME(SIOCGLOWAT); /* TODO: print argument */ + NAME(SIOCSPGRP); /* TODO: print argument */ + NAME(SIOCGPGRP); /* TODO: print argument */ + NAME(SIOCADDRT); /* TODO: print argument */ + NAME(SIOCDELRT); /* TODO: print argument */ + NAME(SIOCSIFADDR); /* TODO: print argument */ + NAME(SIOCGIFADDR); /* TODO: print argument */ + NAME(SIOCSIFDSTADDR); /* TODO: print argument */ + NAME(SIOCGIFDSTADDR); /* TODO: print argument */ + NAME(SIOCSIFFLAGS); /* TODO: print argument */ + NAME(SIOCGIFFLAGS); /* TODO: print argument */ + NAME(SIOCGIFBRDADDR); /* TODO: print argument */ + NAME(SIOCSIFBRDADDR); /* TODO: print argument */ + NAME(SIOCGIFCONF); /* TODO: print argument */ + NAME(SIOCGIFNETMASK); /* TODO: print argument */ + NAME(SIOCSIFNETMASK); /* TODO: print argument */ + NAME(SIOCGIFMETRIC); /* TODO: print argument */ + NAME(SIOCSIFMETRIC); /* TODO: print argument */ + NAME(SIOCDIFADDR); /* TODO: print argument */ + NAME(SIOCAIFADDR); /* TODO: print argument */ + NAME(SIOCGIFALIAS); /* TODO: print argument */ + NAME(SIOCGIFAFLAG_IN); /* TODO: print argument */ + NAME(SIOCALIFADDR); /* TODO: print argument */ + NAME(SIOCGLIFADDR); /* TODO: print argument */ + NAME(SIOCDLIFADDR); /* TODO: print argument */ + NAME(SIOCSIFADDRPREF); /* TODO: print argument */ + NAME(SIOCGIFADDRPREF); /* TODO: print argument */ + NAME(SIOCADDMULTI); /* TODO: print argument */ + NAME(SIOCDELMULTI); /* TODO: print argument */ + NAME(SIOCSIFMEDIA); /* TODO: print argument */ + NAME(SIOCGIFMEDIA); /* TODO: print argument */ + NAME(SIOCSIFGENERIC); /* TODO: print argument */ + NAME(SIOCGIFGENERIC); /* TODO: print argument */ + NAME(SIOCSIFPHYADDR); /* TODO: print argument */ + NAME(SIOCGIFPSRCADDR); /* TODO: print argument */ + NAME(SIOCGIFPDSTADDR); /* TODO: print argument */ + NAME(SIOCDIFPHYADDR); /* TODO: print argument */ + NAME(SIOCSLIFPHYADDR); /* TODO: print argument */ + NAME(SIOCGLIFPHYADDR); /* TODO: print argument */ + NAME(SIOCSIFMTU); /* TODO: print argument */ + NAME(SIOCGIFMTU); /* TODO: print argument */ + NAME(SIOCSDRVSPEC); /* TODO: print argument */ + NAME(SIOCGDRVSPEC); /* TODO: print argument */ + NAME(SIOCIFCREATE); /* TODO: print argument */ + NAME(SIOCIFDESTROY); /* TODO: print argument */ + NAME(SIOCIFGCLONERS); /* TODO: print argument */ + NAME(SIOCGIFDLT); /* TODO: print argument */ + NAME(SIOCGIFCAP); /* TODO: print argument */ + NAME(SIOCSIFCAP); /* TODO: print argument */ + NAME(SIOCSVH); /* TODO: print argument */ + NAME(SIOCGVH); /* TODO: print argument */ + NAME(SIOCINITIFADDR); /* TODO: print argument */ + NAME(SIOCGIFDATA); /* TODO: print argument */ + NAME(SIOCZIFDATA); /* TODO: print argument */ + NAME(SIOCGLINKSTR); /* TODO: print argument */ + NAME(SIOCSLINKSTR); /* TODO: print argument */ + NAME(SIOCGETHERCAP); /* TODO: print argument */ + NAME(SIOCGIFINDEX); /* TODO: print argument */ + NAME(SIOCSETPFSYNC); /* TODO: print argument */ + NAME(SIOCGETPFSYNC); /* TODO: print argument */ + /* netinet6/in6_var.h */ + NAME(SIOCSIFADDR_IN6); /* TODO: print argument */ + NAME(SIOCGIFADDR_IN6); /* TODO: print argument */ + NAME(SIOCGIFDSTADDR_IN6); /* TODO: print argument */ + NAME(SIOCGIFNETMASK_IN6); /* TODO: print argument */ + NAME(SIOCDIFADDR_IN6); /* TODO: print argument */ + NAME(SIOCGIFPSRCADDR_IN6); /* TODO: print argument */ + NAME(SIOCGIFPDSTADDR_IN6); /* TODO: print argument */ + NAME(SIOCGIFAFLAG_IN6); /* TODO: print argument */ + NAME(SIOCGDRLST_IN6); /* TODO: print argument */ + NAME(SIOCSNDFLUSH_IN6); /* TODO: print argument */ + NAME(SIOCGNBRINFO_IN6); /* TODO: print argument */ + NAME(SIOCSRTRFLUSH_IN6); /* TODO: print argument */ + NAME(SIOCGIFSTAT_IN6); /* TODO: print argument */ + NAME(SIOCGIFSTAT_ICMP6); /* TODO: print argument */ + NAME(SIOCSDEFIFACE_IN6); /* TODO: print argument */ + NAME(SIOCGDEFIFACE_IN6); /* TODO: print argument */ + NAME(SIOCSIFINFO_FLAGS); /* TODO: print argument */ + NAME(SIOCSIFPREFIX_IN6); /* TODO: print argument */ + NAME(SIOCGIFPREFIX_IN6); /* TODO: print argument */ + NAME(SIOCDIFPREFIX_IN6); /* TODO: print argument */ + NAME(SIOCAIFPREFIX_IN6); /* TODO: print argument */ + NAME(SIOCCIFPREFIX_IN6); /* TODO: print argument */ + NAME(SIOCGIFALIFETIME_IN6); /* TODO: print argument */ + NAME(SIOCAIFADDR_IN6); /* TODO: print argument */ + NAME(SIOCGIFINFO_IN6); /* TODO: print argument */ + NAME(SIOCSIFINFO_IN6); /* TODO: print argument */ + NAME(SIOCSIFPHYADDR_IN6); /* TODO: print argument */ + NAME(SIOCAADDRCTL_POLICY); /* TODO: print argument */ + NAME(SIOCDADDRCTL_POLICY); /* TODO: print argument */ + /* net80211/ieee80211_ioctl.h */ + NAME(SIOCS80211NWID); /* TODO: print argument */ + NAME(SIOCG80211NWID); /* TODO: print argument */ + /* old MINIX inet ioctls */ NAME(NWIOSETHOPT); /* TODO: print argument */ NAME(NWIOGETHOPT); /* TODO: print argument */ NAME(NWIOGETHSTAT); /* TODO: print argument */