Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102113797
D36002.id109100.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
219 KB
Referenced Files
None
Subscribers
None
D36002.id109100.diff
View Options
Index: sys/compat/linux/linux.c
===================================================================
--- sys/compat/linux/linux.c
+++ sys/compat/linux/linux.c
@@ -47,6 +47,7 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/if_types.h>
+#include <netlink/netlink.h>
#include <sys/un.h>
#include <netinet/in.h>
@@ -364,6 +365,8 @@
return (AF_IPX);
case LINUX_AF_APPLETALK:
return (AF_APPLETALK);
+ case LINUX_AF_NETLINK:
+ return (AF_NETLINK);
}
return (-1);
}
@@ -387,6 +390,8 @@
return (LINUX_AF_IPX);
case AF_APPLETALK:
return (LINUX_AF_APPLETALK);
+ case AF_NETLINK:
+ return (LINUX_AF_NETLINK);
}
return (-1);
}
@@ -514,6 +519,14 @@
}
}
+ if (bdom == AF_NETLINK) {
+ if (salen < sizeof(struct sockaddr_nl)) {
+ error = EINVAL;
+ goto out;
+ }
+ salen = sizeof(struct sockaddr_nl);
+ }
+
sa = (struct sockaddr *)kosa;
sa->sa_family = bdom;
sa->sa_len = salen;
Index: sys/compat/linux/linux_socket.c
===================================================================
--- sys/compat/linux/linux_socket.c
+++ sys/compat/linux/linux_socket.c
@@ -91,6 +91,8 @@
l_uint, struct msghdr *);
static int linux_set_socket_flags(int, int *);
+#define SOL_NETLINK 270
+
static int
linux_to_bsd_sockopt_level(int level)
{
@@ -2091,6 +2093,10 @@
case IPPROTO_TCP:
name = linux_to_bsd_tcp_sockopt(args->optname);
break;
+ case SOL_NETLINK:
+ level = SOL_SOCKET;
+ name = args->optname;
+ break;
default:
name = -1;
break;
Index: sys/kern/uipc_domain.c
===================================================================
--- sys/kern/uipc_domain.c
+++ sys/kern/uipc_domain.c
@@ -239,6 +239,29 @@
mtx_unlock(&dom_mtx);
}
+void
+domain_remove(void *data)
+{
+ struct domain *dp = (struct domain *)data;
+
+ if ((dp->dom_flags & DOMF_UNLOADABLE) == 0)
+ return;
+
+ mtx_lock(&dom_mtx);
+ if (domains == dp) {
+ domains = dp->dom_next;
+ } else {
+ struct domain *curr;
+ for (curr = domains; curr != NULL; curr = curr->dom_next) {
+ if (curr->dom_next == dp) {
+ curr->dom_next = dp->dom_next;
+ break;
+ }
+ }
+ }
+ mtx_unlock(&dom_mtx);
+}
+
/* ARGSUSED*/
static void
domaininit(void *dummy)
Index: sys/modules/linux_common/Makefile
===================================================================
--- sys/modules/linux_common/Makefile
+++ sys/modules/linux_common/Makefile
@@ -16,6 +16,8 @@
EXPORT_SYMS+= linux_get_osname
EXPORT_SYMS+= linux_get_osrelease
EXPORT_SYMS+= linux_use_real_ifname
+EXPORT_SYMS+= linux_to_bsd_domain
+EXPORT_SYMS+= bsd_to_linux_domain
.if !defined(KERNBUILDDIR)
.warning Building Linuxulator outside of a kernel does not make sense
Index: sys/modules/netlink/Makefile
===================================================================
--- /dev/null
+++ sys/modules/netlink/Makefile
@@ -0,0 +1,7 @@
+.PATH: ${SRCTOP}/sys/netlink
+KMOD= netlink
+
+SRCS = netlink_module.c netlink_domain.c netlink_iface.c netlink_io.c \
+ netlink_message.c netlink_route.c netlink_nhop.c netlink_linux.c
+
+.include <bsd.kmod.mk>
Index: sys/net/route.h
===================================================================
--- sys/net/route.h
+++ sys/net/route.h
@@ -344,15 +344,17 @@
void *);
struct rt_addrinfo {
- int rti_addrs; /* Route RTF_ flags */
+ uint16_t rti_addrs; /* rti_info bitmask */
+ uint8_t rti_family; /* address family to operate on */
+ uint8_t rti_spare2;
int rti_flags; /* Route RTF_ flags */
struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */
struct ifaddr *rti_ifa; /* value of rt_ifa addr */
struct ifnet *rti_ifp; /* route interface */
rib_filter_f_t *rti_filter; /* filter function */
- void *rti_filterdata; /* filter parameters */
- u_long rti_mflags; /* metrics RTV_ flags */
- u_long rti_spare; /* Will be used for fib */
+ void *rti_filterdata; /* filter parameters */
+ uint32_t rti_mflags; /* metrics RTV_ flags */
+ uint32_t rti_fibnum; /* Will be used for fib */
struct rt_metrics *rti_rmx; /* Pointer to route metrics */
};
Index: sys/net/route/nhop.h
===================================================================
--- sys/net/route/nhop.h
+++ sys/net/route/nhop.h
@@ -199,6 +199,8 @@
void nhop_set_transmit_ifp(struct nhop_object *nh, struct ifnet *ifp);
uint32_t nhop_get_idx(const struct nhop_object *nh);
+uint32_t nhop_get_uidx(const struct nhop_object *nh);
+void nhop_set_uidx(struct nhop_object *nh, uint32_t uidx);
enum nhop_type nhop_get_type(const struct nhop_object *nh);
int nhop_get_rtflags(const struct nhop_object *nh);
struct vnet *nhop_get_vnet(const struct nhop_object *nh);
Index: sys/net/route/nhop_ctl.c
===================================================================
--- sys/net/route/nhop_ctl.c
+++ sys/net/route/nhop_ctl.c
@@ -774,6 +774,18 @@
return (nh->nh_priv->nh_idx);
}
+uint32_t
+nhop_get_uidx(const struct nhop_object *nh)
+{
+ return (nh->nh_priv->nh_uidx);
+}
+
+void
+nhop_set_uidx(struct nhop_object *nh, uint32_t uidx)
+{
+ nh->nh_priv->nh_uidx = uidx;
+}
+
enum nhop_type
nhop_get_type(const struct nhop_object *nh)
{
Index: sys/net/route/nhop_var.h
===================================================================
--- sys/net/route/nhop_var.h
+++ sys/net/route/nhop_var.h
@@ -79,6 +79,7 @@
uint16_t nh_type; /* nexthop type */
uint32_t rt_flags; /* routing flags for the control plane */
uint32_t nh_expire; /* path expiration time */
+ uint32_t nh_uidx; /* userland-provided index */
/* nhop lookup comparison end */
uint32_t nh_idx; /* nexthop index */
uint32_t nh_fibnum; /* nexthop fib */
Index: sys/net/route/route_ctl.h
===================================================================
--- sys/net/route/route_ctl.h
+++ sys/net/route/route_ctl.h
@@ -35,6 +35,8 @@
#ifndef _NET_ROUTE_ROUTE_CTL_H_
#define _NET_ROUTE_ROUTE_CTL_H_
+#include <sys/ck.h>
+
struct rib_cmd_info {
uint8_t rc_cmd; /* RTM_ADD|RTM_DEL|RTM_CHANGE */
uint8_t spare[3];
@@ -160,4 +162,31 @@
void rib_unsubscribe(struct rib_subscription *rs);
void rib_unsubscribe_locked(struct rib_subscription *rs);
+/* Event bridge */
+
+/* Types of events */
+#define NLBR_EVENT_ROUTE 1
+
+/* Event providers */
+#define NLBR_PROVIDER_KERNEL 1
+#define NLBR_PROVIDER_RTSOCK 2
+#define NLBR_PROVIDER_NETLINK 3
+
+struct rib_event_bridge;
+typedef void rib_event_bridge_cb_t(uint32_t event_type, uint32_t fibnum,
+ const struct rt_addrinfo *info, const struct rib_cmd_info *rc, void *arg);
+
+struct rib_event_bridge {
+ rib_event_bridge_cb_t *reb_cb;
+ void *reb_cb_arg;
+ int reb_provider_id;
+ CK_STAILQ_ENTRY(rib_event_bridge) reb_link;
+};
+void rib_bridge_generic_event(int provider_id, uint32_t event_type, uint32_t val1,
+ const void *ptr1, const void *ptr2);
+void rib_bridge_rt_event(int provider_id, uint32_t fibnum, const struct rt_addrinfo *info,
+ const struct rib_cmd_info *rc);
+void rib_bridge_link(struct rib_event_bridge *reb);
+void rib_bridge_unlink(struct rib_event_bridge *reb);
+
#endif
Index: sys/net/route/route_ctl.c
===================================================================
--- sys/net/route/route_ctl.c
+++ sys/net/route/route_ctl.c
@@ -60,7 +60,7 @@
#define DEBUG_MOD_NAME route_ctl
#define DEBUG_MAX_LEVEL LOG_DEBUG
#include <net/route/route_debug.h>
-_DECLARE_DEBUG(LOG_INFO);
+_DECLARE_DEBUG(LOG_DEBUG3);
/*
* This file contains control plane routing tables functions.
@@ -1601,3 +1601,65 @@
RIB_WUNLOCK(rnh);
NET_EPOCH_EXIT(et);
}
+
+
+CK_STAILQ_HEAD(rib_event_bridge_head, rib_event_bridge);
+static struct rib_event_bridge_head bridge_head;
+struct mtx bridge_lock;
+
+static void
+rib_bridge_init(void)
+{
+ CK_STAILQ_INIT(&bridge_head);
+ mtx_init(&bridge_lock, "rib_event_bridge_lock", NULL, MTX_DEF);
+}
+SYSINIT(rib_bridge_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, rib_bridge_init, NULL);
+
+
+void
+rib_bridge_generic_event(int provider_id, uint32_t event_type, uint32_t val1,
+ const void *ptr1, const void *ptr2)
+{
+ struct rib_event_bridge *reb;
+
+ NET_EPOCH_ASSERT();
+
+ CK_STAILQ_FOREACH(reb, &bridge_head, reb_link) {
+ RT_LOG(LOG_DEBUG, "HERE reb %p %d", reb, reb->reb_provider_id);
+ if (reb->reb_provider_id != provider_id)
+ reb->reb_cb(event_type, val1, ptr1, ptr2, reb->reb_cb_arg);
+ }
+}
+
+void
+rib_bridge_rt_event(int provider_id, uint32_t fibnum, const struct rt_addrinfo *info,
+ const struct rib_cmd_info *rc)
+{
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG3
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG3, fibnum, rt_get_family(rc->rc_rt), "received cmd %s for %s",
+ rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)));
+#endif
+ RT_LOG(LOG_DEBUG, "HERE %u", fibnum);
+ rib_bridge_generic_event(provider_id, NLBR_EVENT_ROUTE, fibnum, info, rc);
+}
+
+
+void
+rib_bridge_link(struct rib_event_bridge *reb)
+{
+ mtx_lock(&bridge_lock);
+ CK_STAILQ_INSERT_HEAD(&bridge_head, reb, reb_link);
+ mtx_unlock(&bridge_lock);
+ RT_LOG(LOG_DEBUG, "link %p", reb);
+}
+
+void
+rib_bridge_unlink(struct rib_event_bridge *reb)
+{
+ mtx_lock(&bridge_lock);
+ CK_STAILQ_REMOVE(&bridge_head, reb, rib_event_bridge, reb_link);
+ mtx_unlock(&bridge_lock);
+ RT_LOG(LOG_DEBUG, "unlink %p", reb);
+}
+
Index: sys/net/rtsock.c
===================================================================
--- sys/net/rtsock.c
+++ sys/net/rtsock.c
@@ -1126,6 +1126,7 @@
}
error = rib_action(fibnum, rtm->rtm_type, &info, &rc);
if (error == 0) {
+ rib_bridge_rt_event(NLBR_PROVIDER_RTSOCK, fibnum, &info, &rc);
#ifdef ROUTE_MPATH
if (NH_IS_NHGRP(rc.rc_nh_new) ||
(rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
@@ -1147,6 +1148,7 @@
case RTM_DELETE:
error = rib_action(fibnum, RTM_DELETE, &info, &rc);
if (error == 0) {
+ rib_bridge_rt_event(NLBR_PROVIDER_RTSOCK, fibnum, &info, &rc);
#ifdef ROUTE_MPATH
if (NH_IS_NHGRP(rc.rc_nh_old) ||
(rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
Index: sys/netlink/netlink.h
===================================================================
--- /dev/null
+++ sys/netlink/netlink.h
@@ -0,0 +1,233 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains structures and constants for RFC 3549 (Netlink)
+ * protocol. Some values have been taken from Linux implementation.
+ */
+
+#ifndef _NETLINK_LINUX_NETLINK_H_
+#define _NETLINK_LINUX_NETLINK_H_
+
+#ifndef _KERNEL
+#ifndef PF_NETLINK
+#define PF_NETLINK 38
+#endif
+#ifndef AF_NETLINK
+#define AF_NETLINK 38
+#endif
+#ifndef AF_MPLS
+#define AF_MPLS 39
+#endif
+#endif
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+struct sockaddr_nl {
+ uint8_t nl_len; /* total length */
+ sa_family_t nl_family; /* AF_NETLINK */
+ uint16_t nl_pad; /* zero */
+ uint32_t nl_pid; /* port ID */
+ uint32_t nl_groups; /* multicast groups mask */
+};
+
+#define SOL_NETLINK 270
+
+/* Currently supported socket options */
+#define NETLINK_ADD_MEMBERSHIP 1
+#define NETLINK_DROP_MEMBERSHIP 2
+#define NETLINK_PKTINFO 3 /* XXX: not supported */
+#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */
+#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */
+#define NETLINK_RX_RING 6 /* XXX: not supported */
+#define NETLINK_TX_RING 7 /* XXX: not supported */
+#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */
+
+#define NETLINK_LIST_MEMBERSHIPS 9
+#define NETLINK_CAP_ACK 10
+#define NETLINK_EXT_ACK 11
+#define NETLINK_GET_STRICT_CHK 12 /* XXX: not supported */
+
+
+/*
+ * RFC 3549, 2.3.2 Netlink Message Header
+ */
+struct nlmsghdr {
+ uint32_t nlmsg_len; /* Length of message including header */
+ uint16_t nlmsg_type; /* Message type identifier */
+ uint16_t nlmsg_flags; /* Flags (NLM_F_) */
+ uint32_t nlmsg_seq; /* Sequence number */
+ uint32_t nlmsg_pid; /* Sending process port ID */
+};
+
+/*
+ * RFC 3549, 2.3.2.2 The ACK Netlink Message
+ */
+struct nlmsgerr {
+ int error;
+ struct nlmsghdr msg;
+};
+
+/*
+ * RFC 3549, 2.3.2 standard flag bits (nlm_flags)
+ */
+#define NLM_F_REQUEST 0x01 /* It is request message. */
+#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */
+#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */
+#define NLM_F_ECHO 0x08 /* Echo this request */
+#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */
+#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for GET requests
+ */
+#define NLM_F_ROOT 0x100 /* Return the complete table */
+#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */
+#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot */
+#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH)
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for NEW requests
+ */
+#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */
+#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */
+#define NLM_F_CREATE 0x400 /* Create if it does not exist */
+#define NLM_F_APPEND 0x800 /* Add to end of list */
+
+/* Modifiers to DELETE request */
+#define NLM_F_NONREC 0x100 /* Do not delete recursively */
+
+/* Flags for ACK message */
+#define NLM_F_CAPPED 0x100 /* request was capped */
+#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */
+
+/*
+ * RFC 3549, 2.3.2 standard message types (nlmsg_type).
+ */
+#define NLMSG_NOOP 0x1 /* Message is ignored. */
+#define NLMSG_ERROR 0x2 /* reply error code reporting */
+#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */
+#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */
+
+#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */
+
+/*
+ * Defition of numbers assigned to the netlink subsystems.
+ */
+#define NETLINK_ROUTE 0 /* Routing/device hook */
+#define NETLINK_UNUSED 1 /* (not used) */
+#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */
+#define NETLINK_FIREWALL 3 /* (not used) */
+#define NETLINK_SOCK_DIAG 4 /* socket monitoring */
+#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */
+#define NETLINK_XFRM 6 /* ipsec */
+#define NETLINK_SELINUX 7 /* SELinux event notifications */
+#define NETLINK_ISCSI 8 /* Open-iSCSI */
+#define NETLINK_AUDIT 9 /* auditing */
+#define NETLINK_FIB_LOOKUP 10
+#define NETLINK_CONNECTOR 11
+#define NETLINK_NETFILTER 12 /* netfilter subsystem */
+#define NETLINK_IP6_FW 13 /* (not used) */
+#define NETLINK_DNRTMSG 14 /* DECnet routing messages (not used) */
+#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */
+#define NETLINK_GENERIC 16
+
+
+#ifndef roundup2
+#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+#endif
+#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t)
+#define NL_ITEM_ALIGN(_len) roundup2(_len, NL_ITEM_ALIGN_SIZE)
+#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off))
+#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off))
+
+#define NL_ITEM_OK(_ptr, _len, _hlen, _DLEN) \
+ ((_len) >= _hlen && _DLEN(_ptr) = _hlen && _DLEN(_ptr) <= (_len))
+#define NL_ITEM_NEXT(_ptr, _LEN_MACRO) (typeof(_ptr)NL_ITEM_DATA(_LEN_MACRO(_ptr)))
+#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \
+ ((_len) -= _LEN_MACRO(_ptr), (_ptr) = NL_ITEM_NEXT(_ptr, _LEN_MACRO))
+
+
+#ifndef _KERNEL
+/* part of netlink(3) API */
+#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define NLMSG_HDRLEN ((int)sizeof(struct nlmsghdr))
+#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN)
+#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(_len))
+#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, 0)
+#define _NLMSG_LEN(_hdr) ((int)(_hdr)->nlmsg_len)
+#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr))
+#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN)
+#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN)
+#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len)))
+
+#else
+#define NLMSG_ALIGNTO 4U
+#define NLMSG_ALIGN(len) (((len) + NLMSG_ALIGNTO - 1) & ~(NLMSG_ALIGNTO - 1))
+#define NLMSG_HDRLEN ((int)NLMSG_ALIGN(sizeof(struct nlmsghdr)))
+#endif
+
+/*
+ * Base netlink attribute TLV header.
+ */
+struct nlattr {
+ uint16_t nla_len; /* Total attribute length */
+ uint16_t nla_type; /* Attribute type */
+};
+
+/*
+ *
+ * nl_type field enconding:
+ *
+ * 0 1
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |N|O| Attribute type |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * N - attribute contains other attributes
+ * O - encoded in network byte order
+ * Note: N & O are mutually exclusive
+ *
+ * Note: attribute type value scope normally is per-message
+ * or per message group.
+ */
+
+#define NLA_F_NESTED (1 << 15)
+#define NLA_F_NET_BYTEORDER (1 << 14)
+#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER)
+
+/* Compatibility macro */
+#ifndef _KERNEL
+#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define NLA_HDRLEN ((int)sizeof(struct nlattr))
+#endif
+
+#endif
Index: sys/netlink/netlink_ctl.h
===================================================================
--- /dev/null
+++ sys/netlink/netlink_ctl.h
@@ -0,0 +1,208 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef _NETLINK_NETLINK_CTL_H_
+#define _NETLINK_NETLINK_CTL_H_
+
+#ifdef _KERNEL
+
+/*
+ * This file provides headers for the public KPI of the netlink
+ * subsystem
+ * */
+
+/*
+ * Messages and attributes (netlink_message.c)
+ */
+#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1)))
+
+#define NETLINK_ALIGN_SIZE sizeof(uint32_t)
+#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE)
+
+#define NLA_ALIGN_SIZE sizeof(uint32_t)
+#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE)
+
+#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len))
+#define NLA_FOREACH(_attr, _start, _len) \
+ for (_attr = (_start); ((char *)NLA_NEXT(_attr) - (char *)(_start)) <= (_len); _attr = NLA_NEXT(_attr))
+
+struct mbuf;
+struct nlmsg_state;
+typedef bool nlmsg_state_cb(struct nlmsg_state *ns, char *buf, int buflen);
+
+struct nlmsg_state {
+ int alloc_len;
+ int offset;
+ struct nlmsghdr *hdr;
+ char *data; // pointer to contig storage
+ union {
+ struct mbuf *_m;
+ char *_buf;
+ };
+ nlmsg_state_cb *cb;
+ void *arg;
+ int malloc_flag; // M_WAITOK | M_NOWAIT
+ uint8_t writer_type;
+ uint8_t writer_target;
+};
+#define NS_WRITER_TARGET_SOCKET 0
+#define NS_WRITER_TARGET_GROUP 1
+#define NS_WRITER_TARGET_CHAIN 2
+
+#define NS_WRITER_TYPE_MBUF 0
+#define NS_WRITER_TYPE_BUF 1
+#define NS_WRITER_TYPE_LBUF 2
+#define NS_WRITER_TYPE_MBUFC 3
+
+
+#define NLMSG_SMALL 128
+#define NLMSG_LARGE 2048
+
+struct nlpcb;
+bool nlmsg_get_socket_writer(int size, struct nlpcb *nlp, struct nlmsg_state *ns);
+bool nlmsg_get_group_writer(int size, uint32_t group_mask, struct nlmsg_state *ns);
+bool nlmsg_get_chain_writer(int size, struct mbuf **pm, struct nlmsg_state *ns);
+void nlmsg_free(struct nlmsg_state *ns);
+bool nlmsg_add(struct nlmsg_state *ns, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len);
+void *nlmsg_reserve_data_raw(struct nlmsg_state *ns, size_t sz);
+void nlmsg_end(struct nlmsg_state *ns);
+void nlmsg_abort(struct nlmsg_state *ns);
+bool nlmsg_flush(struct nlmsg_state *ns);
+
+#define nlmsg_data(_hdr) ((void *)((_hdr) + 1))
+
+#define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, NLA_ALIGN(sizeof(_t))))
+#define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz))
+
+/* Attributes */
+bool nlattr_add_handle_oom(struct nlmsg_state *ns, int attr_type, int attr_len,
+ const void *data);
+
+static inline bool
+nlattr_add_noerror(struct nlmsg_state *ns, int attr_type, int attr_len,
+ const void *data)
+{
+ int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+ struct nlattr *nla = (struct nlattr *)(&ns->data[ns->offset]);
+
+ nla->nla_len = attr_len + sizeof(struct nlattr);
+ nla->nla_type = attr_type;
+ if (attr_len > 0) {
+ memcpy((nla + 1), data, attr_len);
+ }
+ ns->offset += required_len;
+ return (true);
+}
+
+static inline bool
+nlattr_add(struct nlmsg_state *ns, int attr_type, int attr_len, const void *data)
+{
+ int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+
+ if (__predict_false(ns->offset + required_len > ns->alloc_len)) {
+ if (!nlattr_add_handle_oom(ns, attr_type, attr_len, data))
+ return (false);
+ }
+
+ return (nlattr_add_noerror(ns, attr_type, attr_len, data));
+}
+
+static inline bool
+nlattr_add_u8(struct nlmsg_state *ns, int attrtype, uint8_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint8_t), &value));
+}
+
+static inline bool
+nlattr_add_u16(struct nlmsg_state *ns, int attrtype, uint16_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint16_t), &value));
+}
+
+static inline bool
+nlattr_add_u32(struct nlmsg_state *ns, int attrtype, uint32_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint32_t), &value));
+}
+
+static inline bool
+nlattr_add_u64(struct nlmsg_state *ns, int attrtype, uint64_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint64_t), &value));
+}
+
+static inline bool
+nlattr_add_s8(struct nlmsg_state *ns, int attrtype, int8_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int8_t), &value));
+}
+
+static inline bool
+nlattr_add_s16(struct nlmsg_state *ns, int attrtype, int16_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int16_t), &value));
+}
+
+static inline bool
+nlattr_add_s32(struct nlmsg_state *ns, int attrtype, int32_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int32_t), &value));
+}
+
+static inline bool
+nlattr_add_s64(struct nlmsg_state *ns, int attrtype, int64_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int64_t), &value));
+}
+
+static inline bool
+nlattr_add_flag(struct nlmsg_state *ns, int attrtype)
+{
+ return (nlattr_add(ns, attrtype, 0, NULL));
+}
+
+static inline bool
+nlattr_add_string(struct nlmsg_state *ns, int attrtype, const char *str)
+{
+ return (nlattr_add(ns, attrtype, strlen(str) + 1, str));
+}
+
+/* Protocol handlers */
+struct netlink_parse_tracker;
+typedef int (*nl_handler)(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt);
+
+bool netlink_register_proto(int proto, nl_handler handle);
+bool netlink_unregister_proto(int proto);
+
+
+#endif
+
+
+#endif
Index: sys/netlink/netlink_debug.h
===================================================================
--- /dev/null
+++ sys/netlink/netlink_debug.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2022
+ * Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETLINK_NETLINK_DEBUG_H_
+#define _NETLINK_NETLINK_DEBUG_H_
+
+#include <sys/net/route/route_debug.h>
+
+/*
+ * Generic debug
+ * [nl_domain] func_name: debug text
+ */
+#define NL_DEBUG RT_DEBUG
+
+/*
+ * Logging for events specific for particular process
+ * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45
+ */
+#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, _fib, _fam, _fmt, ## __VA_ARGS__)
+#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \
+ _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, __func__. ##__VA_ARGS__); \
+}
+
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG3
+#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG2
+#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG
+#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_INFO
+#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...)
+#endif
+#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG
+
+
+
+#endif
Index: sys/netlink/netlink_domain.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_domain.c
@@ -0,0 +1,526 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains socket and protocol bindings for netlink.
+ */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/netisr.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_domain
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+
+static u_long nl_sendspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
+ "Default netlink socket send space");
+
+static u_long nl_recvspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
+ "Default netlink socket receive space");
+
+/*
+ * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
+ * Returns nlpcb pointer if present else NULL
+ */
+static struct nlpcb *
+nl_port_lookup(uint32_t port_id)
+{
+ struct nlpcb *nlp;
+
+ CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) {
+ if (nlp->nl_port == port_id)
+ return (nlp);
+ }
+ return (NULL);
+}
+
+static void
+nl_update_groups_locked(struct nlpcb *nlp, uint32_t nl_groups)
+{
+ /* Update group mask */
+ RT_LOG(LOG_DEBUG2, "socket %p, groups 0x%X -> 0x%X",
+ nlp->nl_socket, nlp->nl_groups, nl_groups);
+ nlp->nl_groups = nl_groups;
+}
+
+static uint32_t
+nl_find_port() {
+ /*
+ * app can open multiple netlink sockets.
+ * Start with current pid, if already taken,
+ * try random numbers in 65k..256k+65k space,
+ * avoiding clash with pids.
+ */
+ if (nl_port_lookup(curproc->p_pid) == NULL)
+ return (curproc->p_pid);
+ for (int i = 0; i < 16; i++) {
+ uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
+ if (nl_port_lookup(nl_port) == 0)
+ return (nl_port);
+ RT_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
+ }
+ return (curproc->p_pid);
+}
+
+static int
+nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
+{
+ if (nlp->nl_active) {
+ if (nlp->nl_port != snl->nl_pid) {
+ RT_LOG(LOG_DEBUG,
+ "bind() failed: program pid %d "
+ "is different from provided pid %d",
+ nlp->nl_port, snl->nl_pid);
+ return (EINVAL); // XXX: better error
+ }
+ } else {
+ if (snl->nl_pid == 0)
+ snl->nl_pid = nl_find_port();
+ if (nl_port_lookup(snl->nl_pid) != NULL)
+ return (EADDRINUSE);
+ nlp->nl_port = snl->nl_pid;
+ nlp->nl_active = true;
+ CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next);
+ }
+ nl_update_groups_locked(nlp, snl->nl_groups);
+
+ return (0);
+}
+
+static int
+nl_pru_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct nlpcb *nlp;
+ int error;
+
+ bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d%s: attaching socket to netlink proto %d",
+ so, curproc->p_pid, is_linux ? "(linux)" : "", proto);
+
+ if (__predict_false(netlink_unloading != 0))
+ return (EAFNOSUPPORT);
+
+ /* Create per-VNET state on first socket init */
+ if (V_nl_ctl == NULL)
+ vnet_nl_ctl_init();
+ KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed"));
+ MPASS(sotonlpcb(so) == NULL);
+
+ error = nl_verify_proto(proto);
+ if (error != 0)
+ return (error);
+
+ nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
+ error = soreserve(so, nl_sendspace, nl_recvspace);
+ if (error != 0) {
+ free(nlp, M_PCB);
+ return (error);
+ }
+ so->so_pcb = (void *)nlp;
+ nlp->nl_socket = so;
+ nlp->nl_proto = proto;
+ nlp->nl_process_id = curproc->p_pid;
+ nlp->nl_linux = is_linux;
+ NLP_LOCK_INIT(nlp);
+ refcount_init(&nlp->nl_refcount, 1);
+
+ nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
+ taskqueue_thread_enqueue, &nlp->nl_taskqueue);
+ TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
+ taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
+ "netlink_socket (PID %u)", nlp->nl_process_id);
+
+ CTL_WLOCK();
+ CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_pcb_head, nlp, nl_next);
+ CTL_WUNLOCK();
+
+ soisconnected(so);
+
+ return (0);
+}
+
+static void
+nl_pru_abort(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ soisdisconnected(so);
+}
+
+static int
+nl_pru_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)nam;
+ int error;
+
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ RT_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+ CTL_WLOCK();
+ NLP_LOCK(nlp);
+ error = nl_bind_locked(nlp, snl);
+ NLP_UNLOCK(nlp);
+ CTL_WUNLOCK();
+ RT_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
+ snl->nl_pid, snl->nl_groups, error);
+
+ return (error);
+}
+
+
+static int
+nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
+{
+ struct sockaddr_nl snl = {
+ .nl_pid = port_id,
+ };
+ int error;
+
+ CTL_WLOCK();
+ NLP_LOCK(nlp);
+ snl.nl_groups = nlp->nl_groups;
+ error = nl_bind_locked(nlp, &snl);
+ NLP_UNLOCK(nlp);
+ CTL_WUNLOCK();
+
+ RT_LOG(LOG_DEBUG2, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+/*
+ * nl_autobind_port binds a unused portid to @nlp
+ * @nlp: pcb data for the netlink socket
+ * @candidate_id: first id to consider
+ */
+static int
+nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
+{
+ uint32_t port_id = candidate_id;
+ CTL_TRACKER;
+ bool exist;
+ int error;
+
+ for (int i = 0; i < 10; i++) {
+ RT_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
+ CTL_RLOCK();
+ exist = nl_port_lookup(port_id) != 0;
+ CTL_RUNLOCK();
+ if (!exist) {
+ error = nl_assign_port(nlp, port_id);
+ if (error != EADDRINUSE)
+ break;
+ }
+ port_id++;
+ }
+ RT_LOG(LOG_DEBUG2, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+static int
+nl_pru_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)nam;
+ struct nlpcb *nlp;
+
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ RT_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+ nlp = sotonlpcb(so);
+ if (!nlp->nl_active) {
+ int error = nl_autobind_port(nlp, td->td_proc->p_pid);
+ if (error != 0) {
+ RT_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
+ return (error);
+ }
+ }
+ /* XXX: Handle socket flags & multicast */
+ soisconnected(so);
+
+ RT_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
+
+ return (0);
+}
+
+static void
+destroy_socket(struct nlpcb *nlp)
+{
+ NLP_LOCK(nlp);
+ NLP_LOCK_DESTROY(nlp);
+ free(nlp, M_PCB);
+}
+
+static void
+destroy_socket_epoch(epoch_context_t ctx)
+{
+ struct nlpcb *nlp;
+
+ nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
+
+ destroy_socket(nlp);
+}
+
+
+static void
+nl_pru_detach(struct socket *so)
+{
+ MPASS(sotonlpcb(so) != NULL);
+ struct nlpcb *nlp;
+
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ nlp = sotonlpcb(so);
+
+ /* Mark as inactive so no new work can be enqueued */
+ NLP_LOCK(nlp);
+ bool was_active = nlp->nl_active;
+ nlp->nl_active = false;
+ NLP_UNLOCK(nlp);
+
+ /* Wait till all scheduled work has been completed */
+ taskqueue_drain_all(nlp->nl_taskqueue);
+ taskqueue_free(nlp->nl_taskqueue);
+
+ CTL_WLOCK();
+ NLP_LOCK(nlp);
+ if (was_active) {
+ CK_LIST_REMOVE(nlp, nl_port_next);
+ RT_LOG(LOG_DEBUG2, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
+ }
+ CK_LIST_REMOVE(nlp, nl_next);
+ nlp->nl_socket = NULL;
+ NLP_UNLOCK(nlp);
+ CTL_WUNLOCK();
+
+ so->so_pcb = NULL;
+
+ RT_LOG(LOG_DEBUG2, "socket %p, detached", so);
+
+ epoch_call(net_epoch_preempt, destroy_socket_epoch, &nlp->nl_epoch_ctx);
+}
+
+static int
+nl_pru_disconnect(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ return (ENOTCONN);
+}
+
+static int
+nl_pru_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ return (ENOTCONN);
+}
+
+static int
+nl_pru_shutdown(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ socantsendmore(so);
+ return (0);
+}
+
+static int
+nl_pru_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct sockaddr_nl *snl;
+
+ snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO);
+ /* TODO: set other fields */
+ snl->nl_len = sizeof(struct sockaddr_nl);
+ snl->nl_family = AF_NETLINK;
+ snl->nl_pid = sotonlpcb(so)->nl_port;
+ *nam = (struct sockaddr *)snl;
+ return (0);
+}
+
+static void
+nl_pru_close(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ soisdisconnected(so);
+}
+
+static int
+nl_pru_output(struct mbuf *m, struct socket *so, ...)
+{
+
+ if (__predict_false(m == NULL ||
+ ((m->m_len < sizeof(struct nlmsghdr)) &&
+ (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL)))
+ return (ENOBUFS);
+ MPASS((m->m_flags & M_PKTHDR) != 0);
+
+ nl_receive_async(m, so);
+ return (0);
+}
+
+
+static int
+nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+ RT_LOG(LOG_DEBUG2, "sending message to kernel");
+ return (nl_pru_output(m, so));
+}
+
+/* netlink usrreqs */
+static struct pr_usrreqs nl_usrreqs = {
+ .pru_abort = nl_pru_abort,
+ .pru_attach = nl_pru_attach,
+ .pru_bind = nl_pru_bind,
+ .pru_connect = nl_pru_connect,
+ .pru_detach = nl_pru_detach,
+ .pru_disconnect = nl_pru_disconnect,
+ .pru_peeraddr = nl_pru_peeraddr,
+ .pru_send = nl_pru_send,
+ .pru_shutdown = nl_pru_shutdown,
+ .pru_sockaddr = nl_pru_sockaddr,
+ .pru_close = nl_pru_close
+};
+
+static int
+nl_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+ uint32_t flag, groups;
+ int optval, error = 0;
+ CTL_TRACKER;
+
+ RT_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
+ so, sopt->sopt_name);
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case NETLINK_ADD_MEMBERSHIP:
+ case NETLINK_DROP_MEMBERSHIP:
+ sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+
+ CTL_WLOCK();
+ if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
+ groups = nlp->nl_groups | optval;
+ else
+ groups = nlp->nl_groups & ~optval;
+ nl_update_groups_locked(nlp, groups);
+ CTL_WUNLOCK();
+ break;
+ case NETLINK_CAP_ACK:
+ case NETLINK_EXT_ACK:
+ sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+
+ if (sopt->sopt_name == NETLINK_CAP_ACK)
+ flag = NLF_CAP_ACK;
+ else if (sopt->sopt_name == NETLINK_EXT_ACK)
+ flag = NLF_EXT_ACK;
+ else
+ flag = 0;
+
+ CTL_WLOCK();
+ if (optval != 0)
+ nlp->nl_flags |= flag;
+ else
+ nlp->nl_flags &= ~flag;
+ CTL_WUNLOCK();
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case NETLINK_LIST_MEMBERSHIPS:
+ CTL_RLOCK();
+ optval = nlp->nl_groups;
+ CTL_RUNLOCK();
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+
+ return (error);
+}
+
+static struct domain netlinkdomain;
+
+static struct protosw netlinksw[] = {
+ {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &netlinkdomain,
+ .pr_flags = PR_ATOMIC | PR_ADDR,
+ .pr_output = nl_pru_output,
+ .pr_ctloutput = nl_ctloutput,
+ .pr_usrreqs = &nl_usrreqs
+ },
+};
+
+static struct domain netlinkdomain = {
+ .dom_family = PF_NETLINK,
+ .dom_name = "netlink",
+ .dom_protosw = netlinksw,
+ .dom_flags = DOMF_UNLOADABLE,
+ .dom_protoswNPROTOSW = &netlinksw[sizeof(netlinksw) / sizeof(netlinksw[0])]
+};
+
+DOMAIN_SET(netlink);
Index: sys/netlink/netlink_iface.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_iface.c
@@ -0,0 +1,532 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+#include <sys/ck.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_iface
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+
+struct netlink_walkargs {
+ struct nlmsg_state ns;
+ struct rib_cmd_info rc;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+#define FAIL_ATTR(a) {\
+ RT_LOG(LOG_DEBUG, "failed writing attribute %s (%d)", #a, a); \
+ goto error; \
+}
+
+static eventhandler_tag ifdetach_event, ifattach_event, ifaddr_event;
+
+/* */
+
+/*
+ * RTM_GETLINK request
+ * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0},
+ * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
+ *
+ * Reply:
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0},
+{{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"}
+
+[
+{{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"},
+{{nla_len=8, nla_type=IFLA_TXQLEN}, 1000},
+{{nla_len=5, nla_type=IFLA_OPERSTATE}, 6},
+{{nla_len=5, nla_type=IFLA_LINKMODE}, 0},
+{{nla_len=8, nla_type=IFLA_MTU}, 1500},
+{{nla_len=8, nla_type=IFLA_MIN_MTU}, 68},
+ {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000},
+{{nla_len=8, nla_type=IFLA_GROUP}, 0},
+{{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0},
+{{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536},
+{{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1},
+{{nla_len=5, nla_type=IFLA_CARRIER}, 1},
+{{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"},
+{{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2},
+{{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0},
+{{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1},
+{{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1},
+ */
+
+static unsigned
+ifp_flags_to_netlink(const struct ifnet *ifp)
+{
+ return (ifp->if_flags);
+}
+
+#define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen))
+static bool
+dump_sa(struct nlmsg_state *ns, int attr, const struct sockaddr *sa)
+{
+ uint32_t addr_len = 0;
+ const void *addr_data = NULL;
+ struct in6_addr addr6;
+
+ if (sa == NULL)
+ return (true);
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ addr_data = &((const struct sockaddr_in *)sa)->sin_addr;
+ break;
+ case AF_INET6:
+ in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len);
+ addr_len = sizeof(struct in6_addr);
+ addr_data = &addr6;
+ break;
+ case AF_LINK:
+ addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen;
+ addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa);
+ break;
+ default:
+ RT_LOG(LOG_DEBUG, "unsupported family: %d, skipping", sa->sa_family);
+ return (true);
+ }
+
+ return (nlattr_add(ns, attr, addr_len, addr_data));
+}
+
+static bool
+dump_iface(struct nlmsg_state *ns, struct ifnet *ifp, const struct nlmsghdr *hdr)
+{
+ struct ifinfomsg *ifinfo;
+
+ int payload_len = sizeof(struct ifinfomsg);
+ nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type,
+ hdr->nlmsg_flags, payload_len);
+
+ ifinfo = nlmsg_reserve_object(ns, struct ifinfomsg);
+ ifinfo->ifi_family = AF_UNSPEC;
+ ifinfo->__ifi_pad = 0;
+ ifinfo->ifi_type = ifp->if_type; // ARPHDR
+ ifinfo->ifi_index = ifp->if_index;
+ ifinfo->ifi_flags = ifp_flags_to_netlink(ifp);
+ ifinfo->ifi_change = 0;
+
+ if (!nlattr_add_string(ns, IFLA_IFNAME, if_name(ifp)))
+ goto error;
+
+ uint8_t val = 0; // XXX: operstate?
+ if (!nlattr_add_u8(ns, IFLA_OPERSTATE, val))
+ goto error;
+
+ /* XXX: carrier */
+ if (!nlattr_add_u8(ns, IFLA_CARRIER, val))
+ goto error;
+
+ if (!nlattr_add_u8(ns, IFLA_PROTO_DOWN, val))
+ goto error;
+
+ if (!nlattr_add_u8(ns, IFLA_LINKMODE, val))
+ goto error;
+
+ /* Link addr */
+ if ((ifp->if_addr != NULL)) {
+ if (!dump_sa(ns, IFLA_ADDRESS, ifp->if_addr->ifa_addr))
+ goto error;
+ }
+
+ if (!nlattr_add_u32(ns, IFLA_MTU, ifp->if_mtu))
+ goto error;
+/*
+ if (!nlattr_add_u32(ns, IFLA_MIN_MTU, 60))
+ goto error;
+
+ if (!nlattr_add_u32(ns, IFLA_MAX_MTU, 9000))
+ goto error;
+
+ if (!nlattr_add_u32(ns, IFLA_GROUP, 0))
+ goto error;
+*/
+ if (!nlattr_add_u32(ns, IFLA_PROMISCUITY, 0))
+ goto error;
+
+ nlmsg_end(ns);
+
+ return (true);
+
+error:
+ RT_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp));
+ nlmsg_abort(ns);
+ return (false);
+}
+
+int
+rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ struct ifnet *ifp;
+ int error = 0;
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .rc.rc_cmd = NL_RTM_NEWLINK,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .hdr.nlmsg_type = NL_RTM_NEWLINK,
+ };
+
+ if (!nlmsg_get_socket_writer(NLMSG_LARGE, nlp, &wa.ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return (ENOMEM);
+ }
+
+ RT_LOG(LOG_DEBUG, "Start dump");
+
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ wa.count++;
+ if (!dump_iface(&wa.ns, ifp, &wa.hdr)) {
+ error = ENOMEM;
+ break;
+ }
+ wa.dumped++;
+ }
+
+ RT_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_add(&wa.ns, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
+ RT_LOG(LOG_DEBUG, "Fuck");
+ return (ENOMEM);
+ }
+ /* report operation result */
+ int *perror = nlmsg_reserve_object(&wa.ns, int);
+ RT_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, wa.ns.offset, perror);
+ *perror = error;
+ nlmsg_end(&wa.ns);
+ nlmsg_flush(&wa.ns);
+
+
+ return (error);
+}
+
+
+/*
+
+{ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")},
+ [
+ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")},
+ {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")},
+ {{nla_len=7, nla_type=IFA_LABEL}, "lo"},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]},
+---
+
+{{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735},
+ {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")},
+ [
+ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]},
+*/
+
+static uint8_t
+ifa_get_scope(const struct ifaddr *ifa)
+{
+ const struct sockaddr *sa;
+ uint8_t addr_scope = RT_SCOPE_UNIVERSE;
+
+ sa = ifa->ifa_addr;
+ switch (sa->sa_family) {
+ case AF_INET:
+ {
+ struct in_addr addr;
+ addr = ((const struct sockaddr_in *)sa)->sin_addr;
+ if (IN_LOOPBACK(addr.s_addr))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN_LINKLOCAL(addr.s_addr))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+ case AF_INET6:
+ {
+ const struct in6_addr *addr;
+ addr = &((const struct sockaddr_in6 *)sa)->sin6_addr;
+ if (IN6_IS_ADDR_LOOPBACK(addr))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN6_IS_ADDR_LINKLOCAL(addr))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+ }
+
+ return (addr_scope);
+}
+
+static uint8_t
+inet6_get_plen(const struct in6_addr *addr)
+{
+
+ return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) +
+ bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3]));
+}
+
+static uint8_t
+get_sa_plen(const struct sockaddr *sa)
+{
+ const struct in6_addr *paddr6;
+ const struct in_addr *paddr;
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ if (sa == NULL)
+ return (32);
+ paddr = &(((const struct sockaddr_in *)sa)->sin_addr);
+ return bitcount32(paddr->s_addr);;
+ case AF_INET6:
+ if (sa == NULL)
+ return (128);
+ paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr);
+ return inet6_get_plen(paddr6);
+ }
+
+ return (0);
+}
+
+
+/*
+ * {'attrs': [('IFA_ADDRESS', '12.0.0.1'),
+ ('IFA_LOCAL', '12.0.0.1'),
+ ('IFA_LABEL', 'eth10'),
+ ('IFA_FLAGS', 128),
+ ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})],
+ */
+static bool
+dump_iface_addr(struct nlmsg_state *ns, struct ifnet *ifp, struct ifaddr *ifa,
+ const struct nlmsghdr *hdr)
+{
+ struct ifaddrmsg *ifamsg;
+ struct sockaddr *sa = ifa->ifa_addr;
+
+ RT_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s",
+ ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+
+ int payload_len = sizeof(struct ifaddrmsg);
+ nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type,
+ hdr->nlmsg_flags, payload_len);
+
+ ifamsg = nlmsg_reserve_object(ns, struct ifaddrmsg);
+ ifamsg->ifa_family = sa->sa_family;
+ ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask);
+ ifamsg->ifa_flags = 0; // ifa_flags is useless
+ ifamsg->ifa_scope = ifa_get_scope(ifa);
+ ifamsg->ifa_index = ifp->if_index;
+
+ struct sockaddr *dst_sa = ifa->ifa_dstaddr;
+ if ((dst_sa == NULL) || (dst_sa->sa_family != sa->sa_family))
+ dst_sa = sa;
+ if (!dump_sa(ns, IFA_ADDRESS, dst_sa))
+ FAIL_ATTR(IFA_ADDRESS);
+ if (!dump_sa(ns, IFA_LOCAL, sa))
+ FAIL_ATTR(IFA_LOCAL);
+
+ if (!nlattr_add_string(ns, IFA_LABEL, if_name(ifp)))
+ FAIL_ATTR(IFA_LABEL);
+ uint32_t val = 0; // ifa->ifa_flags;
+ if (!nlattr_add_u32(ns, IFA_FLAGS, val))
+ FAIL_ATTR(IFA_FLAGS);
+
+ nlmsg_end(ns);
+ return (true);
+error:
+ RT_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s",
+ rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+ nlmsg_abort(ns);
+ return (false);
+}
+
+int
+rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ int error = 0;
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .rc.rc_cmd = NL_RTM_NEWADDR,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .hdr.nlmsg_type = NL_RTM_NEWADDR,
+ };
+
+ if (!nlmsg_get_socket_writer(NLMSG_LARGE, nlp, &wa.ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return (ENOMEM);
+ }
+
+ RT_LOG(LOG_DEBUG, "Start dump");
+
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (wa.family != 0 && wa.family != ifa->ifa_addr->sa_family)
+ continue;
+ if (ifa->ifa_addr->sa_family == AF_LINK)
+ continue;
+ wa.count++;
+ if (!dump_iface_addr(&wa.ns, ifp, ifa, &wa.hdr)) {
+ error = ENOMEM;
+ break;
+ }
+ wa.dumped++;
+ }
+ if (error != 0)
+ break;
+ }
+
+ RT_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_add(&wa.ns, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
+ RT_LOG(LOG_DEBUG, "Unable to write message");
+ return (ENOMEM);
+ }
+ int *perror = nlmsg_reserve_object(&wa.ns, int);
+ RT_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, wa.ns.offset, perror);
+ *perror = error;
+ nlmsg_end(&wa.ns);
+ nlmsg_flush(&wa.ns);
+
+ return (error);
+}
+
+static void
+rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd)
+{
+ struct nlmsghdr hdr = {};
+ struct nlmsg_state ns = {};
+ uint32_t group = 0;
+
+ switch (ifa->ifa_addr->sa_family) {
+ case AF_INET:
+ group = RTNLGRP_IPV4_IFADDR;
+ break;
+ case AF_INET6:
+ group = RTNLGRP_IPV6_IFADDR;
+ break;
+ default:
+ RT_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d",
+ ifa->ifa_addr->sa_family);
+ return;
+ }
+
+ if (!nlmsg_get_group_writer(NLMSG_LARGE, group, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+
+ hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR;
+
+ dump_iface_addr(&ns, ifa->ifa_ifp, ifa, &hdr);
+ nlmsg_flush(&ns);
+}
+
+static void
+rtnl_handle_ifattach(void *arg, struct ifnet *ifp)
+{
+ struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_NEWLINK };
+ struct nlmsg_state ns = {};
+
+ if (!nlmsg_get_group_writer(NLMSG_LARGE, RTNLGRP_LINK, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+ dump_iface(&ns, ifp, &hdr);
+ nlmsg_flush(&ns);
+}
+
+static void
+rtnl_handle_ifdetach(void *arg, struct ifnet *ifp)
+{
+ struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_DELLINK };
+ struct nlmsg_state ns = {};
+
+ if (!nlmsg_get_group_writer(NLMSG_LARGE, RTNLGRP_LINK, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+ dump_iface(&ns, ifp, &hdr);
+ nlmsg_flush(&ns);
+}
+
+void
+rtnl_ifaces_init(void)
+{
+ ifattach_event = EVENTHANDLER_REGISTER(
+ ifnet_arrival_event, rtnl_handle_ifattach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifdetach_event = EVENTHANDLER_REGISTER(
+ ifnet_departure_event, rtnl_handle_ifdetach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifaddr_event = EVENTHANDLER_REGISTER(
+ rt_addrmsg, rtnl_handle_ifaddr, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+void
+rtnl_ifaces_destroy(void)
+{
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event);
+ EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event);
+}
Index: sys/netlink/netlink_io.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_io.c
@@ -0,0 +1,348 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/mbuf.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/netisr.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_io
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+
+static struct sockaddr_nl _nl_empty_src = {
+ .nl_len = sizeof(struct sockaddr_nl),
+ .nl_family = PF_NETLINK,
+ .nl_pid = 0 /* comes from the kernel */
+};
+static struct sockaddr *nl_empty_src = (struct sockaddr *)&_nl_empty_src;
+
+static int nl_receive(struct mbuf *m, struct nlpcb *nlp);
+
+int
+nl_receive_async(struct mbuf *m, struct socket *so)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+
+ m->m_nextpkt = NULL;
+
+ NLP_LOCK(nlp);
+
+ if ((__predict_false(!nlp->nl_active))) {
+ NLP_UNLOCK(nlp);
+ m_free(m);
+ return (EINVAL);
+ }
+
+ /* XXX: Implement queue limits */
+ if (nlp->nl_queue_head == NULL) {
+ nlp->nl_queue_head = m;
+ nlp->nl_queue_last = m;
+ } else {
+ nlp->nl_queue_last->m_nextpkt = m;
+ nlp->nl_queue_last = m;
+ }
+ nlp->nl_queue_length += m_length(m, NULL);
+ RT_LOG(LOG_DEBUG3, "enqueue, total len %ld", nlp->nl_queue_length);
+
+ if (!nlp->nl_task_pending) {
+ nlp->nl_task_pending = true;
+ taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
+ }
+ NLP_UNLOCK(nlp);
+
+ return (0);
+}
+
+static void
+nl_process_received(struct nlpcb *nlp)
+{
+ struct mbuf *m;
+
+ NLP_LOCK(nlp);
+ m = nlp->nl_queue_head;
+ nlp->nl_queue_head = NULL;
+ nlp->nl_queue_last = NULL;
+ nlp->nl_queue_length = 0;
+ nlp->nl_task_pending = false;
+ NLP_UNLOCK(nlp);
+
+ RT_LOG(LOG_DEBUG2, "taskqueue called");
+
+ while (m != NULL) {
+ struct mbuf *m_next = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ nl_receive(m, nlp);
+ m = m_next;
+ }
+}
+
+void
+nl_taskqueue_handler(void *_arg, int pending)
+{
+ struct nlpcb *nlp = (struct nlpcb *)_arg;
+ struct epoch_tracker et;
+
+ CURVNET_SET(nlp->nl_socket->so_vnet);
+ NET_EPOCH_ENTER(et);
+ nl_process_received(nlp);
+ NET_EPOCH_EXIT(et);
+ CURVNET_RESTORE();
+}
+
+bool
+nl_send_one(struct mbuf *m, struct nlpcb *nlp)
+{
+#if DEBUG_MAX_LEVEL > LOG_DEBUG2
+ struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+ RT_LOG(LOG_DEBUG2, "TX mbuf len %u msg type %d first hdrlen %u",
+ m->m_len, hdr->nlmsg_type, hdr->nlmsg_len);
+#endif
+ if (nlp->nl_linux) {
+ m = mbufs_to_linux(m, nlp);
+ if (m == NULL)
+ return (false);
+ }
+ bool result = false;
+ NLP_LOCK(nlp);
+ if (nlp->nl_socket != NULL) {
+ struct socket *so = nlp->nl_socket;
+
+ if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) {
+ sorwakeup(so);
+ RT_LOG(LOG_DEBUG3, "TX done");
+ result = true;
+ } else {
+ soroverflow(so);
+ m_freem(m);
+ RT_LOG(LOG_DEBUG, "socket RX overflow for PID %u",
+ nlp->nl_process_id);
+ }
+
+ }
+ NLP_UNLOCK(nlp);
+
+ return (result);
+}
+
+/*
+ * Used when certain data needs to be broadcasted to the group
+ */
+void
+nl_send_group(struct mbuf *m, uint32_t groups_mask)
+{
+ struct nlpcb *nlp_last = NULL;
+ struct nlpcb *nlp;
+ CTL_TRACKER;
+
+#if DEBUG_MAX_LEVEL > LOG_DEBUG2
+ struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+ RT_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to groups 0x%X",
+ m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, groups_mask);
+#endif
+
+ CTL_RLOCK();
+
+ CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_pcb_head, nl_next) {
+ if (nlp->nl_groups & groups_mask) {
+ if (nlp_last != NULL) {
+ struct mbuf *m_copy;
+ m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+ if (m_copy != NULL)
+ nl_send_one(m_copy, nlp_last);
+ else {
+ NLP_LOCK(nlp_last);
+ if (nlp_last->nl_socket != NULL)
+ sorwakeup(nlp_last->nl_socket);
+ NLP_UNLOCK(nlp_last);
+ }
+ }
+ nlp_last = nlp;
+ }
+ }
+ if (nlp_last != NULL)
+ nl_send_one(m, nlp_last);
+ else
+ m_freem(m);
+
+ CTL_RUNLOCK();
+}
+
+/*
+ * Sends an ack message
+ */
+void
+nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg)
+{
+ struct nlmsgerr *errmsg;
+ int payload_len;
+ uint32_t flags = nlp->nl_flags;
+ struct nlmsg_state ns;
+ bool cap_ack;
+
+ payload_len = sizeof(struct nlmsgerr);
+
+ /*
+ * The only case when we send the full message in the
+ * reply is when there is an error and NETLINK_CAP_ACK
+ * is not set.
+ */
+ cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
+ if (!cap_ack)
+ payload_len += nlmsg->nlmsg_len - sizeof(struct nlmsghdr);
+
+ /*
+ * TODO: handle NETLINK_F_EXT_ACK sockopt
+ * TODO: handle cookies
+ */
+
+ int sz = payload_len + sizeof(struct nlmsghdr);
+ if (!nlmsg_get_socket_writer(sz, nlp, &ns)) {
+ RT_LOG(LOG_NOTICE, "error allocating nlmsg(%d)", sz);
+ return;
+ }
+
+ RT_LOG(LOG_DEBUG, "type-%d;payload-%d;pid-%d;seq-%d", NLMSG_ERROR, payload_len,
+ nlp->nl_port, nlmsg->nlmsg_seq);
+
+ nlmsg_add(&ns, nlp->nl_port, nlmsg->nlmsg_seq, NLMSG_ERROR, 0, payload_len);
+
+ errmsg = nlmsg_reserve_data(&ns, payload_len, struct nlmsgerr);
+ errmsg->error = error;
+ /* In case of error copy the whole message, else just the header */
+ memcpy(&errmsg->msg, nlmsg, cap_ack ? sizeof(*nlmsg) : nlmsg->nlmsg_len);
+
+ nlmsg_end(&ns);
+ nlmsg_flush(&ns);
+}
+
+static int
+nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
+ struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ nl_handler handler = nl_handlers[nlp->nl_proto];
+ int error = 0;
+
+ RT_LOG(LOG_DEBUG3, "msg len: %d type: %d", hdr->nlmsg_len, hdr->nlmsg_type);
+
+ if (__predict_false(hdr->nlmsg_len > remaining_length)) {
+ RT_LOG(LOG_DEBUG, "invalid message");
+ return (EINVAL);
+ } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
+ RT_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
+ return (EINVAL);
+ }
+ /* Stamp each message with sender pid */
+ hdr->nlmsg_pid = nlp->nl_port;
+
+ if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
+ RT_LOG(LOG_DEBUG2, "handling message with msg type: %d",
+ hdr->nlmsg_type);
+
+ struct nlmsghdr *thdr = hdr;
+ if (nlp->nl_linux)
+ thdr = nlmsg_from_linux(hdr, npt);
+ error = handler(thdr, npt);
+ RT_LOG(LOG_DEBUG2, "retcode: %d", error);
+ }
+ if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
+ RT_LOG(LOG_DEBUG3, "ack");
+ nlmsg_ack(nlp, error, hdr);
+ RT_LOG(LOG_DEBUG3, "done");
+ }
+
+ return (0);
+}
+
+/*
+ * Processes an incoming packet, which can contain multiple netlink messages
+ */
+static int
+nl_receive(struct mbuf *m, struct nlpcb *nlp)
+{
+ int offset, buffer_length, error = 0;
+ struct nlmsghdr *hdr;
+ char *buffer;
+
+ RT_LOG(LOG_DEBUG, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
+
+ int data_length = m_length(m, NULL);
+ buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
+ if (nlp->nl_linux)
+ buffer_length += roundup2(data_length, 8);
+ buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (buffer == NULL) {
+ m_freem(m);
+ RT_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
+ buffer_length);
+ return (ENOMEM);
+ }
+ m_copydata(m, 0, data_length, buffer);
+ m_freem(m); // XXX: reuse for ack?
+
+ struct netlink_parse_tracker npt = {
+ .nlp = nlp,
+ .lb.base = &buffer[roundup2(data_length, 8)],
+ .lb.size = buffer_length - roundup2(data_length, 8),
+ };
+
+ for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
+ hdr = (struct nlmsghdr *)&buffer[offset];
+ /* Save length prior to calling handler */
+ int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+ RT_LOG(LOG_DEBUG2, "parsing offset %d/%d", offset, data_length);
+ /* Update parse state */
+ npt.hdr = hdr;
+ lb_clear(&npt.lb);
+ error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
+ if (__predict_false(error != 0))
+ break;
+ offset += msglen;
+ }
+ RT_LOG(LOG_DEBUG2, "packet parsing done");
+
+ free(buffer, M_NETLINK);
+ return (error);
+}
Index: sys/netlink/netlink_linux.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_linux.c
@@ -0,0 +1,482 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+
+#include <compat/linux/linux.h>
+#include <compat/linux/linux_common.h>
+
+#define DEBUG_MOD_NAME nl_linux
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+static int
+_linux_to_bsd_domain(int domain)
+{
+
+ switch (domain) {
+ case LINUX_AF_UNSPEC:
+ return (AF_UNSPEC);
+ case LINUX_AF_UNIX:
+ return (AF_LOCAL);
+ case LINUX_AF_INET:
+ return (AF_INET);
+ case LINUX_AF_INET6:
+ return (AF_INET6);
+ }
+ return (-1);
+}
+
+static int
+_bsd_to_linux_domain(int domain)
+{
+
+ switch (domain) {
+ case AF_UNSPEC:
+ return (LINUX_AF_UNSPEC);
+ case AF_LOCAL:
+ return (LINUX_AF_UNIX);
+ case AF_INET:
+ return (LINUX_AF_INET);
+ case AF_INET6:
+ return (LINUX_AF_INET6);
+ }
+ return (-1);
+}
+
+static bool
+valid_rta_size(const struct rtattr *rta, int sz)
+{
+ return (NL_RTA_DATA_LEN(rta) == sz);
+}
+
+static bool
+valid_rta_u32(const struct rtattr *rta)
+{
+ return (valid_rta_size(rta, sizeof(uint32_t)));
+}
+
+static uint32_t
+nl_rta_get_uint32(const struct rtattr *rta)
+{
+ return (*((const uint32_t *)NL_RTA_DATA_CONST(rta)));
+}
+
+#if 0
+static struct nlmsghdr *
+rtnl_ifaddr_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt)
+{
+ /* Tweak address families and default fib only */
+ struct ifaddrmsg *ifamsg = (struct ifaddrmsg *)(hdr + 1);
+
+ ifamsg->ifa_family = _linux_to_bsd_domain(ifamsg->ifa_family);
+
+ return (hdr);
+}
+#endif
+
+static struct nlmsghdr *
+rtnl_route_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt)
+{
+ /* Tweak address families and default fib only */
+ struct rtmsg *rtm = (struct rtmsg *)(hdr + 1);
+ struct nlattr *nla, *nla_head;
+ int attrs_len;
+
+ rtm->rtm_family = _linux_to_bsd_domain(rtm->rtm_family);
+
+ if (rtm->rtm_table == 254)
+ rtm->rtm_table = 0;
+
+ attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr);
+ attrs_len -= NETLINK_ALIGN(sizeof(struct rtmsg));
+ nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg)));
+
+ NLA_FOREACH(nla, nla_head, attrs_len) {
+ RT_LOG(LOG_DEBUG3, "GOT type %d len %d total %d",
+ nla->nla_type, nla->nla_len, attrs_len);
+ struct rtattr *rta = (struct rtattr *)nla;
+ if (rta->rta_len < sizeof(struct rtattr)) {
+ break;
+ }
+ switch (rta->rta_type) {
+ case NL_RTA_TABLE:
+ if (!valid_rta_u32(rta))
+ goto done;
+ rtm->rtm_table = 0;
+ uint32_t fibnum = nl_rta_get_uint32(rta);
+ RT_LOG(LOG_DEBUG, "GET RTABLE: %u", fibnum);
+ if (fibnum == 254) {
+ *((uint32_t *)NL_RTA_DATA(rta)) = 0;
+ }
+ break;
+ }
+ }
+
+done:
+ return (hdr);
+}
+
+static struct nlmsghdr *
+rtnl_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt)
+{
+ switch (hdr->nlmsg_type) {
+ case NL_RTM_GETROUTE:
+ case NL_RTM_NEWROUTE:
+ case NL_RTM_DELROUTE:
+ return (rtnl_route_from_linux(hdr, npt));
+ }
+
+ return (hdr);
+}
+
+struct nlmsghdr *
+nlmsg_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt)
+{
+ struct nlpcb *nlp = npt->nlp;
+
+ switch (nlp->nl_proto) {
+ case NETLINK_ROUTE:
+ return (rtnl_from_linux(hdr, npt));
+ }
+
+ return (hdr);
+}
+
+
+/************************************************************
+ * Kernel -> Linux
+ ************************************************************/
+
+static bool
+handle_default_out(struct nlmsghdr *hdr, struct nlmsg_state *ns)
+{
+ char *out_hdr;
+ out_hdr = nlmsg_reserve_data(ns, NLMSG_ALIGN(hdr->nlmsg_len), char);
+
+ if (out_hdr != NULL) {
+ memcpy(out_hdr, hdr, hdr->nlmsg_len);
+ return (true);
+ }
+ return (false);
+}
+
+static bool
+nlmsg_copy_header(struct nlmsghdr *hdr, struct nlmsg_state *ns)
+{
+ return (nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type,
+ hdr->nlmsg_flags, 0));
+}
+
+static void *
+_nlmsg_copy_next_header(struct nlmsghdr *hdr, struct nlmsg_state *ns, int sz)
+{
+ void *next_hdr = nlmsg_reserve_data(ns, sz, void);
+ memcpy(next_hdr, hdr + 1, NLMSG_ALIGN(sz));
+
+ return (next_hdr);
+}
+#define nlmsg_copy_next_header(_hdr, _ns, _t) \
+ ((_t *)(_nlmsg_copy_next_header(_hdr, _ns, sizeof(_t))))
+
+static bool
+nlmsg_copy_nla(const struct nlattr *nla_orig, struct nlmsg_state *ns)
+{
+ struct nlattr *nla = nlmsg_reserve_data(ns, nla_orig->nla_len, struct nlattr);
+ if (nla != NULL) {
+ memcpy(nla, nla_orig, nla_orig->nla_len);
+ return (true);
+ }
+ return (false);
+}
+
+static bool
+nlmsg_copy_all_nla(struct nlmsghdr *hdr, int raw_hdrlen, struct nlmsg_state *ns)
+{
+ struct nlattr *nla;
+
+ int hdrlen = NETLINK_ALIGN(raw_hdrlen);
+ int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen;
+ struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen);
+
+ NLA_FOREACH(nla, nla_head, attrs_len) {
+ if (!nlmsg_copy_nla(nla, ns))
+ return (false);
+ }
+ return (true);
+}
+
+static unsigned int
+rtnl_if_flags_to_linux(unsigned int if_flags)
+{
+ unsigned int result = 0;
+
+ for (int i = 0; i < 31; i++) {
+ unsigned int flag = 1 << i;
+ if (!(flag & if_flags))
+ continue;
+ switch (flag) {
+ case IFF_UP:
+ case IFF_BROADCAST:
+ case IFF_DEBUG:
+ case IFF_LOOPBACK:
+ case IFF_POINTOPOINT:
+ case IFF_NOARP:
+ case IFF_PROMISC:
+ case IFF_ALLMULTI:
+ result |= flag;
+ break;
+ case IFF_KNOWSEPOCH:
+ case IFF_DRV_RUNNING:
+ case IFF_DRV_OACTIVE:
+ case IFF_SIMPLEX:
+ case IFF_LINK0:
+ case IFF_LINK1:
+ case IFF_LINK2:
+ case IFF_CANTCONFIG:
+ case IFF_PPROMISC:
+ case IFF_MONITOR:
+ case IFF_STATICARP:
+ case IFF_STICKYARP:
+ case IFF_DYING:
+ case IFF_RENAMING:
+ case IFF_NOGROUP:
+ /* No Linux analogue */
+ break;
+ case IFF_MULTICAST:
+ result |= 1 << 12;
+ }
+ }
+ return (result);
+}
+
+static bool
+rtnl_newlink_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nlmsg_state *ns)
+{
+ if (!nlmsg_copy_header(hdr, ns))
+ return (false);
+
+ struct ifinfomsg *ifinfo;
+ ifinfo = nlmsg_copy_next_header(hdr, ns, struct ifinfomsg);
+
+ ifinfo->ifi_family = _bsd_to_linux_domain(ifinfo->ifi_family);
+ /* Convert interface type */
+ switch (ifinfo->ifi_type) {
+ case IFT_ETHER:
+ ifinfo->ifi_type = 1; // ARPHRD_ETHER
+ break;
+ }
+ ifinfo->ifi_flags = rtnl_if_flags_to_linux(ifinfo->ifi_flags);
+
+ /* Copy attributes unchanged */
+ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifinfomsg), ns))
+ return (false);
+
+ /* make ip(8) happy */
+ if (!nlattr_add_string(ns, IFLA_QDISC, "noqueue"))
+ return (false);
+
+ if (!nlattr_add_u32(ns, IFLA_TXQLEN, 1000))
+ return (false);
+
+ nlmsg_end(ns);
+ RT_LOG(LOG_DEBUG2, "done processing ns %p", ns);
+ return (true);
+}
+
+static bool
+rtnl_newaddr_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nlmsg_state *ns)
+{
+ if (!nlmsg_copy_header(hdr, ns))
+ return (false);
+
+ struct ifaddrmsg *ifamsg;
+ ifamsg = nlmsg_copy_next_header(hdr, ns, struct ifaddrmsg);
+
+ int old_family = ifamsg->ifa_family;
+ ifamsg->ifa_family = _bsd_to_linux_domain(ifamsg->ifa_family);
+ RT_LOG(LOG_DEBUG2, "CONVERT FAMILY %d -> %d", old_family, ifamsg->ifa_family);
+ /* XXX: fake ifa_flags? */
+
+ /* Copy attributes unchanged */
+ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifaddrmsg), ns))
+ return (false);
+
+ nlmsg_end(ns);
+ RT_LOG(LOG_DEBUG2, "done processing ns %p", ns);
+ return (true);
+}
+
+static bool
+rtnl_newroute_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct nlmsg_state *ns)
+{
+ if (!nlmsg_copy_header(hdr, ns))
+ return (false);
+
+ struct rtmsg *rtm;
+ rtm = nlmsg_copy_next_header(hdr, ns, struct rtmsg);
+ rtm->rtm_family = _bsd_to_linux_domain(rtm->rtm_family);
+
+ struct nlattr *nla;
+
+ int hdrlen = NETLINK_ALIGN(sizeof(struct rtmsg));
+ int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen;
+ struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen);
+
+ NLA_FOREACH(nla, nla_head, attrs_len) {
+ struct rtattr *rta = (struct rtattr *)nla;
+
+ switch (rta->rta_type) {
+ case NL_RTA_TABLE:
+ {
+ uint32_t fibnum;
+ fibnum = nl_rta_get_uint32(rta);
+ if (fibnum == 0)
+ fibnum = 254;
+ RT_LOG(LOG_DEBUG3, "FIBNUM %u", fibnum);
+ if (!nlattr_add_u32(ns, NL_RTA_TABLE, fibnum))
+ return (false);
+ }
+ break;
+ default:
+ if (!nlmsg_copy_nla(nla, ns))
+ return (false);
+ break;
+ }
+ }
+
+ nlmsg_end(ns);
+ RT_LOG(LOG_DEBUG2, "done processing ns %p", ns);
+ return (true);
+}
+
+static bool
+rtnl_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nlmsg_state *ns)
+{
+ RT_LOG(LOG_DEBUG2, "Got message type %d", hdr->nlmsg_type);
+
+ switch (hdr->nlmsg_type) {
+ case NL_RTM_NEWLINK:
+ return (rtnl_newlink_to_linux(hdr, nlp, ns));
+ case NL_RTM_NEWADDR:
+ return (rtnl_newaddr_to_linux(hdr, nlp, ns));
+ case NL_RTM_DELADDR:
+ return (rtnl_newaddr_to_linux(hdr, nlp, ns));
+ case NL_RTM_NEWROUTE:
+ return (rtnl_newroute_to_linux(hdr, nlp, ns));
+ default:
+ return (handle_default_out(hdr, ns));
+ }
+}
+
+static bool
+nlmsg_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nlmsg_state *ns)
+{
+ switch (nlp->nl_proto) {
+ case NETLINK_ROUTE:
+ return (rtnl_to_linux(hdr, nlp, ns));
+ default:
+ return (handle_default_out(hdr, ns));
+ }
+}
+
+struct mbuf *
+nlmsgs_to_linux(char *buf, int data_length, struct nlpcb *nlp)
+{
+ RT_LOG(LOG_DEBUG, "LINUX: get %p size %d", buf, data_length);
+ struct nlmsg_state ns = {};
+
+ struct mbuf *m = NULL;
+ if (!nlmsg_get_chain_writer(data_length, &m, &ns)) {
+ RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d",
+ data_length);
+ return (NULL);
+ }
+
+ /* Assume correct headers. Buffer IS mutable */
+ int count = 0;
+ for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
+ struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset];
+ int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+ count++;
+
+ if (!nlmsg_to_linux(hdr, nlp, &ns)) {
+ RT_LOG(LOG_DEBUG, "failed to process msg type %d",
+ hdr->nlmsg_type);
+ m_freem(m);
+ return (NULL);
+ }
+ offset += msglen;
+ }
+ nlmsg_flush(&ns);
+ nlmsg_free(&ns);
+ RT_LOG(LOG_DEBUG2, "Processed %d messages, chain size %d", count, m ? m_length(m, NULL) : 0);
+
+ return (m);
+}
+
+struct mbuf *
+mbufs_to_linux(struct mbuf *m, struct nlpcb *nlp)
+{
+ /* XXX: easiest solution, not optimized for performance */
+ int data_length = m_length(m, NULL);
+ char *buf = malloc(data_length, M_NETLINK, M_NOWAIT);
+ if (buf == NULL) {
+ RT_LOG(LOG_INFO, "unable to allocate %d bytes, dropping message",
+ data_length);
+ m_freem(m);
+ return (NULL);
+ }
+ m_copydata(m, 0, data_length, buf);
+ m_freem(m);
+
+ return (nlmsgs_to_linux(buf, data_length, nlp));
+}
+
Index: sys/netlink/netlink_message.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_message.c
@@ -0,0 +1,582 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/mbuf.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_message
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+
+typedef bool nlwriter_op_init(struct nlmsg_state *ns, int size, bool waitok);
+typedef bool nlwriter_op_write(struct nlmsg_state *ns, char *buf, int buflen);
+
+struct nlwriter_ops {
+ nlwriter_op_init *init;
+ nlwriter_op_write *write_socket;
+ nlwriter_op_write *write_group;
+ nlwriter_op_write *write_chain;
+};
+
+/*
+ * NS_WRITER_TYPE_BUF
+ * Writes message to a temporary memory buffer,
+ * flushing to the socket/group when buffer size limit is reached
+ */
+static bool
+nlmsg_get_ns_buf(struct nlmsg_state *ns, int size, bool waitok)
+{
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ ns->_buf = malloc(size, M_NETLINK, mflag | M_ZERO);
+ if (__predict_false(ns->_buf == NULL))
+ return (false);
+ ns->alloc_len = size;
+ ns->offset = 0;
+ ns->hdr = NULL;
+ ns->data = ns->_buf;
+ ns->writer_type = NS_WRITER_TYPE_BUF;
+ ns->malloc_flag = mflag;
+ return (true);
+}
+
+static bool
+nlmsg_write_socket_buf(struct nlmsg_state *ns, char *buf, int datalen)
+{
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns);
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ /* XXX: should we set sorcverr? */
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ m_append(m, datalen, buf);
+ free(buf, M_NETLINK);
+
+ return (nl_send_one(m, (struct nlpcb *)(ns->arg)));
+}
+
+static bool
+nlmsg_write_group_buf(struct nlmsg_state *ns, char *buf, int datalen)
+{
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ bool success = m_append(m, datalen, buf) != 0;
+ free(buf, M_NETLINK);
+
+ if (!success)
+ return (false);
+
+ nl_send_group(m, (uint32_t)(uintptr_t)(ns->arg));
+ return (true);
+}
+
+static bool
+nlmsg_write_chain_buf(struct nlmsg_state *ns, char *buf, int datalen)
+{
+ struct mbuf **m0 = (struct mbuf **)(ns->arg);
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ if (*m0 == NULL) {
+ struct mbuf *m;
+
+ m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ *m0 = m;
+ }
+ if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ return (true);
+}
+
+
+/*
+ * NS_WRITER_TYPE_MBUF
+ * Writes message to the allocated mbuf,
+ * flushing to socket/group when mbuf size limit is reached.
+ * This is the most efficient mechanism as it avoids double-copying.
+ *
+ * Allocates a single mbuf suitable to store up to @size bytes of data.
+ * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr
+ * If size <= MCLBYTES (2k), allocate a single mbuf cluster
+ * Otherwise, return NULL.
+ */
+static bool
+nlmsg_get_ns_mbuf(struct nlmsg_state *ns, int size, bool waitok)
+{
+ struct mbuf *m;
+
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ m = m_get2(size, mflag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL))
+ return (false);
+ ns->alloc_len = M_TRAILINGSPACE(m);
+ ns->offset = 0;
+ ns->hdr = NULL;
+ ns->_m = m;
+ ns->data = mtod(m, void *);
+ ns->writer_type = NS_WRITER_TYPE_MBUF;
+ ns->malloc_flag = mflag;
+ RT_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
+ m, size, ns->alloc_len, ns->data);
+ return (true);
+}
+
+static bool
+nlmsg_write_socket_mbuf(struct nlmsg_state *ns, char *buf, int datalen)
+{
+ struct mbuf *m = (struct mbuf *)buf;
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m);
+ return (true);
+ }
+
+ m->m_pkthdr.len = datalen;
+ m->m_len = datalen;
+ return (nl_send_one(m, (struct nlpcb *)(ns->arg)));
+}
+
+static bool
+nlmsg_write_group_mbuf(struct nlmsg_state *ns, char *buf, int datalen)
+{
+ struct mbuf *m = (struct mbuf *)buf;
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m);
+ return (true);
+ }
+
+ m->m_pkthdr.len = datalen;
+ m->m_len = datalen;
+ nl_send_group(m, (uint32_t)(uintptr_t)(ns->arg));
+ return (true);
+}
+
+static bool
+nlmsg_write_chain_mbuf(struct nlmsg_state *ns, char *buf, int datalen)
+{
+ struct mbuf *m_new = (struct mbuf *)buf;
+ struct mbuf **m0 = (struct mbuf **)(ns->arg);
+
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m_new);
+ return (true);
+ }
+
+ m_new->m_pkthdr.len = datalen;
+ m_new->m_len = datalen;
+
+ if (*m0 == NULL) {
+ *m0 = m_new;
+ } else {
+ struct mbuf *m_last;
+ for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
+ ;
+ m_last->m_next = m_new;
+ (*m0)->m_pkthdr.len += datalen;
+ }
+
+ return (true);
+}
+
+/*
+ * NS_WRITER_TYPE_LBUF
+ * Writes message to the allocated memory buffer,
+ * flushing to socket/group when mbuf size limit is reached.
+ * Calls linux handler to rewrite messages before sending to the socket.
+ */
+static bool
+nlmsg_get_ns_lbuf(struct nlmsg_state *ns, int size, bool waitok)
+{
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ size = roundup2(size, sizeof(void *));
+ int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
+ char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
+ if (__predict_false(buf == NULL))
+ return (false);
+
+ /* Fill buffer header first */
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ lb->base = &buf[sizeof(struct linear_buffer) + size];
+ lb->size = size + SCRATCH_BUFFER_SIZE;
+
+ ns->alloc_len = size;
+ ns->offset = 0;
+ ns->hdr = NULL;
+ ns->_buf = buf;
+ ns->data = (char *)(lb + 1);
+ ns->malloc_flag = mflag;
+ ns->writer_type = NS_WRITER_TYPE_LBUF;
+ return (true);
+}
+
+
+static bool
+nlmsg_write_socket_lbuf(struct nlmsg_state *ns, char *buf, int datalen)
+{
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ char *data = (char *)(lb + 1);
+ struct nlpcb *nlp = (struct nlpcb *)(ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = nlmsgs_to_linux(data, datalen, nlp);
+ free(buf, M_NETLINK);
+
+ if (__predict_false(m == NULL)) {
+ /* XXX: should we set sorcverr? */
+ return (false);
+ }
+
+ return (nl_send_one(m, nlp));
+}
+
+/* Shouldn't be called (maybe except Linux code originating message) */
+static bool
+nlmsg_write_group_lbuf(struct nlmsg_state *ns,char *buf, int datalen)
+{
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ char *data = (char *)(lb + 1);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ m_append(m, datalen, data);
+ free(buf, M_NETLINK);
+
+ nl_send_group(m, (uint32_t)(uintptr_t)(ns->arg));
+ return (true);
+}
+
+struct nlwriter_ops nlmsg_writers[] = {
+ /* NS_WRITER_TYPE_MBUF */
+ {
+ .init = nlmsg_get_ns_mbuf,
+ .write_socket = nlmsg_write_socket_mbuf,
+ .write_group = nlmsg_write_group_mbuf,
+ .write_chain = nlmsg_write_chain_mbuf,
+ },
+ /* NS_WRITER_TYPE_BUF */
+ {
+ .init = nlmsg_get_ns_buf,
+ .write_socket = nlmsg_write_socket_buf,
+ .write_group = nlmsg_write_group_buf,
+ .write_chain = nlmsg_write_chain_buf,
+ },
+ /* NS_WRITER_TYPE_LBUF */
+ {
+ .init = nlmsg_get_ns_lbuf,
+ .write_socket = nlmsg_write_socket_lbuf,
+ .write_group = nlmsg_write_group_lbuf,
+ },
+};
+
+static void
+nlmsg_set_callback(struct nlmsg_state *ns)
+{
+ struct nlwriter_ops *pops = &nlmsg_writers[ns->writer_type];
+
+ switch (ns->writer_target) {
+ case NS_WRITER_TARGET_SOCKET:
+ ns->cb = pops->write_socket;
+ break;
+ case NS_WRITER_TARGET_GROUP:
+ ns->cb = pops->write_group;
+ break;
+ case NS_WRITER_TARGET_CHAIN:
+ ns->cb = pops->write_chain;
+ break;
+ default:
+ panic("not implemented");
+ }
+}
+
+static bool
+nlmsg_get_buf_type(struct nlmsg_state *ns, int size, int type, bool waitok)
+{
+ MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
+ RT_LOG(LOG_DEBUG3, "Setting up ns %p size %d type %d", ns, size, type);
+ return (nlmsg_writers[type].init(ns, size, waitok));
+}
+
+static bool
+nlmsg_get_buf(struct nlmsg_state *ns, int size, bool waitok, bool is_linux)
+{
+ int type;
+
+ if (!is_linux) {
+ if (__predict_true(size <= MCLBYTES))
+ type = NS_WRITER_TYPE_MBUF;
+ else
+ type = NS_WRITER_TYPE_BUF;
+ } else
+ type = NS_WRITER_TYPE_LBUF;
+ return (nlmsg_get_buf_type(ns, size, type, waitok));
+}
+
+bool
+nlmsg_get_socket_writer(int size, struct nlpcb *nlp, struct nlmsg_state *ns)
+{
+ if (!nlmsg_get_buf(ns, size, false, nlp->nl_linux))
+ return (false);
+ ns->arg = (void *)nlp;
+ ns->writer_target = NS_WRITER_TARGET_SOCKET;
+ nlmsg_set_callback(ns);
+ return (true);
+}
+
+bool
+nlmsg_get_group_writer(int size, uint32_t group_mask, struct nlmsg_state *ns)
+{
+ if (!nlmsg_get_buf(ns, size, false, false))
+ return (false);
+ ns->arg = (void *)(uintptr_t)group_mask;
+ ns->writer_target = NS_WRITER_TARGET_GROUP;
+ nlmsg_set_callback(ns);
+ return (true);
+}
+
+bool
+nlmsg_get_chain_writer(int size, struct mbuf **pm, struct nlmsg_state *ns)
+{
+ if (!nlmsg_get_buf(ns, size, false, false))
+ return (false);
+ *pm = NULL;
+ ns->arg = (void *)pm;
+ ns->writer_target = NS_WRITER_TARGET_CHAIN;
+ nlmsg_set_callback(ns);
+ RT_LOG(LOG_DEBUG3, "setup cb %p (need %p)", ns->cb, &nlmsg_write_chain_mbuf);
+ return (true);
+}
+
+void
+nlmsg_free(struct nlmsg_state *ns)
+{
+ ns->cb(ns, ns->_buf, 0);
+}
+
+bool
+nlmsg_flush(struct nlmsg_state *ns)
+{
+
+ if (__predict_false(ns->hdr != NULL)) {
+ /* Last message has not been completed, skip it. */
+ int completed_len = (char *)ns->hdr - ns->data;
+ /* Send completed messages */
+ ns->offset -= ns->offset - completed_len;
+ ns->hdr = NULL;
+ }
+
+ bool result = ns->cb(ns, ns->_buf, ns->offset);
+ ns->_buf = NULL;
+
+ if (!result) {
+ RT_LOG(LOG_DEBUG, "ns %p offset %d: flush with %p() failed", ns, ns->offset, ns->cb);
+ }
+
+ return (result);
+}
+
+static __noinline bool
+clear_storage(struct nlmsg_state *ns)
+{
+ struct nlmsg_state ns_new = {};
+ int completed_len, new_len;
+ RT_LOG(LOG_DEBUG2, "realloc storage: used %d/%d bytes", ns->offset, ns->alloc_len);
+
+ /* Calculated new buffer size and allocate it s*/
+ completed_len = (ns->hdr != NULL) ? (char *)ns->hdr - ns->data : ns->offset;
+ if (completed_len > 0) {
+ /* We already ran out of space, use the largest effective size */
+ new_len = max(ns->alloc_len, MCLBYTES);
+ } else {
+ if (ns->alloc_len < MCLBYTES)
+ new_len = MCLBYTES;
+ else
+ new_len = ns->alloc_len * 2;
+ }
+ bool waitok = ns->malloc_flag == M_WAITOK;
+ bool is_linux = ns->writer_type == NS_WRITER_TYPE_LBUF;
+ if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux))
+ return (false);
+
+ /* Update callback data */
+ ns_new.writer_target = ns->writer_target;
+ nlmsg_set_callback(&ns_new);
+ ns_new.arg = ns->arg;
+
+ /* Copy last (unfinished) header to the new storage */
+ int last_len = ns->offset - completed_len;
+ if (last_len > 0) {
+ memcpy(ns_new.data, ns->hdr, last_len);
+ ns_new.hdr = (struct nlmsghdr *)ns_new.data;
+ ns_new.offset = last_len;
+ }
+
+ RT_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
+
+ /* Flush completed headers */
+ if (completed_len > 0) {
+ RT_LOG(LOG_DEBUG2, "Flushing completed %d bytes", completed_len);
+ ns->offset -= last_len;
+ ns->hdr = NULL;
+ nlmsg_flush(ns);
+ }
+
+ /* Update state */
+ memcpy(ns, &ns_new, sizeof(struct nlmsg_state));
+ RT_LOG(LOG_DEBUG2, "switched mbuf: used %d/%d bytes", ns->offset, ns->alloc_len);
+
+ return (true);
+}
+
+/*
+ * Note it MAY invalidate any previous pointers fetched.
+ */
+void *
+nlmsg_reserve_data_raw(struct nlmsg_state *ns, size_t sz)
+{
+ if (__predict_false(ns->offset + NETLINK_ALIGN(sz) > ns->alloc_len)) {
+ if (!clear_storage(ns))
+ return (NULL);
+ }
+
+ void *data_ptr = &ns->data[ns->offset];
+
+ RT_LOG(LOG_DEBUG3, "add data at offset %d, buf %p data_ptr %p",
+ ns->offset, ns->data, data_ptr);
+
+ ns->offset += NLMSG_ALIGN(sz);
+
+ return (data_ptr);
+}
+
+bool
+nlmsg_add(struct nlmsg_state *ns, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len)
+{
+ struct nlmsghdr *hdr;
+
+ if (__predict_false(ns->offset + NETLINK_ALIGN(len + sizeof(struct nlmsghdr)) > ns->alloc_len)) {
+ if (!clear_storage(ns))
+ return (false);
+ }
+
+ hdr = (struct nlmsghdr *)(&ns->data[ns->offset]);
+
+ hdr->nlmsg_len = len;
+ hdr->nlmsg_type = type;
+ hdr->nlmsg_flags = flags;
+ hdr->nlmsg_seq = seq;
+ hdr->nlmsg_pid = portid;
+
+ ns->hdr = hdr;
+ ns->offset += sizeof(struct nlmsghdr);
+
+ return (true);
+}
+
+void
+nlmsg_end(struct nlmsg_state *ns)
+{
+ ns->hdr->nlmsg_len = (uint32_t)(ns->data + ns->offset - (char *)ns->hdr);
+ ns->hdr = NULL;
+}
+
+void
+nlmsg_abort(struct nlmsg_state *ns)
+{
+ if (ns->hdr != NULL) {
+ ns->offset = (uint32_t)((char *)ns->hdr - ns->data);
+ ns->hdr = NULL;
+ }
+}
+
+bool
+nlattr_add_handle_oom(struct nlmsg_state *ns, int attr_type, int attr_len,
+ const void *data)
+{
+ int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+
+ RT_LOG(LOG_DEBUG3,
+ "no space at offset %d (want %d), alloc_len %d, trying to reclaim",
+ ns->offset, required_len, ns->alloc_len);
+ return (clear_storage(ns));
+}
Index: sys/netlink/netlink_module.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_module.c
@@ -0,0 +1,214 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/mbuf.h>
+#include <sys/ck.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#include <machine/atomic.h>
+
+MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets");
+
+#define DEBUG_MOD_NAME nl_mod
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+
+SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, "");
+
+nl_handler nl_handlers[NL_MAX_HANDLERS];
+
+CK_LIST_HEAD(nl_control_head, nl_control);
+static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER();
+
+VNET_DEFINE(struct nl_control *, nl_ctl) = NULL;
+
+struct mtx nlsock_mtx;
+MTX_SYSINIT(nlsock, &nlsock_mtx, "nlsock for handlers or portid list lock", MTX_DEF);
+
+#define NL_GLOBAL_LOCK_INIT() mtx_init(&nlsock_mtx, "nlsock global mtx", NULL, MTX_DEF)
+#define NL_GLOBAL_LOCK() mtx_lock(&nlsock_mtx)
+#define NL_GLOBAL_UNLOCK() mtx_unlock(&nlsock_mtx)
+
+int netlink_unloading = 0;
+
+static void
+free_nl_ctl(struct nl_control *ctl)
+{
+ rm_destroy(&ctl->ctl_lock);
+ free(ctl, M_NETLINK);
+}
+
+void
+vnet_nl_ctl_init(void)
+{
+ struct nl_control *ctl;
+
+ ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO);
+ rm_init(&ctl->ctl_lock, "netlink lock");
+ CK_LIST_INIT(&ctl->ctl_port_head);
+ CK_LIST_INIT(&ctl->ctl_pcb_head);
+
+ NL_GLOBAL_LOCK();
+
+ if (atomic_load_ptr(&V_nl_ctl) == NULL) {
+ atomic_store_ptr(&V_nl_ctl, ctl);
+ CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next);
+ RT_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list",
+ curvnet, ctl);
+ } else{
+ RT_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance");
+ free_nl_ctl(ctl);
+ }
+
+ NL_GLOBAL_UNLOCK();
+}
+
+static void
+vnet_nl_ctl_destroy(const void *unused __unused)
+{
+ struct nl_control *ctl;
+
+ NL_GLOBAL_LOCK();
+ ctl = atomic_load_ptr(&V_nl_ctl);
+ atomic_store_ptr(&V_nl_ctl, NULL);
+ if (ctl != NULL) {
+ RT_LOG(LOG_DEBUG2, "Removing %p from global list", ctl);
+ CK_LIST_REMOVE(ctl, ctl_next);
+ }
+ NL_GLOBAL_UNLOCK();
+
+ if (ctl != NULL)
+ free_nl_ctl(ctl);
+}
+VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+ vnet_nl_ctl_destroy, NULL);
+
+int
+nl_verify_proto(int proto)
+{
+ if (proto < 0 || proto >= NL_MAX_HANDLERS) {
+ return (EINVAL);
+ }
+ int handler_defined = nl_handlers[proto] != NULL;
+ return (handler_defined ? 0 : EPROTONOSUPPORT);
+}
+
+bool
+netlink_register_proto(int proto, nl_handler handler)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto] == NULL), ("netlink handler %d is already set", proto));
+ nl_handlers[proto] = handler;
+ NL_GLOBAL_UNLOCK();
+ RT_LOG(LOG_DEBUG, "Registered netlink proto %d handler", proto);
+ return (true);
+}
+
+bool
+netlink_unregister_proto(int proto)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto] != NULL), ("netlink handler %d is not set", proto));
+ nl_handlers[proto] = NULL;
+ NL_GLOBAL_UNLOCK();
+ RT_LOG(LOG_DEBUG, "Unregistered netlink proto %d handler", proto);
+ return (true);
+}
+
+
+
+static bool
+can_unload(void)
+{
+ struct nl_control *ctl;
+ bool result = true;
+
+ NL_GLOBAL_LOCK();
+
+ CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) {
+ RT_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl);
+ if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) {
+ RT_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl);
+ result = false;
+ break;
+ }
+ }
+
+ NL_GLOBAL_UNLOCK();
+
+ return (result);
+}
+
+static int
+netlink_modevent(module_t mod __unused, int what, void *priv __unused)
+{
+ int ret = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ RT_LOG(LOG_NOTICE, "Loading");
+ break;
+
+ case MOD_UNLOAD:
+ RT_LOG(LOG_NOTICE, "Unload called");
+ if (can_unload()) {
+ RT_LOG(LOG_WARNING, "unloading");
+ netlink_unloading = 1;
+ } else
+ ret = EBUSY;
+ break;
+
+ default:
+ ret = EOPNOTSUPP;
+ break;
+ }
+
+ return (ret);
+}
+static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL };
+
+DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(netlink, 1);
Index: sys/netlink/netlink_nhop.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_nhop.c
@@ -0,0 +1,313 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_utils.h>
+
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+
+#define DEBUG_MOD_NAME nl_nhop
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+/*
+ * idx -> {n:, d:, h:}
+ *
+ *
+ *
+ */
+struct user_nhop {
+ uint32_t un_idx; /* Userland-provided index */
+ struct nhop_object * un_nhop[3]; /* Normal, host, default */
+ struct user_nhop * un_next;
+ struct epoch_context un_epoch_ctx; /* epoch ctl helper */
+};
+
+/* produce hash value for an object */
+#define unhop_hash_obj(_obj) (hash_unhop(_obj))
+/* compare two objects */
+#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
+/* next object accessor */
+#define unhop_next(_obj) (_obj)->un_next
+
+CHT_SLIST_DEFINE(unhop, struct user_nhop);
+
+VNET_DEFINE_STATIC(struct unhop_head *, nl_nhop_head) = NULL;
+#define V_nl_nhop_head VNET(nl_nhop_head)
+
+static void consider_resize(uint32_t new_gr_buckets);
+static int clone_unhop(const struct nhop_object *nh_base, int nh_flags,
+ struct nhop_object **pnh);
+
+static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
+static unsigned int hash_unhop(const struct user_nhop *obj);
+
+static int
+cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
+{
+ return (a->un_idx == b->un_idx);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_unhop(const struct user_nhop *obj)
+{
+ return (obj->un_idx);
+}
+
+/*
+ * Returns object referenced and unlocked
+ */
+static int
+find_unhop(uint32_t uidx, int nh_flags, struct nhop_object **pnhop)
+{
+ int error = 0;
+ CTL_TRACKER;
+
+ struct user_nhop key= { .un_idx = uidx }, *unhop;
+ nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
+
+ CTL_RLOCK();
+ CHT_SLIST_FIND_BYOBJ(V_nl_nhop_head, unhop, &key, unhop);
+ if (unhop != NULL) {
+ int off = 0;
+ switch (nh_flags) {
+ case NHF_HOST:
+ off = 1;
+ break;
+ case NHF_DEFAULT:
+ off = 2;
+ break;
+ }
+ if (unhop->un_nhop[off] != NULL) {
+ *pnhop = unhop->un_nhop[off];
+ goto done;
+ }
+ /* Nexthop with the required flags does not exist yet. */
+ struct nhop_object *nhop = NULL;
+ error = clone_unhop(unhop->un_nhop[0], nh_flags, &nhop);
+ if (error != 0)
+ goto done;
+
+ /*
+ * Nexhops remains constant once set and get dereferenced
+ * only when unhop is deleted.
+ */
+ if (!atomic_cmpset_ptr((uintptr_t *)&unhop->un_nhop[off],
+ (uintptr_t)NULL, (uintptr_t)nhop)) {
+ nhop_free_any(nhop);
+ nhop = atomic_load_ptr(&unhop->un_nhop[off]);
+ }
+ *pnhop = unhop->un_nhop[off];
+ CTL_RUNLOCK();
+ } else
+ error = ESRCH;
+done:
+ CTL_RUNLOCK();
+ return (error);
+}
+
+static struct rib_head *
+nhop_get_rnh(const struct nhop_object *nh)
+{
+ return (rt_tables_get_rnh(nhop_get_fibnum(nh), nhop_get_upper_family(nh)));
+}
+
+#define MAX_STACK_NHOPS 4
+static int
+clone_unhop(const struct nhop_object *nh_base, int nh_flags, struct nhop_object **pnh)
+{
+ const struct weightened_nhop *wn;
+ struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
+ uint32_t num_nhops;
+ int error;
+
+ if (!NH_IS_NHGRP(nh_base)) {
+ struct nhop_object *nh;
+ nh = nhop_alloc(nhop_get_fibnum(nh_base),
+ nhop_get_upper_family(nh_base));
+ if (nh == NULL)
+ return (ENOMEM);
+ nhop_copy(nh, nh_base);
+ nhop_set_uidx(nh, nhop_get_uidx(nh_base));
+ nhop_set_pxtype_flag(nh, nh_flags);
+ *pnh = nhop_get_nhop(nh, &error);
+ return (error);
+ }
+
+ const struct nhgrp_object *nhg_base = (const struct nhgrp_object *)nh_base;
+ wn = nhgrp_get_nhops(nhg_base, &num_nhops);
+
+ if (num_nhops > MAX_STACK_NHOPS) {
+ wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
+ if (wn_new == NULL)
+ return (ENOMEM);
+ } else
+ wn_new = wn_base;
+
+ for (int i = 0; i < num_nhops; i++) {
+ uint32_t uidx = nhop_get_uidx(wn[i].nh);
+ if (uidx == 0) {
+ error = ESRCH;
+ break;
+ }
+ error = find_unhop(uidx, nh_flags, &wn_new[i].nh);
+ if (error != 0)
+ break;
+ wn_new[i].weight = wn[i].weight;
+ }
+
+ if (error == 0) {
+ struct rib_head *rh = nhop_get_rnh(wn_new[0].nh);
+ error = nhgrp_get_group(rh, wn_new, num_nhops,
+ (struct nhgrp_object **)pnh);
+ }
+
+ if (wn_new != wn_base)
+ free(wn_new, M_TEMP);
+ return (error);
+}
+
+static void
+destroy_unhop_epoch(epoch_context_t ctx) {
+ struct user_nhop *unhop;
+
+ unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
+
+ for (int i = 0; i < 3; i++)
+ nhop_free_any(unhop->un_nhop[i]);
+ free(unhop, M_NETLINK);
+}
+
+
+static void
+delete_unhop(struct user_nhop *unhop) {
+ struct user_nhop *unhop_ret;
+
+ CTL_WLOCK();
+ CHT_SLIST_REMOVE(V_nl_nhop_head, unhop, unhop, unhop_ret);
+ CTL_WUNLOCK();
+
+ if (unhop_ret == NULL) {
+ RT_LOG(LOG_DEBUG, "unable to find unhop %u", unhop->un_idx);
+ }
+ MPASS(unhop == unhop_ret);
+
+ epoch_call(net_epoch_preempt, destroy_unhop_epoch,
+ &unhop->un_epoch_ctx);
+}
+
+
+static void
+consider_resize(uint32_t new_gr_bucket)
+{
+ void *gr_ptr = NULL;
+ size_t alloc_size;
+
+ if (new_gr_bucket == 0)
+ return;
+
+ if (new_gr_bucket != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_gr_bucket);
+ gr_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (gr_ptr == NULL)
+ return;
+ }
+
+ CTL_WLOCK();
+ if (gr_ptr != NULL) {
+ CHT_SLIST_RESIZE(V_nl_nhop_head, unhop, gr_ptr, new_gr_bucket);
+ }
+ CTL_WUNLOCK();
+
+ if (gr_ptr != NULL)
+ free(gr_ptr, M_NETLINK);
+}
+
+static bool __noinline
+init_unhops()
+{
+ uint32_t num_buckets = 16;
+ size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+
+ struct unhop_head *phead = malloc(sizeof(struct unhop_head), M_NETLINK,
+ M_NOWAIT | M_ZERO);
+ if (phead == NULL)
+ return (NULL);
+
+ void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (ptr == NULL)
+ return (false);
+ CHT_SLIST_INIT(phead, ptr, num_buckets);
+
+ CTL_WLOCK();
+ if (V_nl_nhop_head == NULL)
+ V_nl_nhop_head = phead;
+ else {
+ free(ptr, M_NETLINK);
+ free(phead, M_NETLINK);
+ }
+ CTL_WUNLOCK();
+
+ return (true);
+}
+
+
+int
+rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt)
+{
+ if ((__predict_false(V_nl_nhop_head == NULL)) && (!init_unhops()))
+ return (ENOMEM);
+
+ return (0);
+}
+
+
+
+
Index: sys/netlink/netlink_route.h
===================================================================
--- /dev/null
+++ sys/netlink/netlink_route.h
@@ -0,0 +1,890 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_ROUTE_H_
+#define _NETLINK_NETLINK_ROUTE_H_
+
+#include <sys/types.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+
+/*
+ * Messages defined by the NETLINK_ROUTE subsystem
+ */
+
+enum {
+ NL_RTM_BASE = 16,
+#define NL_RTM_BASE NL_RTM_BASE
+ NL_RTM_NEWLINK = 16,
+#define NL_RTM_NEWLINK NL_RTM_NEWLINK
+ NL_RTM_DELLINK,
+#define NL_RTM_DELLINK NL_RTM_DELLINK
+ NL_RTM_GETLINK,
+#define NL_RTM_GETLINK NL_RTM_GETLINK
+ NL_RTM_SETLINK,
+#define NL_RTM_SETLINK NL_RTM_SETLINK
+ NL_RTM_NEWADDR = 20,
+#define NL_RTM_NEWADDR NL_RTM_NEWADDR
+ NL_RTM_DELADDR,
+#define NL_RTM_DELADDR NL_RTM_DELADDR
+ NL_RTM_GETADDR,
+#define NL_RTM_GETADDR NL_RTM_GETADDR
+ NL_RTM_NEWROUTE = 24,
+#define NL_RTM_NEWROUTE NL_RTM_NEWROUTE
+ NL_RTM_DELROUTE,
+#define NL_RTM_DELROUTE NL_RTM_DELROUTE
+ NL_RTM_GETROUTE,
+#define NL_RTM_GETROUTE NL_RTM_GETROUTE
+ NL_RTM_NEWNEIGH = 28,
+#define NL_RTM_NEWNEIGH NL_RTM_NEWNEIGH
+ NL_RTM_DELNEIGH,
+#define NL_RTM_DELNEIGH NL_RTM_DELNEIGH
+ NL_RTM_GETNEIGH,
+#define NL_RTM_GETNEIGH NL_RTM_GETNEIGH
+ NL_RTM_NEWRULE = 32,
+#define NL_RTM_NEWRULE NL_RTM_NEWRULE
+ NL_RTM_DELRULE,
+#define NL_RTM_DELRULE NL_RTM_DELRULE
+ NL_RTM_GETRULE,
+#define NL_RTM_GETRULE NL_RTM_GETRULE
+ NL_RTM_NEWQDISC = 36,
+#define NL_RTM_NEWQDISC NL_RTM_NEWQDISC
+ NL_RTM_DELQDISC,
+#define NL_RTM_DELQDISC NL_RTM_DELQDISC
+ NL_RTM_GETQDISC,
+#define NL_RTM_GETQDISC NL_RTM_GETQDISC
+ NL_RTM_NEWTCLASS = 40,
+#define NL_RTM_NEWTCLASS NL_RTM_NEWTCLASS
+ NL_RTM_DELTCLASS,
+#define NL_RTM_DELTCLASS NL_RTM_DELTCLASS
+ NL_RTM_GETTCLASS,
+#define NL_RTM_GETTCLASS NL_RTM_GETTCLASS
+ NL_RTM_NEWTFILTER = 44,
+#define NL_RTM_NEWTFILTER NL_RTM_NEWTFILTER
+ NL_RTM_DELTFILTER,
+#define NL_RTM_DELTFILTER NL_RTM_DELTFILTER
+ NL_RTM_GETTFILTER,
+#define NL_RTM_GETTFILTER NL_RTM_GETTFILTER
+ NL_RTM_NEWACTION = 48,
+#define NL_RTM_NEWACTION NL_RTM_NEWACTION
+ NL_RTM_DELACTION,
+#define NL_RTM_DELACTION NL_RTM_DELACTION
+ NL_RTM_GETACTION,
+#define NL_RTM_GETACTION NL_RTM_GETACTION
+ NL_RTM_NEWPREFIX = 52,
+#define NL_RTM_NEWPREFIX NL_RTM_NEWPREFIX
+ NL_RTM_GETMULTICAST = 58,
+#define NL_RTM_GETMULTICAST NL_RTM_GETMULTICAST
+ NL_RTM_GETANYCAST = 62,
+#define NL_RTM_GETANYCAST NL_RTM_GETANYCAST
+ NL_RTM_NEWNEIGHTBL = 64,
+#define NL_RTM_NEWNEIGHTBL NL_RTM_NEWNEIGHTBL
+ NL_RTM_GETNEIGHTBL = 66,
+#define NL_RTM_GETNEIGHTBL NL_RTM_GETNEIGHTBL
+ NL_RTM_SETNEIGHTBL,
+#define NL_RTM_SETNEIGHTBL NL_RTM_SETNEIGHTBL
+ NL_RTM_NEWNDUSEROPT = 68,
+#define NL_RTM_NEWNDUSEROPT NL_RTM_NEWNDUSEROPT
+ NL_RTM_NEWADDRLABEL = 72,
+#define NL_RTM_NEWADDRLABEL NL_RTM_NEWADDRLABEL
+ NL_RTM_DELADDRLABEL,
+#define NL_RTM_DELADDRLABEL NL_RTM_DELADDRLABEL
+ NL_RTM_GETADDRLABEL,
+#define NL_RTM_GETADDRLABEL NL_RTM_GETADDRLABEL
+ NL_RTM_GETDCB = 78,
+#define NL_RTM_GETDCB NL_RTM_GETDCB
+ NL_RTM_SETDCB,
+#define NL_RTM_SETDCB NL_RTM_SETDCB
+ NL_RTM_NEWNETCONF = 80,
+#define NL_RTM_NEWNETCONF NL_RTM_NEWNETCONF
+ NL_RTM_GETNETCONF = 82,
+#define NL_RTM_GETNETCONF NL_RTM_GETNETCONF
+ NL_RTM_NEWMDB = 84,
+#define NL_RTM_NEWMDB NL_RTM_NEWMDB
+ NL_RTM_DELMDB = 85,
+#define NL_RTM_DELMDB NL_RTM_DELMDB
+ NL_RTM_GETMDB = 86,
+#define NL_RTM_GETMDB NL_RTM_GETMDB
+ NL_RTM_NEWNSID = 88,
+#define NL_RTM_NEWNSID NL_RTM_NEWNSID
+ NL_RTM_DELNSID = 89,
+#define NL_RTM_DELNSID NL_RTM_DELNSID
+ NL_RTM_GETNSID = 90,
+#define NL_RTM_GETNSID NL_RTM_GETNSID
+ NL_RTM_NEWSTATS = 92,
+#define NL_RTM_NEWSTATS NL_RTM_NEWSTATS
+ NL_RTM_GETSTATS = 94,
+#define NL_RTM_GETSTATS NL_RTM_GETSTATS
+ NL_RTM_NEWNEXTHOP = 104,
+#define NL_RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP
+ NL_RTM_DELNEXTHOP,
+#define NL_RTM_DELNEXTHOP NL_RTM_DELNEXTHOP
+ NL_RTM_GETNEXTHOP,
+#define NL_RTM_GETNEXTHOP NL_RTM_GETNEXTHOP
+ __NL_RTM_MAX,
+};
+#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1)
+
+#ifndef _KERNEL
+/*
+ * RTM_* namespace clashes with BSD rtsock namespace.
+ * Use NL_RTM_ prefix in the kernel and map it to RTM_
+ * for userland.
+ */
+#define RTM_BASE NL_RTM_BASE
+#define RTM_NEWLINK NL_RTM_NEWLINK
+#define RTM_DELLINK NL_RTM_DELLINK
+#define RTM_GETLINK NL_RTM_GETLINK
+#define RTM_SETLINK NL_RTM_SETLINK
+#define RTM_NEWADDR NL_RTM_NEWADDR
+#define RTM_DELADDR NL_RTM_DELADDR
+#define RTM_GETADDR NL_RTM_GETADDR
+#define RTM_NEWROUTE NL_RTM_NEWROUTE
+#define RTM_DELROUTE NL_RTM_DELROUTE
+#define RTM_GETROUTE NL_RTM_GETROUTE
+#define RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP
+#define RTM_DELNEXTHOP NL_RTM_DELNEXTHOP
+#define RTM_GETNEXTHOP NL_RTM_GETNEXTHOP
+#endif
+
+
+/*
+ * Route-related (RTM_<NEW|DEL|GET>ROUTE) message header and attributes.
+ */
+struct rtmsg {
+ unsigned char rtm_family; /* address family */
+ unsigned char rtm_dst_len; /* Prefix length */
+ unsigned char rtm_src_len; /* Source prefix length (not used) */
+ unsigned char rtm_tos; /* Type of service (not used) */
+ unsigned char rtm_table; /* rtable id */
+ unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */
+ unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */
+ unsigned char rtm_type; /* Route type (RTN_) */
+ unsigned rtm_flags; /* Route flags (RTM_F_) */
+};
+
+/*
+ * RFC 3549, 3.1.1, route type (rtm_type field).
+ */
+enum {
+ RTN_UNSPEC,
+ RTN_UNICAST, /* Unicast route */
+ RTN_LOCAL, /* Accept locally (not supported) */
+ RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */
+ RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */
+ RTN_MULTICAST, /* Multicast route */
+ RTN_BLACKHOLE, /* Drop traffic towards destination */
+ RTN_UNREACHABLE,/* Destination is unreachable */
+ RTN_PROHIBIT, /* Administratively prohibited */
+ RTN_THROW, /* Not in this table (not supported) */
+ RTN_NAT, /* Translate this address (not supported) */
+ RTN_XRESOLVE, /* Use external resolver (not supported) */
+ __RTN_MAX,
+};
+#define RTN_MAX (__RTN_MAX - 1)
+
+/*
+ * RFC 3549, 3.1.1, protocol (Identifies what/who added the route).
+ * Values larger than RTPROT_STATIC(4) are not interpreted by the
+ * kernel, they are just for user information.
+ */
+
+#define RTPROT_UNSPEC 0
+#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */
+#define RTPROT_KERNEL 2 /* Route installed by kernel */
+#define RTPROT_BOOT 3 /* Route installed during boot */
+#define RTPROT_STATIC 4 /* Route installed by administrator */
+
+#define RTPROT_GATED 8 /* Apparently, GateD */
+#define RTPROT_RA 9 /* RDISC/ND router advertisements */
+#define RTPROT_MRT 10 /* Merit MRT */
+#define RTPROT_ZEBRA 11 /* Zebra */
+#define RTPROT_BIRD 12 /* BIRD */
+#define RTPROT_DNROUTED 13 /* DECnet routing daemon */
+#define RTPROT_XORP 14 /* XORP */
+#define RTPROT_NTK 15 /* Netsukuku */
+#define RTPROT_DHCP 16 /* DHCP client */
+#define RTPROT_MROUTED 17 /* Multicast daemon */
+#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */
+#define RTPROT_BABEL 42 /* Babel daemon */
+#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */
+#define RTPROT_BGP 186 /* BGP Routes */
+#define RTPROT_ISIS 187 /* ISIS Routes */
+#define RTPROT_OSPF 188 /* OSPF Routes */
+#define RTPROT_RIP 189 /* RIP Routes */
+#define RTPROT_EIGRP 192 /* EIGRP Routes */
+
+/*
+ * RFC 3549 3.1.1 Route scope (valid distance to destination).
+ *
+ * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200)
+ * are available to the user.
+*/
+enum rt_scope_t {
+ RT_SCOPE_UNIVERSE = 0,
+ /* User defined values */
+ RT_SCOPE_SITE = 200,
+ RT_SCOPE_LINK = 253,
+ RT_SCOPE_HOST = 254,
+ RT_SCOPE_NOWHERE = 255
+};
+
+/*
+ * RFC 3549 3.1.1 Route flags.
+ *
+*/
+#define RTM_F_NOTIFY 0x100 /* Notify user of route change */
+#define RTM_F_CLONED 0x200 /* This route is cloned (not used) */
+#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
+#define RTM_F_PREFIX 0x800 /* Prefix addresses */
+#define RTM_F_LOOKUP_TABLE 0x1000 /* set tableid to FIB lookup result */
+#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */
+#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */
+#define RTM_F_TRAP 0x8000 /* route is trapping packets */
+#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed */
+
+/* Compatibility handling helpers */
+#ifndef _KERNEL
+#define NL_RTM_HDRLEN ((int)sizeof(struct rtmsg))
+#define RTM_RTA(_rtm) ((struct rtattr *)(NL_ITEM_DATA(_rtm) + NL_RTM_HDRLEN))
+#define RTM_PAYLOAD(_hdr) NLMSG_PAYLOAD((_hdr), NL_RTM_HDRLEN)
+#endif
+
+/*
+ * Routing table identifiers.
+ * Contrary to Linux implementation, FreeBSD route table numbering starts from 0.
+ * Thus, indicating "all tables" can be done by not including RTA_TABLE attribute
+ * and keeping rtm_table=0 (compatibility) or setting RTA_TABLE value to RT_TABLE_UNSPEC.
+ */
+#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */
+#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */
+
+enum rtattr_type_t {
+ NL_RTA_UNSPEC,
+ NL_RTA_DST,
+ NL_RTA_SRC,
+ NL_RTA_IIF,
+ NL_RTA_OIF,
+ NL_RTA_GATEWAY,
+ NL_RTA_PRIORITY,
+ NL_RTA_PREFSRC,
+ NL_RTA_METRICS,
+ NL_RTA_MULTIPATH,
+ NL_RTA_PROTOINFO, /* not used / deprecated */
+ NL_RTA_FLOW,
+ NL_RTA_CACHEINFO, /* not used */
+ NL_RTA_SESSION, /* not used / deprecated */
+ NL_RTA_MP_ALGO, /* not used / deprecated */
+ NL_RTA_TABLE,
+ NL_RTA_MARK, /* not used */
+ NL_RTA_MFC_STATS,
+ NL_RTA_VIA,
+ NL_RTA_NEWDST,
+ NL_RTA_PREF,
+ NL_RTA_ENCAP_TYPE,
+ NL_RTA_ENCAP,
+ NL_RTA_EXPIRES,
+ NL_RTA_PAD,
+ NL_RTA_UID,
+ NL_RTA_TTL_PROPAGATE,
+ NL_RTA_IP_PROTO,
+ NL_RTA_SPORT,
+ NL_RTA_DPORT,
+ NL_RTA_NH_ID,
+ __RTA_MAX
+};
+#define NL_RTA_MAX (__RTA_MAX - 1)
+
+#ifndef _KERNEL
+/*
+ * RTA_* space has clashes with rtsock namespace.
+ * Use NL_RTA_ prefix in the kernel and map to
+ * RTA_ for userland.
+ */
+#define RTA_UNSPEC NL_RTA_UNSPEC
+#define RTA_DST NL_RTA_DST
+#define RTA_SRC NL_RTA_SRC
+#define RTA_IIF NL_RTA_IIF
+#define RTA_OIF NL_RTA_OIF
+#define RTA_GATEWAY NL_RTA_GATEWAY
+#define RTA_PRIORITY NL_RTA_PRIORITY
+#define RTA_PREFSRC NL_RTA_PREFSRC
+#define RTA_METRICS NL_RTA_METRICS
+#define RTA_MULTIPATH NL_RTA_MULTIPATH
+#define RTA_PROTOINFO NL_RTA_PROTOINFO
+#define RTA_FLOW NL_RTA_FLOW
+#define RTA_CACHEINFO NL_RTA_CACHEINFO
+#define RTA_SESSION NL_RTA_SESSION
+#define RTA_MP_ALGO NL_RTA_MP_ALGO
+#define RTA_TABLE NL_RTA_TABLE
+#define RTA_MARK NL_RTA_MARK
+#define RTA_MFC_STATS NL_RTA_MFC_STATS
+#define RTA_VIA NL_RTA_VIA
+#define RTA_NEWDST NL_RTA_NEWDST
+#define RTA_PREF NL_RTA_PREF
+#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE
+#define RTA_ENCAP NL_RTA_ENCAP
+#define RTA_EXPIRES NL_RTA_EXPIRES
+#define RTA_PAD NL_RTA_PAD
+#define RTA_UID NL_RTA_UID
+#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE
+#define RTA_IP_PROTO NL_RTA_IP_PROTO
+#define RTA_SPORT NL_RTA_SPORT
+#define RTA_DPORT NL_RTA_DPORT
+#define RTA_NH_ID NL_RTA_NH_ID
+#define RTA_MAX NL_RTA_MAX
+#endif
+
+/*
+ * route attribute header
+ */
+struct rtattr {
+ unsigned short rta_len;
+ unsigned short rta_type;
+};
+
+#define NL_RTA_ALIGN_SIZE NL_ITEM_ALIGN_SIZE
+#define NL_RTA_ALIGN NL_ITEM_ALIGN
+#define NL_RTA_HDRLEN ((int)sizeof(struct rtattr))
+#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - NL_RTA_HDRLEN))
+#define NL_RTA_DATA(_rta) NL_ITEM_DATA(_rta, NL_RTA_HDRLEN)
+#define NL_RTA_DATA_CONST(_rta) NL_ITEM_DATA_CONST(_rta, NL_RTA_HDRLEN)
+
+/* Compatibility attribute handling helpers */
+#ifndef _KERNEL
+#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE
+#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len)
+#define _RTA_LEN(_rta) ((int)(_rta)->rta_len)
+#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN(_RTA_LEN(_rta))
+#define RTA_OK(_rta, _len) NL_ITEM_OK(_rta, _len, NL_RTA_HDRLEN, _RTA_LEN)
+#define RTA_NEXT(_rta, _len) NL_ITEM_ITER(_rta, _len, _RTA_ALIGNED_LEN)
+#define RTA_LENGTH(_len) (NL_RTA_HDRLEN + (_len))
+#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len))
+#define RTA_DATA(_rta) NL_RTA_DATA(_rta)
+#define RTA_PAYLOAD(_rta) ((int)(_RTA_LEN(_rta) - NL_RTA_HDRLEN)
+#endif
+
+/* RTA attribute headers */
+
+/* RTA_VIA */
+struct rtvia {
+ sa_family_t rtvia_family;
+ uint8_t rtvia_addr[0];
+};
+
+
+/*
+ * RTA_METRICS is a nested attribute, consistes of array of 'struct rtattr'
+ * with the types defined below. Most of the values are uint32_t.
+ */
+ enum {
+ NL_RTAX_UNSPEC,
+#define NL_RTAX_UNSPEC NL_RTAX_UNSPEC
+ NL_RTAX_LOCK,
+#define NL_RTAX_LOCK NL_RTAX_LOCK
+ NL_RTAX_MTU,
+#define NL_RTAX_MTU NL_RTAX_MTU
+ NL_RTAX_WINDOW,
+#define NL_RTAX_WINDOW NL_RTAX_WINDOW
+ NL_RTAX_RTT,
+#define NL_RTAX_RTT NL_RTAX_RTT
+ NL_RTAX_RTTVAR,
+#define NL_RTAX_RTTVAR NL_RTAX_RTTVAR
+ NL_RTAX_SSTHRESH,
+#define NL_RTAX_SSTHRESH NL_RTAX_SSTHRESH
+ NL_RTAX_CWND,
+#define NL_RTAX_CWND NL_RTAX_CWND
+ NL_RTAX_ADVMSS,
+#define NL_RTAX_ADVMSS NL_RTAX_ADVMSS
+ NL_RTAX_REORDERING,
+#define NL_RTAX_REORDERING NL_RTAX_REORDERING
+ NL_RTAX_HOPLIMIT,
+#define NL_RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+ NL_RTAX_INITCWND,
+#define NL_RTAX_INITCWND NL_RTAX_INITCWND
+ NL_RTAX_FEATURES,
+#define NL_RTAX_FEATURES NL_RTAX_FEATURES
+ NL_RTAX_RTO_MIN,
+#define NL_RTAX_RTO_MIN NL_RTAX_RTO_MIN
+ NL_RTAX_INITRWND,
+#define NL_RTAX_INITRWND NL_RTAX_INITRWND
+ NL_RTAX_QUICKACK,
+#define NL_RTAX_QUICKACK NL_RTAX_QUICKACK
+ NL_RTAX_CC_ALGO,
+#define NL_RTAX_CC_ALGO NL_RTAX_CC_ALGO
+ NL_RTAX_FASTOPEN_NO_COOKIE,
+#define NL_RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+ __NL_RTAX_MAX
+};
+#define NL_RTAX_MAX (__NL_RTAX_MAX - 1)
+
+#define RTAX_FEATURE_ECN (1 << 0)
+#define RTAX_FEATURE_SACK (1 << 1)
+#define RTAX_FEATURE_TIMESTAMP (1 << 2)
+#define RTAX_FEATURE_ALLFRAG (1 << 3)
+
+#define RTAX_FEATURE_MASK \
+ (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \
+ RTAX_FEATURE_ALLFRAG)
+
+#ifndef _KERNEL
+/*
+ * RTAX_* space clashes with rtsock namespace.
+ * Use NL_RTAX_ prefix in the kernel and map to
+ * RTAX_ for userland.
+ */
+#define RTAX_UNSPEC NL_RTAX_UNSPEC
+#define RTAX_LOCK NL_RTAX_LOCK
+#define RTAX_MTU NL_RTAX_MTU
+#define RTAX_WINDOW NL_RTAX_WINDOW
+#define RTAX_RTT NL_RTAX_RTT
+#define RTAX_RTTVAR NL_RTAX_RTTVAR
+#define RTAX_SSTHRESH NL_RTAX_SSTHRESH
+#define RTAX_CWND NL_RTAX_CWND
+#define RTAX_ADVMSS NL_RTAX_ADVMSS
+#define RTAX_REORDERING NL_RTAX_REORDERING
+#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+#define RTAX_INITCWND NL_RTAX_INITCWND
+#define RTAX_FEATURES NL_RTAX_FEATURES
+#define RTAX_RTO_MIN NL_RTAX_RTO_MIN
+#define RTAX_INITRWND NL_RTAX_INITRWND
+#define RTAX_QUICKACK NL_RTAX_QUICKACK
+#define RTAX_CC_ALGO NL_RTAX_CC_ALGO
+#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+#endif
+
+/*
+ * RTA_MULTIPATH consists of array of rtnexthop structures.
+ */
+struct rtnexthop {
+ unsigned short rtnh_len;
+ unsigned char rtnh_flags;
+ unsigned char rtnh_hops;
+ int rtnh_ifindex;
+};
+
+/* rtnh_flags */
+
+#define RTNH_F_DEAD 0x01 /* Nexthop is dead (used by multipath) */
+#define RTNH_F_PERVASIVE 0x02 /* Do recursive gateway lookup */
+#define RTNH_F_ONLINK 0x04 /* Gateway is forced on link */
+#define RTNH_F_OFFLOAD 0x08 /* Nexthop is offloaded */
+#define RTNH_F_LINKDOWN 0x10 /* carrier-down on nexthop */
+#define RTNH_F_UNRESOLVED 0x20 /* The entry is unresolved (ipmr) */
+#define RTNH_F_TRAP 0x40 /* Nexthop is trapping packets */
+
+#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \
+ RTNH_F_OFFLOAD | RTNH_F_TRAP)
+
+/* Macros to handle hexthops */
+
+#define RTNH_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define RTNH_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define RTNH_HDRLEN ((int)sizeof(struct rtnexthop))
+#define _RTNH_LEN(_nh) ((int)(_nh)->rtnh_len)
+#define _RTNH_ALIGNED_LEN(_nh) RTNH_ALIGN(_RTNH_LEN(_nh))
+#define RTNH_OK(_nh, _len) NL_ITEM_OK(_nh, _len, RTNH_HDRLEN, _RTNH_LEN)
+//#define RTNH_NEXT(_nh) (struct rtnexthop *)NL_ITEM_DATA(_nh, RTNH_HDRLEN)
+//#define RTNH_NEXT(_nh) NL_ITEM_NEXT(_nh, _RTNH_ALIGNED_LEN(_nh))
+#define RTNH_LENGTH(_len) (RTNH_HDRLEN + (_len))
+#define RTNH_SPACE(_len) RTNH_ALIGN(RTNH_LENGTH(_len))
+#define RTNH_DATA(_nh) ((struct rtattr *)NL_ITEM_DATA(_nh, RTNH_HDRLEN))
+
+
+struct rtgenmsg {
+ unsigned char rtgen_family;
+};
+
+
+/*
+ * NEXTHOP-related (RTM_<NEW|DEL|GET>NEXTHOP) message header and attributes.
+ */
+
+struct nhmsg {
+ unsigned char nh_family;
+ unsigned char nh_scope; /* ignored on RX, filled by kernel */
+ unsigned char nh_protocol; /* Routing protocol that installed nh */
+ unsigned char resvd;
+ unsigned int nh_flags; /* RTNH_F_* flags */
+};
+
+/* entry in a nexthop group */
+struct nexthop_grp {
+ uint32_t id; /* nexhop userland index */
+ uint8_t weight; /* weight of this nexthop */
+ uint8_t resvd1;
+ uint16_t resvd2;
+};
+
+enum {
+ NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */
+ NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */
+ __NEXTHOP_GRP_TYPE_MAX,
+};
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+enum {
+ NHA_UNSPEC,
+ NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */
+
+ NHA_GROUP, /* array of nexthop_grp */
+ NHA_GROUP_TYPE, /* u16 one of NEXTHOP_GRP_TYPE */
+ /* if NHA_GROUP attribute is added, no other attributes can be set */
+
+ NHA_BLACKHOLE, /* flag; nexthop used to blackhole packets */
+ /* if NHA_BLACKHOLE is added, OIF, GATEWAY, ENCAP can not be set */
+
+ NHA_OIF, /* u32; nexthop device */
+ NHA_GATEWAY, /* be32 (IPv4) or in6_addr (IPv6) gw address */
+ NHA_ENCAP_TYPE, /* u16; lwt encap type */
+ NHA_ENCAP, /* lwt encap data */
+
+ /* NHA_OIF can be appended to dump request to return only
+ * nexthops using given device
+ */
+ NHA_GROUPS, /* flag; only return nexthop groups in dump */
+ NHA_MASTER, /* u32; only return nexthops with given master dev */
+
+ NHA_FDB, /* flag; nexthop belongs to a bridge fdb */
+ /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
+
+ /* nested; resilient nexthop group attributes */
+ NHA_RES_GROUP,
+ /* nested; nexthop bucket attributes */
+ NHA_RES_BUCKET,
+
+ __NHA_MAX,
+};
+
+#define NHA_MAX (__NHA_MAX - 1)
+
+enum {
+ NHA_RES_GROUP_UNSPEC,
+ /* Pad attribute for 64-bit alignment. */
+ NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC,
+
+ /* u16; number of nexthop buckets in a resilient nexthop group */
+ NHA_RES_GROUP_BUCKETS,
+ /* clock_t as u32; nexthop bucket idle timer (per-group) */
+ NHA_RES_GROUP_IDLE_TIMER,
+ /* clock_t as u32; nexthop unbalanced timer */
+ NHA_RES_GROUP_UNBALANCED_TIMER,
+ /* clock_t as u64; nexthop unbalanced time */
+ NHA_RES_GROUP_UNBALANCED_TIME,
+ __NHA_RES_GROUP_MAX,
+};
+#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1)
+
+
+/*****************************************************************
+ * Link layer specific messages.
+ ****/
+
+/* struct ifinfomsg
+ * passes link level specific information, not dependent
+ * on network protocol.
+ */
+
+struct ifinfomsg {
+ unsigned char ifi_family; /* Related XX */
+ unsigned char __ifi_pad;
+ unsigned short ifi_type; /* ARPHRD_* */
+ int ifi_index; /* Link index */
+ unsigned ifi_flags; /* IFF_* flags */
+ unsigned ifi_change; /* IFF_* change mask */
+};
+
+#ifndef _KERNEL
+/* Compatilbility helpers */
+#define _IFINFO_HDRLEN ((int)sizeof(struct ifinfomsg))
+#define IFLA_RTA(_ifi) ((struct rtattr *)NL_ITEM_DATA(_ifi, _IFINFO_HDRLEN))
+#define IFLA_PAYLOAD(_ifi) NLMSG_PAYLOAD(_ifi, _IFINFO_HDRLEN)
+#endif
+
+enum {
+ IFLA_UNSPEC,
+ IFLA_ADDRESS,
+ IFLA_BROADCAST,
+ IFLA_IFNAME,
+ IFLA_MTU,
+ IFLA_LINK,
+ IFLA_QDISC,
+ IFLA_STATS,
+ IFLA_COST,
+#define IFLA_COST IFLA_COST
+ IFLA_PRIORITY,
+#define IFLA_PRIORITY IFLA_PRIORITY
+ IFLA_MASTER,
+#define IFLA_MASTER IFLA_MASTER
+ IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */
+#define IFLA_WIRELESS IFLA_WIRELESS
+ IFLA_PROTINFO, /* Protocol specific information for a link */
+#define IFLA_PROTINFO IFLA_PROTINFO
+ IFLA_TXQLEN,
+#define IFLA_TXQLEN IFLA_TXQLEN
+ IFLA_MAP,
+#define IFLA_MAP IFLA_MAP
+ IFLA_WEIGHT,
+#define IFLA_WEIGHT IFLA_WEIGHT
+ IFLA_OPERSTATE,
+ IFLA_LINKMODE,
+ IFLA_LINKINFO,
+#define IFLA_LINKINFO IFLA_LINKINFO
+ IFLA_NET_NS_PID,
+ IFLA_IFALIAS,
+ IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */
+ IFLA_VFINFO_LIST,
+ IFLA_STATS64,
+ IFLA_VF_PORTS,
+ IFLA_PORT_SELF,
+ IFLA_AF_SPEC,
+ IFLA_GROUP, /* Group the device belongs to */
+ IFLA_NET_NS_FD,
+ IFLA_EXT_MASK, /* Extended info mask, VFs, etc */
+ IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */
+#define IFLA_PROMISCUITY IFLA_PROMISCUITY
+ IFLA_NUM_TX_QUEUES,
+ IFLA_NUM_RX_QUEUES,
+ IFLA_CARRIER,
+ IFLA_PHYS_PORT_ID,
+ IFLA_CARRIER_CHANGES,
+ IFLA_PHYS_SWITCH_ID,
+ IFLA_LINK_NETNSID,
+ IFLA_PHYS_PORT_NAME,
+ IFLA_PROTO_DOWN,
+ IFLA_GSO_MAX_SEGS,
+ IFLA_GSO_MAX_SIZE,
+ IFLA_PAD,
+ IFLA_XDP,
+ IFLA_EVENT,
+ IFLA_NEW_NETNSID,
+ IFLA_IF_NETNSID,
+ IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */
+ IFLA_CARRIER_UP_COUNT,
+ IFLA_CARRIER_DOWN_COUNT,
+ IFLA_NEW_IFINDEX,
+ IFLA_MIN_MTU,
+ IFLA_MAX_MTU,
+ IFLA_PROP_LIST,
+ IFLA_ALT_IFNAME, /* Alternative ifname */
+ IFLA_PERM_ADDRESS,
+ IFLA_PROTO_DOWN_REASON,
+ __IFLA_MAX };
+
+#define IFLA_MAX (__IFLA_MAX - 1)
+
+
+/********************************************************************
+ * prefix information
+ ****/
+
+struct prefixmsg {
+ unsigned char prefix_family;
+ unsigned char prefix_pad1;
+ unsigned short prefix_pad2;
+ int prefix_ifindex;
+ unsigned char prefix_type;
+ unsigned char prefix_len;
+ unsigned char prefix_flags;
+ unsigned char prefix_pad3;
+};
+
+enum { PREFIX_UNSPEC, PREFIX_ADDRESS, PREFIX_CACHEINFO, __PREFIX_MAX };
+
+#define PREFIX_MAX (__PREFIX_MAX - 1)
+
+struct prefix_cacheinfo {
+ uint32_t preferred_time;
+ uint32_t valid_time;
+};
+
+#ifndef _KERNEL
+/* RTnetlink multicast groups - backwards compatibility for userspace */
+#define RTMGRP_LINK 0x01
+#define RTMGRP_NOTIFY 0x02
+#define RTMGRP_NEIGH 0x04
+#define RTMGRP_TC 0x08
+
+#define RTMGRP_IPV4_IFADDR 0x10
+#define RTMGRP_IPV4_MROUTE 0x20
+#define RTMGRP_IPV4_ROUTE 0x40
+#define RTMGRP_IPV4_RULE 0x80
+
+#define RTMGRP_IPV6_IFADDR 0x100
+#define RTMGRP_IPV6_MROUTE 0x200
+#define RTMGRP_IPV6_ROUTE 0x400
+#define RTMGRP_IPV6_IFINFO 0x800
+
+#define RTMGRP_DECnet_IFADDR 0x1000
+#define RTMGRP_DECnet_ROUTE 0x4000
+
+#define RTMGRP_IPV6_PREFIX 0x20000
+#endif
+
+/* RTnetlink multicast groups */
+enum rtnetlink_groups {
+ RTNLGRP_NONE,
+#define RTNLGRP_NONE RTNLGRP_NONE
+ RTNLGRP_LINK,
+#define RTNLGRP_LINK RTNLGRP_LINK
+ RTNLGRP_NOTIFY,
+#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY
+ RTNLGRP_NEIGH,
+#define RTNLGRP_NEIGH RTNLGRP_NEIGH
+ RTNLGRP_TC,
+#define RTNLGRP_TC RTNLGRP_TC
+ RTNLGRP_IPV4_IFADDR,
+#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
+ RTNLGRP_IPV4_MROUTE,
+#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE
+ RTNLGRP_IPV4_ROUTE,
+#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE
+ RTNLGRP_IPV4_RULE,
+#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE
+ RTNLGRP_IPV6_IFADDR,
+#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR
+ RTNLGRP_IPV6_MROUTE,
+#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE
+ RTNLGRP_IPV6_ROUTE,
+#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE
+ RTNLGRP_IPV6_IFINFO,
+#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO
+ RTNLGRP_DECnet_IFADDR,
+#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR
+ RTNLGRP_NOP2,
+ RTNLGRP_DECnet_ROUTE,
+#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE
+ RTNLGRP_DECnet_RULE,
+#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE
+ RTNLGRP_NOP4,
+ RTNLGRP_IPV6_PREFIX,
+#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX
+ RTNLGRP_IPV6_RULE,
+#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE
+ RTNLGRP_ND_USEROPT,
+#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT
+ RTNLGRP_PHONET_IFADDR,
+#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR
+ RTNLGRP_PHONET_ROUTE,
+#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE
+ RTNLGRP_DCB,
+#define RTNLGRP_DCB RTNLGRP_DCB
+ RTNLGRP_IPV4_NETCONF,
+#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF
+ RTNLGRP_IPV6_NETCONF,
+#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
+ RTNLGRP_MDB,
+#define RTNLGRP_MDB RTNLGRP_MDB
+ RTNLGRP_MPLS_ROUTE,
+#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE
+ RTNLGRP_NSID,
+#define RTNLGRP_NSID RTNLGRP_NSID
+ RTNLGRP_MPLS_NETCONF,
+#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF
+ RTNLGRP_IPV4_MROUTE_R,
+#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R
+ RTNLGRP_IPV6_MROUTE_R,
+#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R
+ RTNLGRP_NEXTHOP,
+#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP
+ RTNLGRP_BRVLAN,
+#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN
+ __RTNLGRP_MAX
+};
+#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
+
+// START OF IF_ADDR SECTION
+
+struct ifaddrmsg {
+ uint8_t ifa_family; /* Address family */
+ uint8_t ifa_prefixlen; /* Prefix length */
+ uint8_t ifa_flags; /* Address-specific flags */
+ uint8_t ifa_scope; /* Address scope */
+ uint32_t ifa_index; /* Link ifindex */
+};
+
+#ifndef _KERNEL
+#define _NL_IFA_HDRLEN ((int)sizeof(struct ifaddrmsg))
+#define IFA_RTA(_ifa) ((struct rtattr *)(NL_ITEM_DATA(_ifa, _NL_IFA_HDRLEN)))
+#define IFA_PAYLOAD(_hdr) NLMSG_PAYLOAD(_hdr, _NL_IFA_HDRLEN)
+#endif
+
+/*
+ * Important comment:
+ * IFA_ADDRESS is prefix address, rather than local interface address.
+ * It makes no difference for normally configured broadcast interfaces,
+ * but for point-to-point IFA_ADDRESS is DESTINATION address,
+ * local address is supplied in IFA_LOCAL attribute.
+ *
+ * IFA_FLAGS is a u32 attribute that extends the u8 field ifa_flags.
+ * If present, the value from struct ifaddrmsg will be ignored.
+ */
+enum {
+ IFA_UNSPEC,
+ IFA_ADDRESS,
+ IFA_LOCAL,
+ IFA_LABEL,
+ IFA_BROADCAST,
+ IFA_ANYCAST,
+ IFA_CACHEINFO,
+ IFA_MULTICAST,
+ IFA_FLAGS,
+ IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */
+ IFA_TARGET_NETNSID,
+ __IFA_MAX,
+};
+#define IFA_MAX (__IFA_MAX - 1)
+
+/* ifa_flags */
+#define IFA_F_SECONDARY 0x01
+#define IFA_F_TEMPORARY IFA_F_SECONDARY
+#define IFA_F_NODAD 0x02
+#define IFA_F_OPTIMISTIC 0x04
+#define IFA_F_DADFAILED 0x08
+#define IFA_F_HOMEADDRESS 0x10
+#define IFA_F_DEPRECATED 0x20
+#define IFA_F_TENTATIVE 0x40
+#define IFA_F_PERMANENT 0x80
+#define IFA_F_MANAGETEMPADDR 0x100
+#define IFA_F_NOPREFIXROUTE 0x200
+#define IFA_F_MCAUTOJOIN 0x400
+#define IFA_F_STABLE_PRIVACY 0x800
+
+/* */
+
+
+struct ifa_cacheinfo {
+ uint32_t ifa_prefered;
+ uint32_t ifa_valid;
+ uint32_t cstamp; /* created timestamp, hundredths of seconds */
+ uint32_t tstamp; /* updated timestamp, hundredths of seconds */
+};
+
+// END OF IF_ADDR SECTION
+
+#endif
Index: sys/netlink/netlink_route.c
===================================================================
--- /dev/null
+++ sys/netlink/netlink_route.c
@@ -0,0 +1,968 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+
+#define DEBUG_MOD_NAME nl_route
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <net/route/route_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+
+#if defined(INET6) || defined(INET)
+
+#endif
+
+static unsigned char
+get_rtm_type(const struct nhop_object *nh)
+{
+ int nh_flags = nh->nh_flags;
+
+ /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
+ if (nh_flags & NHF_BLACKHOLE)
+ return (RTN_BLACKHOLE);
+ else if (nh_flags & NHF_REJECT)
+ return (RTN_PROHIBIT);
+ return (RTN_UNICAST);
+}
+
+static unsigned char
+get_rtm_protocol(const struct nhop_object *nh)
+{
+ if (NH_IS_NHGRP(nh)) {
+ const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
+ nh = nhg->nhops[0];
+ }
+ int rt_flags = nhop_get_rtflags(nh);
+ if (rt_flags & RTF_PROTO1)
+ return (RTPROT_ZEBRA);
+ if (rt_flags & RTF_STATIC)
+ return (RTPROT_STATIC);
+ return (RTPROT_KERNEL);
+}
+
+static int
+get_rtmsg_type_from_rtsock(int cmd)
+{
+ switch (cmd) {
+ case RTM_ADD:
+ case RTM_CHANGE:
+ case RTM_GET:
+ return NL_RTM_NEWROUTE;
+ case RTM_DELETE:
+ return NL_RTM_DELROUTE;
+ }
+
+ return (0);
+}
+
+static struct sockaddr *
+parse_rta_ip4(void *rta_data, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in));
+ if (__predict_false(sin == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_family = AF_INET;
+ memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr));
+ return ((struct sockaddr *)sin);
+}
+
+static struct sockaddr *
+get_ip4_netmask(uint8_t plen, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct in_addr mask;
+
+ if (__predict_false(plen > 32)) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+
+ mask.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
+ return (parse_rta_ip4(&mask, npt, perror));
+}
+
+static struct sockaddr *
+parse_rta_ip6(void *rta_data, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in6));
+ if (__predict_false(sin6 == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_family = AF_INET6;
+ memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in_addr));
+ return ((struct sockaddr *)sin6);
+}
+
+static void
+ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
+{
+ uint32_t *cp;
+
+ for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
+ *cp++ = 0xFFFFFFFF;
+ if (mask > 0)
+ *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
+}
+
+static struct sockaddr *
+get_ip6_netmask(uint8_t plen, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct in6_addr mask6;
+
+ if (__predict_false(plen > 128)) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ ipv6_writemask(&mask6, plen);
+
+ return (parse_rta_ip6(&mask6, npt, perror));
+}
+
+static struct sockaddr *
+parse_rta_ip(struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror)
+{
+ void *rta_data = NL_RTA_DATA(rta);
+ int rta_len = NL_RTA_DATA_LEN(rta);
+
+ if (rta_len == sizeof(struct in_addr)) {
+ return (parse_rta_ip4(rta_data, npt, perror));
+ } else if (rta_len == sizeof(struct in6_addr)) {
+ return (parse_rta_ip6(rta_data, npt, perror));
+ } else {
+ RT_LOG(LOG_NOTICE, "unknown IP len: %d for rta type %d",
+ rta_len, rta->rta_type);
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+ return (NULL);
+}
+
+static struct sockaddr *
+parse_rta_via(struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct rtvia *via = NL_RTA_DATA(rta);
+ int data_len = NL_RTA_DATA_LEN(rta);
+
+ if (__predict_false(data_len) < sizeof(struct rtvia)) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ data_len -= offsetof(struct rtvia, rtvia_addr);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ if (__predict_false(data_len < sizeof(struct in_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip4(via->rtvia_addr, npt, perror));
+ case AF_INET6:
+ if (__predict_false(data_len < sizeof(struct in6_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip6(via->rtvia_addr, npt, perror));
+ default:
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+}
+
+static uint32_t
+nl_rta_get_uint32(const struct rtattr *rta, int *perror)
+{
+ if (__predict_false(NL_RTA_DATA_LEN(rta) != sizeof(uint32_t))) {
+ RT_LOG(LOG_DEBUG2, "nla type %d size(%u) is not uint32",
+ rta->rta_type, NL_RTA_DATA_LEN(rta));
+ *perror = EINVAL;
+ return (0);
+ }
+ *perror = 0;
+ return (*((const uint32_t *)NL_RTA_DATA_CONST(rta)));
+}
+
+static struct ifnet *
+parse_rta_oif(const struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror)
+{
+ uint32_t ifindex = nl_rta_get_uint32(rta, perror);
+
+ NET_EPOCH_ASSERT();
+
+ if (__predict_false(*perror != 0))
+ return (NULL);
+
+ return (ifnet_byindex(ifindex));
+}
+
+
+/*
+ * fibnum heuristics
+ *
+ * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
+ * msg rtm_table RTA_TABLE result
+ * RTM_GETROUTE/dump 0 - RT_ALL_FIBS
+ * RTM_GETROUTE/dump 1 - 1
+ * RTM_GETROUTE/get 0 - 0
+ *
+ */
+
+/*
+ * Converts rtmsg message into rt_addrinfo.
+ */
+static int
+parse_rtmsg_nlattr(struct rtmsg *rtm, int len, struct rt_addrinfo *info,
+ uint16_t required_mask, struct netlink_parse_tracker *npt)
+{
+ struct sockaddr *sa;
+ int error = 0;
+ struct nlattr *nla, *nla_head;
+
+ len -= NETLINK_ALIGN(sizeof(struct rtmsg));
+ nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg)));
+
+ RT_LOG(LOG_DEBUG, "parse %p remaining_len %d", nla_head, len);
+
+ info->rti_fibnum = rtm->rtm_table;
+ info->rti_family = rtm->rtm_family;
+
+ if (info->rti_fibnum > V_rt_numfibs) {
+ RT_LOG(LOG_DEBUG, "incorrect fibnum: %u", info->rti_fibnum);
+ return (EINVAL);
+ }
+
+ NLA_FOREACH(nla, nla_head, len) {
+ struct rtattr *rta = (struct rtattr *)nla;
+ if (rta->rta_len < sizeof(struct rtattr)) {
+ RT_LOG(LOG_NOTICE, "invalid length for attribute %d, stopping processing",
+ rta->rta_type);
+ break;
+ }
+ RT_LOG(LOG_DEBUG2, "parse rta %d len %d", rta->rta_type, rta->rta_len);
+
+ switch (rta->rta_type) {
+ case NL_RTA_DST:
+ sa = parse_rta_ip(rta, npt, &error);
+ if (sa != NULL) {
+ info->rti_info[RTAX_DST] = sa;
+ info->rti_addrs |= RTA_DST;
+ /* XXX: check DST af */
+ }
+ break;
+ case NL_RTA_GATEWAY:
+ sa = parse_rta_ip(rta, npt, &error);
+ if (sa != NULL) {
+ info->rti_info[RTAX_GATEWAY] = sa;
+ info->rti_addrs |= RTA_GATEWAY;
+ info->rti_flags |= RTF_GATEWAY;
+ }
+ break;
+ case NL_RTA_VIA:
+ sa = parse_rta_via(rta, npt, &error);
+ info->rti_info[RTAX_GATEWAY] = sa;
+ info->rti_addrs |= RTA_GATEWAY;
+ info->rti_flags |= RTF_GATEWAY;
+ break;
+ case NL_RTA_OIF:
+ info->rti_ifp = parse_rta_oif(rta, npt, &error);
+ break;
+ case NL_RTA_TABLE:
+ info->rti_fibnum = nl_rta_get_uint32(rta, &error);
+ if (info->rti_fibnum > V_rt_numfibs) {
+ RT_LOG(LOG_DEBUG, "incorrect fibnum: %u", info->rti_fibnum);
+ error = EINVAL;
+ }
+ break;
+ default:
+ RT_LOG(LOG_DEBUG, "unsupported rta_type %d", rta->rta_type);
+ break;
+ }
+ if (__predict_false(error != 0)) {
+ break;
+ }
+ }
+ /* XXX: IPv6 embedding */
+
+ if (error != 0)
+ return (error);
+
+ if ((error == 0 ) && ((info->rti_addrs & required_mask) != required_mask)) {
+ RT_LOG(LOG_DEBUG, "required mask failed");
+ error = EINVAL;
+ }
+
+ return (error);
+}
+
+static int
+finalize_rtmsg(struct rtmsg *rtm, int len, struct rt_addrinfo *info,
+ struct netlink_parse_tracker *npt)
+{
+ struct sockaddr *sa;
+ int error = 0;
+
+ switch (rtm->rtm_family) {
+ case AF_INET:
+ if (rtm->rtm_dst_len < 32) {
+ sa = get_ip4_netmask(rtm->rtm_dst_len, npt, &error);
+ info->rti_info[RTAX_NETMASK] = sa;
+ info->rti_addrs |= RTA_NETMASK;
+ } else
+ info->rti_flags |= RTF_HOST;
+ break;
+ case AF_INET6:
+ if (rtm->rtm_dst_len < 32) {
+ sa = get_ip6_netmask(rtm->rtm_dst_len, npt, &error);
+ info->rti_info[RTAX_NETMASK] = sa;
+ info->rti_addrs |= RTA_NETMASK;
+ } else
+ info->rti_flags |= RTF_HOST;
+ /* XXX: embed IPv6 addrs */
+ break;
+
+ }
+
+ return (error);
+}
+
+/*
+ * Populates an addr_info struct from an rtmsg.
+ * Parses the nl_attributes and parses the netmask.
+ */
+static struct rt_addrinfo *
+get_info_from_rtmsg(struct nlmsghdr *hdr, uint16_t required_mask,
+ struct netlink_parse_tracker *npt)
+{
+ struct rt_addrinfo *info = npt_alloc(npt, sizeof(struct rt_addrinfo));
+
+ struct rtmsg *rtm = (struct rtmsg *)nlmsg_data(hdr);
+ int len = hdr->nlmsg_len - NLMSG_HDRLEN;
+
+ npt->error = parse_rtmsg_nlattr(rtm, len, info, required_mask, npt);
+ if (npt->error == 0)
+ npt->error = finalize_rtmsg(rtm, len, info, npt);
+
+ FIB_LOG(LOG_DEBUG2, info->rti_fibnum, info->rti_family, "errno=%d", npt->error);
+ if (npt->error == 0)
+ return (info);
+ return (NULL);
+}
+
+static struct nhop_object *
+rc_get_nhop(const struct rib_cmd_info *rc)
+{
+ return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
+}
+
+static bool
+dump_rc_nhop_gw(struct nlmsg_state *ns, struct nhop_object *nh)
+{
+ int upper_family;
+
+ switch (nhop_get_neigh_family(nh)) {
+ case AF_LINK:
+ /* onlink prefix, skip */
+ break;
+ case AF_INET:
+ if (!nlattr_add(ns, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr))
+ return (false);
+ break;
+ case AF_INET6:
+ upper_family = nhop_get_upper_family(nh);
+ if (upper_family == AF_INET6) {
+ if (!nlattr_add(ns, NL_RTA_GATEWAY, 16, &nh->gw6_sa.sin6_addr))
+ return (false);
+ } else if (upper_family == AF_INET) {
+ /* IPv4 over IPv6 */
+ char buf[20];
+ struct rtvia *via = (struct rtvia *)&buf[0];
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &nh->gw6_sa.sin6_addr, 16);
+ if (!nlattr_add(ns, NL_RTA_VIA, 17, via))
+ return (false);
+ } else {
+ /* shouldn't happen */
+ return (false);
+ }
+ break;
+ }
+
+ return (true);
+
+}
+
+
+static bool
+dump_rc_nhop(struct nlmsg_state *ns, const struct rib_cmd_info *rc)
+{
+ struct nhop_object *nh;
+
+ nh = rc_get_nhop(rc);
+ /* XXX: can be raw */
+
+ if (nh == NULL)
+ return (false);
+
+ /*
+ * IPv4 over IPv6
+ * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
+ * IPv4 w/ gw
+ * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
+ * Direct route:
+ * ('RTA_OIF', 2)
+ */
+ if (nh->nh_flags & NHF_GATEWAY)
+ dump_rc_nhop_gw(ns, nh);
+
+ /* Add nhop id. XXX: Switch to user nhop id */
+ if (!nlattr_add_u32(ns, NL_RTA_NH_ID, nhop_get_idx(nh)))
+ return (false);
+
+ /* In any case, fill outgoing interface */
+ if (!nlattr_add_u32(ns, NL_RTA_OIF, nh->nh_ifp->if_index))
+ return (false);
+
+ return (true);
+}
+
+/*
+ * Dumps output from a rib command into an rtmsg
+ */
+
+static int
+dump_rc(uint32_t fibnum, const struct nlmsghdr *hdr,
+ const struct rib_cmd_info *rc, struct nlmsg_state *ns)
+{
+ const struct nhop_object *nh = rc_get_nhop(rc);
+ struct rtmsg *rtm;
+ int error = 0;
+
+ NET_EPOCH_ASSERT();
+
+ int payload_len = sizeof(struct rtmsg);
+ int nlmsgtype = get_rtmsg_type_from_rtsock(rc->rc_cmd);
+ if (!nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, nlmsgtype,
+ hdr->nlmsg_flags, payload_len))
+ goto enomem;
+
+ int family = rt_get_family(rc->rc_rt);
+ rtm = nlmsg_reserve_object(ns, struct rtmsg);
+ rtm->rtm_family = family;
+ rtm->rtm_dst_len = 0;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ if (fibnum < 255)
+ rtm->rtm_table = (unsigned char)fibnum;
+ rtm->rtm_protocol = get_rtm_protocol(nh);
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ rtm->rtm_type = get_rtm_type(nh);
+ rtm->rtm_flags = 0;
+
+ if (!nlattr_add_u32(ns, NL_RTA_TABLE, fibnum))
+ goto enomem;
+
+ int plen = 0;
+ uint32_t scopeid = 0;
+ switch (family) {
+ case AF_INET:
+ {
+ struct in_addr addr;
+ rt_get_inet_prefix_plen(rc->rc_rt, &addr, &plen, &scopeid);
+ rtm->rtm_dst_len = plen;
+ if (!nlattr_add(ns, NL_RTA_DST, 4, &addr))
+ goto enomem;
+ break;
+ }
+ case AF_INET6:
+ {
+ struct in6_addr addr;
+ rt_get_inet6_prefix_plen(rc->rc_rt, &addr, &plen, &scopeid);
+ rtm->rtm_dst_len = plen;
+ if (!nlattr_add(ns, NL_RTA_DST, 16, &addr))
+ goto enomem;
+ break;
+ }
+ default:
+ FIB_LOG(LOG_NOTICE, fibnum, family, "unknown rt family");
+ error = EAFNOSUPPORT;
+ goto flush;
+ }
+
+ if (!dump_rc_nhop(ns, rc))
+ goto enomem;
+
+/*
+ struct nlattr *metrics_nla;
+ metrics_nla = nla_nest_start(m, NL_RTA_METRICS);
+ nlattr_add_u32(m, NL_RTAX_MTU, nh->nh_mtu);
+ nla_nest_end(m, metrics_nla);
+*/
+ nlmsg_end(ns);
+ return (0);
+enomem:
+ error = ENOMEM;
+flush:
+ nlmsg_abort(ns);
+ return (error);
+}
+
+static int
+family_to_group(int family)
+{
+ switch (family) {
+ case AF_INET:
+ return (RTNLGRP_IPV4_ROUTE);
+ case AF_INET6:
+ return (RTNLGRP_IPV6_ROUTE);
+ }
+ return (0);
+}
+
+
+static void
+report_operation(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc,
+ struct nlpcb *nlp, struct nlmsghdr *hdr)
+{
+ struct nlmsg_state ns;
+
+ uint32_t group_mask = family_to_group(rt_get_family(rc->rc_rt));
+ if (nlmsg_get_group_writer(NLMSG_SMALL, group_mask, &ns)) {
+ dump_rc(fibnum, hdr, rc, &ns);
+ nlmsg_flush(&ns);
+ }
+}
+
+
+struct netlink_walkargs {
+ struct nlmsg_state ns;
+ struct rib_cmd_info rc;
+ struct nlmsghdr hdr;
+ struct nlpcb *nlp;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG3
+static void
+print_hex(char *data, int len)
+{
+ unsigned char buffer[128], *ptr;
+
+ ptr = &buffer[0];
+
+ for (int i = 0; i < len; i++) {
+ printf(" WTF? %d %d\n", i, (int)(unsigned char)data[i]);
+ ptr += snprintf(ptr, 128, "%02X ", (unsigned char)data[i]);
+ }
+ *ptr = '\0';
+ RT_LOG(LOG_DEBUG2, "DBG: %s", buffer);
+}
+#endif
+
+static int
+dump_rtentry(struct rtentry *rt, void *_arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
+ int error;
+
+ wa->count++;
+ if (wa->error != 0)
+ return (0);
+ wa->dumped++;
+
+ wa->rc.rc_rt = rt;
+ wa->rc.rc_nh_new = rt_get_raw_nhop(rt);
+
+ error = dump_rc(wa->fibnum, &wa->hdr, &wa->rc, &wa->ns);
+
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG2, wa->fibnum, wa->family, "Dump %s, offset %u, error %d",
+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)), wa->ns.offset, error);
+#endif
+ wa->error = error;
+
+ return (0);
+}
+
+static void
+dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ FIB_LOG(LOG_DEBUG, fibnum, family, "Start dump");
+ wa->count = 0;
+ wa->dumped = 0;
+
+ rib_walk(fibnum, family, false, dump_rtentry, wa);
+
+ FIB_LOG(LOG_DEBUG, fibnum, family, "End dump, iterated %d dumped %d",
+ wa->count, wa->dumped);
+ RT_LOG(LOG_DEBUG2, "Current offset: %d", wa->ns.offset);
+}
+
+static void
+dump_rtable_family(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ wa->fibnum = fibnum;
+ wa->family = family;
+
+ if (rt_tables_get_rnh(fibnum, family) != 0) {
+ dump_rtable_one(wa, fibnum, family);
+ }
+}
+
+static int
+handle_rtm_getroute(struct nlpcb *nlp, struct rt_addrinfo *info, uint32_t fibnum,
+ struct nlmsghdr *hdr)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+ struct nhop_object *nh;
+ sa_family_t saf;
+
+ if (info->rti_info[RTAX_DST] == NULL) {
+ RT_LOG(LOG_WARNING, "No RTAX_DST supplied");
+ return (EINVAL);
+ }
+ saf = info->rti_info[RTAX_DST]->sa_family;
+
+ FIB_LOG(LOG_DEBUG, fibnum, saf, "getroute called");
+
+ rnh = rt_tables_get_rnh(fibnum, saf);
+ if (rnh == NULL)
+ return (EAFNOSUPPORT);
+
+ struct rib_cmd_info rc = {};
+
+ RIB_RLOCK(rnh);
+
+ if ((info->rti_addrs & RTA_NETMASK) == 0) {
+ rc.rc_rt = (struct rtentry *)rnh->rnh_matchaddr(
+ info->rti_info[RTAX_DST], &rnh->head);
+ } else
+ rc.rc_rt = (struct rtentry *)rnh->rnh_lookup(
+ info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK],
+ &rnh->head);
+
+ if (rc.rc_rt == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ nh = rt_get_raw_nhop(rc.rc_rt);
+ if (nh == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+ rc.rc_nh_new = nh;
+ rc.rc_nh_weight = rc.rc_rt->rt_weight;
+ rc.rc_cmd = RTM_GET;
+ RIB_RUNLOCK(rnh);
+
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG
+ char rtbuf[INET6_ADDRSTRLEN + 5], nhbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG, fibnum, saf, "getroute completed: got %s for %s",
+ nhop_print_buf(rc.rc_nh_new, nhbuf, sizeof(nhbuf)),
+ rt_print_buf(rc.rc_rt, rtbuf, sizeof(rtbuf)));
+#endif
+ struct nlmsg_state ns = {};
+ if (!nlmsg_get_socket_writer(NLMSG_SMALL, nlp, &ns))
+ return (ENOMEM);
+ dump_rc(fibnum, hdr, &rc, &ns);
+
+ return (0);
+}
+
+
+static int
+handle_rtm_filter(struct nlpcb *nlp, struct rt_addrinfo *info, uint32_t fibnum, int family,
+ struct nlmsghdr *hdr)
+{
+ struct netlink_walkargs wa = {
+ .nlp = nlp,
+ .rc.rc_cmd = RTM_ADD,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .fibnum = fibnum,
+ .family = family,
+ };
+
+ if (fibnum >= V_rt_numfibs) {
+ FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum");
+ return (ENOENT);
+ }
+
+ if (!nlmsg_get_socket_writer(NLMSG_LARGE, nlp, &wa.ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return (ENOMEM);
+ }
+
+ if (family != AF_UNSPEC)
+ dump_rtable_family(&wa, fibnum, family);
+ else for (int i = 1; i < AF_MAX; i++) {
+ dump_rtable_family(&wa, fibnum, i);
+ if (wa.error != 0)
+ break;
+ }
+
+ if (wa.error == 0) {
+ if (!nlmsg_add(&wa.ns, wa.hdr.nlmsg_pid, wa.hdr.nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
+ RT_LOG(LOG_DEBUG, "Fuck");
+ return (ENOMEM);
+ }
+ /* Save operation result */
+ int *perror = nlmsg_reserve_object(&wa.ns, int);
+ RT_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", wa.error,
+ wa.ns.offset, perror);
+ *perror = wa.error;
+ nlmsg_end(&wa.ns);
+ }
+ nlmsg_flush(&wa.ns);
+
+ return (wa.error);
+}
+
+
+static int
+rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt
+)
+{
+ struct rib_cmd_info rc;
+ int error;
+
+ struct rt_addrinfo *info = get_info_from_rtmsg(hdr, RTA_DST|RTA_GATEWAY, npt);
+ if (info == NULL)
+ return (EINVAL);
+
+ error = rib_action(info->rti_fibnum, RTM_ADD, info, &rc);
+ if (error == 0)
+ report_operation(info->rti_fibnum, info, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ struct rib_cmd_info rc;
+ int error;
+
+ struct rt_addrinfo *info = get_info_from_rtmsg(hdr, RTA_DST, npt);
+ if (info == NULL)
+ return (EINVAL);
+
+ /* XX */
+
+ error = rib_action(info->rti_fibnum, RTM_DELETE, info, &rc);
+ if (error == 0)
+ report_operation(info->rti_fibnum, info, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ struct rt_addrinfo *info;
+ int error = EINVAL;
+
+ if (hdr->nlmsg_flags & NLM_F_DUMP) {
+ info = get_info_from_rtmsg(hdr, 0, npt);
+ if (info != NULL)
+ error = handle_rtm_filter(nlp, info, info->rti_fibnum, info->rti_family, hdr);
+ } else {
+ info = get_info_from_rtmsg(hdr, RTA_DST, npt);
+ if (info != NULL)
+ error = handle_rtm_getroute(nlp, info, info->rti_fibnum, hdr);
+ }
+
+ return (error);
+}
+
+static int
+rtnl_handle_message(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ int error = 0;
+
+ RT_LOG(LOG_DEBUG2, "received msg type %d (pid %u)", hdr->nlmsg_type,
+ hdr->nlmsg_pid);
+ /* XXX: check min header length */
+ switch (hdr->nlmsg_type) {
+ case NL_RTM_NEWROUTE:
+ error = rtnl_handle_newroute(hdr, nlp, npt);
+ break;
+ case NL_RTM_DELROUTE:
+ error = rtnl_handle_delroute(hdr, nlp, npt);
+ break;
+ case NL_RTM_GETROUTE:
+ error = rtnl_handle_getroute(hdr, nlp, npt);
+ break;
+ case NL_RTM_GETLINK:
+ error = rtnl_handle_getlink(hdr, nlp, npt);
+ break;
+ case NL_RTM_GETADDR:
+ error = rtnl_handle_getaddr(hdr, nlp, npt);
+ break;
+ case NL_RTM_NEWNEXTHOP:
+ error = rtnl_handle_newnhop(hdr, nlp, npt);
+ default:
+ RT_LOG(LOG_DEBUG, "msg type %d unsupported (pid %u)",
+ hdr->nlmsg_type, hdr->nlmsg_pid);
+ error = EOPNOTSUPP;
+ }
+
+ return (error);
+}
+
+/*
+ * Handler called by netlink subsystem when matching netlink message is received
+ */
+static int
+rtnl_receive_message(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt)
+{
+ struct epoch_tracker et;
+ int error;
+
+ NET_EPOCH_ENTER(et);
+ error = rtnl_handle_message(hdr, npt->nlp, npt);
+ NET_EPOCH_EXIT(et);
+
+ return (error);
+}
+
+static void
+handle_route_event(uint32_t fibnum, const struct rt_addrinfo *info,
+ const struct rib_cmd_info *rc)
+{
+ int family, nlm_flags = 0;
+
+ struct nlmsg_state ns;
+
+ family = rt_get_family(rc->rc_rt);
+
+ /* XXX: check if there are active listeners first */
+
+ /* TODO: consider passing PID/type/seq */
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
+ break;
+ case RTM_CHANGE:
+ nlm_flags = NLM_F_REPLACE;
+ break;
+ case RTM_DELETE:
+ nlm_flags = 0;
+ break;
+ }
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG, fibnum, family, "received event %s for %s / nlm_flags=%X",
+ rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
+ nlm_flags);
+#endif
+ struct nlmsghdr hdr = {
+ .nlmsg_flags = nlm_flags,
+ };
+
+ uint32_t group_mask = family_to_group(family);
+
+ if (!nlmsg_get_group_writer(NLMSG_SMALL, group_mask, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+
+ dump_rc(fibnum, &hdr, rc, &ns);
+ nlmsg_flush(&ns);
+}
+
+static void nlbridge_cb_func(uint32_t event_type, uint32_t fibnum,
+ const struct rt_addrinfo *info, const struct rib_cmd_info *rc, void *arg)
+{
+ RT_LOG(LOG_DEBUG2, "received bridge event %d", event_type);
+ switch (event_type) {
+ case NLBR_EVENT_ROUTE:
+ handle_route_event(fibnum, info, rc);
+ break;
+ }
+}
+
+static struct rib_event_bridge nlbridge = {
+ .reb_cb = nlbridge_cb_func,
+ .reb_cb_arg = NULL,
+ .reb_provider_id = NLBR_PROVIDER_NETLINK,
+};
+
+static void
+rtnl_load(void *u __unused)
+{
+ RT_LOG(LOG_ERR, "netlink support is in ALPHA stage");
+ RT_LOG(LOG_NOTICE, "rtnl loading");
+ rib_bridge_link(&nlbridge);
+ rtnl_ifaces_init();
+ netlink_register_proto(NETLINK_ROUTE, rtnl_receive_message);
+}
+SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL);
+
+static void
+rtnl_unload(void *u __unused)
+{
+ rib_bridge_unlink(&nlbridge);
+ rtnl_ifaces_destroy();
+
+ /* Wait till all consumers read nlbridge data */
+ epoch_wait_preempt(net_epoch_preempt);
+}
+SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL);
Index: sys/netlink/netlink_var.h
===================================================================
--- /dev/null
+++ sys/netlink/netlink_var.h
@@ -0,0 +1,200 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_VAR_H_
+#define _NETLINK_NETLINK_VAR_H_
+
+#include <sys/epoch.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/vnet.h>
+
+MALLOC_DECLARE(M_NETLINK);
+
+#define NLSNDQ 65536 /* Default socket sendspace */
+#define NLRCVQ 65536 /* Default socket recvspace */
+
+struct nlpcb {
+ struct socket *nl_socket;
+ uint32_t nl_port;
+ uint32_t nl_groups;
+ uint32_t nl_flags;
+ uint32_t nl_process_id;
+ int nl_proto;
+ bool nl_active;
+ bool nl_task_pending;
+ bool nl_linux; /* true if running under compat */
+ struct mbuf *nl_queue_head;
+ struct mbuf *nl_queue_last;
+ int64_t nl_queue_length;
+ struct taskqueue *nl_taskqueue;
+ struct task nl_task;
+ CK_LIST_ENTRY(nlpcb) nl_next;
+ CK_LIST_ENTRY(nlpcb) nl_port_next;
+ volatile u_int nl_refcount;
+ struct mtx nl_lock;
+ struct epoch_context nl_epoch_ctx;
+};
+#define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb)
+
+#define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF)
+#define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock))
+#define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock))
+#define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock))
+
+#define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16)
+
+#define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */
+#define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */
+
+#define NETISR_NETLINK 15 // XXX hack, must be unused and < 16
+
+
+SYSCTL_DECL(_net_netlink);
+
+struct nl_io {
+ struct callout callout;
+ struct mbuf *head;
+ struct mbuf *last;
+ int64_t length;
+};
+
+
+struct nl_control {
+ CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head;
+ CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head;
+ CK_LIST_ENTRY(nl_control) ctl_next;
+ struct nl_io ctl_io;
+ struct rmlock ctl_lock;
+};
+VNET_DECLARE(struct nl_control *, nl_ctl);
+#define V_nl_ctl VNET(nl_ctl)
+
+
+/* locking */
+#define CTL_TRACKER struct rm_priotracker nl_tracker
+#define CTL_RLOCK() rm_rlock(&V_nl_ctl->ctl_lock, &nl_tracker)
+#define CTL_RUNLOCK() rm_runlock(&V_nl_ctl->ctl_lock, &nl_tracker)
+
+#define CTL_WLOCK() rm_wlock(&V_nl_ctl->ctl_lock)
+#define CTL_WUNLOCK() rm_wunlock(&V_nl_ctl->ctl_lock)
+
+struct sockaddr_nl;
+struct sockaddr;
+struct nlmsghdr;
+
+/* Parsing state */
+
+struct linear_buffer {
+ char *base; /* Base allocated memory pointer */
+ uint32_t offset; /* Currently used offset */
+ uint32_t size; /* Total buffer size */
+};
+
+static inline void *
+lb_alloc(struct linear_buffer *lb, int len)
+{
+ len = roundup2(len, sizeof(uint64_t));
+ if (lb->offset + len > lb->size)
+ return (NULL);
+ void *data = (void *)(lb->base + lb->offset);
+ lb->offset += len;
+ return (data);
+}
+
+static inline void
+lb_clear(struct linear_buffer *lb)
+{
+ memset(lb->base, 0, lb->size);
+ lb->offset = 0;
+}
+
+#define SCRATCH_BUFFER_SIZE 1024
+struct netlink_parse_tracker {
+ struct linear_buffer lb; /* Per-message scratch buffer */
+ struct nlpcb *nlp; /* Originator */
+ struct nlmsghdr *hdr; /* Current message being parsed */
+ int error; /* last operation error */
+};
+
+static inline void *
+npt_alloc(struct netlink_parse_tracker *npt, int len)
+{
+ return (lb_alloc(&npt->lb, len));
+}
+#define npt_alloc_sockaddr(_npt, _len) ((struct sockaddr *)(npt_alloc(_npt, _len)))
+
+/* netlink_netisr.c */
+void netlink_netisr_init(void);
+void netlink_netisr_destroy(void);
+void netlink_netisr_vnet_init(void);
+void netlink_netisr_vnet_destroy(void);
+int nl_send_msg(struct mbuf *m, uint32_t group_mask);
+void nl_msg_from_netlink(struct mbuf *m);
+
+extern struct netisr_handler nlsock_nh;
+
+/* netlink_io.c */
+void nl_taskqueue_handler(void *_arg, int pending);
+int nl_receive_async(struct mbuf *m, struct socket *so);
+void nl_process_receive_locked(struct nlpcb *nlp);
+
+/* netlink_iface.c */
+struct rt_addrinfo;
+int rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt);
+int rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt);
+void rtnl_ifaces_init(void);
+void rtnl_ifaces_destroy(void);
+
+/* netlink_module.c */
+void vnet_nl_ctl_init(void);
+
+int nl_verify_proto(int proto);
+
+extern int netlink_unloading;
+
+#define NL_MAX_HANDLERS 100
+extern nl_handler nl_handlers[NL_MAX_HANDLERS];
+
+/* netlink_nhop.c */
+int rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt);
+
+/* netlink_io.c */
+void nl_send_group(struct mbuf *m, uint32_t group_mask);
+bool nl_send_one(struct mbuf *m, struct nlpcb *nlp);
+void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg);
+
+/* Linux compat */
+struct nlmsghdr *nlmsg_from_linux(struct nlmsghdr *hdr,
+ struct netlink_parse_tracker *npt);
+struct mbuf *nlmsgs_to_linux(char *buf, int buflen, struct nlpcb *nlp);
+struct mbuf *mbufs_to_linux(struct mbuf *m, struct nlpcb *nlp);
+
+#endif
Index: sys/netlink/route/base.h
===================================================================
--- /dev/null
+++ sys/netlink/route/base.h
@@ -0,0 +1,262 @@
+
+
+#ifndef _NETLINK_ROUTE_ROUTE_BASE_H_
+#define _NETLINK_ROUTE_ROUTE_BASE_H_
+
+
+/*
+ * Messages defined by the NETLINK_ROUTE subsystem
+ */
+
+enum {
+ NL_RTM_BASE = 16,
+#define NL_RTM_BASE NL_RTM_BASE
+ NL_RTM_NEWLINK = 16,
+#define NL_RTM_NEWLINK NL_RTM_NEWLINK
+ NL_RTM_DELLINK,
+#define NL_RTM_DELLINK NL_RTM_DELLINK
+ NL_RTM_GETLINK,
+#define NL_RTM_GETLINK NL_RTM_GETLINK
+ NL_RTM_SETLINK,
+#define NL_RTM_SETLINK NL_RTM_SETLINK
+ NL_RTM_NEWADDR = 20,
+#define NL_RTM_NEWADDR NL_RTM_NEWADDR
+ NL_RTM_DELADDR,
+#define NL_RTM_DELADDR NL_RTM_DELADDR
+ NL_RTM_GETADDR,
+#define NL_RTM_GETADDR NL_RTM_GETADDR
+ NL_RTM_NEWROUTE = 24,
+#define NL_RTM_NEWROUTE NL_RTM_NEWROUTE
+ NL_RTM_DELROUTE,
+#define NL_RTM_DELROUTE NL_RTM_DELROUTE
+ NL_RTM_GETROUTE,
+#define NL_RTM_GETROUTE NL_RTM_GETROUTE
+ NL_RTM_NEWNEIGH = 28,
+#define NL_RTM_NEWNEIGH NL_RTM_NEWNEIGH
+ NL_RTM_DELNEIGH,
+#define NL_RTM_DELNEIGH NL_RTM_DELNEIGH
+ NL_RTM_GETNEIGH,
+#define NL_RTM_GETNEIGH NL_RTM_GETNEIGH
+ NL_RTM_NEWRULE = 32,
+#define NL_RTM_NEWRULE NL_RTM_NEWRULE
+ NL_RTM_DELRULE,
+#define NL_RTM_DELRULE NL_RTM_DELRULE
+ NL_RTM_GETRULE,
+#define NL_RTM_GETRULE NL_RTM_GETRULE
+ NL_RTM_NEWQDISC = 36,
+#define NL_RTM_NEWQDISC NL_RTM_NEWQDISC
+ NL_RTM_DELQDISC,
+#define NL_RTM_DELQDISC NL_RTM_DELQDISC
+ NL_RTM_GETQDISC,
+#define NL_RTM_GETQDISC NL_RTM_GETQDISC
+ NL_RTM_NEWTCLASS = 40,
+#define NL_RTM_NEWTCLASS NL_RTM_NEWTCLASS
+ NL_RTM_DELTCLASS,
+#define NL_RTM_DELTCLASS NL_RTM_DELTCLASS
+ NL_RTM_GETTCLASS,
+#define NL_RTM_GETTCLASS NL_RTM_GETTCLASS
+ NL_RTM_NEWTFILTER = 44,
+#define NL_RTM_NEWTFILTER NL_RTM_NEWTFILTER
+ NL_RTM_DELTFILTER,
+#define NL_RTM_DELTFILTER NL_RTM_DELTFILTER
+ NL_RTM_GETTFILTER,
+#define NL_RTM_GETTFILTER NL_RTM_GETTFILTER
+ NL_RTM_NEWACTION = 48,
+#define NL_RTM_NEWACTION NL_RTM_NEWACTION
+ NL_RTM_DELACTION,
+#define NL_RTM_DELACTION NL_RTM_DELACTION
+ NL_RTM_GETACTION,
+#define NL_RTM_GETACTION NL_RTM_GETACTION
+ NL_RTM_NEWPREFIX = 52,
+#define NL_RTM_NEWPREFIX NL_RTM_NEWPREFIX
+ NL_RTM_GETMULTICAST = 58,
+#define NL_RTM_GETMULTICAST NL_RTM_GETMULTICAST
+ NL_RTM_GETANYCAST = 62,
+#define NL_RTM_GETANYCAST NL_RTM_GETANYCAST
+ NL_RTM_NEWNEIGHTBL = 64,
+#define NL_RTM_NEWNEIGHTBL NL_RTM_NEWNEIGHTBL
+ NL_RTM_GETNEIGHTBL = 66,
+#define NL_RTM_GETNEIGHTBL NL_RTM_GETNEIGHTBL
+ NL_RTM_SETNEIGHTBL,
+#define NL_RTM_SETNEIGHTBL NL_RTM_SETNEIGHTBL
+ NL_RTM_NEWNDUSEROPT = 68,
+#define NL_RTM_NEWNDUSEROPT NL_RTM_NEWNDUSEROPT
+ NL_RTM_NEWADDRLABEL = 72,
+#define NL_RTM_NEWADDRLABEL NL_RTM_NEWADDRLABEL
+ NL_RTM_DELADDRLABEL,
+#define NL_RTM_DELADDRLABEL NL_RTM_DELADDRLABEL
+ NL_RTM_GETADDRLABEL,
+#define NL_RTM_GETADDRLABEL NL_RTM_GETADDRLABEL
+ NL_RTM_GETDCB = 78,
+#define NL_RTM_GETDCB NL_RTM_GETDCB
+ NL_RTM_SETDCB,
+#define NL_RTM_SETDCB NL_RTM_SETDCB
+ NL_RTM_NEWNETCONF = 80,
+#define NL_RTM_NEWNETCONF NL_RTM_NEWNETCONF
+ NL_RTM_GETNETCONF = 82,
+#define NL_RTM_GETNETCONF NL_RTM_GETNETCONF
+ NL_RTM_NEWMDB = 84,
+#define NL_RTM_NEWMDB NL_RTM_NEWMDB
+ NL_RTM_DELMDB = 85,
+#define NL_RTM_DELMDB NL_RTM_DELMDB
+ NL_RTM_GETMDB = 86,
+#define NL_RTM_GETMDB NL_RTM_GETMDB
+ NL_RTM_NEWNSID = 88,
+#define NL_RTM_NEWNSID NL_RTM_NEWNSID
+ NL_RTM_DELNSID = 89,
+#define NL_RTM_DELNSID NL_RTM_DELNSID
+ NL_RTM_GETNSID = 90,
+#define NL_RTM_GETNSID NL_RTM_GETNSID
+ NL_RTM_NEWSTATS = 92,
+#define NL_RTM_NEWSTATS NL_RTM_NEWSTATS
+ NL_RTM_GETSTATS = 94,
+#define NL_RTM_GETSTATS NL_RTM_GETSTATS
+ NL_RTM_NEWNEXTHOP = 104,
+#define NL_RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP
+ NL_RTM_DELNEXTHOP,
+#define NL_RTM_DELNEXTHOP NL_RTM_DELNEXTHOP
+ NL_RTM_GETNEXTHOP,
+#define NL_RTM_GETNEXTHOP NL_RTM_GETNEXTHOP
+ __NL_RTM_MAX,
+#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1)
+};
+
+#ifndef _KERNEL
+/*
+ * RTM_* namespace clashes with BSD rtsock namespace.
+ * Use NL_RTM_ prefix in the kernel and map it to RTM_
+ * for userland.
+ */
+#define RTM_BASE NL_RTM_BASE
+#define RTM_NEWLINK NL_RTM_NEWLINK
+#define RTM_DELLINK NL_RTM_DELLINK
+#define RTM_GETLINK NL_RTM_GETLINK
+#define RTM_SETLINK NL_RTM_SETLINK
+#define RTM_NEWADDR NL_RTM_NEWADDR
+#define RTM_DELADDR NL_RTM_DELADDR
+#define RTM_GETADDR NL_RTM_GETADDR
+#define RTM_NEWROUTE NL_RTM_NEWROUTE
+#define RTM_DELROUTE NL_RTM_DELROUTE
+#define RTM_GETROUTE NL_RTM_GETROUTE
+#endif
+
+
+/*
+ * route attribute header
+ */
+struct rtattr {
+ unsigned short rta_len;
+ unsigned short rta_type;
+};
+
+#ifndef _roundup2
+#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+#endif
+#define NL_RTA_ALIGN_SIZE sizeof(uint32_t)
+#define NL_RTA_ALIGN(_len) _roundup2(_len, NL_RTA_ALIGN_SIZE)
+#define _NL_RTA_BASE_LEN NL_RTA_ALIGN(sizeof(struct rtattr))
+#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - _NL_RTA_BASE_LEN))
+#define NL_RTA_DATA(_rta) ((void *)((char *)(_rta) + _NL_RTA_BASE_LEN))
+#define NL_RTA_DATA_CONST(_rta) ((const void *)((const char *)(_rta) + _NL_RTA_BASE_LEN))
+
+
+/* Compatibility attribute handling helpers */
+#ifndef _KERNEL
+#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE
+#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len)
+#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN((_rta)->rta_len)
+#define _RTA_OK(_rta, _len, _sz) \
+ (((_len) >= (_sz)) && ((_rta)->rta_len >= (_sz)) && ((_rta)->rta_len <= (_len)))
+#define RTA_OK(_rta, _len) _RTA_OK(_rta, _len, sizeof(struct rtattr))
+#define RTA_NEXT(_rta, _len) \
+ ((_len) -= RTA_ALIGNED_LEN, (_rta) = (struct rtattr *)((char *)(_rta) + RTA_ALIGNED_LEN))
+
+#define RTA_LENGTH(_len) (sizeof(struct rtattr) + (_len))
+#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len))
+#define RTA_DATA(_rta) NL_RTA_DATA(_rta)
+#define RTA_PAYLOAD(_rta) ((int)((_rta)->rta_len) - sizeof(struct rtattr))
+
+#define RTM_RTA(_rtm) \
+ ((struct rtattr *)(((char *)(_rtm)) + NLMSG_ALIGN(sizeof(struct rtmsg))))
+#define RTM_PAYLOAD(_msg) NLMSG_PAYLOAD((_msg), sizeof(struct rtmsg))
+#endif
+
+
+enum rtattr_type_t {
+ NL_RTA_UNSPEC,
+ NL_RTA_DST,
+ NL_RTA_SRC,
+ NL_RTA_IIF,
+ NL_RTA_OIF,
+ NL_RTA_GATEWAY,
+ NL_RTA_PRIORITY,
+ NL_RTA_PREFSRC,
+ NL_RTA_METRICS,
+ NL_RTA_MULTIPATH,
+ NL_RTA_PROTOINFO, /* not used / deprecated */
+ NL_RTA_FLOW,
+ NL_RTA_CACHEINFO, /* not used */
+ NL_RTA_SESSION, /* not used / deprecated */
+ NL_RTA_MP_ALGO, /* not used / deprecated */
+ NL_RTA_TABLE,
+ NL_RTA_MARK, /* not used */
+ NL_RTA_MFC_STATS,
+ NL_RTA_VIA,
+ NL_RTA_NEWDST,
+ NL_RTA_PREF,
+ NL_RTA_ENCAP_TYPE,
+ NL_RTA_ENCAP,
+ NL_RTA_EXPIRES,
+ NL_RTA_PAD,
+ NL_RTA_UID,
+ NL_RTA_TTL_PROPAGATE,
+ NL_RTA_IP_PROTO,
+ NL_RTA_SPORT,
+ NL_RTA_DPORT,
+ NL_RTA_NH_ID,
+ __RTA_MAX
+};
+#define NL_RTA_MAX (__RTA_MAX - 1)
+
+#ifndef _KERNEL
+/*
+ * RTA_* space has clashes with rtsock namespace.
+ * Use NL_RTA_ prefix in the kernel and map to
+ * RTA_ for userland.
+ */
+#define RTA_UNSPEC NL_RTA_UNSPEC
+#define RTA_DST NL_RTA_DST
+#define RTA_SRC NL_RTA_SRC
+#define RTA_IIF NL_RTA_IIF
+#define RTA_OIF NL_RTA_OIF
+#define RTA_GATEWAY NL_RTA_GATEWAY
+#define RTA_PRIORITY NL_RTA_PRIORITY
+#define RTA_PREFSRC NL_RTA_PREFSRC
+#define RTA_METRICS NL_RTA_METRICS
+#define RTA_MULTIPATH NL_RTA_MULTIPATH
+#define RTA_PROTOINFO NL_RTA_PROTOINFO
+#define RTA_FLOW NL_RTA_FLOW
+#define RTA_CACHEINFO NL_RTA_CACHEINFO
+#define RTA_SESSION NL_RTA_SESSION
+#define RTA_MP_ALGO NL_RTA_MP_ALGO
+#define RTA_TABLE NL_RTA_TABLE
+#define RTA_MARK NL_RTA_MARK
+#define RTA_MFC_STATS NL_RTA_MFC_STATS
+#define RTA_VIA NL_RTA_VIA
+#define RTA_NEWDST NL_RTA_NEWDST
+#define RTA_PREF NL_RTA_PREF
+#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE
+#define RTA_ENCAP NL_RTA_ENCAP
+#define RTA_EXPIRES NL_RTA_EXPIRES
+#define RTA_PAD NL_RTA_PAD
+#define RTA_UID NL_RTA_UID
+#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE
+#define RTA_IP_PROTO NL_RTA_IP_PROTO
+#define RTA_SPORT NL_RTA_SPORT
+#define RTA_DPORT NL_RTA_DPORT
+#define RTA_NH_ID NL_RTA_NH_ID
+#define RTA_MAX NL_RTA_MAX
+#endif
+
+
+#endif
\ No newline at end of file
Index: sys/netlink/route/nhop.h
===================================================================
--- /dev/null
+++ sys/netlink/route/nhop.h
@@ -0,0 +1,27 @@
+#ifndef _NETLINK_ROUTE_NHOP_H_
+#define _NETLINK_ROUTE_NHOP_H_
+
+struct nhmsg {
+ unsigned char nh_family;
+ unsigned char nh_scope; /* return only */
+ unsigned char nh_protocol; /* Routing protocol that installed nh */
+ unsigned char resvd;
+ unsigned int nh_flags; /* RTNH_F flags */
+};
+
+/* entry in a nexthop group */
+struct nexthop_grp {
+ __u32 id; /* nexthop userland index */
+ __u8 weight; /* weight of this nexthop */
+ __u8 resvd1;
+ __u16 resvd2;
+};
+
+enum {
+ NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */
+ NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */
+ __NEXTHOP_GRP_TYPE_MAX,
+};
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+#endif
\ No newline at end of file
Index: sys/netlink/route/route.h
===================================================================
--- /dev/null
+++ sys/netlink/route/route.h
@@ -0,0 +1,236 @@
+#ifndef _NETLINK_ROUTE_ROUTE_H_
+#define _NETLINK_ROUTE_ROUTE_H_
+
+/*
+ * Headers and attributes necessary for handling
+ * RTM_NEWROUTE|RTM_DELROUTE|RTM_GETROUTE
+ */
+
+/*
+ * Routing message header
+ */
+struct rtmsg {
+ unsigned char rtm_family; /* address family */
+ unsigned char rtm_dst_len; /* Prefix length */
+ unsigned char rtm_src_len; /* Source prefix length (not used) */
+ unsigned char rtm_tos; /* Type of service (not used) */
+ unsigned char rtm_table; /* rtable id */
+ unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */
+ unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */
+ unsigned char rtm_type; /* Route type (RTN_) */
+ unsigned rtm_flags; /* Route flags (RTM_F_) */
+};
+
+/*
+ * RFC 3549, 3.1.1, route type (rtm_type field).
+ */
+enum {
+ RTN_UNSPEC,
+ RTN_UNICAST, /* Unicast route */
+ RTN_LOCAL, /* Accept locally (not supported) */
+ RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */
+ RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */
+ RTN_MULTICAST, /* Multicast route */
+ RTN_BLACKHOLE, /* Drop traffic towards destination */
+ RTN_UNREACHABLE,/* Destination is unreachable */
+ RTN_PROHIBIT, /* Administratively prohibited */
+ RTN_THROW, /* Not in this table (not supported) */
+ RTN_NAT, /* Translate this address (not supported) */
+ RTN_XRESOLVE, /* Use external resolver (not supported) */
+ __RTN_MAX,
+};
+#define RTN_MAX (__RTN_MAX - 1)
+
+/*
+ * RFC 3549, 3.1.1, protocol (Identifies what/who added the route).
+ * Values larger than RTPROT_STATIC(4) are not interpreted by the
+ * kernel, they are just for user information.
+ */
+
+#define RTPROT_UNSPEC 0
+#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */
+#define RTPROT_KERNEL 2 /* Route installed by kernel */
+#define RTPROT_BOOT 3 /* Route installed during boot */
+#define RTPROT_STATIC 4 /* Route installed by administrator */
+
+#define RTPROT_GATED 8 /* Apparently, GateD */
+#define RTPROT_RA 9 /* RDISC/ND router advertisements */
+#define RTPROT_MRT 10 /* Merit MRT */
+#define RTPROT_ZEBRA 11 /* Zebra */
+#define RTPROT_BIRD 12 /* BIRD */
+#define RTPROT_DNROUTED 13 /* DECnet routing daemon */
+#define RTPROT_XORP 14 /* XORP */
+#define RTPROT_NTK 15 /* Netsukuku */
+#define RTPROT_DHCP 16 /* DHCP client */
+#define RTPROT_MROUTED 17 /* Multicast daemon */
+#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */
+#define RTPROT_BABEL 42 /* Babel daemon */
+#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */
+#define RTPROT_BGP 186 /* BGP Routes */
+#define RTPROT_ISIS 187 /* ISIS Routes */
+#define RTPROT_OSPF 188 /* OSPF Routes */
+#define RTPROT_RIP 189 /* RIP Routes */
+#define RTPROT_EIGRP 192 /* EIGRP Routes */
+
+/*
+ * RFC 3549 3.1.1 Route scope (valid distance to destination).
+ *
+ * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200)
+ * are available to the user.
+*/
+enum rt_scope_t {
+ RT_SCOPE_UNIVERSE = 0,
+ /* User defined values */
+ RT_SCOPE_SITE = 200,
+ RT_SCOPE_LINK = 253,
+ RT_SCOPE_HOST = 254,
+ RT_SCOPE_NOWHERE = 255
+};
+
+/*
+ * RFC 3549 3.1.1 Route flags.
+*/
+#define RTM_F_NOTIFY 0x100 /* Notify user of route change */
+#define RTM_F_CLONED 0x200 /* This route is cloned (not used) */
+#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
+#define RTM_F_PREFIX 0x800 /* Prefix addresses */
+#define RTM_F_LOOKUP_TABLE 0x1000 /* set tableid to FIB lookup result */
+#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */
+#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */
+#define RTM_F_TRAP 0x8000 /* route is trapping packets */
+#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed */
+
+/*
+ * Routing table identifiers.
+ * FreeBSD route table numbering starts from 0.
+ */
+#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */
+#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */
+
+
+/* NL_RTA_VIA header */
+struct rtvia {
+ sa_family_t rtvia_family;
+ uint8_t rtvia_addr[0];
+};
+
+/*
+* NL_RTA_METRICS: attribute consisting of
+* array of struct rtattr with types of RTAX_*
+*/
+
+ enum {
+ NL_RTAX_UNSPEC,
+#define NL_RTAX_UNSPEC NL_RTAX_UNSPEC
+ NL_RTAX_LOCK,
+#define NL_RTAX_LOCK NL_RTAX_LOCK
+ NL_RTAX_MTU,
+#define NL_RTAX_MTU NL_RTAX_MTU
+ NL_RTAX_WINDOW,
+#define NL_RTAX_WINDOW NL_RTAX_WINDOW
+ NL_RTAX_RTT,
+#define NL_RTAX_RTT NL_RTAX_RTT
+ NL_RTAX_RTTVAR,
+#define NL_RTAX_RTTVAR NL_RTAX_RTTVAR
+ NL_RTAX_SSTHRESH,
+#define NL_RTAX_SSTHRESH NL_RTAX_SSTHRESH
+ NL_RTAX_CWND,
+#define NL_RTAX_CWND NL_RTAX_CWND
+ NL_RTAX_ADVMSS,
+#define NL_RTAX_ADVMSS NL_RTAX_ADVMSS
+ NL_RTAX_REORDERING,
+#define NL_RTAX_REORDERING NL_RTAX_REORDERING
+ NL_RTAX_HOPLIMIT,
+#define NL_RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+ NL_RTAX_INITCWND,
+#define NL_RTAX_INITCWND NL_RTAX_INITCWND
+ NL_RTAX_FEATURES,
+#define NL_RTAX_FEATURES NL_RTAX_FEATURES
+ NL_RTAX_RTO_MIN,
+#define NL_RTAX_RTO_MIN NL_RTAX_RTO_MIN
+ NL_RTAX_INITRWND,
+#define NL_RTAX_INITRWND NL_RTAX_INITRWND
+ NL_RTAX_QUICKACK,
+#define NL_RTAX_QUICKACK NL_RTAX_QUICKACK
+ NL_RTAX_CC_ALGO,
+#define NL_RTAX_CC_ALGO NL_RTAX_CC_ALGO
+ NL_RTAX_FASTOPEN_NO_COOKIE,
+#define NL_RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+ __NL_RTAX_MAX
+};
+#define NL_RTAX_MAX (__NL_RTAX_MAX - 1)
+
+#define RTAX_FEATURE_ECN (1 << 0)
+#define RTAX_FEATURE_SACK (1 << 1)
+#define RTAX_FEATURE_TIMESTAMP (1 << 2)
+#define RTAX_FEATURE_ALLFRAG (1 << 3)
+
+#define RTAX_FEATURE_MASK \
+ (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \
+ RTAX_FEATURE_ALLFRAG)
+
+#ifndef _KERNEL
+/*
+ * RTAX_* space clashes with rtsock namespace.
+ * Use NL_RTAX_ prefix in the kernel and map to
+ * RTAX_ for userland.
+ */
+#define RTAX_UNSPEC NL_RTAX_UNSPEC
+#define RTAX_LOCK NL_RTAX_LOCK
+#define RTAX_MTU NL_RTAX_MTU
+#define RTAX_WINDOW NL_RTAX_WINDOW
+#define RTAX_RTT NL_RTAX_RTT
+#define RTAX_RTTVAR NL_RTAX_RTTVAR
+#define RTAX_SSTHRESH NL_RTAX_SSTHRESH
+#define RTAX_CWND NL_RTAX_CWND
+#define RTAX_ADVMSS NL_RTAX_ADVMSS
+#define RTAX_REORDERING NL_RTAX_REORDERING
+#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+#define RTAX_INITCWND NL_RTAX_INITCWND
+#define RTAX_FEATURES NL_RTAX_FEATURES
+#define RTAX_RTO_MIN NL_RTAX_RTO_MIN
+#define RTAX_INITRWND NL_RTAX_INITRWND
+#define RTAX_QUICKACK NL_RTAX_QUICKACK
+#define RTAX_CC_ALGO NL_RTAX_CC_ALGO
+#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+#endif
+
+
+struct rtnexthop {
+ unsigned short rtnh_len;
+ unsigned char rtnh_flags;
+ unsigned char rtnh_hops;
+ int rtnh_ifindex;
+};
+
+/* rtnh_flags */
+
+#define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */
+#define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */
+#define RTNH_F_ONLINK 4 /* Gateway is forced on link */
+#define RTNH_F_OFFLOAD 8 /* Nexthop is offloaded */
+#define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */
+#define RTNH_F_UNRESOLVED 32 /* The entry is unresolved (ipmr) */
+#define RTNH_F_TRAP 64 /* Nexthop is trapping packets */
+
+#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \
+ RTNH_F_OFFLOAD | RTNH_F_TRAP)
+
+/* Macros to handle hexthops */
+
+#define RTNH_ALIGNTO 4
+#define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) )
+#define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \
+ ((int)(rtnh)->rtnh_len) <= (len))
+#define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len)))
+#define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len))
+#define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len))
+#define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0)))
+
+
+struct rtgenmsg {
+ unsigned char rtgen_family;
+};
+
+
+#endif
\ No newline at end of file
Index: sys/sys/domain.h
===================================================================
--- sys/sys/domain.h
+++ sys/sys/domain.h
@@ -71,11 +71,13 @@
/* dom_flags */
#define DOMF_SUPPORTED 0x0001 /* System supports this domain. */
#define DOMF_INITED 0x0002 /* Initialized in the default vnet. */
+#define DOMF_UNLOADABLE 0x0004 /* Can be unloaded */
#ifdef _KERNEL
extern int domain_init_status;
extern struct domain *domains;
void domain_add(void *);
+void domain_remove(void *);
void domain_init(void *);
#ifdef VIMAGE
void vnet_domain_init(void *);
@@ -85,6 +87,8 @@
#define DOMAIN_SET(name) \
SYSINIT(domain_add_ ## name, SI_SUB_PROTO_DOMAIN, \
SI_ORDER_FIRST, domain_add, & name ## domain); \
+ SYSUNINIT(domain_remove_ ## name, SI_SUB_PROTO_DOMAIN, \
+ SI_ORDER_FIRST, domain_remove, & name ## domain); \
SYSINIT(domain_init_ ## name, SI_SUB_PROTO_DOMAIN, \
SI_ORDER_SECOND, domain_init, & name ## domain);
#endif /* _KERNEL */
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -264,6 +264,7 @@
#define AF_ARP 35
#define AF_BLUETOOTH 36 /* Bluetooth sockets */
#define AF_IEEE80211 37 /* IEEE 802.11 protocol */
+#define AF_NETLINK 38 /* Netlink protocol */
#define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */
#define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */
#define AF_HYPERV 43 /* HyperV sockets */
@@ -389,6 +390,7 @@
#define PF_ARP AF_ARP
#define PF_BLUETOOTH AF_BLUETOOTH
#define PF_IEEE80211 AF_IEEE80211
+#define PF_NETLINK AF_NETLINK
#define PF_INET_SDP AF_INET_SDP
#define PF_INET6_SDP AF_INET6_SDP
Index: tests/sys/net/routing/netlink.py
===================================================================
--- /dev/null
+++ tests/sys/net/routing/netlink.py
@@ -0,0 +1,1076 @@
+#!/usr/local/bin/python3
+
+from ctypes import *
+import socket
+import os
+import sys
+import unittest
+import struct
+
+from enum import Enum, auto
+
+from typing import List, Callable, Dict, NamedTuple, Optional
+
+
+def roundup2(val: int, num: int) -> int:
+ if val % num:
+ return (val | (num - 1)) + 1
+ else:
+ return val
+
+
+def align4(val: int) -> int:
+ return roundup2(val, 4)
+
+
+class SockaddrNl(Structure):
+ _fields_ = [
+ ("nl_len", c_ubyte),
+ ("nl_family", c_ubyte),
+ ("nl_pad", c_ushort),
+ ("nl_pid", c_uint),
+ ("nl_groups", c_uint),
+ ]
+
+
+class Nlmsghdr(Structure):
+ _fields_ = [
+ ("nlmsg_len", c_uint),
+ ("nlmsg_type", c_ushort),
+ ("nlmsg_flags", c_ushort),
+ ("nlmsg_seq", c_uint),
+ ("nlmsg_pid", c_uint),
+ ]
+
+
+class Nlmsgerr(Structure):
+ _fields_ = [
+ ("error", c_int),
+ ("msg", Nlmsghdr),
+ ]
+
+
+class RtattrType(Enum):
+ RTA_UNSPEC = 0
+ RTA_DST = auto()
+ RTA_SRC = auto()
+ RTA_IIF = auto()
+ RTA_OIF = auto()
+ RTA_GATEWAY = auto()
+ RTA_PRIORITY = auto()
+ RTA_PREFSRC = auto()
+ RTA_METRICS = auto()
+ RTA_MULTIPATH = auto()
+ RTA_PROTOINFO = auto()
+ RTA_FLOW = auto()
+ RTA_CACHEINFO = auto()
+ RTA_SESSION = auto()
+ RTA_MP_ALGO = auto()
+ RTA_TABLE = auto()
+ RTA_MARK = auto()
+ RTA_MFC_STATS = auto()
+ RTA_VIA = auto()
+ RTA_NEWDST = auto()
+ RTA_PREF = auto()
+ RTA_ENCAP_TYPE = auto()
+ RTA_ENCAP = auto()
+ RTA_EXPIRES = auto()
+ RTA_PAD = auto()
+ RTA_UID = auto()
+ RTA_TTL_PROPAGATE = auto()
+ RTA_IP_PROTO = auto()
+ RTA_SPORT = auto()
+ RTA_DPORT = auto()
+ RTA_NH_ID = auto()
+
+
+class NlMsgType(Enum):
+ NLMSG_NOOP = 1
+ NLMSG_ERROR = 2
+ NLMSG_DONE = 3
+ NLMSG_OVERRUN = 4
+
+
+class NlRtMsgType(Enum):
+ RTM_NEWLINK = 16
+ RTM_DELLINK = 17
+ RTM_GETLINK = 18
+ RTM_SETLINK = 19
+ RTM_NEWADDR = 20
+ RTM_DELADDR = 21
+ RTM_GETADDR = 22
+ RTM_NEWROUTE = 24
+ RTM_DELROUTE = 25
+ RTM_GETROUTE = 26
+ RTM_NEWNEIGH = 28
+ RTM_DELNEIGH = 27
+ RTM_GETNEIGH = 28
+ RTM_NEWRULE = 32
+ RTM_DELRULE = 33
+ RTM_GETRULE = 34
+ RTM_NEWQDISC = 36
+ RTM_DELQDISC = 37
+ RTM_GETQDISC = 38
+ RTM_NEWTCLASS = 40
+ RTM_DELTCLASS = 41
+ RTM_GETTCLASS = 42
+ RTM_NEWTFILTER = 44
+ RTM_DELTFILTER = 45
+ RTM_GETTFILTER = 46
+ RTM_NEWACTION = 48
+ RTM_DELACTION = 49
+ RTM_GETACTION = 50
+ RTM_NEWPREFIX = 52
+ RTM_GETMULTICAST = 58
+ RTM_GETANYCAST = 62
+ RTM_NEWNEIGHTBL = 64
+ RTM_GETNEIGHTBL = 66
+ RTM_SETNEIGHTBL = 67
+ RTM_NEWNDUSEROPT = 68
+ RTM_NEWADDRLABEL = 72
+ RTM_DELADDRLABEL = 73
+ RTM_GETADDRLABEL = 74
+ RTM_GETDCB = 78
+ RTM_SETDCB = 79
+ RTM_NEWNETCONF = 80
+ RTM_GETNETCONF = 82
+ RTM_NEWMDB = 84
+ RTM_DELMDB = 85
+ RTM_GETMDB = 86
+ RTM_NEWNSID = 88
+ RTM_DELNSID = 89
+ RTM_GETNSID = 90
+ RTM_NEWSTATS = 92
+ RTM_GETSTATS = 94
+
+
+class RtAttr(Structure):
+ _fields_ = [
+ ("rta_len", c_ushort),
+ ("rta_type", c_ushort),
+ ]
+
+
+class RtMsgHdr(Structure):
+ _fields_ = [
+ ("rtm_family", c_ubyte),
+ ("rtm_dst_len", c_ubyte),
+ ("rtm_src_len", c_ubyte),
+ ("rtm_tos", c_ubyte),
+ ("rtm_table", c_ubyte),
+ ("rtm_protocol", c_ubyte),
+ ("rtm_scope", c_ubyte),
+ ("rtm_type", c_ubyte),
+ ("rtm_flags", c_uint),
+ ]
+
+
+class RtMsgFlags(Enum):
+ RTM_F_NOTIFY = 0x100
+ RTM_F_CLONED = 0x200
+ RTM_F_EQUALIZE = 0x400
+ RTM_F_PREFIX = 0x800
+ RTM_F_LOOKUP_TABLE = 0x1000
+ RTM_F_FIB_MATCH = 0x2000
+ RTM_F_OFFLOAD = 0x4000
+ RTM_F_TRAP = 0x8000
+ RTM_F_OFFLOAD_FAILED = 0x20000000
+
+
+class AddressFamilyLinux(Enum):
+ AF_INET = socket.AF_INET
+ AF_INET6 = socket.AF_INET6
+ AF_NETLINK = 16
+
+
+class AddressFamilyBsd(Enum):
+ AF_INET = socket.AF_INET
+ AF_INET6 = socket.AF_INET6
+ AF_NETLINK = 38
+
+
+class NlmBaseFlags(Enum):
+ NLM_F_REQUEST = 0x01
+ NLM_F_MULTI = 0x02
+ NLM_F_ACK = 0x04
+ NLM_F_ECHO = 0x08
+ NLM_F_DUMP_INTR = 0x10
+ NLM_F_DUMP_FILTERED = 0x20
+
+# XXX: in python3.8 it is possible to
+# class NlmGetFlags(Enum, NlmBaseFlags):
+
+
+class NlmGetFlags(Enum):
+ NLM_F_ROOT = 0x100
+ NLM_F_MATCH = 0x200
+ NLM_F_ATOMIC = 0x400
+
+
+class NlmNewFlags(Enum):
+ NLM_F_REPLACE = 0x100
+ NLM_F_EXCL = 0x200
+ NLM_F_CREATE = 0x400
+ NLM_F_APPEND = 0x800
+
+
+class NlmDeleteFlags(Enum):
+ NLM_F_NONREC = 0x100
+
+
+class NlmAckFlags(Enum):
+ NLM_F_CAPPED = 0x100
+ NLM_F_ACK_TLVS = 0x200
+
+
+class RtScope(Enum):
+ RT_SCOPE_UNIVERSE = 0
+ RT_SCOPE_SITE = 200
+ RT_SCOPE_LINK = 253
+ RT_SCOPE_HOST = 254
+ RT_SCOPE_NOWHERE = 255
+
+
+class RtType(Enum):
+ RTN_UNSPEC = 0
+ RTN_UNICAST = auto()
+ RTN_LOCAL = auto()
+ RTN_BROADCAST = auto()
+ RTN_ANYCAST = auto()
+ RTN_MULTICAST = auto()
+ RTN_BLACKHOLE = auto()
+ RTN_UNREACHABLE = auto()
+ RTN_PROHIBIT = auto()
+ RTN_THROW = auto()
+ RTN_NAT = auto()
+ RTN_XRESOLVE = auto()
+
+
+class RtProto(Enum):
+ RTPROT_UNSPEC = 0
+ RTPROT_REDIRECT = 1
+ RTPROT_KERNEL = 2
+ RTPROT_BOOT = 3
+ RTPROT_STATIC = 4
+ RTPROT_GATED = 8
+ RTPROT_RA = 9
+ RTPROT_MRT = 10
+ RTPROT_ZEBRA = 11
+ RTPROT_BIRD = 12
+ RTPROT_DNROUTED = 13
+ RTPROT_XORP = 14
+ RTPROT_NTK = 15
+ RTPROT_DHCP = 16
+ RTPROT_MROUTED = 17
+ RTPROT_KEEPALIVED = 18
+ RTPROT_BABEL = 42
+ RTPROT_OPENR = 99
+ RTPROT_BGP = 186
+ RTPROT_ISIS = 187
+ RTPROT_OSPF = 188
+ RTPROT_RIP = 189
+ RTPROT_EIGRP = 192
+
+
+class NlRtaxType(Enum):
+ RTAX_UNSPEC = 0
+ RTAX_LOCK = auto()
+ RTAX_MTU = auto()
+ RTAX_WINDOW = auto()
+ RTAX_RTT = auto()
+ RTAX_RTTVAR = auto()
+ RTAX_SSTHRESH = auto()
+ RTAX_CWND = auto()
+ RTAX_ADVMSS = auto()
+ RTAX_REORDERING = auto()
+ RTAX_HOPLIMIT = auto()
+ RTAX_INITCWND = auto()
+ RTAX_FEATURES = auto()
+ RTAX_RTO_MIN = auto()
+ RTAX_INITRWND = auto()
+ RTAX_QUICKACK = auto()
+ RTAX_CC_ALGO = auto()
+ RTAX_FASTOPEN_NO_COOKIE = auto()
+
+
+class NlRtGroup(Enum):
+ RTNLGRP_NONE = 0
+ RTNLGRP_LINK = auto()
+ RTNLGRP_NOTIFY = auto()
+ RTNLGRP_NEIGH = auto()
+ RTNLGRP_TC = auto()
+ RTNLGRP_IPV4_IFADDR = auto()
+ RTNLGRP_IPV4_MROUTE = auto()
+ RTNLGRP_IPV4_ROUTE = auto()
+ RTNLGRP_IPV4_RULE = auto()
+ RTNLGRP_IPV6_IFADDR = auto()
+ RTNLGRP_IPV6_MROUTE = auto()
+ RTNLGRP_IPV6_ROUTE = auto()
+ RTNLGRP_IPV6_IFINFO = auto()
+ RTNLGRP_DECnet_IFADDR = auto()
+ RTNLGRP_NOP2 = auto()
+ RTNLGRP_DECnet_ROUTE = auto()
+ RTNLGRP_DECnet_RULE = auto()
+ RTNLGRP_NOP4 = auto()
+ RTNLGRP_IPV6_PREFIX = auto()
+ RTNLGRP_IPV6_RULE = auto()
+ RTNLGRP_ND_USEROPT = auto()
+ RTNLGRP_PHONET_IFADDR = auto()
+ RTNLGRP_PHONET_ROUTE = auto()
+ RTNLGRP_DCB = auto()
+ RTNLGRP_IPV4_NETCONF = auto()
+ RTNLGRP_IPV6_NETCONF = auto()
+ RTNLGRP_MDB = auto()
+ RTNLGRP_MPLS_ROUTE = auto()
+ RTNLGRP_NSID = auto()
+ RTNLGRP_MPLS_NETCONF = auto()
+ RTNLGRP_IPV4_MROUTE_R = auto()
+ RTNLGRP_IPV6_MROUTE_R = auto()
+ RTNLGRP_NEXTHOP = auto()
+ RTNLGRP_BRVLAN = auto()
+
+
+class IfinfoMsg(Structure):
+ _fields_ = [
+ ("ifi_family", c_ubyte),
+ ("__ifi_pad", c_ubyte),
+ ("ifi_type", c_ushort),
+ ("ifi_index", c_int),
+ ("ifi_flags", c_uint),
+ ("ifi_change", c_uint),
+ ]
+
+
+class IflattrType(Enum):
+ IFLA_UNSPEC = 0
+ IFLA_ADDRESS = auto()
+ IFLA_BROADCAST = auto()
+ IFLA_IFNAME = auto()
+ IFLA_MTU = auto()
+ IFLA_LINK = auto()
+ IFLA_QDISC = auto()
+ IFLA_STATS = auto()
+ IFLA_COST = auto()
+ IFLA_PRIORITY = auto()
+ IFLA_MASTER = auto()
+ IFLA_WIRELESS = auto()
+ IFLA_PROTINFO = auto()
+ IFLA_TXQLEN = auto()
+ IFLA_MAP = auto()
+ IFLA_WEIGHT = auto()
+ IFLA_OPERSTATE = auto()
+ IFLA_LINKMODE = auto()
+ IFLA_LINKINFO = auto()
+ IFLA_NET_NS_PID = auto()
+ IFLA_IFALIAS = auto()
+ IFLA_NUM_VF = auto()
+ IFLA_VFINFO_LIST = auto()
+ IFLA_STATS64 = auto()
+ IFLA_VF_PORTS = auto()
+ IFLA_PORT_SELF = auto()
+ IFLA_AF_SPEC = auto()
+ IFLA_GROUP = auto()
+ IFLA_NET_NS_FD = auto()
+ IFLA_EXT_MASK = auto()
+ IFLA_PROMISCUITY = auto()
+ IFLA_NUM_TX_QUEUES = auto()
+ IFLA_NUM_RX_QUEUES = auto()
+ IFLA_CARRIER = auto()
+ IFLA_PHYS_PORT_ID = auto()
+ IFLA_CARRIER_CHANGES = auto()
+ IFLA_PHYS_SWITCH_ID = auto()
+ IFLA_LINK_NETNSID = auto()
+ IFLA_PHYS_PORT_NAME = auto()
+ IFLA_PROTO_DOWN = auto()
+ IFLA_GSO_MAX_SEGS = auto()
+ IFLA_GSO_MAX_SIZE = auto()
+ IFLA_PAD = auto()
+ IFLA_XDP = auto()
+ IFLA_EVENT = auto()
+ IFLA_NEW_NETNSID = auto()
+ IFLA_IF_NETNSID = auto()
+ IFLA_CARRIER_UP_COUNT = auto()
+ IFLA_CARRIER_DOWN_COUNT = auto()
+ IFLA_NEW_IFINDEX = auto()
+ IFLA_MIN_MTU = auto()
+ IFLA_MAX_MTU = auto()
+ IFLA_PROP_LIST = auto()
+ IFLA_ALT_IFNAME = auto()
+ IFLA_PERM_ADDRESS = auto()
+ IFLA_PROTO_DOWN_REASON = auto()
+
+
+class IfaddrMsg(Structure):
+ _fields_ = [
+ ("ifa_family", c_ubyte),
+ ("ifa_prefixlen", c_ubyte),
+ ("ifa_flags", c_ubyte),
+ ("ifa_scope", c_ubyte),
+ ("ifa_index", c_uint),
+ ]
+
+
+class IfattrType(Enum):
+ IFA_UNSPEC = 0
+ IFA_ADDRESS = auto()
+ IFA_LOCAL = auto()
+ IFA_LABEL = auto()
+ IFA_BROADCAST = auto()
+ IFA_ANYCAST = auto()
+ IFA_CACHEINFO = auto()
+ IFA_MULTICAST = auto()
+ IFA_FLAGS = auto()
+ IFA_RT_PRIORITY = auto()
+ IFA_TARGET_NETNSID = auto()
+
+
+class NlConst():
+ AF_NETLINK = 38
+ NETLINK_ROUTE = 0
+
+
+class NlHelper():
+ def __init__(self):
+ self._pmap = {}
+ self._af_cls = self.get_af_cls()
+
+ def get_af_cls(self):
+ if sys.platform.startswith("freebsd"):
+ cls = AddressFamilyBsd
+ else:
+ cls = AddressFamilyLinux
+ return cls
+
+ def get_propmap(self, cls):
+ if cls not in self._pmap:
+ ret = {}
+ for prop in dir(cls):
+ if not prop.startswith("_"):
+ ret[getattr(cls, prop).value] = prop
+ self._pmap[cls] = ret
+ return self._pmap[cls]
+
+ def get_name_propmap(self, cls):
+ ret = {}
+ for prop in dir(cls):
+ if not prop.startswith("_"):
+ ret[prop] = getattr(cls, prop).value
+ return ret
+
+ def get_attr_byval(self, cls, attr_val):
+ propmap = self.get_propmap(cls)
+ return propmap.get(attr_val)
+
+ def get_nlmsg_name(self, val):
+ for cls in [NlRtMsgType, NlMsgType]:
+ v = self.get_attr_byval(cls, val)
+ if v is not None:
+ return v
+ return "msg#{}".format(val)
+
+ def get_af_name(self, family):
+ v = self.get_attr_byval(self._af_cls, family)
+ if v is not None:
+ return v
+ return "af#{}".format(family)
+
+ def get_af_value(self, family_str: str) -> int:
+ propmap = self.get_name_propmap(self._af_cls)
+ return propmap.get(family_str)
+
+ def get_rta_name(self, val):
+ return self.get_attr_byval(RtattrType, val)
+
+ def get_bitmask_map(self, cls, val):
+ propmap = self.get_propmap(cls)
+ v = 1
+ ret = {}
+ while val:
+ if v & val:
+ if v in propmap:
+ ret[v] = propmap[v]
+ else:
+ ret[v] = hex(v)
+ val -= v
+ v *= 2
+ return ret
+
+ def get_bitmask_str(self, cls, val):
+ bmap = self.get_bitmask_map(cls, val)
+ return ",".join([v for k, v in bmap.items()])
+
+ def get_nlm_flags_str(self, msg_str: str, reply: bool, val):
+ if reply:
+ return self.get_bitmask_str(NlmAckFlags, val)
+ if msg_str.startswith("RTM_GET"):
+ return self.get_bitmask_str(NlmGetFlags, val)
+ elif msg_str.startswith("RTM_DEL"):
+ return self.get_bitmask_str(NlmDeleteFlags, val)
+ elif msg_str.startswith("RTM_NEW"):
+ return self.get_bitmask_str(NlmNewFlags, val)
+ else:
+ return self.get_bitmask_str(NlmBaseFlags, val)
+
+
+class BaseRtAttr(object):
+ def __init__(self, parent, rta_type, rta_len, data=None):
+ self.parent = parent
+ self.helper = parent.helper
+ self.attr_enum = parent.attr_enum
+ self.rta_type = rta_type & 0x3f
+ self.is_nested = rta_type & (1 << 15)
+ self.network_byte_order = rta_type & (1 << 14)
+ self.rta_len = rta_len
+ self.rta_type_str = self.helper.get_attr_byval(self.attr_enum, self.rta_type) # noqa: E501
+ if data is not None:
+ self._validate(data)
+ self._parse(data)
+ self._orig_data = data
+
+ def print_attribute(self, prepend=""):
+ if self.rta_type_str:
+ type_str = self.rta_type_str
+ else:
+ type_str = "rta#{}".format(self.rta_type)
+ print("{}rta_len={} rta_type={}({}){}".format(prepend,
+ self.rta_len,
+ type_str,
+ self.rta_type,
+ self._print_attr_value())
+ )
+
+ def _print_attr_value(self):
+ return " [" + " ".join(["{:02X}".format(b) for b in self._orig_data[4:]]) + "]" # noqa: E501
+
+ @classmethod
+ def from_bytes(cls, parent, data):
+ if len(data) < sizeof(RtAttr):
+ raise ValueError("length less than rtattr header")
+ rta_hdr = RtAttr.from_buffer_copy(data)
+ self = cls(parent, rta_hdr.rta_type, rta_hdr.rta_len, data[:rta_hdr.rta_len]) # noqa: E501
+ # XXX: nested
+ return self
+
+ def __bytes__(self):
+ ret = self._orig_data
+ if align4(len(ret)) != len(ret):
+ ret += bytes(align4(len(ret)) - len(ret))
+ return ret
+
+ def _validate(self, data):
+ pass
+
+ def _parse(self, data):
+ pass
+
+
+class RtAttrIp(BaseRtAttr):
+ def _validate(self, data):
+ data_len = len(data) - 4
+ if data_len != 4 and data_len != 16:
+ raise ValueError("Error validating attr {}: rta_len is not valid".format( # noqa: E501
+ self.rta_type_str))
+
+ def _parse(self, data):
+ data_len = len(data) - 4
+ if data_len == 4:
+ self.family = socket.AF_INET
+ self.addr = socket.inet_ntop(self.family, data[4:8])
+ else:
+ self.family = socket.AF_INET6
+ self.addr = socket.inet_ntop(self.family, data[4:20])
+
+ def _print_attr_value(self):
+ return " addr={}".format(self.addr)
+
+
+class RtAttrU32(BaseRtAttr):
+ def _validate(self, data):
+ if len(data) != 8:
+ raise ValueError("Error validating attr {}: rta_len is not valid".format( # noqa: E501
+ self.rta_type_str))
+
+ def _parse(self, data):
+ self.value = struct.unpack("@I", data[4:8])[0]
+
+ def _print_attr_value(self):
+ return " value={}".format(self.value)
+
+
+class RtAttrIfindex(RtAttrU32):
+ def _print_attr_value(self):
+ try:
+ ifname = socket.if_indextoname(self.value)
+ return " iface={}(#{})".format(ifname, self.value)
+ except OSError as e:
+ pass
+ return " iface=if#{}".format(self.value)
+
+
+class RtAttrTable(RtAttrU32):
+ def _print_attr_value(self):
+ return " rtable={}".format(self.value)
+
+
+class RtAttrNhId(RtAttrU32):
+ def _print_attr_value(self):
+ return " nh_id={}".format(self.value)
+
+
+class RtAttrVia(BaseRtAttr):
+ def _validate(self, data):
+ data_len = len(data) - 4
+ if data_len == 0:
+ raise ValueError("Error validating attr {}: empty data".format(self.rta_type_str)) # noqa: E501
+ family = int(data_len[0])
+ if family not in (socket.AF_INET, socket.AF_INET6):
+ raise ValueError("Error validating attr {}: unsupported AF {}".format( # noqa: E501
+ self.rta_type_str, family))
+ if family == socket.AF_INET:
+ expected_len = 1 + 4
+ else:
+ expected_len = 1 + 16
+ if data_len != expected_len:
+ raise ValueError("Error validating attr {}: expected len {} got {}".format( # noqa: E501
+ self.rta_type_str, expected_len, data_len))
+
+ def _parse(self, data):
+ data_len = len(data) - 4
+ self.family = int(data_len[0])
+ if self.family == socket.AF_INET:
+ self.addr = socket.inet_ntop(self.family, data[5:9])
+ else:
+ self.addr = socket.inet_ntop(self.family, data[5:21])
+
+ def _print_attr_value(self):
+ return ", via={}".format(self.addr)
+
+
+class RtAttrStr(BaseRtAttr):
+ def _validate(self, data):
+ try:
+ s = data[4:].decode("utf-8")
+ except Exception as e:
+ raise ValueError("wrong utf-8 string")
+
+ def _parse(self, data):
+ self.str = data[4:].decode("utf-8")
+
+ def _print_attr_value(self):
+ return " str=\"{}\"".format(self.str)
+
+
+rta_class_map = {
+ "RTA_DST": RtAttrIp,
+ "RTA_SRC": RtAttrIp,
+ "RTA_IIF": RtAttrIfindex,
+ "RTA_OIF": RtAttrIfindex,
+ "RTA_GATEWAY": RtAttrIp,
+ "RTA_TABLE": RtAttrTable,
+ "RTA_VIA": RtAttrVia,
+ "RTA_NH_ID": RtAttrNhId,
+}
+
+
+ifla_class_map = {
+ "IFLA_MTU": RtAttrU32,
+}
+
+ifa_class_map = {
+ "IFA_ADDRESS": RtAttrIp,
+ "IFA_LOCAL": RtAttrIp,
+ "IFA_LABEL": RtAttrStr,
+ "IFA_BROADCAST": RtAttrIp,
+ "IFA_ANYCAST": RtAttrIp,
+ "IFA_FLAGS": RtAttrU32,
+}
+
+
+class BaseNetlinkMessage(object):
+ def __init__(self, helper, nlmsg_type):
+ self.nlmsg_type = nlmsg_type
+ self.ut = unittest.TestCase()
+ self.rta_list = []
+ self._orig_data = None
+ self.helper = helper
+ self.nl_hdr = Nlmsghdr(nlmsg_type=nlmsg_type)
+
+ def assertEqual(self, a, b, msg=None):
+ self.ut.assertEqual(a, b, msg)
+
+ def assertNotEqual(self, a, b, msg=None):
+ self.ut.assertNotEqual(a, b, msg)
+
+ @staticmethod
+ def parse_nl_header(data: bytes):
+ if len(data) < sizeof(Nlmsghdr):
+ raise ValueError("length less than netlink message header")
+ return Nlmsghdr.from_buffer_copy(data), sizeof(Nlmsghdr)
+
+ def is_reply(self, hdr):
+ return hdr.nlmsg_type == NlMsgType.NLMSG_ERROR.value
+
+ def print_nl_header(self, hdr, prepend=""):
+ # len=44, type=RTM_DELROUTE, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1641163704, pid=0 # noqa: E501
+ is_reply = self.is_reply(hdr)
+ msg_name = self.helper.get_nlmsg_name(hdr.nlmsg_type)
+ print("{}len={}, type={}, flags={}(0x{:X}), seq={}, pid={}".format(
+ prepend,
+ hdr.nlmsg_len,
+ msg_name,
+ self.helper.get_nlm_flags_str(msg_name, is_reply, hdr.nlmsg_flags), # noqa: E501
+ hdr.nlmsg_flags,
+ hdr.nlmsg_seq,
+ hdr.nlmsg_pid
+ ))
+
+ @classmethod
+ def from_bytes(cls, helper, data):
+ try:
+ hdr, hdrlen = BaseNetlinkMessage.parse_nl_header(data)
+ self = cls(helper, hdr.nlmsg_type)
+ self._orig_data = data
+ self.nl_hdr = hdr
+ except ValueError as e:
+ print("Failed to parse nl header: {}".format(e))
+ cls.print_as_bytes(data)
+ raise
+ return self
+
+ def print_message(self):
+ self.print_nl_header(self.nl_hdr)
+
+ @staticmethod
+ def print_as_bytes(data: bytes, descr: str):
+ print("===vv {} (len:{:3d}) vv===".format(descr, len(data)))
+ off = 0
+ step = 16
+ while off < len(data):
+ for i in range(step):
+ if off + i < len(data):
+ print(" {:02X}".format(data[off + i]), end="")
+ print("")
+ off += step
+ print("--------------------")
+
+
+class NetlinkErrorMessage(BaseNetlinkMessage):
+ messages = [NlMsgType.NLMSG_ERROR.value]
+
+ def __init__(self, helper, nlmsg_type, error):
+ super().__init__(helper, nlmsg_type)
+ self.err_hdr = Nlmsgerr()
+
+ def print_error_header(self, errhdr, prepend=""):
+ print("{}error={}, ".format(prepend), end="")
+ self.print_nl_header(errhdr.msg, prepend)
+
+ def print_message(self, prepend=""):
+ self.print_nl_header(self.nl_nhr, prepend)
+ self.print_error_header(self.err_hdr, prepend + " ")
+
+
+class BaseNetlinkRtMessage(BaseNetlinkMessage):
+ attr_class_map = {}
+ attr_enum = None
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = None
+
+ def parse_rta_list(self, data: bytes) -> List[BaseRtAttr]:
+ ret = []
+ offset = 0
+ while offset < len(data):
+ # print("OFFSET={}".format(offset))
+ if offset + 4 > len(data):
+ raise ValueError("only {} bytes remaining".format(len(data) - offset)) # noqa: E501
+ rta_hdr = RtAttr.from_buffer_copy(data[offset:])
+ rta_type_str = self.helper.get_attr_byval(self.attr_enum, rta_hdr.rta_type) # noqa: E501
+ cls = self.attr_class_map.get(rta_type_str, BaseRtAttr)
+ rta = cls.from_bytes(self, data[offset:])
+ offset += align4(rta.rta_len)
+ if rta.rta_len == 0:
+ raise ValueError("empty rta len, {} bytes remaining".format(len(data) - offset)) # noqa: E501
+ ret.append(rta)
+ return ret, offset
+
+ @classmethod
+ def from_bytes(cls, helper, data):
+ try:
+ hdr, hdrlen = BaseNetlinkMessage.parse_nl_header(data)
+ self = cls(helper, hdr.nlmsg_type)
+ self._orig_data = data
+ self.nl_hdr = hdr
+ except ValueError as e:
+ print("Failed to parse nl header: {}".format(e))
+ cls.print_as_bytes(data)
+ raise
+
+ offset = align4(hdrlen)
+ try:
+ base_hdr, hdrlen = self.parse_base_header(data[offset:])
+ self.base_hdr = base_hdr
+ offset += align4(hdrlen)
+ except ValueError as e:
+ print("Failed to parse nl rt header: {}".format(e))
+ cls.print_as_bytes(data)
+ raise
+
+ orig_offset = offset
+ try:
+ rta_list, rta_len = self.parse_rta_list(data[offset:])
+ offset += rta_len
+ if offset != len(data):
+ raise ValueError("{} bytes left at the end of the packet".format(len(data) - offset)) # noqa: E501
+ self.rta_list = rta_list
+ except ValueError as e:
+ print("Failed to parse nl rta attributes at offset {}: {}".format(orig_offset, e)) # noqa: E501
+ cls.print_as_bytes(data, "msg dump")
+ cls.print_as_bytes(data[orig_offset:], "failed block")
+ raise
+ return self
+
+ def __bytes__(self):
+ ret = bytes()
+ for rta in self.rta_list:
+ ret += bytes(rta)
+ ret = bytes(self.base_hdr) + ret
+ self.nl_hdr.nlmsg_len = len(ret) + sizeof(Nlmsghdr)
+ return bytes(self.nl_hdr) + ret
+
+ def print_message(self):
+ self.print_nl_header(self.nl_hdr)
+ self.print_base_header(self.base_hdr, " ")
+ for rta in self.rta_list:
+ rta.print_attribute(" ")
+
+
+class NetlinkRtMessage(BaseNetlinkRtMessage):
+ messages = [
+ NlRtMsgType.RTM_NEWROUTE.value,
+ NlRtMsgType.RTM_DELROUTE.value,
+ NlRtMsgType.RTM_GETROUTE.value,
+ ]
+ attr_class_map = rta_class_map
+ attr_enum = RtattrType
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = RtMsgHdr()
+
+ def parse_base_header(self, data):
+ if len(data) < sizeof(RtMsgHdr):
+ raise ValueError("length less than rtmsg header")
+ rtm_hdr = RtMsgHdr.from_buffer_copy(data)
+ return (rtm_hdr, sizeof(RtMsgHdr))
+
+ def print_base_header(self, hdr, prepend=""):
+ family = self.helper.get_af_name(hdr.rtm_family)
+ print("{}family={}, dst_len={}, src_len={}, tos={}, table={}, protocol={}({}), scope={}({}), type={}({}), flags={}({})".format( # noqa: E501
+ prepend,
+ family,
+ hdr.rtm_dst_len,
+ hdr.rtm_src_len,
+ hdr.rtm_tos,
+ hdr.rtm_table,
+ self.helper.get_attr_byval(RtProto, hdr.rtm_protocol),
+ hdr.rtm_protocol,
+ self.helper.get_attr_byval(RtScope, hdr.rtm_scope),
+ hdr.rtm_scope,
+ self.helper.get_attr_byval(RtType, hdr.rtm_type),
+ hdr.rtm_type,
+ self.helper.get_bitmask_str(RtMsgFlags, hdr.rtm_flags),
+ hdr.rtm_flags))
+
+
+class NetlinkIflaMessage(BaseNetlinkRtMessage):
+ messages = [
+ NlRtMsgType.RTM_NEWLINK.value,
+ NlRtMsgType.RTM_DELLINK.value,
+ NlRtMsgType.RTM_GETLINK.value,
+ ]
+ attr_class_map = ifla_class_map
+ attr_enum = IflattrType
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = IfinfoMsg()
+
+ def parse_base_header(self, data):
+ if len(data) < sizeof(IfinfoMsg):
+ raise ValueError("length less than IfinfoMsg header")
+ rtm_hdr = IfinfoMsg.from_buffer_copy(data)
+ return (rtm_hdr, sizeof(IfinfoMsg))
+
+ def print_base_header(self, hdr, prepend=""):
+ family = self.helper.get_af_name(hdr.ifi_family)
+ print("{}family={}, ifi_type={}, ifi_index={}, ifi_flags={}, ifi_change={}".format( # noqa: E501
+ prepend,
+ family,
+ hdr.ifi_type,
+ hdr.ifi_index,
+ hdr.ifi_flags,
+ hdr.ifi_change))
+
+
+class NetlinkIfaMessage(BaseNetlinkRtMessage):
+ messages = [
+ NlRtMsgType.RTM_NEWADDR.value,
+ NlRtMsgType.RTM_DELADDR.value,
+ NlRtMsgType.RTM_GETADDR.value,
+ ]
+ attr_class_map = ifa_class_map
+ attr_enum = IfattrType
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = IfaddrMsg()
+
+ def parse_base_header(self, data):
+ if len(data) < sizeof(IfaddrMsg):
+ raise ValueError("length less than IfaddrMsg header")
+ rtm_hdr = IfaddrMsg.from_buffer_copy(data)
+ return (rtm_hdr, sizeof(IfaddrMsg))
+
+ def print_base_header(self, hdr, prepend=""):
+ family = self.helper.get_af_name(hdr.ifa_family)
+ print("{}family={}, ifa_prefixlen={}, ifa_flags={}, ifa_scope={}, ifa_index={}".format( # noqa: E501
+ prepend,
+ family,
+ hdr.ifa_prefixlen,
+ hdr.ifa_flags,
+ hdr.ifa_scope,
+ hdr.ifa_index))
+
+
+class Nlsock():
+ def __init__(self, helper):
+ self.helper = helper
+ self.sock_fd = self._setup_netlink()
+ self._data = bytes()
+ self.rtm_seq = 1
+ self.pid = os.getpid()
+ self.msgmap = self.build_msgmap()
+ self.set_groups(NlRtGroup.RTNLGRP_IPV4_ROUTE.value | NlRtGroup.RTNLGRP_IPV6_ROUTE.value) # noqa: E501
+
+ def build_msgmap(self):
+ classes = [NetlinkRtMessage, NetlinkIfaMessage, NetlinkErrorMessage]
+ xmap = {}
+ for cls in classes:
+ for message in cls.messages:
+ xmap[message] = cls
+ return xmap
+
+ def get_seq(self):
+ ret = self.rtm_seq
+ self.rtm_seq += 1
+ return ret
+
+ def _setup_netlink(self) -> int:
+ family = self.helper.get_af_value("AF_NETLINK")
+ s = socket.socket(family, socket.SOCK_RAW, NlConst.NETLINK_ROUTE)
+ return s
+
+ def set_groups(self, mask: int):
+ self.sock_fd.setsockopt(socket.SOL_SOCKET, 1, mask)
+ # snl = SockaddrNl(nl_len = sizeof(SockaddrNl), nl_family=38,
+ # nl_pid=self.pid, nl_groups=mask)
+ # xbuffer = create_string_buffer(sizeof(SockaddrNl))
+ # memmove(xbuffer, addressof(snl), sizeof(SockaddrNl))
+ # k = struct.pack("@BBHII", 12, 38, 0, self.pid, mask)
+ # self.sock_fd.bind(k)
+
+ def write_message(self, msg):
+ print("vvvvvvvv OUT vvvvvvvv")
+ msg.print_message()
+ msg_bytes = bytes(msg)
+ try:
+ ret = os.write(self.sock_fd.fileno(), bytes(msg))
+ except Exception as e:
+ print("write({}) -> {}".format(len(msg_bytes), e))
+
+ def parse_message(self, data: bytes):
+ if len(data) < sizeof(Nlmsghdr):
+ raise Exception("Short read from nl: {} bytes".format(len(data)))
+ hdr = Nlmsghdr.from_buffer_copy(data)
+ nlmsg_type = hdr.nlmsg_type
+ cls = self.msgmap.get(nlmsg_type)
+ if not cls:
+ cls = BaseNetlinkMessage
+ return cls.from_bytes(self.helper, data)
+
+ def write_data(self, data: bytes):
+ self.sock_fd.send(data)
+
+ def read_data(self):
+ while True:
+ data = self.sock_fd.recv(65535)
+ self._data += data
+ if len(self._data) >= sizeof(Nlmsghdr):
+ break
+ if seq is None:
+ break
+ hdr = Nlmsghdr.from_buffer_copy(data)
+ if hdr.nlmsg_pid == self.pid and hdr.nlmsg_seq == seq:
+ break
+ return data
+
+ def read_message(self) -> bytes:
+ if len(self._data) < sizeof(Nlmsghdr):
+ self.read_data()
+ hdr = Nlmsghdr.from_buffer_copy(self._data)
+ while (hdr.nlmsg_len > len(self._data)):
+ self.read_data()
+ raw_msg = self._data[:hdr.nlmsg_len]
+ self._data = self._data[hdr.nlmsg_len:]
+ return self.parse_message(raw_msg)
+
+ def fill_msg_seq(self, msg):
+ msg.nl_hdr.nlmsg_seq = self.get_seq()
+ msg.nl_hdr.nlmsg_pid = self.pid
+
+ def request_ifaddrs(self, family):
+ msg = NetlinkIfaMessage(self.helper, NlRtMsgType.RTM_GETADDR.value)
+ flags = NlmGetFlags.NLM_F_ROOT.value | NlmGetFlags.NLM_F_MATCH.value
+ self.fill_msg_seq(msg)
+ msg.base_hdr.ifa_family = family
+ msg.nl_hdr.nlmsg_flags = flags | NlmBaseFlags.NLM_F_REQUEST.value
+
+ msg_bytes = bytes(msg)
+ x = self.parse_message(msg_bytes)
+ x.print_message()
+ print(msg_bytes)
+ # Skip family for now
+ self.write_data(msg_bytes)
+
+ def request_routes(self, family):
+ msg = NetlinkRtMessage(self.helper, NlRtMsgType.RTM_GETROUTE.value)
+ flags = NlmGetFlags.NLM_F_ROOT.value | NlmGetFlags.NLM_F_MATCH.value
+ self.fill_msg_seq(msg)
+ msg.base_hdr.rtm_family = family
+ msg.nl_hdr.nlmsg_flags = flags | NlmBaseFlags.NLM_F_REQUEST.value
+
+ msg_bytes = bytes(msg)
+ x = self.parse_message(msg_bytes)
+ x.print_message()
+ print(msg_bytes)
+ # Skip family for now
+ self.write_data(msg_bytes)
+
+
+def main():
+ helper = NlHelper()
+ nl = Nlsock(helper)
+ # nl.request_ifaddrs(socket.AF_INET)
+ nl.request_routes(0)
+ while True:
+ msg = nl.read_message()
+ print("")
+ msg.print_message()
+
+ pass
+
+
+if __name__ == "__main__":
+ main()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Nov 8, 7:05 PM (18 h, 10 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14540999
Default Alt Text
D36002.id109100.diff (219 KB)
Attached To
Mode
D36002: netlink: add netlink support
Attached
Detach File
Event Timeline
Log In to Comment