Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F108311253
D36002.id109971.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
253 KB
Referenced Files
None
Subscribers
None
D36002.id109971.diff
View Options
diff --git a/sys/modules/netlink/Makefile b/sys/modules/netlink/Makefile
new file mode 100644
--- /dev/null
+++ b/sys/modules/netlink/Makefile
@@ -0,0 +1,16 @@
+.PATH: ${SRCTOP}/sys/netlink
+KMOD= netlink
+
+SRCS = netlink_module.c netlink_domain.c netlink_io.c netlink_helpers.c \
+ netlink_message.c netlink_route.c \
+ route/iface.c route/neigh.c route/nexthop.c route/route.c
+
+EXPORT_SYMS=
+EXPORT_SYMS+= nlmsg_get_chain_writer
+EXPORT_SYMS+= nlmsg_refill_buffer
+EXPORT_SYMS+= nlmsg_end
+EXPORT_SYMS+= nlmsg_flush
+
+EXPORT_SYMS= YES
+
+.include <bsd.kmod.mk>
diff --git a/sys/net/route.c b/sys/net/route.c
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -77,6 +77,8 @@
VNET_PCPUSTAT_SYSUNINIT(rtstat);
#endif
+void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */
+
EVENTHANDLER_LIST_DEFINE(rt_addrmsg);
static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *,
diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h
--- a/sys/net/route/route_ctl.h
+++ b/sys/net/route/route_ctl.h
@@ -35,6 +35,8 @@
#ifndef _NET_ROUTE_ROUTE_CTL_H_
#define _NET_ROUTE_ROUTE_CTL_H_
+#include <sys/ck.h>
+
struct rib_cmd_info {
uint8_t rc_cmd; /* RTM_ADD|RTM_DEL|RTM_CHANGE */
uint8_t spare[3];
@@ -189,4 +191,31 @@
void rib_notify(struct rib_head *rnh, enum rib_subscription_type type,
struct rib_cmd_info *rc);
+/* Event bridge */
+
+/* Types of events */
+#define NLBR_EVENT_ROUTE 1
+
+/* Event providers */
+#define NLBR_PROVIDER_KERNEL 1
+#define NLBR_PROVIDER_RTSOCK 2
+#define NLBR_PROVIDER_NETLINK 3
+
+struct rib_event_bridge;
+typedef void rib_event_bridge_cb_t(uint32_t event_type, uint32_t fibnum,
+ const struct rt_addrinfo *info, const struct rib_cmd_info *rc, void *arg);
+
+struct rib_event_bridge {
+ rib_event_bridge_cb_t *reb_cb;
+ void *reb_cb_arg;
+ int reb_provider_id;
+ CK_STAILQ_ENTRY(rib_event_bridge) reb_link;
+};
+void rib_bridge_generic_event(int provider_id, uint32_t event_type, uint32_t val1,
+ void *ptr1, void *ptr2);
+void rib_bridge_rt_event(int provider_id, uint32_t fibnum, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+void rib_bridge_link(struct rib_event_bridge *reb);
+void rib_bridge_unlink(struct rib_event_bridge *reb);
+
#endif
diff --git a/sys/net/route/route_ctl.c b/sys/net/route/route_ctl.c
--- a/sys/net/route/route_ctl.c
+++ b/sys/net/route/route_ctl.c
@@ -59,7 +59,7 @@
#define DEBUG_MOD_NAME route_ctl
#define DEBUG_MAX_LEVEL LOG_DEBUG
#include <net/route/route_debug.h>
-_DECLARE_DEBUG(LOG_INFO);
+_DECLARE_DEBUG(LOG_DEBUG);
/*
* This file contains control plane routing tables functions.
@@ -1548,3 +1548,62 @@
return ("unknown");
}
+CK_STAILQ_HEAD(rib_event_bridge_head, rib_event_bridge);
+static struct rib_event_bridge_head bridge_head;
+struct mtx bridge_lock;
+
+static void
+rib_bridge_init(void)
+{
+ CK_STAILQ_INIT(&bridge_head);
+ mtx_init(&bridge_lock, "rib_event_bridge_lock", NULL, MTX_DEF);
+}
+SYSINIT(rib_bridge_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, rib_bridge_init, NULL);
+
+
+void
+rib_bridge_generic_event(int provider_id, uint32_t event_type, uint32_t val1,
+ void *ptr1, void *ptr2)
+{
+ struct rib_event_bridge *reb;
+
+ NET_EPOCH_ASSERT();
+
+ CK_STAILQ_FOREACH(reb, &bridge_head, reb_link) {
+ RT_LOG(LOG_DEBUG3, "HERE reb %p %d", reb, reb->reb_provider_id);
+ if (reb->reb_provider_id != provider_id)
+ reb->reb_cb(event_type, val1, ptr1, ptr2, reb->reb_cb_arg);
+ }
+}
+
+void
+rib_bridge_rt_event(int provider_id, uint32_t fibnum, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG3, fibnum, rt_get_family(rc->rc_rt), "received cmd %s for %s",
+ rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)));
+#endif
+ rib_bridge_generic_event(provider_id, NLBR_EVENT_ROUTE, fibnum, info, rc);
+}
+
+
+void
+rib_bridge_link(struct rib_event_bridge *reb)
+{
+ mtx_lock(&bridge_lock);
+ CK_STAILQ_INSERT_HEAD(&bridge_head, reb, reb_link);
+ mtx_unlock(&bridge_lock);
+ RT_LOG(LOG_DEBUG, "link %p", reb);
+}
+
+void
+rib_bridge_unlink(struct rib_event_bridge *reb)
+{
+ mtx_lock(&bridge_lock);
+ CK_STAILQ_REMOVE(&bridge_head, reb, rib_event_bridge, reb_link);
+ mtx_unlock(&bridge_lock);
+ RT_LOG(LOG_DEBUG, "unlink %p", reb);
+}
+
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -274,6 +274,53 @@
vnet_rts_uninit, 0);
#endif
+static void
+report_route_event(const struct rib_cmd_info *rc, void *_cbdata)
+{
+ uint32_t fibnum = (uint32_t)(uintptr_t)_cbdata;
+ struct nhop_object *nh;
+
+ nh = rc->rc_cmd == RTM_DELETE ? rc->rc_nh_old : rc->rc_nh_new;
+ rt_routemsg(rc->rc_cmd, rc->rc_rt, nh, fibnum);
+}
+
+static void
+rts_handle_route_event(uint32_t fibnum, const struct rt_addrinfo *info,
+ const struct rib_cmd_info *rc)
+{
+#ifdef ROUTE_MPATH
+ if ((rc->rc_nh_new && NH_IS_NHGRP(rc->rc_nh_new)) ||
+ (rc->rc_nh_old && NH_IS_NHGRP(rc->rc_nh_old))) {
+ rib_decompose_notification(rc, report_route_event,
+ (void *)(uintptr_t)fibnum);
+ } else
+#endif
+ report_route_event(rc, (void *)(uintptr_t)fibnum);
+}
+
+static void rtsbridge_cb_func(uint32_t event_type, uint32_t fibnum,
+ const struct rt_addrinfo *info, const struct rib_cmd_info *rc, void *arg)
+{
+ switch (event_type) {
+ case NLBR_EVENT_ROUTE:
+ rts_handle_route_event(fibnum, info, rc);
+ break;
+ }
+}
+
+static struct rib_event_bridge rtsbridge = {
+ .reb_cb = rtsbridge_cb_func,
+ .reb_cb_arg = NULL,
+ .reb_provider_id = NLBR_PROVIDER_RTSOCK,
+};
+
+static void
+rtsock_init(void)
+{
+ rib_bridge_link(&rtsbridge);
+}
+SYSINIT(rtsock_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtsock_init, NULL);
+
static void
rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp)
{
@@ -1074,6 +1121,7 @@
}
error = rib_action(fibnum, rtm->rtm_type, &info, &rc);
if (error == 0) {
+ rib_bridge_rt_event(NLBR_PROVIDER_RTSOCK, fibnum, &info, &rc);
#ifdef ROUTE_MPATH
if (NH_IS_NHGRP(rc.rc_nh_new) ||
(rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
@@ -1095,6 +1143,7 @@
case RTM_DELETE:
error = rib_action(fibnum, RTM_DELETE, &info, &rc);
if (error == 0) {
+ rib_bridge_rt_event(NLBR_PROVIDER_RTSOCK, fibnum, &info, &rc);
#ifdef ROUTE_MPATH
if (NH_IS_NHGRP(rc.rc_nh_old) ||
(rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
diff --git a/sys/netlink/netlink.h b/sys/netlink/netlink.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink.h
@@ -0,0 +1,226 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains structures and constants for RFC 3549 (Netlink)
+ * protocol. Some values have been taken from Linux implementation.
+ */
+
+#ifndef _NETLINK_NETLINK_H_
+#define _NETLINK_NETLINK_H_
+
+#ifndef _KERNEL
+#ifndef AF_MPLS
+#define AF_MPLS 39
+#endif
+#endif
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+struct sockaddr_nl {
+ uint8_t nl_len; /* total length */
+ sa_family_t nl_family; /* AF_NETLINK */
+ uint16_t nl_pad; /* zero */
+ uint32_t nl_pid; /* port ID */
+ uint32_t nl_groups; /* multicast groups mask */
+};
+
+#define SOL_NETLINK 270
+
+/* Currently supported socket options */
+#define NETLINK_ADD_MEMBERSHIP 1
+#define NETLINK_DROP_MEMBERSHIP 2
+#define NETLINK_PKTINFO 3 /* XXX: not supported */
+#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */
+#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */
+#define NETLINK_RX_RING 6 /* XXX: not supported */
+#define NETLINK_TX_RING 7 /* XXX: not supported */
+#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */
+
+#define NETLINK_LIST_MEMBERSHIPS 9
+#define NETLINK_CAP_ACK 10
+#define NETLINK_EXT_ACK 11
+#define NETLINK_GET_STRICT_CHK 12 /* XXX: not supported */
+
+
+/*
+ * RFC 3549, 2.3.2 Netlink Message Header
+ */
+struct nlmsghdr {
+ uint32_t nlmsg_len; /* Length of message including header */
+ uint16_t nlmsg_type; /* Message type identifier */
+ uint16_t nlmsg_flags; /* Flags (NLM_F_) */
+ uint32_t nlmsg_seq; /* Sequence number */
+ uint32_t nlmsg_pid; /* Sending process port ID */
+};
+
+/*
+ * RFC 3549, 2.3.2.2 The ACK Netlink Message
+ */
+struct nlmsgerr {
+ int error;
+ struct nlmsghdr msg;
+};
+
+/*
+ * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags)
+ */
+#define NLM_F_REQUEST 0x01 /* It is request message. */
+#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */
+#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */
+#define NLM_F_ECHO 0x08 /* Echo this request */
+#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */
+#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for GET requests
+ */
+#define NLM_F_ROOT 0x100 /* Return the complete table */
+#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */
+#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot (ignored) */
+#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH)
+
+/*
+ * RFC 3549, 2.3.2 Additional flag bits for NEW requests
+ */
+#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */
+#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */
+#define NLM_F_CREATE 0x400 /* Create if it does not exist */
+#define NLM_F_APPEND 0x800 /* Add to end of list */
+
+/* Modifiers to DELETE requests */
+#define NLM_F_NONREC 0x100 /* Do not delete recursively */
+
+/* Flags for ACK message */
+#define NLM_F_CAPPED 0x100 /* request was capped */
+#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */
+
+/*
+ * RFC 3549, 2.3.2 standard message types (nlmsg_type).
+ */
+#define NLMSG_NOOP 0x1 /* Message is ignored. */
+#define NLMSG_ERROR 0x2 /* reply error code reporting */
+#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */
+#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */
+
+#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */
+
+/*
+ * Defition of numbers assigned to the netlink subsystems.
+ */
+#define NETLINK_ROUTE 0 /* Routing/device hook */
+#define NETLINK_UNUSED 1 /* not supported */
+#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols (not supported) */
+#define NETLINK_FIREWALL 3 /* (not supported) */
+#define NETLINK_SOCK_DIAG 4 /* socket monitoring (not supported) */
+#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG (not supported) */
+#define NETLINK_XFRM 6 /* ipsec (not supported) */
+#define NETLINK_SELINUX 7 /* SELinux event notifications (not supported) */
+#define NETLINK_ISCSI 8 /* Open-iSCSI (not supported) */
+#define NETLINK_AUDIT 9 /* auditing (not supported) */
+#define NETLINK_FIB_LOOKUP 10 /* not supported */
+#define NETLINK_CONNECTOR 11 /* not supported */
+#define NETLINK_NETFILTER 12 /* netfilter subsystem (not supported) */
+#define NETLINK_IP6_FW 13 /* not supporterd */
+#define NETLINK_DNRTMSG 14 /* DECnet routing messages (not supported ) */
+#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace (not supported) */
+#define NETLINK_GENERIC 16 /* not supported */
+
+
+#ifndef roundup2
+#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+#endif
+#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t)
+#define NL_ITEM_ALIGN(_len) roundup2(_len, NL_ITEM_ALIGN_SIZE)
+#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off))
+#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off))
+
+#define NL_ITEM_OK(_ptr, _len, _hlen, _LEN_M) \
+ ((_len) >= _hlen && _LEN_M(_ptr) >= _hlen && _LEN_M(_ptr) <= (_len))
+#define NL_ITEM_NEXT(_ptr, _LEN_M) ((typeof(_ptr))((char *)(_ptr) + _LEN_M(_ptr)))
+#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \
+ ((_len) -= _LEN_MACRO(_ptr), NL_ITEM_NEXT(_ptr, _LEN_MACRO))
+
+
+#ifndef _KERNEL
+/* part of netlink(3) API */
+#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define NLMSG_HDRLEN ((int)sizeof(struct nlmsghdr))
+#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN)
+#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(_len))
+#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, NLMSG_HDRLEN)
+#define _NLMSG_LEN(_hdr) ((int)(_hdr)->nlmsg_len)
+#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr))
+#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN)
+#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len)))
+#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN)
+
+#else
+#define NLMSG_ALIGNTO 4U
+#define NLMSG_ALIGN(len) (((len) + NLMSG_ALIGNTO - 1) & ~(NLMSG_ALIGNTO - 1))
+#define NLMSG_HDRLEN ((int)NLMSG_ALIGN(sizeof(struct nlmsghdr)))
+#endif
+
+/*
+ * Base netlink attribute TLV header.
+ */
+struct nlattr {
+ uint16_t nla_len; /* Total attribute length */
+ uint16_t nla_type; /* Attribute type */
+};
+
+/*
+ *
+ * nl_type field enconding:
+ *
+ * 0 1
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |N|O| Attribute type |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * N - attribute contains other attributes
+ * O - encoded in network byte order
+ * Note: N & O are mutually exclusive
+ *
+ * Note: attribute type value scope normally is per-message
+ * or per message group.
+ */
+
+#define NLA_F_NESTED (1 << 15)
+#define NLA_F_NET_BYTEORDER (1 << 14)
+#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER)
+
+#ifndef _KERNEL
+#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define NLA_HDRLEN ((int)sizeof(struct nlattr))
+#endif
+
+#endif
diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_ctl.h
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_NETLINK_CTL_H_
+#define _NETLINK_NETLINK_CTL_H_
+
+/*
+ * This file provides headers for the public KPI of the netlink
+ * subsystem
+ */
+
+MALLOC_DECLARE(M_NETLINK);
+
+/*
+ * Macro for handling attribute TLVs
+ */
+#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1)))
+
+#define NETLINK_ALIGN_SIZE sizeof(uint32_t)
+#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE)
+
+#define NLA_ALIGN_SIZE sizeof(uint32_t)
+#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE)
+#define NLA_HDRLEN ((int)sizeof(struct nlattr))
+#define NLA_DATA_LEN(_nla) ((int)((_nla)->nla_len - NLA_HDRLEN))
+#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN)
+#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN)
+
+#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF)
+
+#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len))
+
+#define _NLA_END(_start, _len) ((char *)(_start) + (_len))
+#define NLA_FOREACH(_attr, _start, _len) \
+ for (_attr = (_start); \
+ ((char *)_attr < _NLA_END(_start, _len)) && \
+ ((char *)NLA_NEXT(_attr) <= _NLA_END(_start, _len)); \
+ _attr = NLA_NEXT(_attr))
+
+/*
+ (NLA_ALIGN(_attr->nla_len) >= NLA_HDRLEN) && \
+*/
+
+struct mbuf;
+struct nlmsg_state;
+typedef bool nlmsg_state_cb(struct nlmsg_state *ns, void *buf, int buflen, int cnt);
+
+struct nlmsg_state {
+ int alloc_len; /* allocated buffer length */
+ int offset; /* offset from the start of the buffer */
+ struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */
+ char *data; /* pointer to the contiguous storage */
+ void *_storage; /* Underlying storage pointer */
+ nlmsg_state_cb *cb; /* Callback to flush data */
+ void *arg; /* Callback argument */
+ int num_messages; /* Number of messages in the buffer */
+ int malloc_flag; /* M_WAITOK or M_NOWAIT */
+ uint8_t writer_type; /* NS_WRITER_TYPE_* */
+ uint8_t writer_target; /* NS_WRITER_TARGET_* */
+ bool ignore_limit; /* If true, ignores RCVBUF limit */
+};
+#define NS_WRITER_TARGET_SOCKET 0
+#define NS_WRITER_TARGET_GROUP 1
+#define NS_WRITER_TARGET_CHAIN 2
+
+#define NS_WRITER_TYPE_MBUF 0
+#define NS_WRITER_TYPE_BUF 1
+#define NS_WRITER_TYPE_LBUF 2
+#define NS_WRITER_TYPE_MBUFC 3
+
+
+#define NLMSG_SMALL 128
+#define NLMSG_LARGE 2048
+
+/* Message and attribute writing */
+
+struct nlpcb;
+bool nlmsg_get_socket_writer(int size, struct nlpcb *nlp, struct nlmsg_state *ns);
+bool nlmsg_get_group_writer(int size, uint32_t group_mask, struct nlmsg_state *ns);
+bool nlmsg_get_chain_writer(int size, struct mbuf **pm, struct nlmsg_state *ns);
+bool nlmsg_flush(struct nlmsg_state *ns);
+void nlmsg_ignore_limit(struct nlmsg_state *ns);
+
+bool nlmsg_refill_buffer(struct nlmsg_state *ns, int required_size);
+bool nlmsg_add(struct nlmsg_state *ns, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len);
+void nlmsg_end(struct nlmsg_state *ns);
+void nlmsg_abort(struct nlmsg_state *ns);
+
+bool nlmsg_end_dump(struct nlmsg_state *ns, int error, struct nlmsghdr *hdr);
+
+static inline bool
+nlmsg_reply(struct nlmsg_state *ns, const struct nlmsghdr *hdr, int payload_len)
+{
+ return (nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type,
+ hdr->nlmsg_flags, payload_len));
+}
+
+#define nlmsg_data(_hdr) ((void *)((_hdr) + 1))
+
+/*
+ * KPI similar to mtodo():
+ * current (uncompleted) header is guaranteed to be contiguous,
+ * but can be reallocated, thus pointers may need to be readjusted.
+ */
+static inline int
+nlattr_save_offset(const struct nlmsg_state *ns)
+{
+ return (ns->offset - ((char *)ns->hdr - ns->data));
+}
+
+static inline void *
+_nlattr_restore_offset(const struct nlmsg_state *ns, int off)
+{
+ return ((void *)((char *)ns->hdr + off));
+}
+#define nlattr_restore_offset(_ns, _off, _t) ((_t *)_nlattr_restore_offset(_ns, _off))
+
+static inline void *
+nlmsg_reserve_data_raw(struct nlmsg_state *ns, size_t sz)
+{
+ if (__predict_false(ns->offset + NETLINK_ALIGN(sz) > ns->alloc_len)) {
+ if (!nlmsg_refill_buffer(ns, NETLINK_ALIGN(sz)))
+ return (NULL);
+ }
+
+ void *data_ptr = &ns->data[ns->offset];
+ ns->offset += NLMSG_ALIGN(sz);
+
+ return (data_ptr);
+}
+#define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, NLA_ALIGN(sizeof(_t))))
+#define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz))
+
+static inline void *
+_nlmsg_reserve_attr(struct nlmsg_state *ns, uint16_t nla_type, uint16_t sz)
+{
+ sz += sizeof(struct nlattr);
+
+ struct nlattr *nla = nlmsg_reserve_data(ns, sz, struct nlattr);
+ if (__predict_false(nla == NULL))
+ return (NULL);
+ nla->nla_type = nla_type;
+ nla->nla_len = sz;
+
+ return ((void *)(nla + 1));
+}
+#define nlmsg_reserve_attr(_ns, _at, _t) ((_t *)_nlmsg_reserve_attr(_ns, _at, NLA_ALIGN(sizeof(_t))))
+
+static inline bool
+nlattr_add(struct nlmsg_state *ns, int attr_type, int attr_len, const void *data)
+{
+ int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
+
+ if (__predict_false(ns->offset + required_len > ns->alloc_len)) {
+ if (!nlmsg_refill_buffer(ns, required_len))
+ return (false);
+ }
+
+ struct nlattr *nla = (struct nlattr *)(&ns->data[ns->offset]);
+
+ nla->nla_len = attr_len + sizeof(struct nlattr);
+ nla->nla_type = attr_type;
+ if (attr_len > 0)
+ memcpy((nla + 1), data, attr_len);
+ ns->offset += required_len;
+ return (true);
+}
+
+static inline bool
+nlattr_add_u8(struct nlmsg_state *ns, int attrtype, uint8_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint8_t), &value));
+}
+
+static inline bool
+nlattr_add_u16(struct nlmsg_state *ns, int attrtype, uint16_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint16_t), &value));
+}
+
+static inline bool
+nlattr_add_u32(struct nlmsg_state *ns, int attrtype, uint32_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint32_t), &value));
+}
+
+static inline bool
+nlattr_add_u64(struct nlmsg_state *ns, int attrtype, uint64_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(uint64_t), &value));
+}
+
+static inline bool
+nlattr_add_s8(struct nlmsg_state *ns, int attrtype, int8_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int8_t), &value));
+}
+
+static inline bool
+nlattr_add_s16(struct nlmsg_state *ns, int attrtype, int16_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int16_t), &value));
+}
+
+static inline bool
+nlattr_add_s32(struct nlmsg_state *ns, int attrtype, int32_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int32_t), &value));
+}
+
+static inline bool
+nlattr_add_s64(struct nlmsg_state *ns, int attrtype, int64_t value)
+{
+ return (nlattr_add(ns, attrtype, sizeof(int64_t), &value));
+}
+
+static inline bool
+nlattr_add_flag(struct nlmsg_state *ns, int attrtype)
+{
+ return (nlattr_add(ns, attrtype, 0, NULL));
+}
+
+static inline bool
+nlattr_add_string(struct nlmsg_state *ns, int attrtype, const char *str)
+{
+ return (nlattr_add(ns, attrtype, strlen(str) + 1, str));
+}
+
+/* Attribute reading */
+
+/* netlink_attr_helpers.c */
+struct netlink_parse_tracker;
+
+typedef int parse_attr_f(struct nlattr *attr, struct netlink_parse_tracker *npt,
+ void *target);
+struct nlattr_parser {
+ uint16_t type; /* Attribute type */
+ uint16_t off; /* field offset in the target structure */
+ parse_attr_f *cb; /* parser function to call */
+};
+
+int nl_parse_attrs_raw(struct nlattr *nla_head, int len, struct nlattr_parser *ps,
+ int pslen, struct netlink_parse_tracker *npt, void *target);
+int nl_parse_attrs(struct nlmsghdr *hdr, int hdrlen, struct nlattr_parser *ps,
+ int pslen, struct netlink_parse_tracker *npt, void *target);
+
+int nlattr_get_flag(struct nlattr *nla, struct netlink_parse_tracker *npt,
+ void *target);
+int nlattr_get_ip(struct nlattr *nla, struct netlink_parse_tracker *npt,
+ void *target);
+int nlattr_get_uint32(struct nlattr *nla, struct netlink_parse_tracker *npt,
+ void *target);
+int nlattr_get_ifindex(struct nlattr *nla, struct netlink_parse_tracker *npt,
+ void *target);
+int nlattr_get_ipvia(struct nlattr *nla, struct netlink_parse_tracker *npt,
+ void *target);
+int nlattr_get_string(struct nlattr *nla, struct netlink_parse_tracker *npt,
+ void *target);
+int nlattr_get_nla(struct nlattr *nla, struct netlink_parse_tracker *npt,
+ void *target);
+
+/* Parsing state */
+struct linear_buffer {
+ char *base; /* Base allocated memory pointer */
+ uint32_t offset; /* Currently used offset */
+ uint32_t size; /* Total buffer size */
+};
+
+static inline void *
+lb_alloc(struct linear_buffer *lb, int len)
+{
+ len = roundup2(len, sizeof(uint64_t));
+ if (lb->offset + len > lb->size)
+ return (NULL);
+ void *data = (void *)(lb->base + lb->offset);
+ lb->offset += len;
+ return (data);
+}
+
+static inline void
+lb_clear(struct linear_buffer *lb)
+{
+ memset(lb->base, 0, lb->size);
+ lb->offset = 0;
+}
+
+#define SCRATCH_BUFFER_SIZE 1024
+struct netlink_parse_tracker {
+ struct linear_buffer lb; /* Per-message scratch buffer */
+ struct nlpcb *nlp; /* Originator */
+ struct nlmsg_state *ns; /* Message writer to use */
+ int error; /* last operation error */
+};
+
+static inline void *
+npt_alloc(struct netlink_parse_tracker *npt, int len)
+{
+ return (lb_alloc(&npt->lb, len));
+}
+#define npt_alloc_sockaddr(_npt, _len) ((struct sockaddr *)(npt_alloc(_npt, _len)))
+
+
+
+/* Protocol handlers */
+typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt);
+
+bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler);
+bool netlink_unregister_proto(int proto);
+
+/* Generic */
+bool nl_has_listeners(int netlink_family, uint32_t groups_mask);
+bool nlp_has_priv(struct nlpcb *nlp, int priv);
+bool nlp_has_priv_route(struct nlpcb *nlp);
+
+/* Debug */
+uint32_t nlp_get_pid(const struct nlpcb *nlp);
+
+#endif
diff --git a/sys/netlink/netlink_debug.h b/sys/netlink/netlink_debug.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_debug.h
@@ -0,0 +1,80 @@
+/*-
+ * Copyright (c) 2022
+ * Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETLINK_NETLINK_DEBUG_H_
+#define _NETLINK_NETLINK_DEBUG_H_
+
+#include <net/route/route_debug.h>
+
+/*
+ * Generic debug
+ * [nl_domain] func_name: debug text
+ */
+#define NL_DEBUG RT_DEBUG
+
+/*
+ * Logging for events specific for particular process
+ * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45
+ */
+#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, _pid, _fmt, ## __VA_ARGS__)
+#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \
+ _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, __func__, ##__VA_ARGS__); \
+}
+
+#define NLP_LOG(_l, _nlp, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, nlp_get_pid(_nlp), _fmt, ## __VA_ARGS__)
+
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG3
+#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG2
+#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_DEBUG
+#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...)
+#endif
+#if DEBUG_MAX_LEVEL>=LOG_INFO
+#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG
+#else
+#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...)
+#endif
+#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG
+#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG
+
+
+
+#endif
diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_domain.c
@@ -0,0 +1,635 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains socket and protocol bindings for netlink.
+ */
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/priv.h> /* priv_check */
+
+#include <net/if.h>
+#include <net/netisr.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_domain
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+
+static u_long nl_sendspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0,
+ "Default netlink socket send space");
+
+static u_long nl_recvspace = NLSNDQ;
+SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0,
+ "Default netlink socket receive space");
+
+uint32_t
+nlp_get_pid(const struct nlpcb *nlp)
+{
+ return (nlp->nl_process_id);
+}
+
+/*
+ * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx.
+ * Returns nlpcb pointer if present else NULL
+ */
+static struct nlpcb *
+nl_port_lookup(uint32_t port_id)
+{
+ struct nlpcb *nlp;
+
+ CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) {
+ if (nlp->nl_port == port_id)
+ return (nlp);
+ }
+ return (NULL);
+}
+
+static void
+nl_update_groups_locked(struct nlpcb *nlp, uint32_t nl_groups)
+{
+ /* Update group mask */
+ RT_LOG(LOG_DEBUG2, "socket %p, groups 0x%X -> 0x%X",
+ nlp->nl_socket, nlp->nl_groups, nl_groups);
+ nlp->nl_groups = nl_groups;
+}
+
+/*
+ * Broadcasts message @m to the one or more groups specified by
+ * @groups_mask.
+ */
+void
+nl_send_group(struct mbuf *m, int num_messages, uint32_t groups_mask)
+{
+ struct nlpcb *nlp_last = NULL;
+ struct nlpcb *nlp;
+ NLCTL_TRACKER;
+
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+ RT_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to groups 0x%X",
+ m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, groups_mask);
+#endif
+
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ if (__predict_false(ctl == NULL)) {
+ /*
+ * Can be the case when notification is sent within VNET
+ * which doesn't have any netlink sockets.
+ */
+ m_freem(m);
+ return;
+ }
+
+ NLCTL_RLOCK(ctl);
+
+ int io_flags = NL_IOF_UNTRANSLATED;
+
+ CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
+ if (nlp->nl_groups & groups_mask) {
+ if (nlp_last != NULL) {
+ struct mbuf *m_copy;
+ m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+ if (m_copy != NULL)
+ nl_send_one(m_copy, nlp_last, num_messages, io_flags);
+ else {
+ NLP_LOCK(nlp_last);
+ if (nlp_last->nl_socket != NULL)
+ sorwakeup(nlp_last->nl_socket);
+ NLP_UNLOCK(nlp_last);
+ }
+ }
+ nlp_last = nlp;
+ }
+ }
+ if (nlp_last != NULL)
+ nl_send_one(m, nlp_last, num_messages, io_flags);
+ else
+ m_freem(m);
+
+ NLCTL_RUNLOCK(ctl);
+}
+
+bool
+nl_has_listeners(int netlink_family, uint32_t groups_mask)
+{
+ return (V_nl_ctl != NULL);
+}
+
+bool
+nlp_has_priv(struct nlpcb *nlp, int priv)
+{
+ return (priv_check_cred(nlp->nl_cred, priv));
+}
+
+bool
+nlp_has_priv_route(struct nlpcb *nlp)
+{
+ return (nlp_has_priv(nlp, PRIV_NET_ROUTE));
+}
+
+static uint32_t
+nl_find_port() {
+ /*
+ * app can open multiple netlink sockets.
+ * Start with current pid, if already taken,
+ * try random numbers in 65k..256k+65k space,
+ * avoiding clash with pids.
+ */
+ if (nl_port_lookup(curproc->p_pid) == NULL)
+ return (curproc->p_pid);
+ for (int i = 0; i < 16; i++) {
+ uint32_t nl_port = (arc4random() % 65536) + 65536 * 4;
+ if (nl_port_lookup(nl_port) == 0)
+ return (nl_port);
+ RT_LOG(LOG_DEBUG3, "tried %u\n", nl_port);
+ }
+ return (curproc->p_pid);
+}
+
+static int
+nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl)
+{
+ if (nlp->nl_bound) {
+ if (nlp->nl_port != snl->nl_pid) {
+ RT_LOG(LOG_DEBUG,
+ "bind() failed: program pid %d "
+ "is different from provided pid %d",
+ nlp->nl_port, snl->nl_pid);
+ return (EINVAL); // XXX: better error
+ }
+ } else {
+ if (snl->nl_pid == 0)
+ snl->nl_pid = nl_find_port();
+ if (nl_port_lookup(snl->nl_pid) != NULL)
+ return (EADDRINUSE);
+ nlp->nl_port = snl->nl_pid;
+ nlp->nl_bound = true;
+ CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next);
+ }
+ nl_update_groups_locked(nlp, snl->nl_groups);
+
+ return (0);
+}
+
+static int
+nl_pru_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct nlpcb *nlp;
+ int error;
+
+ if (__predict_false(netlink_unloading != 0))
+ return (EAFNOSUPPORT);
+
+ error = nl_verify_proto(proto);
+ if (error != 0)
+ return (error);
+
+ bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX;
+ RT_LOG(LOG_DEBUG, "socket %p, %sPID %d: attaching socket to %s",
+ so, is_linux ? "(linux) " : "", curproc->p_pid,
+ nl_get_proto_name(proto));
+
+ /* Create per-VNET state on first socket init */
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ if (ctl == NULL)
+ ctl = vnet_nl_ctl_init();
+ KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed"));
+
+ MPASS(sotonlpcb(so) == NULL);
+
+ nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
+ error = soreserve(so, nl_sendspace, nl_recvspace);
+ if (error != 0) {
+ free(nlp, M_PCB);
+ return (error);
+ }
+ so->so_pcb = (void *)nlp;
+ nlp->nl_socket = so;
+ /* Copy so_cred to avoid having socket_var.h in every header */
+ nlp->nl_cred = so->so_cred;
+ nlp->nl_proto = proto;
+ nlp->nl_process_id = curproc->p_pid;
+ nlp->nl_linux = is_linux;
+ nlp->nl_active = true;
+ NLP_LOCK_INIT(nlp);
+ refcount_init(&nlp->nl_refcount, 1);
+
+ nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
+ taskqueue_thread_enqueue, &nlp->nl_taskqueue);
+ TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp);
+ taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT,
+ "netlink_socket (PID %u)", nlp->nl_process_id);
+
+ NLCTL_WLOCK(ctl);
+ /* XXX: check ctl is still alive */
+ CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next);
+ NLCTL_WUNLOCK(ctl);
+
+ soisconnected(so);
+
+ return (0);
+}
+
+static void
+nl_pru_abort(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ soisdisconnected(so);
+}
+
+static int
+nl_pru_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ struct nlpcb *nlp = sotonlpcb(so);
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)nam;
+ int error;
+
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ RT_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+
+ NLCTL_WLOCK(ctl);
+ NLP_LOCK(nlp);
+ error = nl_bind_locked(nlp, snl);
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK(ctl);
+ RT_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so,
+ snl->nl_pid, snl->nl_groups, error);
+
+ return (error);
+}
+
+
+static int
+nl_assign_port(struct nlpcb *nlp, uint32_t port_id)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ struct sockaddr_nl snl = {
+ .nl_pid = port_id,
+ };
+ int error;
+
+ NLCTL_WLOCK(ctl);
+ NLP_LOCK(nlp);
+ snl.nl_groups = nlp->nl_groups;
+ error = nl_bind_locked(nlp, &snl);
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK(ctl);
+
+ RT_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+/*
+ * nl_autobind_port binds a unused portid to @nlp
+ * @nlp: pcb data for the netlink socket
+ * @candidate_id: first id to consider
+ */
+static int
+nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ uint32_t port_id = candidate_id;
+ NLCTL_TRACKER;
+ bool exist;
+ int error;
+
+ for (int i = 0; i < 10; i++) {
+ RT_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id);
+ NLCTL_RLOCK(ctl);
+ exist = nl_port_lookup(port_id) != 0;
+ NLCTL_RUNLOCK(ctl);
+ if (!exist) {
+ error = nl_assign_port(nlp, port_id);
+ if (error != EADDRINUSE)
+ break;
+ }
+ port_id++;
+ }
+ RT_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error);
+ return (error);
+}
+
+static int
+nl_pru_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct sockaddr_nl *snl = (struct sockaddr_nl *)nam;
+ struct nlpcb *nlp;
+
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ if (snl->nl_len != sizeof(*snl)) {
+ RT_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so);
+ return (EINVAL);
+ }
+
+ nlp = sotonlpcb(so);
+ if (!nlp->nl_bound) {
+ int error = nl_autobind_port(nlp, td->td_proc->p_pid);
+ if (error != 0) {
+ RT_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error);
+ return (error);
+ }
+ }
+ /* XXX: Handle socket flags & multicast */
+ soisconnected(so);
+
+ RT_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid);
+
+ return (0);
+}
+
+static void
+destroy_nlpcb(struct nlpcb *nlp)
+{
+ NLP_LOCK(nlp);
+ nl_free_io(nlp);
+ NLP_LOCK_DESTROY(nlp);
+ free(nlp, M_PCB);
+}
+
+static void
+destroy_nlpcb_epoch(epoch_context_t ctx)
+{
+ struct nlpcb *nlp;
+
+ nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
+
+ destroy_nlpcb(nlp);
+}
+
+
+static void
+nl_pru_detach(struct socket *so)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ MPASS(sotonlpcb(so) != NULL);
+ struct nlpcb *nlp;
+
+ RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid);
+ nlp = sotonlpcb(so);
+
+ /* Mark as inactive so no new work can be enqueued */
+ NLP_LOCK(nlp);
+ bool was_bound = nlp->nl_bound;
+ nlp->nl_active = false;
+ NLP_UNLOCK(nlp);
+
+ /* Wait till all scheduled work has been completed */
+ taskqueue_drain_all(nlp->nl_taskqueue);
+ taskqueue_free(nlp->nl_taskqueue);
+
+ NLCTL_WLOCK(ctl);
+ NLP_LOCK(nlp);
+ if (was_bound) {
+ CK_LIST_REMOVE(nlp, nl_port_next);
+ RT_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port);
+ }
+ CK_LIST_REMOVE(nlp, nl_next);
+ nlp->nl_socket = NULL;
+ NLP_UNLOCK(nlp);
+ NLCTL_WUNLOCK(ctl);
+
+ so->so_pcb = NULL;
+
+ RT_LOG(LOG_DEBUG3, "socket %p, detached", so);
+
+ /* XXX: is delayed free needed? */
+ epoch_call(net_epoch_preempt, destroy_nlpcb_epoch, &nlp->nl_epoch_ctx);
+}
+
+static int
+nl_pru_disconnect(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ return (ENOTCONN);
+}
+
+static int
+nl_pru_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ return (ENOTCONN);
+}
+
+static int
+nl_pru_shutdown(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ socantsendmore(so);
+ return (0);
+}
+
+static int
+nl_pru_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct sockaddr_nl *snl;
+
+ snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO);
+ /* TODO: set other fields */
+ snl->nl_len = sizeof(struct sockaddr_nl);
+ snl->nl_family = AF_NETLINK;
+ snl->nl_pid = sotonlpcb(so)->nl_port;
+ *nam = (struct sockaddr *)snl;
+ return (0);
+}
+
+static void
+nl_pru_close(struct socket *so)
+{
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+ soisdisconnected(so);
+}
+
+static int
+nl_pru_output(struct mbuf *m, struct socket *so, ...)
+{
+
+ if (__predict_false(m == NULL ||
+ ((m->m_len < sizeof(struct nlmsghdr)) &&
+ (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL)))
+ return (ENOBUFS);
+ MPASS((m->m_flags & M_PKTHDR) != 0);
+
+ RT_LOG(LOG_DEBUG3, "sending message to kernel async processing");
+ nl_receive_async(m, so);
+ return (0);
+}
+
+
+static int
+nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+ RT_LOG(LOG_DEBUG2, "sending message to kernel");
+
+ if (__predict_false(control != NULL)) {
+ if (control->m_len) {
+ m_freem(control);
+ return (EINVAL);
+ }
+ m_freem(control);
+ }
+
+ return (nl_pru_output(m, so));
+}
+
+static int
+nl_pru_rcvd(struct socket *so, int flags)
+{
+ RT_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
+ MPASS(sotonlpcb(so) != NULL);
+
+ nl_on_transmit(sotonlpcb(so));
+
+ return (0);
+}
+
+static int
+nl_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
+ struct nlpcb *nlp = sotonlpcb(so);
+ uint32_t flag, groups;
+ int optval, error = 0;
+ NLCTL_TRACKER;
+
+ RT_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get",
+ so, sopt->sopt_name);
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case NETLINK_ADD_MEMBERSHIP:
+ case NETLINK_DROP_MEMBERSHIP:
+ sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+
+ NLCTL_WLOCK(ctl);
+ if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP)
+ groups = nlp->nl_groups | optval;
+ else
+ groups = nlp->nl_groups & ~optval;
+ nl_update_groups_locked(nlp, groups);
+ NLCTL_WUNLOCK(ctl);
+ break;
+ case NETLINK_CAP_ACK:
+ case NETLINK_EXT_ACK:
+ sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+
+ if (sopt->sopt_name == NETLINK_CAP_ACK)
+ flag = NLF_CAP_ACK;
+ else if (sopt->sopt_name == NETLINK_EXT_ACK)
+ flag = NLF_EXT_ACK;
+ else
+ flag = 0;
+
+ NLCTL_WLOCK(ctl);
+ if (optval != 0)
+ nlp->nl_flags |= flag;
+ else
+ nlp->nl_flags &= ~flag;
+ NLCTL_WUNLOCK(ctl);
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case NETLINK_LIST_MEMBERSHIPS:
+ NLCTL_RLOCK(ctl);
+ optval = nlp->nl_groups;
+ NLCTL_RUNLOCK(ctl);
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+ break;
+ default:
+ error = ENOPROTOOPT;
+ }
+
+ return (error);
+}
+
+static struct domain netlinkdomain;
+
+static struct protosw netlinksw = {
+ .pr_type = SOCK_RAW,
+ .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD,
+ .pr_ctloutput = nl_ctloutput,
+ .pr_abort = nl_pru_abort,
+ .pr_attach = nl_pru_attach,
+ .pr_bind = nl_pru_bind,
+ .pr_connect = nl_pru_connect,
+ .pr_detach = nl_pru_detach,
+ .pr_disconnect = nl_pru_disconnect,
+ .pr_peeraddr = nl_pru_peeraddr,
+ .pr_send = nl_pru_send,
+ .pr_rcvd = nl_pru_rcvd,
+ .pr_shutdown = nl_pru_shutdown,
+ .pr_sockaddr = nl_pru_sockaddr,
+ .pr_close = nl_pru_close
+};
+
+static struct domain netlinkdomain = {
+ .dom_family = PF_NETLINK,
+ .dom_name = "netlink",
+ .dom_flags = DOMF_UNLOADABLE,
+ .dom_nprotosw = 1,
+ .dom_protosw = { &netlinksw },
+};
+
+DOMAIN_SET(netlink);
diff --git a/sys/netlink/netlink_helpers.c b/sys/netlink/netlink_helpers.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_helpers.c
@@ -0,0 +1,369 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+
+#define DEBUG_MOD_NAME nl_helpers
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+/*
+ * Sends an ack message
+ */
+void
+nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr)
+{
+ struct nlmsgerr *errmsg;
+ int payload_len;
+ uint32_t flags = nlp->nl_flags;
+ struct nlmsg_state ns;
+ bool cap_ack;
+
+ payload_len = sizeof(struct nlmsgerr);
+
+ /*
+ * The only case when we send the full message in the
+ * reply is when there is an error and NETLINK_CAP_ACK
+ * is not set.
+ */
+ cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
+ if (!cap_ack)
+ payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr);
+
+ /*
+ * TODO: handle NETLINK_F_EXT_ACK sockopt
+ * TODO: handle cookies
+ */
+
+ int sz = payload_len + sizeof(struct nlmsghdr);
+ if (!nlmsg_get_socket_writer(sz, nlp, &ns))
+ goto enomem;
+ nlmsg_ignore_limit(&ns);
+
+ RT_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d",
+ hdr->nlmsg_type, hdr->nlmsg_seq);
+
+ if (!nlmsg_add(&ns, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, 0, payload_len))
+ goto enomem;
+
+ errmsg = nlmsg_reserve_data(&ns, payload_len, struct nlmsgerr);
+ errmsg->error = error;
+ /* In case of error copy the whole message, else just the header */
+ memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len);
+
+ nlmsg_end(&ns);
+ nlmsg_flush(&ns);
+ return;
+enomem:
+ NLP_LOG(LOG_INFO, nlp, "error allocating ack data for message %d seq %u",
+ hdr->nlmsg_type, hdr->nlmsg_seq);
+}
+
+bool
+nlmsg_end_dump(struct nlmsg_state *ns, int error, struct nlmsghdr *hdr)
+{
+ if (!nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
+ RT_LOG(LOG_DEBUG, "Error finalizing table dump");
+ return (false);
+ }
+ /* Save operation result */
+ int *perror = nlmsg_reserve_object(ns, int);
+ RT_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
+ ns->offset, perror);
+ *perror = error;
+ nlmsg_end(ns);
+
+ return (true);
+}
+
+
+static const struct nlattr_parser *
+search_states(const struct nlattr_parser *ps, int pslen, int key)
+{
+ int left_i = 0, right_i = pslen - 1;
+
+ if (key < ps[0].type || key > ps[pslen - 1].type)
+ return (NULL);
+
+ while (left_i + 1 < right_i) {
+ int mid_i = (left_i + right_i) / 2;
+ if (key < ps[mid_i].type)
+ right_i = mid_i;
+ else if (key > ps[mid_i].type)
+ left_i = mid_i + 1;
+ else
+ return (&ps[mid_i]);
+ }
+ if (ps[left_i].type == key)
+ return (&ps[left_i]);
+ else if (ps[right_i].type == key)
+ return (&ps[right_i]);
+ return (NULL);
+}
+
+int
+nl_parse_attrs_raw(struct nlattr *nla_head, int len, struct nlattr_parser *ps, int pslen,
+ struct netlink_parse_tracker *npt, void *target)
+{
+ struct nlattr *nla;
+ int error = 0;
+
+ RT_LOG(LOG_DEBUG3, "parse %p remaining_len %d", nla_head, len);
+ NLA_FOREACH(nla, nla_head, len) {
+ if (nla->nla_len < sizeof(struct nlattr)) {
+ RT_LOG(LOG_DEBUG, "Invalid attr len: %d", nla->nla_len);
+ return (EINVAL);
+ }
+
+ int nla_type = nla->nla_type & NLA_TYPE_MASK;
+ const struct nlattr_parser *s = search_states(ps, pslen, nla_type);
+ if (s != NULL) {
+ void *ptr = (void *)((char *)target + s->off);
+ error = s->cb(nla, npt, ptr);
+ if (error != 0)
+ return (error);
+ } else {
+ /* Default policy is to ignore unknown attrs */
+ }
+
+ if (s == NULL) {
+ /* Default policy is to ignore */
+ continue;
+ }
+ }
+
+ return (0);
+}
+
+int
+nl_parse_attrs(struct nlmsghdr *hdr, int hdrlen, struct nlattr_parser *ps, int pslen,
+ struct netlink_parse_tracker *npt, void *target)
+{
+ int off = NLMSG_HDRLEN + NETLINK_ALIGN(hdrlen);
+ int len = hdr->nlmsg_len - off;
+ struct nlattr *nla_head = (struct nlattr *)((char *)hdr + off);
+
+ return (nl_parse_attrs_raw(nla_head, len, ps, pslen, npt, target));
+}
+
+int
+nlattr_get_flag(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != 0)) {
+ RT_LOG(LOG_DEBUG, "nla type %d size(%u) is not a flag",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+
+ *((uint8_t *)target) = 1;
+ return (0);
+}
+
+static struct sockaddr *
+parse_rta_ip4(void *rta_data, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in));
+ if (__predict_false(sin == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_family = AF_INET;
+ memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr));
+ return ((struct sockaddr *)sin);
+}
+
+static struct sockaddr *
+parse_rta_ip6(void *rta_data, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in6));
+ if (__predict_false(sin6 == NULL)) {
+ *perror = ENOBUFS;
+ return (NULL);
+ }
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+ sin6->sin6_family = AF_INET6;
+ memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in_addr));
+ return ((struct sockaddr *)sin6);
+}
+
+static struct sockaddr *
+parse_rta_ip(struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror)
+{
+ void *rta_data = NL_RTA_DATA(rta);
+ int rta_len = NL_RTA_DATA_LEN(rta);
+
+ if (rta_len == sizeof(struct in_addr)) {
+ return (parse_rta_ip4(rta_data, npt, perror));
+ } else if (rta_len == sizeof(struct in6_addr)) {
+ return (parse_rta_ip6(rta_data, npt, perror));
+ } else {
+ RT_LOG(LOG_NOTICE, "unknown IP len: %d for rta type %d",
+ rta_len, rta->rta_type);
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+ return (NULL);
+}
+
+int
+nlattr_get_ip(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ int error = 0;
+
+ struct sockaddr *sa = parse_rta_ip((struct rtattr *)nla, npt, &error);
+
+ *((struct sockaddr **)target) = sa;
+ return (error);
+}
+
+static struct sockaddr *
+parse_rta_via(struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror)
+{
+ struct rtvia *via = NL_RTA_DATA(rta);
+ int data_len = NL_RTA_DATA_LEN(rta);
+
+ if (__predict_false(data_len) < sizeof(struct rtvia)) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ data_len -= offsetof(struct rtvia, rtvia_addr);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ if (__predict_false(data_len < sizeof(struct in_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip4(via->rtvia_addr, npt, perror));
+ case AF_INET6:
+ if (__predict_false(data_len < sizeof(struct in6_addr))) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+ return (parse_rta_ip6(via->rtvia_addr, npt, perror));
+ default:
+ *perror = ENOTSUP;
+ return (NULL);
+ }
+}
+
+int
+nlattr_get_ipvia(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ int error = 0;
+
+ struct sockaddr *sa = parse_rta_via((struct rtattr *)nla, npt, &error);
+
+ *((struct sockaddr **)target) = sa;
+ return (error);
+}
+
+
+int
+nlattr_get_uint32(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) {
+ RT_LOG(LOG_DEBUG, "nla type %d size(%u) is not uint32",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ *((uint32_t *)target) = *((const uint32_t *)NL_RTA_DATA_CONST(nla));
+ return (0);
+}
+
+int
+nlattr_get_ifindex(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) {
+ RT_LOG(LOG_DEBUG, "nla type %d size(%u) is not uint32",
+ nla->nla_type, NLA_DATA_LEN(nla));
+ return (EINVAL);
+ }
+ uint32_t ifindex = *((const uint32_t *)NLA_DATA_CONST(nla));
+
+ NET_EPOCH_ASSERT();
+
+ struct ifnet *ifp = ifnet_byindex(ifindex);
+ if (__predict_false(ifp == NULL)) {
+ RT_LOG(LOG_DEBUG, "nla type %d: ifindex %u invalid",
+ nla->nla_type, ifindex);
+ return (ENOENT);
+ }
+ *((struct ifnet **)target) = ifp;
+ RT_LOG(LOG_DEBUG3, "nla type %d: ifindex %u -> %s", nla->nla_type,
+ ifindex, if_name(ifp));
+
+ return (0);
+}
+
+int
+nlattr_get_string(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ int maxlen = NLA_DATA_LEN(nla);
+
+ if (__predict_false(strnlen((char *)NLA_DATA(nla), maxlen) >= maxlen)) {
+ RT_LOG(LOG_DEBUG, "nla type %d size(%u) is not NULL-terminated",
+ nla->nla_type, maxlen);
+ return (EINVAL);
+ }
+
+ *((char **)target) = (char *)NLA_DATA(nla);
+ return (0);
+}
+
+int
+nlattr_get_nla(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ *((struct nlattr **)target) = nla;
+ return (0);
+}
+
+
+
diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_io.c
@@ -0,0 +1,514 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_io
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+/*
+ * The logic below provide a p2p interface for receiving and
+ * sending netlink data between the kernel and userland.
+ */
+
+
+static struct sockaddr_nl _nl_empty_src = {
+ .nl_len = sizeof(struct sockaddr_nl),
+ .nl_family = PF_NETLINK,
+ .nl_pid = 0 /* comes from the kernel */
+};
+static struct sockaddr *nl_empty_src = (struct sockaddr *)&_nl_empty_src;
+
+static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp);
+
+
+/*
+struct nl_io_queue {
+ struct mbuf *head;
+ struct mbuf *last;
+ int length;
+};
+*/
+
+static void
+queue_push(struct nl_io_queue *q, struct mbuf *m)
+{
+ struct mbuf *m_last;
+
+ for (m_last = m; m_last->m_nextpkt != NULL; m_last = m_last->m_nextpkt)
+ q->length += m_length(m_last, NULL);
+ q->length += m_length(m_last, NULL);
+
+ if (q->last == NULL) {
+ q->head = m;
+ q->last = m_last;
+ } else {
+ q->last->m_nextpkt = m;
+ q->last = m_last;
+ }
+}
+
+static void
+queue_push_head(struct nl_io_queue *q, struct mbuf *m)
+{
+ MPASS(m->m_nextpkt == NULL);
+
+ q->length += m_length(m, NULL);
+
+ if (q->last == NULL) {
+ q->head = m;
+ q->last = m;
+ } else {
+ m->m_nextpkt = q->head;
+ q->head = m;
+ }
+}
+
+static struct mbuf *
+queue_pop(struct nl_io_queue *q)
+{
+ if (q->head != NULL) {
+ struct mbuf *m = q->head;
+ q->head = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ if (q->head == NULL)
+ q->last = NULL;
+ q->length -= m_length(m, NULL);
+
+ return (m);
+ }
+ return (NULL);
+}
+
+static struct mbuf *
+queue_head(const struct nl_io_queue *q)
+{
+ return (q->head);
+}
+
+static inline bool
+queue_empty(const struct nl_io_queue *q)
+{
+ return (q->length == 0);
+}
+
+static void
+queue_free(struct nl_io_queue *q)
+{
+ struct mbuf *m = q->head;
+
+ while (m != NULL) {
+ struct mbuf *m_next = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = m_next;
+ }
+ q->head = NULL;
+ q->last = NULL;
+ q->length = 0;
+}
+
+
+static void
+nl_schedule_taskqueue(struct nlpcb *nlp)
+{
+ if (!nlp->nl_task_pending) {
+ nlp->nl_task_pending = true;
+ taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task);
+ RT_LOG(LOG_DEBUG3, "taskqueue scheduled");
+ } else {
+ RT_LOG(LOG_DEBUG3, "taskqueue schedule skipped");
+ }
+}
+
+int
+nl_receive_async(struct mbuf *m, struct socket *so)
+{
+ struct nlpcb *nlp = sotonlpcb(so);
+ int error = 0;
+
+ m->m_nextpkt = NULL;
+
+ NLP_LOCK(nlp);
+
+ if ((__predict_true(nlp->nl_active))) {
+ sbappend(&so->so_snd, m, 0);
+ RT_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL));
+ nl_schedule_taskqueue(nlp);
+ } else {
+ RT_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket",
+ m_length(m, NULL));
+ m_free(m);
+ error = EINVAL;
+ }
+
+ NLP_UNLOCK(nlp);
+
+ return (error);
+}
+
+static bool
+tx_check_locked(struct nlpcb *nlp)
+{
+ if (queue_empty(&nlp->tx_queue))
+ return (true);
+
+ /*
+ * Check if something can be moved from the internal TX queue
+ * to the socket queue.
+ */
+
+ bool appended = false;
+ struct sockbuf *sb = &nlp->nl_socket->so_rcv;
+ SOCKBUF_LOCK(sb);
+
+ while (true) {
+ struct mbuf *m = queue_head(&nlp->tx_queue);
+ if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) {
+ /* appended successfully */
+ queue_pop(&nlp->tx_queue);
+ appended = true;
+ } else
+ break;
+ }
+
+ SOCKBUF_UNLOCK(sb);
+
+ if (appended)
+ sorwakeup(nlp->nl_socket);
+
+ return (queue_empty(&nlp->tx_queue));
+}
+
+static bool
+nl_process_received_one(struct nlpcb *nlp)
+{
+ bool reschedule = false;
+
+ NLP_LOCK(nlp);
+ nlp->nl_task_pending = false;
+
+ if (!tx_check_locked(nlp)) {
+ /* TX overflow queue still not empty, ignore RX */
+ NLP_UNLOCK(nlp);
+ return (false);
+ }
+
+ if (queue_empty(&nlp->rx_queue)) {
+ /*
+ * Grab all data we have from the socket TX queue
+ * and store it the internal queue, so it can be worked on
+ * w/o holding socket lock.
+ */
+ struct sockbuf *sb = &nlp->nl_socket->so_snd;
+
+ SOCKBUF_LOCK(sb);
+ unsigned int avail = sbavail(sb);
+ if (avail > 0) {
+ RT_LOG(LOG_DEBUG3, "grabbed %u bytes", avail);
+ queue_push(&nlp->rx_queue, sbcut_locked(sb, avail));
+ }
+ SOCKBUF_UNLOCK(sb);
+ } else {
+ /* Schedule another pass to read from the socket queue */
+ reschedule = true;
+ }
+
+ int prev_hiwat = nlp->tx_queue.hiwat;
+ NLP_UNLOCK(nlp);
+
+ while (!queue_empty(&nlp->rx_queue)) {
+ struct mbuf *m = queue_pop(&nlp->rx_queue);
+
+ m = nl_process_mbuf(m, nlp);
+ if (m != NULL) {
+ queue_push_head(&nlp->rx_queue, m);
+ reschedule = false;
+ break;
+ }
+ }
+ if (nlp->tx_queue.hiwat > prev_hiwat) {
+ NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
+
+ }
+
+ return (reschedule);
+}
+
+static void
+nl_process_received(struct nlpcb *nlp)
+{
+ RT_LOG(LOG_DEBUG3, "taskqueue called");
+
+ while (nl_process_received_one(nlp))
+ ;
+}
+
+void
+nl_free_io(struct nlpcb *nlp)
+{
+ queue_free(&nlp->rx_queue);
+ queue_free(&nlp->tx_queue);
+}
+
+/*
+ * Called after some data have been read from the socket.
+ */
+void
+nl_on_transmit(struct nlpcb *nlp)
+{
+ NLP_LOCK(nlp);
+
+ struct socket *so = nlp->nl_socket;
+ if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) {
+ uint64_t dropped_bytes = nlp->nl_dropped_bytes;
+ uint64_t dropped_messages = nlp->nl_dropped_messages;
+ nlp->nl_dropped_bytes = 0;
+ nlp->nl_dropped_messages = 0;
+
+ struct sockbuf *sb = &so->so_rcv;
+ NLP_LOG(LOG_DEBUG, nlp,
+ "socket RX overflowed, %lu messages (%lu bytes) dropped. "
+ "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
+ sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
+ /* TODO: send netlink message */
+ }
+
+ nl_schedule_taskqueue(nlp);
+ NLP_UNLOCK(nlp);
+}
+
+void
+nl_taskqueue_handler(void *_arg, int pending)
+{
+ struct nlpcb *nlp = (struct nlpcb *)_arg;
+
+ CURVNET_SET(nlp->nl_socket->so_vnet);
+ nl_process_received(nlp);
+ CURVNET_RESTORE();
+}
+
+static __noinline void
+queue_push_tx(struct nlpcb *nlp, struct mbuf *m)
+{
+ queue_push(&nlp->tx_queue, m);
+ nlp->nl_tx_blocked = true;
+
+ if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
+ nlp->tx_queue.hiwat = nlp->tx_queue.length;
+}
+
+/*
+ * Tries to send @m to the socket @nlp.
+ *
+ * @m: mbuf(s) to send to. Consumed in any case.
+ * @nlp: socket to send to
+ * @cnt: number of messages in @m
+ * @io_flags: combination of NL_IOF_* flags
+ *
+ * Returns true on success.
+ * If no queue overrunes happened, wakes up socket owner.
+ */
+bool
+nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags)
+{
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
+ NLP_LOG(LOG_DEBUG2, nlp, "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
+ m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, io_flags);
+#endif
+ bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
+ bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
+ bool result = true;
+
+ if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
+ m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
+ if (m == NULL)
+ return (false);
+ }
+
+ NLP_LOCK(nlp);
+
+ if (__predict_false(nlp->nl_socket == NULL)) {
+ NLP_UNLOCK(nlp);
+ m_freem(m);
+ return (false);
+ }
+
+ if (!queue_empty(&nlp->tx_queue)) {
+ if (ignore_limits) {
+ queue_push_tx(nlp, m);
+ } else {
+ m_free(m);
+ result = false;
+ }
+ NLP_UNLOCK(nlp);
+ return (result);
+ }
+
+ struct socket *so = nlp->nl_socket;
+ if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) {
+ sorwakeup(so);
+ NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
+ } else {
+ if (ignore_limits) {
+ queue_push_tx(nlp, m);
+ } else {
+ /*
+ * Store dropped data so it can be reported
+ * on the next read
+ */
+ nlp->nl_dropped_bytes += m_length(m, NULL);
+ nlp->nl_dropped_messages += num_messages;
+ NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
+ nlp->nl_dropped_messages, num_messages,
+ nlp->nl_dropped_bytes, m_length(m, NULL));
+ soroverflow(so);
+ m_freem(m);
+ result = false;
+ }
+ }
+ NLP_UNLOCK(nlp);
+
+ return (result);
+}
+
+static int
+nl_receive_message(struct nlmsghdr *hdr, int remaining_length,
+ struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ nl_handler_f handler = nl_handlers[nlp->nl_proto].cb;
+ int error = 0;
+
+ RT_LOG(LOG_DEBUG2, "msg len: %d type: %d", hdr->nlmsg_len, hdr->nlmsg_type);
+
+ if (__predict_false(hdr->nlmsg_len > remaining_length)) {
+ RT_LOG(LOG_DEBUG, "invalid message");
+ return (EINVAL);
+ } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) {
+ RT_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len);
+ return (EINVAL);
+ }
+ /* Stamp each message with sender pid */
+ hdr->nlmsg_pid = nlp->nl_port;
+
+ if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) {
+ RT_LOG(LOG_DEBUG2, "handling message with msg type: %d",
+ hdr->nlmsg_type);
+
+ struct nlmsghdr *thdr = hdr;
+ if (nlp->nl_linux && linux_netlink_p != NULL) {
+ thdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt);
+ }
+ error = handler(thdr, npt);
+ RT_LOG(LOG_DEBUG2, "retcode: %d", error);
+ }
+ if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) {
+ RT_LOG(LOG_DEBUG3, "ack");
+ nlmsg_ack(nlp, error, hdr);
+ RT_LOG(LOG_DEBUG3, "done");
+ }
+
+ return (0);
+}
+
+/*
+ * Processes an incoming packet, which can contain multiple netlink messages
+ */
+static struct mbuf *
+nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp)
+{
+ int offset, buffer_length;
+ struct nlmsghdr *hdr;
+ char *buffer;
+ int error;
+
+ RT_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket);
+
+ /* TODO: alloc this buf once for nlp */
+ int data_length = m_length(m, NULL);
+ buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE;
+ if (nlp->nl_linux)
+ buffer_length += roundup2(data_length, 8);
+ buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (buffer == NULL) {
+ m_freem(m);
+ RT_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory",
+ buffer_length);
+ return (NULL);
+ }
+ m_copydata(m, 0, data_length, buffer);
+
+ struct netlink_parse_tracker npt = {
+ .nlp = nlp,
+ .lb.base = &buffer[roundup2(data_length, 8)],
+ .lb.size = buffer_length - roundup2(data_length, 8),
+ };
+
+ for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
+ hdr = (struct nlmsghdr *)&buffer[offset];
+ /* Save length prior to calling handler */
+ int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+ RT_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length);
+ /* Update parse state */
+ lb_clear(&npt.lb);
+ error = nl_receive_message(hdr, data_length - offset, nlp, &npt);
+ offset += msglen;
+ if (__predict_false(error != 0 || nlp->nl_tx_blocked))
+ break;
+ }
+ RT_LOG(LOG_DEBUG3, "packet parsing done");
+ free(buffer, M_NETLINK);
+
+ if (nlp->nl_tx_blocked) {
+ NLP_LOCK(nlp);
+ nlp->nl_tx_blocked = false;
+ NLP_UNLOCK(nlp);
+ m_adj(m, offset);
+ return (m);
+ } else {
+ m_freem(m);
+ return (NULL);
+ }
+}
diff --git a/sys/netlink/netlink_linux.h b/sys/netlink/netlink_linux.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_linux.h
@@ -0,0 +1,54 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _NETLINK_LINUX_VAR_H_
+#define _NETLINK_LINUX_VAR_H_
+
+/*
+ * The file contains headers for the bridge interface between
+ * linux[_common] module and the netlink module
+ */
+struct nlpcb;
+struct netlink_parse_tracker;
+
+typedef struct mbuf *mbufs_to_linux_cb_t(int netlink_family, struct mbuf *m,
+ struct nlpcb *nlp);
+typedef struct mbuf *msgs_to_linux_cb_t(int netlink_family, char *buf, int data_length,
+ struct nlpcb *nlp);
+typedef struct nlmsghdr *msg_from_linux_cb_t(int netlink_family, struct nlmsghdr *hdr,
+ struct netlink_parse_tracker *npt);
+
+struct linux_netlink_provider {
+ mbufs_to_linux_cb_t *mbufs_to_linux;
+ msgs_to_linux_cb_t *msgs_to_linux;
+ msg_from_linux_cb_t *msg_from_linux;
+
+};
+
+extern struct linux_netlink_provider *linux_netlink_p;
+
+#endif
diff --git a/sys/netlink/netlink_message.c b/sys/netlink/netlink_message.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_message.c
@@ -0,0 +1,589 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/mbuf.h>
+#include <sys/ck.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
+
+#define DEBUG_MOD_NAME nl_message
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+/*
+ * The goal of this file is to provide convenient message writing KPI on top of
+ * different storage methods (mbufs, uio, temporary memory chunjs).
+ *
+ * The main KPI guarantee is the the (last) message always resides in the contiguous
+ * memory buffer, so one is able to update the header after writing the entire message.
+ *
+ * This guarantee comes with a side effect of potentially reallocating underlying buffer,
+ * so one needs to update the desired pointers before using them if something was added
+ * to the header.
+ */
+
+
+typedef bool nlwriter_op_init(struct nlmsg_state *ns, int size, bool waitok);
+typedef bool nlwriter_op_write(struct nlmsg_state *ns, void *buf, int buflen, int cnt);
+
+struct nlwriter_ops {
+ nlwriter_op_init *init;
+ nlwriter_op_write *write_socket;
+ nlwriter_op_write *write_group;
+ nlwriter_op_write *write_chain;
+};
+
+/*
+ * NS_WRITER_TYPE_BUF
+ * Writes message to a temporary memory buffer,
+ * flushing to the socket/group when buffer size limit is reached
+ */
+static bool
+nlmsg_get_ns_buf(struct nlmsg_state *ns, int size, bool waitok)
+{
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ ns->_storage = malloc(size, M_NETLINK, mflag | M_ZERO);
+ if (__predict_false(ns->_storage == NULL))
+ return (false);
+ ns->alloc_len = size;
+ ns->offset = 0;
+ ns->hdr = NULL;
+ ns->data = ns->_storage;
+ ns->writer_type = NS_WRITER_TYPE_BUF;
+ ns->malloc_flag = mflag;
+ ns->num_messages = 0;
+ return (true);
+}
+
+static bool
+nlmsg_write_socket_buf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns);
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ /* XXX: should we set sorcverr? */
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ m_append(m, datalen, buf);
+ free(buf, M_NETLINK);
+
+ int io_flags = (ns->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
+ return (nl_send_one(m, (struct nlpcb *)(ns->arg), cnt, io_flags));
+}
+
+static bool
+nlmsg_write_group_buf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ bool success = m_append(m, datalen, buf) != 0;
+ free(buf, M_NETLINK);
+
+ if (!success)
+ return (false);
+
+ nl_send_group(m, cnt, (uint32_t)(uintptr_t)(ns->arg));
+ return (true);
+}
+
+static bool
+nlmsg_write_chain_buf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ struct mbuf **m0 = (struct mbuf **)(ns->arg);
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ if (*m0 == NULL) {
+ struct mbuf *m;
+
+ m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ *m0 = m;
+ }
+ if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ return (true);
+}
+
+
+/*
+ * NS_WRITER_TYPE_MBUF
+ * Writes message to the allocated mbuf,
+ * flushing to socket/group when mbuf size limit is reached.
+ * This is the most efficient mechanism as it avoids double-copying.
+ *
+ * Allocates a single mbuf suitable to store up to @size bytes of data.
+ * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr
+ * If size <= MCLBYTES (2k), allocate a single mbuf cluster
+ * Otherwise, return NULL.
+ */
+static bool
+nlmsg_get_ns_mbuf(struct nlmsg_state *ns, int size, bool waitok)
+{
+ struct mbuf *m;
+
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ m = m_get2(size, mflag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL))
+ return (false);
+ ns->alloc_len = M_TRAILINGSPACE(m);
+ ns->offset = 0;
+ ns->hdr = NULL;
+ ns->_storage = (void *)m;
+ ns->data = mtod(m, void *);
+ ns->writer_type = NS_WRITER_TYPE_MBUF;
+ ns->malloc_flag = mflag;
+ ns->num_messages = 0;
+ RT_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
+ m, size, ns->alloc_len, ns->data);
+ return (true);
+}
+
+static bool
+nlmsg_write_socket_mbuf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ struct mbuf *m = (struct mbuf *)buf;
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m);
+ return (true);
+ }
+
+ m->m_pkthdr.len = datalen;
+ m->m_len = datalen;
+ int io_flags = (ns->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
+ return (nl_send_one(m, (struct nlpcb *)(ns->arg), cnt, io_flags));
+}
+
+static bool
+nlmsg_write_group_mbuf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ struct mbuf *m = (struct mbuf *)buf;
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m);
+ return (true);
+ }
+
+ m->m_pkthdr.len = datalen;
+ m->m_len = datalen;
+ nl_send_group(m, cnt, (uint32_t)(uintptr_t)(ns->arg));
+ return (true);
+}
+
+static bool
+nlmsg_write_chain_mbuf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ struct mbuf *m_new = (struct mbuf *)buf;
+ struct mbuf **m0 = (struct mbuf **)(ns->arg);
+
+ RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ m_freem(m_new);
+ return (true);
+ }
+
+ m_new->m_pkthdr.len = datalen;
+ m_new->m_len = datalen;
+
+ if (*m0 == NULL) {
+ *m0 = m_new;
+ } else {
+ struct mbuf *m_last;
+ for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
+ ;
+ m_last->m_next = m_new;
+ (*m0)->m_pkthdr.len += datalen;
+ }
+
+ return (true);
+}
+
+/*
+ * NS_WRITER_TYPE_LBUF
+ * Writes message to the allocated memory buffer,
+ * flushing to socket/group when mbuf size limit is reached.
+ * Calls linux handler to rewrite messages before sending to the socket.
+ */
+static bool
+nlmsg_get_ns_lbuf(struct nlmsg_state *ns, int size, bool waitok)
+{
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+ size = roundup2(size, sizeof(void *));
+ int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
+ char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
+ if (__predict_false(buf == NULL))
+ return (false);
+
+ /* Fill buffer header first */
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ lb->base = &buf[sizeof(struct linear_buffer) + size];
+ lb->size = size + SCRATCH_BUFFER_SIZE;
+
+ ns->alloc_len = size;
+ ns->offset = 0;
+ ns->hdr = NULL;
+ ns->_storage = buf;
+ ns->data = (char *)(lb + 1);
+ ns->malloc_flag = mflag;
+ ns->writer_type = NS_WRITER_TYPE_LBUF;
+ ns->num_messages = 0;
+ return (true);
+}
+
+
+static bool
+nlmsg_write_socket_lbuf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ char *data = (char *)(lb + 1);
+ struct nlpcb *nlp = (struct nlpcb *)(ns->arg);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = NULL;
+ if (linux_netlink_p != NULL)
+ m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
+ free(buf, M_NETLINK);
+
+ if (__predict_false(m == NULL)) {
+ /* XXX: should we set sorcverr? */
+ return (false);
+ }
+
+ int io_flags = (ns->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
+ return (nl_send_one(m, nlp, cnt, io_flags));
+}
+
+/* Shouldn't be called (maybe except Linux code originating message) */
+static bool
+nlmsg_write_group_lbuf(struct nlmsg_state *ns, void *buf, int datalen, int cnt)
+{
+ struct linear_buffer *lb = (struct linear_buffer *)buf;
+ char *data = (char *)(lb + 1);
+
+ if (__predict_false(datalen == 0)) {
+ free(buf, M_NETLINK);
+ return (true);
+ }
+
+ struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR);
+ if (__predict_false(m == NULL)) {
+ free(buf, M_NETLINK);
+ return (false);
+ }
+ m_append(m, datalen, data);
+ free(buf, M_NETLINK);
+
+ nl_send_group(m, cnt, (uint32_t)(uintptr_t)(ns->arg));
+ return (true);
+}
+
+struct nlwriter_ops nlmsg_writers[] = {
+ /* NS_WRITER_TYPE_MBUF */
+ {
+ .init = nlmsg_get_ns_mbuf,
+ .write_socket = nlmsg_write_socket_mbuf,
+ .write_group = nlmsg_write_group_mbuf,
+ .write_chain = nlmsg_write_chain_mbuf,
+ },
+ /* NS_WRITER_TYPE_BUF */
+ {
+ .init = nlmsg_get_ns_buf,
+ .write_socket = nlmsg_write_socket_buf,
+ .write_group = nlmsg_write_group_buf,
+ .write_chain = nlmsg_write_chain_buf,
+ },
+ /* NS_WRITER_TYPE_LBUF */
+ {
+ .init = nlmsg_get_ns_lbuf,
+ .write_socket = nlmsg_write_socket_lbuf,
+ .write_group = nlmsg_write_group_lbuf,
+ },
+};
+
+static void
+nlmsg_set_callback(struct nlmsg_state *ns)
+{
+ struct nlwriter_ops *pops = &nlmsg_writers[ns->writer_type];
+
+ switch (ns->writer_target) {
+ case NS_WRITER_TARGET_SOCKET:
+ ns->cb = pops->write_socket;
+ break;
+ case NS_WRITER_TARGET_GROUP:
+ ns->cb = pops->write_group;
+ break;
+ case NS_WRITER_TARGET_CHAIN:
+ ns->cb = pops->write_chain;
+ break;
+ default:
+ panic("not implemented");
+ }
+}
+
+static bool
+nlmsg_get_buf_type(struct nlmsg_state *ns, int size, int type, bool waitok)
+{
+ MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
+ RT_LOG(LOG_DEBUG3, "Setting up ns %p size %d type %d", ns, size, type);
+ return (nlmsg_writers[type].init(ns, size, waitok));
+}
+
+static bool
+nlmsg_get_buf(struct nlmsg_state *ns, int size, bool waitok, bool is_linux)
+{
+ int type;
+
+ if (!is_linux) {
+ if (__predict_true(size <= MCLBYTES))
+ type = NS_WRITER_TYPE_MBUF;
+ else
+ type = NS_WRITER_TYPE_BUF;
+ } else
+ type = NS_WRITER_TYPE_LBUF;
+ return (nlmsg_get_buf_type(ns, size, type, waitok));
+}
+
+bool
+nlmsg_get_socket_writer(int size, struct nlpcb *nlp, struct nlmsg_state *ns)
+{
+ if (!nlmsg_get_buf(ns, size, false, nlp->nl_linux))
+ return (false);
+ ns->arg = (void *)nlp;
+ ns->writer_target = NS_WRITER_TARGET_SOCKET;
+ nlmsg_set_callback(ns);
+ return (true);
+}
+
+bool
+nlmsg_get_group_writer(int size, uint32_t group_mask, struct nlmsg_state *ns)
+{
+ if (!nlmsg_get_buf(ns, size, false, false))
+ return (false);
+ ns->arg = (void *)(uintptr_t)group_mask;
+ ns->writer_target = NS_WRITER_TARGET_GROUP;
+ nlmsg_set_callback(ns);
+ return (true);
+}
+
+bool
+nlmsg_get_chain_writer(int size, struct mbuf **pm, struct nlmsg_state *ns)
+{
+ if (!nlmsg_get_buf(ns, size, false, false))
+ return (false);
+ *pm = NULL;
+ ns->arg = (void *)pm;
+ ns->writer_target = NS_WRITER_TARGET_CHAIN;
+ nlmsg_set_callback(ns);
+ RT_LOG(LOG_DEBUG3, "setup cb %p (need %p)", ns->cb, &nlmsg_write_chain_mbuf);
+ return (true);
+}
+
+void
+nlmsg_ignore_limit(struct nlmsg_state *ns)
+{
+ ns->ignore_limit = true;
+}
+
+bool
+nlmsg_flush(struct nlmsg_state *ns)
+{
+
+ if (__predict_false(ns->hdr != NULL)) {
+ /* Last message has not been completed, skip it. */
+ int completed_len = (char *)ns->hdr - ns->data;
+ /* Send completed messages */
+ ns->offset -= ns->offset - completed_len;
+ ns->hdr = NULL;
+ }
+
+ bool result = ns->cb(ns, ns->_storage, ns->offset, ns->num_messages);
+ ns->_storage = NULL;
+
+ if (!result) {
+ RT_LOG(LOG_DEBUG, "ns %p offset %d: flush with %p() failed", ns, ns->offset, ns->cb);
+ }
+
+ return (result);
+}
+
+/*
+ * Flushes previous data and allocates new underlying storage
+ * sufficient for holding at least @required_len bytes.
+ * Return true on success.
+ */
+bool
+nlmsg_refill_buffer(struct nlmsg_state *ns, int required_len)
+{
+ struct nlmsg_state ns_new = {};
+ int completed_len, new_len;
+
+ RT_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
+ ns->offset, ns->alloc_len, required_len);
+
+ /* Calculated new buffer size and allocate it s*/
+ completed_len = (ns->hdr != NULL) ? (char *)ns->hdr - ns->data : ns->offset;
+ if (completed_len > 0 && required_len < MCLBYTES) {
+ /* We already ran out of space, use the largest effective size */
+ new_len = max(ns->alloc_len, MCLBYTES);
+ } else {
+ if (ns->alloc_len < MCLBYTES)
+ new_len = MCLBYTES;
+ else
+ new_len = ns->alloc_len * 2;
+ while (new_len < required_len)
+ new_len *= 2;
+ }
+ bool waitok = (ns->malloc_flag == M_WAITOK);
+ bool is_linux = (ns->writer_type == NS_WRITER_TYPE_LBUF);
+ if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux))
+ return (false);
+ if (ns->ignore_limit)
+ nlmsg_ignore_limit(&ns_new);
+
+ /* Update callback data */
+ ns_new.writer_target = ns->writer_target;
+ nlmsg_set_callback(&ns_new);
+ ns_new.arg = ns->arg;
+
+ /* Copy last (unfinished) header to the new storage */
+ int last_len = ns->offset - completed_len;
+ if (last_len > 0) {
+ memcpy(ns_new.data, ns->hdr, last_len);
+ ns_new.hdr = (struct nlmsghdr *)ns_new.data;
+ ns_new.offset = last_len;
+ }
+
+ RT_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
+
+ /* Flush completed headers */
+ if (completed_len > 0) {
+ RT_LOG(LOG_DEBUG2, "Flushing %u completed message(s) (%d bytes)",
+ ns->num_messages, completed_len);
+ ns->offset -= last_len;
+ ns->hdr = NULL;
+ nlmsg_flush(ns);
+ }
+
+ /* Update state */
+ memcpy(ns, &ns_new, sizeof(struct nlmsg_state));
+ RT_LOG(LOG_DEBUG2, "switched mbuf: used %d/%d bytes", ns->offset, ns->alloc_len);
+
+ return (true);
+}
+
+bool
+nlmsg_add(struct nlmsg_state *ns, uint32_t portid, uint32_t seq, uint16_t type,
+ uint16_t flags, uint32_t len)
+{
+ struct nlmsghdr *hdr;
+
+ MPASS(ns->hdr == NULL);
+
+ int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
+ if (__predict_false(ns->offset + required_len > ns->alloc_len)) {
+ if (!nlmsg_refill_buffer(ns, required_len))
+ return (false);
+ }
+
+ hdr = (struct nlmsghdr *)(&ns->data[ns->offset]);
+
+ hdr->nlmsg_len = len;
+ hdr->nlmsg_type = type;
+ hdr->nlmsg_flags = flags;
+ hdr->nlmsg_seq = seq;
+ hdr->nlmsg_pid = portid;
+
+ ns->hdr = hdr;
+ ns->offset += sizeof(struct nlmsghdr);
+
+ return (true);
+}
+
+void
+nlmsg_end(struct nlmsg_state *ns)
+{
+ MPASS(ns->hdr != NULL);
+
+ ns->hdr->nlmsg_len = (uint32_t)(ns->data + ns->offset - (char *)ns->hdr);
+ ns->hdr = NULL;
+ ns->num_messages++;
+}
+
+void
+nlmsg_abort(struct nlmsg_state *ns)
+{
+ if (ns->hdr != NULL) {
+ ns->offset = (uint32_t)((char *)ns->hdr - ns->data);
+ ns->hdr = NULL;
+ }
+}
+
diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_module.c
@@ -0,0 +1,226 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/ck.h>
+#include <sys/syslog.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+
+#include <machine/atomic.h>
+
+MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets");
+
+#define DEBUG_MOD_NAME nl_mod
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, "");
+
+#define NL_MAX_HANDLERS 20
+struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS];
+struct nl_proto_handler *nl_handlers = _nl_handlers;
+
+CK_LIST_HEAD(nl_control_head, nl_control);
+static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER();
+
+VNET_DEFINE(struct nl_control *, nl_ctl) = NULL;
+
+struct mtx nl_global_mtx;
+MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF);
+
+#define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx)
+#define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx)
+
+int netlink_unloading = 0;
+
+static void
+free_nl_ctl(struct nl_control *ctl)
+{
+ rm_destroy(&ctl->ctl_lock);
+ free(ctl, M_NETLINK);
+}
+
+struct nl_control *
+vnet_nl_ctl_init(void)
+{
+ struct nl_control *ctl;
+
+ ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO);
+ rm_init(&ctl->ctl_lock, "netlink lock");
+ CK_LIST_INIT(&ctl->ctl_port_head);
+ CK_LIST_INIT(&ctl->ctl_pcb_head);
+
+ NL_GLOBAL_LOCK();
+
+ struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl);
+
+ if (tmp == NULL) {
+ atomic_store_ptr(&V_nl_ctl, ctl);
+ CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next);
+ RT_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list",
+ curvnet, ctl);
+ } else {
+ RT_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance");
+ free_nl_ctl(ctl);
+ ctl = tmp;
+ }
+
+ NL_GLOBAL_UNLOCK();
+
+ return (ctl);
+}
+
+static void
+vnet_nl_ctl_destroy(const void *unused __unused)
+{
+ struct nl_control *ctl;
+
+ /* Assume at the time all of the processes / sockets are dead */
+
+ NL_GLOBAL_LOCK();
+ ctl = atomic_load_ptr(&V_nl_ctl);
+ atomic_store_ptr(&V_nl_ctl, NULL);
+ if (ctl != NULL) {
+ RT_LOG(LOG_DEBUG2, "Removing %p from global list", ctl);
+ CK_LIST_REMOVE(ctl, ctl_next);
+ }
+ NL_GLOBAL_UNLOCK();
+
+ if (ctl != NULL)
+ free_nl_ctl(ctl);
+}
+VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+ vnet_nl_ctl_destroy, NULL);
+
+int
+nl_verify_proto(int proto)
+{
+ if (proto < 0 || proto >= NL_MAX_HANDLERS) {
+ return (EINVAL);
+ }
+ int handler_defined = nl_handlers[proto].cb != NULL;
+ return (handler_defined ? 0 : EPROTONOSUPPORT);
+}
+
+const char *
+nl_get_proto_name(int proto)
+{
+ return (nl_handlers[proto].proto_name);
+}
+
+bool
+netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto));
+ nl_handlers[proto].cb = handler;
+ nl_handlers[proto].proto_name = proto_name;
+ NL_GLOBAL_UNLOCK();
+ RT_LOG(LOG_DEBUG, "Registered netlink %s(%d) handler", proto_name, proto);
+ return (true);
+}
+
+bool
+netlink_unregister_proto(int proto)
+{
+ if ((proto < 0) || (proto >= NL_MAX_HANDLERS))
+ return (false);
+ NL_GLOBAL_LOCK();
+ KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto));
+ nl_handlers[proto].cb = NULL;
+ nl_handlers[proto].proto_name = NULL;
+ NL_GLOBAL_UNLOCK();
+ RT_LOG(LOG_DEBUG, "Unregistered netlink proto %d handler", proto);
+ return (true);
+}
+
+static bool
+can_unload(void)
+{
+ struct nl_control *ctl;
+ bool result = true;
+
+ NL_GLOBAL_LOCK();
+
+ CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) {
+ RT_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl);
+ if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) {
+ RT_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl);
+ result = false;
+ break;
+ }
+ }
+
+ NL_GLOBAL_UNLOCK();
+
+ return (result);
+}
+
+static int
+netlink_modevent(module_t mod __unused, int what, void *priv __unused)
+{
+ int ret = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ RT_LOG(LOG_NOTICE, "Loading");
+ break;
+
+ case MOD_UNLOAD:
+ RT_LOG(LOG_NOTICE, "Unload called");
+ if (can_unload()) {
+ RT_LOG(LOG_WARNING, "unloading");
+ netlink_unloading = 1;
+ } else
+ ret = EBUSY;
+ break;
+
+ default:
+ ret = EOPNOTSUPP;
+ break;
+ }
+
+ return (ret);
+}
+static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL };
+
+DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(netlink, 1);
diff --git a/sys/netlink/netlink_route.h b/sys/netlink/netlink_route.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_route.h
@@ -0,0 +1,43 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_ROUTE_H_
+#define _NETLINK_NETLINK_ROUTE_H_
+
+#include <sys/types.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_var.h>
+
+#include <netlink/route/common.h>
+#include <netlink/route/ifaddrs.h>
+#include <netlink/route/interface.h>
+#include <netlink/route/neigh.h>
+#include <netlink/route/route.h>
+#include <netlink/route/nexthop.h>
+
+#endif
diff --git a/sys/netlink/netlink_route.c b/sys/netlink/netlink_route.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_route.c
@@ -0,0 +1,154 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_route_core
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+#define HANDLER_MAX_NUM (NL_RTM_MAX + 10)
+static struct rtnl_cmd_handler *rtnl_handler[HANDLER_MAX_NUM] = {};
+
+bool
+rtnl_register_messages(struct rtnl_cmd_handler *handlers, int count)
+{
+ for (int i = 0; i < count; i++) {
+ if (handlers[i].rtnl_cmd >= HANDLER_MAX_NUM)
+ return (false);
+ MPASS(rtnl_handler[handlers[i].rtnl_cmd] == NULL);
+ }
+ for (int i = 0; i < count; i++)
+ rtnl_handler[handlers[i].rtnl_cmd] = &handlers[i];
+ return (true);
+}
+
+/*
+ * Handler called by netlink subsystem when matching netlink message is received
+ */
+static int
+rtnl_handle_message(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt)
+{
+ struct rtnl_cmd_handler *cmd;
+ struct epoch_tracker et;
+ struct nlpcb *nlp = npt->nlp;
+ int error = 0;
+
+ if (__predict_false(hdr->nlmsg_type >= HANDLER_MAX_NUM)) {
+ NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ cmd = rtnl_handler[hdr->nlmsg_type];
+ if (__predict_false(cmd == NULL)) {
+ RT_LOG(LOG_DEBUG, "invalid message type: %d", hdr->nlmsg_type);
+ return (ENOTSUP);
+ }
+
+ NLP_LOG(LOG_DEBUG2, nlp, "received msg %s(%d) len %d", cmd->rtnl_cmd_name,
+ hdr->nlmsg_type, hdr->nlmsg_len);
+ /* XXX: check min header length if strict is set */
+
+ /*
+ * Setup message writer. TODO: init when reading mbuf batch.
+ */
+ struct nlmsg_state ns = {};
+ if (!nlmsg_get_socket_writer(NLMSG_SMALL, nlp, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating socket writer");
+ return (ENOMEM);
+ }
+ nlmsg_ignore_limit(&ns);
+ npt->ns = &ns;
+
+ bool need_epoch = !(cmd->rtnl_flags & RTNL_F_NOEPOCH);
+
+ if (need_epoch)
+ NET_EPOCH_ENTER(et);
+ error = cmd->rtnl_cb(hdr, nlp, npt);
+ if (need_epoch)
+ NET_EPOCH_EXIT(et);
+
+ nlmsg_flush(&ns);
+
+ return (error);
+}
+
+static void nlbridge_cb_func(uint32_t event_type, uint32_t fibnum,
+ const struct rt_addrinfo *info, const struct rib_cmd_info *rc, void *arg)
+{
+ RT_LOG(LOG_DEBUG2, "received bridge event %d", event_type);
+ switch (event_type) {
+ case NLBR_EVENT_ROUTE:
+ rtnl_handle_route_event(fibnum, info, rc);
+ break;
+ }
+}
+
+static struct rib_event_bridge nlbridge = {
+ .reb_cb = nlbridge_cb_func,
+ .reb_cb_arg = NULL,
+ .reb_provider_id = NLBR_PROVIDER_NETLINK,
+};
+
+static void
+rtnl_load(void *u __unused)
+{
+ RT_LOG(LOG_ERR, "netlink support is in BETA stage");
+ RT_LOG(LOG_NOTICE, "rtnl loading");
+ rib_bridge_link(&nlbridge);
+ rtnl_neighs_init();
+ rtnl_ifaces_init();
+ rtnl_nexthops_init();
+ rtnl_routes_init();
+ netlink_register_proto(NETLINK_ROUTE, "NETLINK_ROUTE", rtnl_handle_message);
+}
+SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL);
+
+static void
+rtnl_unload(void *u __unused)
+{
+ rib_bridge_unlink(&nlbridge);
+ rtnl_ifaces_destroy();
+
+ /* Wait till all consumers read nlbridge data */
+ epoch_wait_preempt(net_epoch_preempt);
+}
+SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL);
diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/netlink_var.h
@@ -0,0 +1,148 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef _NETLINK_NETLINK_VAR_H_
+#define _NETLINK_NETLINK_VAR_H_
+
+#include <sys/epoch.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/vnet.h>
+
+#define NLSNDQ 65536 /* Default socket sendspace */
+#define NLRCVQ 65536 /* Default socket recvspace */
+
+struct ucred;
+
+struct nl_io_queue {
+ struct mbuf *head;
+ struct mbuf *last;
+ int length;
+ int hiwat;
+};
+
+struct nlpcb {
+ struct socket *nl_socket;
+ uint32_t nl_port;
+ uint32_t nl_groups;
+ uint32_t nl_flags;
+ uint32_t nl_process_id;
+ int nl_proto;
+ bool nl_active;
+ bool nl_bound;
+ bool nl_task_pending;
+ bool nl_tx_blocked; /* No new requests accepted */
+ bool nl_linux; /* true if running under compat */
+ struct nl_io_queue rx_queue;
+ struct nl_io_queue tx_queue;
+ struct taskqueue *nl_taskqueue;
+ struct task nl_task;
+ struct ucred *nl_cred; /* Copy of nl_socket->so_cred */
+ uint64_t nl_dropped_bytes;
+ uint64_t nl_dropped_messages;
+ CK_LIST_ENTRY(nlpcb) nl_next;
+ CK_LIST_ENTRY(nlpcb) nl_port_next;
+ volatile u_int nl_refcount;
+ struct mtx nl_lock;
+ struct epoch_context nl_epoch_ctx;
+};
+#define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb)
+
+#define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF)
+#define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock))
+#define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock))
+#define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock))
+
+#define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16)
+
+#define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */
+#define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */
+
+SYSCTL_DECL(_net_netlink);
+
+struct nl_io {
+ struct callout callout;
+ struct mbuf *head;
+ struct mbuf *last;
+ int64_t length;
+};
+
+
+struct nl_control {
+ CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head;
+ CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head;
+ CK_LIST_ENTRY(nl_control) ctl_next;
+ struct nl_io ctl_io;
+ struct rmlock ctl_lock;
+};
+VNET_DECLARE(struct nl_control *, nl_ctl);
+#define V_nl_ctl VNET(nl_ctl)
+
+
+/* locking */
+#define NLCTL_TRACKER struct rm_priotracker nl_tracker
+#define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker)
+#define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker)
+
+#define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock))
+#define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock))
+
+struct sockaddr_nl;
+struct sockaddr;
+struct nlmsghdr;
+
+
+/* netlink_module.c */
+struct nl_control *vnet_nl_ctl_init(void);
+
+int nl_verify_proto(int proto);
+const char *nl_get_proto_name(int proto);
+
+extern int netlink_unloading;
+
+struct nl_proto_handler {
+ nl_handler_f cb;
+ const char *proto_name;
+};
+extern struct nl_proto_handler *nl_handlers;
+
+/* netlink_domain.c */
+void nl_send_group(struct mbuf *m, int cnt, uint32_t group_mask);
+
+/* netlink_io.c */
+#define NL_IOF_UNTRANSLATED 0x01
+#define NL_IOF_IGNORE_LIMIT 0x02
+bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int cnt, int io_flags);
+void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg);
+void nl_on_transmit(struct nlpcb *nlp);
+void nl_free_io(struct nlpcb *nlp);
+
+void nl_taskqueue_handler(void *_arg, int pending);
+int nl_receive_async(struct mbuf *m, struct socket *so);
+void nl_process_receive_locked(struct nlpcb *nlp);
+
+#endif
diff --git a/sys/netlink/route/common.h b/sys/netlink/route/common.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/common.h
@@ -0,0 +1,243 @@
+/*
+ * Common defines for all parts of the netlink route family
+ */
+
+#ifndef _NETLINK_ROUTE_COMMON_H_
+#define _NETLINK_ROUTE_COMMON_H_
+
+/*
+ * All messages defined by the NETLINK_ROUTE subsystem
+ */
+enum {
+ NL_RTM_BASE = 16,
+#define NL_RTM_BASE NL_RTM_BASE
+ NL_RTM_NEWLINK = 16,
+#define NL_RTM_NEWLINK NL_RTM_NEWLINK
+ NL_RTM_DELLINK,
+#define NL_RTM_DELLINK NL_RTM_DELLINK
+ NL_RTM_GETLINK,
+#define NL_RTM_GETLINK NL_RTM_GETLINK
+ NL_RTM_SETLINK,
+#define NL_RTM_SETLINK NL_RTM_SETLINK
+ NL_RTM_NEWADDR = 20,
+#define NL_RTM_NEWADDR NL_RTM_NEWADDR
+ NL_RTM_DELADDR,
+#define NL_RTM_DELADDR NL_RTM_DELADDR
+ NL_RTM_GETADDR,
+#define NL_RTM_GETADDR NL_RTM_GETADDR
+ NL_RTM_NEWROUTE = 24,
+#define NL_RTM_NEWROUTE NL_RTM_NEWROUTE
+ NL_RTM_DELROUTE,
+#define NL_RTM_DELROUTE NL_RTM_DELROUTE
+ NL_RTM_GETROUTE,
+#define NL_RTM_GETROUTE NL_RTM_GETROUTE
+ NL_RTM_NEWNEIGH = 28,
+#define NL_RTM_NEWNEIGH NL_RTM_NEWNEIGH
+ NL_RTM_DELNEIGH,
+#define NL_RTM_DELNEIGH NL_RTM_DELNEIGH
+ NL_RTM_GETNEIGH,
+#define NL_RTM_GETNEIGH NL_RTM_GETNEIGH
+ NL_RTM_NEWRULE = 32,
+#define NL_RTM_NEWRULE NL_RTM_NEWRULE
+ NL_RTM_DELRULE,
+#define NL_RTM_DELRULE NL_RTM_DELRULE
+ NL_RTM_GETRULE,
+#define NL_RTM_GETRULE NL_RTM_GETRULE
+ NL_RTM_NEWQDISC = 36,
+#define NL_RTM_NEWQDISC NL_RTM_NEWQDISC
+ NL_RTM_DELQDISC,
+#define NL_RTM_DELQDISC NL_RTM_DELQDISC
+ NL_RTM_GETQDISC,
+#define NL_RTM_GETQDISC NL_RTM_GETQDISC
+ NL_RTM_NEWTCLASS = 40,
+#define NL_RTM_NEWTCLASS NL_RTM_NEWTCLASS
+ NL_RTM_DELTCLASS,
+#define NL_RTM_DELTCLASS NL_RTM_DELTCLASS
+ NL_RTM_GETTCLASS,
+#define NL_RTM_GETTCLASS NL_RTM_GETTCLASS
+ NL_RTM_NEWTFILTER = 44,
+#define NL_RTM_NEWTFILTER NL_RTM_NEWTFILTER
+ NL_RTM_DELTFILTER,
+#define NL_RTM_DELTFILTER NL_RTM_DELTFILTER
+ NL_RTM_GETTFILTER,
+#define NL_RTM_GETTFILTER NL_RTM_GETTFILTER
+ NL_RTM_NEWACTION = 48,
+#define NL_RTM_NEWACTION NL_RTM_NEWACTION
+ NL_RTM_DELACTION,
+#define NL_RTM_DELACTION NL_RTM_DELACTION
+ NL_RTM_GETACTION,
+#define NL_RTM_GETACTION NL_RTM_GETACTION
+ NL_RTM_NEWPREFIX = 52,
+#define NL_RTM_NEWPREFIX NL_RTM_NEWPREFIX
+ NL_RTM_GETMULTICAST = 58,
+#define NL_RTM_GETMULTICAST NL_RTM_GETMULTICAST
+ NL_RTM_GETANYCAST = 62,
+#define NL_RTM_GETANYCAST NL_RTM_GETANYCAST
+ NL_RTM_NEWNEIGHTBL = 64,
+#define NL_RTM_NEWNEIGHTBL NL_RTM_NEWNEIGHTBL
+ NL_RTM_GETNEIGHTBL = 66,
+#define NL_RTM_GETNEIGHTBL NL_RTM_GETNEIGHTBL
+ NL_RTM_SETNEIGHTBL,
+#define NL_RTM_SETNEIGHTBL NL_RTM_SETNEIGHTBL
+ NL_RTM_NEWNDUSEROPT = 68,
+#define NL_RTM_NEWNDUSEROPT NL_RTM_NEWNDUSEROPT
+ NL_RTM_NEWADDRLABEL = 72,
+#define NL_RTM_NEWADDRLABEL NL_RTM_NEWADDRLABEL
+ NL_RTM_DELADDRLABEL,
+#define NL_RTM_DELADDRLABEL NL_RTM_DELADDRLABEL
+ NL_RTM_GETADDRLABEL,
+#define NL_RTM_GETADDRLABEL NL_RTM_GETADDRLABEL
+ NL_RTM_GETDCB = 78,
+#define NL_RTM_GETDCB NL_RTM_GETDCB
+ NL_RTM_SETDCB,
+#define NL_RTM_SETDCB NL_RTM_SETDCB
+ NL_RTM_NEWNETCONF = 80,
+#define NL_RTM_NEWNETCONF NL_RTM_NEWNETCONF
+ NL_RTM_GETNETCONF = 82,
+#define NL_RTM_GETNETCONF NL_RTM_GETNETCONF
+ NL_RTM_NEWMDB = 84,
+#define NL_RTM_NEWMDB NL_RTM_NEWMDB
+ NL_RTM_DELMDB = 85,
+#define NL_RTM_DELMDB NL_RTM_DELMDB
+ NL_RTM_GETMDB = 86,
+#define NL_RTM_GETMDB NL_RTM_GETMDB
+ NL_RTM_NEWNSID = 88,
+#define NL_RTM_NEWNSID NL_RTM_NEWNSID
+ NL_RTM_DELNSID = 89,
+#define NL_RTM_DELNSID NL_RTM_DELNSID
+ NL_RTM_GETNSID = 90,
+#define NL_RTM_GETNSID NL_RTM_GETNSID
+ NL_RTM_NEWSTATS = 92,
+#define NL_RTM_NEWSTATS NL_RTM_NEWSTATS
+ NL_RTM_GETSTATS = 94,
+#define NL_RTM_GETSTATS NL_RTM_GETSTATS
+ NL_RTM_NEWNEXTHOP = 104,
+#define NL_RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP
+ NL_RTM_DELNEXTHOP,
+#define NL_RTM_DELNEXTHOP NL_RTM_DELNEXTHOP
+ NL_RTM_GETNEXTHOP,
+#define NL_RTM_GETNEXTHOP NL_RTM_GETNEXTHOP
+ __NL_RTM_MAX,
+};
+#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1)
+
+#ifndef _KERNEL
+/*
+ * RTM_* namespace clashes with BSD rtsock namespace.
+ * Use NL_RTM_ prefix in the kernel and map it to RTM_
+ * for userland.
+ */
+#define RTM_BASE NL_RTM_BASE
+#define RTM_NEWLINK NL_RTM_NEWLINK
+#define RTM_DELLINK NL_RTM_DELLINK
+#define RTM_GETLINK NL_RTM_GETLINK
+#define RTM_SETLINK NL_RTM_SETLINK
+#define RTM_NEWADDR NL_RTM_NEWADDR
+#define RTM_DELADDR NL_RTM_DELADDR
+#define RTM_GETADDR NL_RTM_GETADDR
+#define RTM_NEWROUTE NL_RTM_NEWROUTE
+#define RTM_DELROUTE NL_RTM_DELROUTE
+#define RTM_GETROUTE NL_RTM_GETROUTE
+#define RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP
+#define RTM_DELNEXTHOP NL_RTM_DELNEXTHOP
+#define RTM_GETNEXTHOP NL_RTM_GETNEXTHOP
+#endif
+
+#ifndef _KERNEL
+/* rtnetlink multicast groups - backwards compatibility for userspace */
+#define RTMGRP_LINK 0x01
+#define RTMGRP_NOTIFY 0x02
+#define RTMGRP_NEIGH 0x04
+#define RTMGRP_TC 0x08
+
+#define RTMGRP_IPV4_IFADDR 0x10
+#define RTMGRP_IPV4_MROUTE 0x20
+#define RTMGRP_IPV4_ROUTE 0x40
+#define RTMGRP_IPV4_RULE 0x80
+
+#define RTMGRP_IPV6_IFADDR 0x100
+#define RTMGRP_IPV6_MROUTE 0x200
+#define RTMGRP_IPV6_ROUTE 0x400
+#define RTMGRP_IPV6_IFINFO 0x800
+
+#define RTMGRP_DECnet_IFADDR 0x1000
+#define RTMGRP_DECnet_ROUTE 0x4000
+
+#define RTMGRP_IPV6_PREFIX 0x20000
+#endif
+
+/* rtnetlink multicast groups */
+enum rtnetlink_groups {
+ RTNLGRP_NONE,
+#define RTNLGRP_NONE RTNLGRP_NONE
+ RTNLGRP_LINK,
+#define RTNLGRP_LINK RTNLGRP_LINK
+ RTNLGRP_NOTIFY,
+#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY
+ RTNLGRP_NEIGH,
+#define RTNLGRP_NEIGH RTNLGRP_NEIGH
+ RTNLGRP_TC,
+#define RTNLGRP_TC RTNLGRP_TC
+ RTNLGRP_IPV4_IFADDR,
+#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
+ RTNLGRP_IPV4_MROUTE,
+#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE
+ RTNLGRP_IPV4_ROUTE,
+#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE
+ RTNLGRP_IPV4_RULE,
+#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE
+ RTNLGRP_IPV6_IFADDR,
+#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR
+ RTNLGRP_IPV6_MROUTE,
+#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE
+ RTNLGRP_IPV6_ROUTE,
+#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE
+ RTNLGRP_IPV6_IFINFO,
+#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO
+ RTNLGRP_DECnet_IFADDR,
+#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR
+ RTNLGRP_NOP2,
+ RTNLGRP_DECnet_ROUTE,
+#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE
+ RTNLGRP_DECnet_RULE,
+#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE
+ RTNLGRP_NOP4,
+ RTNLGRP_IPV6_PREFIX,
+#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX
+ RTNLGRP_IPV6_RULE,
+#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE
+ RTNLGRP_ND_USEROPT,
+#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT
+ RTNLGRP_PHONET_IFADDR,
+#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR
+ RTNLGRP_PHONET_ROUTE,
+#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE
+ RTNLGRP_DCB,
+#define RTNLGRP_DCB RTNLGRP_DCB
+ RTNLGRP_IPV4_NETCONF,
+#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF
+ RTNLGRP_IPV6_NETCONF,
+#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
+ RTNLGRP_MDB,
+#define RTNLGRP_MDB RTNLGRP_MDB
+ RTNLGRP_MPLS_ROUTE,
+#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE
+ RTNLGRP_NSID,
+#define RTNLGRP_NSID RTNLGRP_NSID
+ RTNLGRP_MPLS_NETCONF,
+#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF
+ RTNLGRP_IPV4_MROUTE_R,
+#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R
+ RTNLGRP_IPV6_MROUTE_R,
+#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R
+ RTNLGRP_NEXTHOP,
+#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP
+ RTNLGRP_BRVLAN,
+#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN
+ __RTNLGRP_MAX
+};
+#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
+
+
+#endif
+
diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/iface.c
@@ -0,0 +1,718 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_media.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_iface
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+
+struct netlink_walkargs {
+ struct nlmsg_state *ns;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+#define FAIL_ATTR(a) {\
+ RT_LOG(LOG_DEBUG, "failed writing attribute %s (%d)", #a, a); \
+ goto enomem; \
+}
+
+static eventhandler_tag ifdetach_event, ifattach_event, ifaddr_event;
+
+/* */
+
+/*
+ * RTM_GETLINK request
+ * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0},
+ * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
+ *
+ * Reply:
+ * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0},
+{{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"}
+
+[
+{{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"},
+{{nla_len=8, nla_type=IFLA_TXQLEN}, 1000},
+{{nla_len=5, nla_type=IFLA_OPERSTATE}, 6},
+{{nla_len=5, nla_type=IFLA_LINKMODE}, 0},
+{{nla_len=8, nla_type=IFLA_MTU}, 1500},
+{{nla_len=8, nla_type=IFLA_MIN_MTU}, 68},
+ {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000},
+{{nla_len=8, nla_type=IFLA_GROUP}, 0},
+{{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0},
+{{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535},
+{{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536},
+{{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1},
+{{nla_len=5, nla_type=IFLA_CARRIER}, 1},
+{{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"},
+{{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2},
+{{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0},
+{{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1},
+{{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1},
+ */
+
+struct if_state {
+ uint8_t ifla_operstate;
+ uint8_t ifla_carrier;
+};
+
+static void
+get_operstate_ether(struct ifnet *ifp, struct if_state *pstate)
+{
+ struct ifmediareq ifmr = {};
+ int error;
+ error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (void *)&ifmr);
+
+ if (error != 0) {
+ RT_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d",
+ if_name(ifp), error);
+ return;
+ }
+
+ switch (IFM_TYPE(ifmr.ifm_active)) {
+ case IFM_ETHER:
+ if (ifmr.ifm_status & IFM_ACTIVE) {
+ pstate->ifla_carrier = 1;
+ if (ifp->if_flags & IFF_MONITOR)
+ pstate->ifla_operstate = IF_OPER_DORMANT;
+ else
+ pstate->ifla_operstate = IF_OPER_UP;
+ } else
+ pstate->ifla_operstate = IF_OPER_DOWN;
+ }
+}
+
+static bool
+get_stats(struct nlmsg_state *ns, struct ifnet *ifp)
+{
+ struct rtnl_link_stats64 *stats;
+
+ int nla_len = sizeof(struct nlattr) + sizeof(*stats);
+ struct nlattr *nla = nlmsg_reserve_data(ns, nla_len, struct nlattr);
+ if (nla == NULL)
+ return (false);
+ nla->nla_type = IFLA_STATS64;
+ nla->nla_len = nla_len;
+ stats = (struct rtnl_link_stats64 *)(nla + 1);
+
+ stats->rx_packets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
+ stats->tx_packets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
+
+ stats->rx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
+ stats->tx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
+ stats->rx_errors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
+ stats->tx_errors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
+ stats->rx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
+ stats->tx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
+ stats->multicast = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
+ stats->rx_nohandler = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
+
+ return (true);
+}
+
+static void
+get_operstate(struct ifnet *ifp, struct if_state *pstate)
+{
+ pstate->ifla_operstate = IF_OPER_UNKNOWN;
+ pstate->ifla_carrier = 0; /* no carrier */
+
+ switch (ifp->if_type) {
+ case IFT_ETHER:
+ get_operstate_ether(ifp, pstate);
+ break;
+ case IFT_LOOP:
+ if (ifp->if_flags & IFF_UP) {
+ pstate->ifla_operstate = IF_OPER_UP;
+ pstate->ifla_carrier = 1;
+ } else
+ pstate->ifla_operstate = IF_OPER_DOWN;
+ break;
+ }
+}
+
+static unsigned
+ifp_flags_to_netlink(const struct ifnet *ifp)
+{
+ return (ifp->if_flags | ifp->if_drv_flags);
+}
+
+#define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen))
+static bool
+dump_sa(struct nlmsg_state *ns, int attr, const struct sockaddr *sa)
+{
+ uint32_t addr_len = 0;
+ const void *addr_data = NULL;
+ struct in6_addr addr6;
+
+ if (sa == NULL)
+ return (true);
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ addr_len = sizeof(struct in_addr);
+ addr_data = &((const struct sockaddr_in *)sa)->sin_addr;
+ break;
+ case AF_INET6:
+ in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len);
+ addr_len = sizeof(struct in6_addr);
+ addr_data = &addr6;
+ break;
+ case AF_LINK:
+ addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen;
+ addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa);
+ break;
+ default:
+ RT_LOG(LOG_DEBUG, "unsupported family: %d, skipping", sa->sa_family);
+ return (true);
+ }
+
+ return (nlattr_add(ns, attr, addr_len, addr_data));
+}
+
+/*
+ * Dumps interface state, properties and metrics.
+ * @ns: message writer
+ * @ifp: target interface
+ * @hdr: template header
+ *
+ * This function is called without epoch and MAY sleep.
+ */
+static bool
+dump_iface(struct nlmsg_state *ns, struct ifnet *ifp, const struct nlmsghdr *hdr)
+{
+ struct ifinfomsg *ifinfo;
+
+ RT_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp));
+
+ if (!nlmsg_reply(ns, hdr, sizeof(struct ifinfomsg)))
+ goto enomem;
+
+ ifinfo = nlmsg_reserve_object(ns, struct ifinfomsg);
+ if (ifinfo == NULL)
+ goto enomem;
+ ifinfo->ifi_family = AF_UNSPEC;
+ ifinfo->__ifi_pad = 0;
+ ifinfo->ifi_type = ifp->if_type; // ARPHDR
+ ifinfo->ifi_index = ifp->if_index;
+ ifinfo->ifi_flags = ifp_flags_to_netlink(ifp);
+ ifinfo->ifi_change = 0;
+
+ if (!nlattr_add_string(ns, IFLA_IFNAME, if_name(ifp)))
+ goto enomem;
+
+ struct if_state ifs = {};
+ get_operstate(ifp, &ifs);
+
+ if (!nlattr_add_u8(ns, IFLA_OPERSTATE, ifs.ifla_operstate))
+ goto enomem;
+
+ if (!nlattr_add_u8(ns, IFLA_CARRIER, ifs.ifla_carrier))
+ goto enomem;
+
+/*
+ if (!nlattr_add_u8(ns, IFLA_PROTO_DOWN, val))
+ goto enomem;
+
+ if (!nlattr_add_u8(ns, IFLA_LINKMODE, val))
+ goto enomem;
+*/
+ if ((ifp->if_addr != NULL)) {
+ if (!dump_sa(ns, IFLA_ADDRESS, ifp->if_addr->ifa_addr))
+ goto enomem;
+ }
+
+ if ((ifp->if_broadcastaddr != NULL)) {
+ if (!nlattr_add(ns, IFLA_BROADCAST, ifp->if_addrlen,
+ ifp->if_broadcastaddr))
+ goto enomem;
+ }
+
+ if (!nlattr_add_u32(ns, IFLA_MTU, ifp->if_mtu))
+ goto enomem;
+/*
+ if (!nlattr_add_u32(ns, IFLA_MIN_MTU, 60))
+ goto enomem;
+
+ if (!nlattr_add_u32(ns, IFLA_MAX_MTU, 9000))
+ goto enomem;
+
+ if (!nlattr_add_u32(ns, IFLA_GROUP, 0))
+ goto enomem;
+*/
+ if (!get_stats(ns, ifp))
+ goto enomem;
+
+ uint32_t val = (ifp->if_flags & IFF_PROMISC) != 0;
+ if (!nlattr_add_u32(ns, IFLA_PROMISCUITY, val))
+ goto enomem;
+
+ nlmsg_end(ns);
+
+ return (true);
+
+enomem:
+ RT_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp));
+ nlmsg_abort(ns);
+ return (false);
+}
+
+struct nl_parsed_link {
+ char *ifla_group;
+ char *ifla_ifname;
+ unsigned short ifi_type;
+ int ifi_index;
+};
+#define _OFF_S(_field) offsetof(struct nl_parsed_link, _field)
+
+static struct nlattr_parser ps[] = {
+ { .type = IFLA_IFNAME, .off = _OFF_S(ifla_ifname), .cb = nlattr_get_string },
+ { .type = IFLA_GROUP, .off = _OFF_S(ifla_group), .cb = nlattr_get_string },
+ { .type = IFLA_ALT_IFNAME, .off = _OFF_S(ifla_ifname), .cb = nlattr_get_string },
+};
+
+static bool
+match_iface(struct nl_parsed_link *attrs, struct ifnet *ifp)
+{
+ if (attrs->ifi_index != 0 && attrs->ifi_index != ifp->if_index)
+ return (false);
+ if (attrs->ifi_type != 0 && attrs->ifi_index != ifp->if_type)
+ return (false);
+ if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp)))
+ return (false);
+ /* TODO: add group match */
+
+ return (true);
+}
+
+static int
+rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ struct epoch_tracker et;
+ struct ifnet *ifp;
+ int error = 0;
+
+ struct ifinfomsg *ifm = (struct ifinfomsg *)nlmsg_data(hdr);
+
+ struct nl_parsed_link attrs = {
+ .ifi_type = ifm->ifi_type,
+ .ifi_index = ifm->ifi_index,
+ };
+ error = nl_parse_attrs(hdr, sizeof(*ifm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .ns = npt->ns,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .hdr.nlmsg_type = NL_RTM_NEWLINK,
+ };
+
+ /* Fast track for an interface w/ explicit index match */
+ if (attrs.ifi_index != 0) {
+ NET_EPOCH_ENTER(et);
+ ifp = ifnet_byindex_ref(attrs.ifi_index);
+ NET_EPOCH_EXIT(et);
+ if (ifp != NULL) {
+ if (match_iface(&attrs, ifp)) {
+ if (!dump_iface(wa.ns, ifp, &wa.hdr))
+ error = ENOMEM;
+ } else
+ error = ESRCH;
+ if_rele(ifp);
+ } else
+ error = ESRCH;
+ return (error);
+ }
+
+ /*
+ * Fetching some link properties require performing ioctl's that may be blocking.
+ * Address it by saving referenced pointers of the matching links,
+ * exiting from epoch and going throught the list one-by-one.
+ */
+
+ RT_LOG(LOG_DEBUG2, "Start dump");
+
+ struct ifnet **match_array;
+ int offset = 0, base_count = 16; /* start with 128 bytes */
+ match_array = malloc(base_count * sizeof(void *), M_TEMP, M_NOWAIT);
+
+ NET_EPOCH_ENTER(et);
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ wa.count++;
+ if (match_iface(&attrs, ifp)) {
+ if (offset < base_count) {
+ if (!if_try_ref(ifp))
+ continue;
+ match_array[offset++] = ifp;
+ continue;
+ }
+ /* Too many matches, need to reallocate */
+ struct ifnet **new_array;
+ int sz = base_count * sizeof(void *);
+ base_count *= 2;
+ new_array = malloc(sz * 2, M_TEMP, M_NOWAIT);
+ if (new_array == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ memcpy(new_array, match_array, sz);
+ free(match_array, M_TEMP);
+ match_array = new_array;
+ }
+ wa.dumped++;
+ }
+ NET_EPOCH_EXIT(et);
+
+ RT_LOG(LOG_DEBUG2, "Matched %d interface(s), dumping", wa.dumped);
+ for (int i = 0; error == 0 && i < offset; i++) {
+ if (!dump_iface(wa.ns, match_array[i], &wa.hdr))
+ error = ENOMEM;
+ }
+ for (int i = 0; i < offset; i++)
+ if_rele(match_array[i]);
+ free(match_array, M_TEMP);
+
+ RT_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_end_dump(wa.ns, error, &wa.hdr)) {
+ RT_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (error);
+}
+
+
+/*
+
+{ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")},
+ [
+ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")},
+ {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")},
+ {{nla_len=7, nla_type=IFA_LABEL}, "lo"},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]},
+---
+
+{{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735},
+ {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")},
+ [
+ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")},
+ {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}},
+ {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]},
+*/
+
+static uint8_t
+ifa_get_scope(const struct ifaddr *ifa)
+{
+ const struct sockaddr *sa;
+ uint8_t addr_scope = RT_SCOPE_UNIVERSE;
+
+ sa = ifa->ifa_addr;
+ switch (sa->sa_family) {
+ case AF_INET:
+ {
+ struct in_addr addr;
+ addr = ((const struct sockaddr_in *)sa)->sin_addr;
+ if (IN_LOOPBACK(addr.s_addr))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN_LINKLOCAL(addr.s_addr))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+ case AF_INET6:
+ {
+ const struct in6_addr *addr;
+ addr = &((const struct sockaddr_in6 *)sa)->sin6_addr;
+ if (IN6_IS_ADDR_LOOPBACK(addr))
+ addr_scope = RT_SCOPE_HOST;
+ else if (IN6_IS_ADDR_LINKLOCAL(addr))
+ addr_scope = RT_SCOPE_LINK;
+ break;
+ }
+ }
+
+ return (addr_scope);
+}
+
+static uint8_t
+inet6_get_plen(const struct in6_addr *addr)
+{
+
+ return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) +
+ bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3]));
+}
+
+static uint8_t
+get_sa_plen(const struct sockaddr *sa)
+{
+ const struct in6_addr *paddr6;
+ const struct in_addr *paddr;
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ if (sa == NULL)
+ return (32);
+ paddr = &(((const struct sockaddr_in *)sa)->sin_addr);
+ return bitcount32(paddr->s_addr);;
+ case AF_INET6:
+ if (sa == NULL)
+ return (128);
+ paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr);
+ return inet6_get_plen(paddr6);
+ }
+
+ return (0);
+}
+
+
+/*
+ * {'attrs': [('IFA_ADDRESS', '12.0.0.1'),
+ ('IFA_LOCAL', '12.0.0.1'),
+ ('IFA_LABEL', 'eth10'),
+ ('IFA_FLAGS', 128),
+ ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})],
+ */
+static bool
+dump_iface_addr(struct nlmsg_state *ns, struct ifnet *ifp, struct ifaddr *ifa,
+ const struct nlmsghdr *hdr)
+{
+ struct ifaddrmsg *ifamsg;
+ struct sockaddr *sa = ifa->ifa_addr;
+
+ RT_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s",
+ ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+
+ if (!nlmsg_reply(ns, hdr, sizeof(struct ifaddrmsg)))
+ goto enomem;
+
+ ifamsg = nlmsg_reserve_object(ns, struct ifaddrmsg);
+ if (ifamsg == NULL)
+ goto enomem;
+ ifamsg->ifa_family = sa->sa_family;
+ ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask);
+ ifamsg->ifa_flags = 0; // ifa_flags is useless
+ ifamsg->ifa_scope = ifa_get_scope(ifa);
+ ifamsg->ifa_index = ifp->if_index;
+
+ struct sockaddr *dst_sa = ifa->ifa_dstaddr;
+ if ((dst_sa == NULL) || (dst_sa->sa_family != sa->sa_family))
+ dst_sa = sa;
+ if (!dump_sa(ns, IFA_ADDRESS, dst_sa))
+ FAIL_ATTR(IFA_ADDRESS);
+ if (!dump_sa(ns, IFA_LOCAL, sa))
+ FAIL_ATTR(IFA_LOCAL);
+
+ if (!nlattr_add_string(ns, IFA_LABEL, if_name(ifp)))
+ FAIL_ATTR(IFA_LABEL);
+ uint32_t val = 0; // ifa->ifa_flags;
+ if (!nlattr_add_u32(ns, IFA_FLAGS, val))
+ FAIL_ATTR(IFA_FLAGS);
+
+ nlmsg_end(ns);
+ return (true);
+enomem:
+ RT_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s",
+ rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp));
+ nlmsg_abort(ns);
+ return (false);
+}
+
+static int
+rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ int error = 0;
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .ns = npt->ns,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ .hdr.nlmsg_type = NL_RTM_NEWADDR,
+ };
+
+ RT_LOG(LOG_DEBUG2, "Start dump");
+
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (wa.family != 0 && wa.family != ifa->ifa_addr->sa_family)
+ continue;
+ if (ifa->ifa_addr->sa_family == AF_LINK)
+ continue;
+ wa.count++;
+ if (!dump_iface_addr(wa.ns, ifp, ifa, &wa.hdr)) {
+ error = ENOMEM;
+ break;
+ }
+ wa.dumped++;
+ }
+ if (error != 0)
+ break;
+ }
+
+ RT_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped);
+
+ if (!nlmsg_end_dump(wa.ns, error, &wa.hdr)) {
+ RT_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (error);
+}
+
+static void
+rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd)
+{
+ struct nlmsghdr hdr = {};
+ struct nlmsg_state ns = {};
+ uint32_t group = 0;
+
+ switch (ifa->ifa_addr->sa_family) {
+ case AF_INET:
+ group = RTNLGRP_IPV4_IFADDR;
+ break;
+ case AF_INET6:
+ group = RTNLGRP_IPV6_IFADDR;
+ break;
+ default:
+ RT_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d",
+ ifa->ifa_addr->sa_family);
+ return;
+ }
+
+ if (!nl_has_listeners(NETLINK_ROUTE, group))
+ return;
+
+ if (!nlmsg_get_group_writer(NLMSG_LARGE, group, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating group writer");
+ return;
+ }
+
+ hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR;
+
+ dump_iface_addr(&ns, ifa->ifa_ifp, ifa, &hdr);
+ nlmsg_flush(&ns);
+}
+
+static void
+rtnl_handle_ifattach(void *arg, struct ifnet *ifp)
+{
+ struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_NEWLINK };
+ struct nlmsg_state ns = {};
+
+ if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK))
+ return;
+
+ if (!nlmsg_get_group_writer(NLMSG_LARGE, RTNLGRP_LINK, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+ dump_iface(&ns, ifp, &hdr);
+ nlmsg_flush(&ns);
+}
+
+static void
+rtnl_handle_ifdetach(void *arg, struct ifnet *ifp)
+{
+ struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_DELLINK };
+ struct nlmsg_state ns = {};
+
+ if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK))
+ return;
+
+ if (!nlmsg_get_group_writer(NLMSG_LARGE, RTNLGRP_LINK, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+ dump_iface(&ns, ifp, &hdr);
+ nlmsg_flush(&ns);
+}
+
+static struct rtnl_cmd_handler cmd_handlers[] = {
+ { NL_RTM_GETLINK, "RTM_GETLINK", &rtnl_handle_getlink, sizeof(struct ifinfomsg), RTNL_F_NOEPOCH },
+ { NL_RTM_GETADDR, "RTM_GETADDR", &rtnl_handle_getaddr, sizeof(struct ifaddrmsg)},
+};
+
+void
+rtnl_ifaces_init(void)
+{
+ ifattach_event = EVENTHANDLER_REGISTER(
+ ifnet_arrival_event, rtnl_handle_ifattach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifdetach_event = EVENTHANDLER_REGISTER(
+ ifnet_departure_event, rtnl_handle_ifdetach, NULL,
+ EVENTHANDLER_PRI_ANY);
+ ifaddr_event = EVENTHANDLER_REGISTER(
+ rt_addrmsg, rtnl_handle_ifaddr, NULL,
+ EVENTHANDLER_PRI_ANY);
+ rtnl_register_messages(cmd_handlers, RTNL_ARRAY_LEN(cmd_handlers));
+}
+
+void
+rtnl_ifaces_destroy(void)
+{
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event);
+ EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event);
+}
diff --git a/sys/netlink/route/ifaddrs.h b/sys/netlink/route/ifaddrs.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/ifaddrs.h
@@ -0,0 +1,74 @@
+/*
+ * Interface address-related (RTM_<NEW|DEL|GET>ADDR) message header and attributes.
+ */
+#ifndef _NETLINK_ROUTE_IFADDRS_H_
+#define _NETLINK_ROUTE_IFADDRS_H_
+
+/* Base header for all of the relevant messages */
+struct ifaddrmsg {
+ uint8_t ifa_family; /* Address family */
+ uint8_t ifa_prefixlen; /* Prefix length */
+ uint8_t ifa_flags; /* Address-specific flags */
+ uint8_t ifa_scope; /* Address scope */
+ uint32_t ifa_index; /* Link ifindex */
+};
+
+#ifndef _KERNEL
+#define _NL_IFA_HDRLEN ((int)sizeof(struct ifaddrmsg))
+#define IFA_RTA(_ifa) ((struct rtattr *)(NL_ITEM_DATA(_ifa, _NL_IFA_HDRLEN)))
+#define IFA_PAYLOAD(_hdr) NLMSG_PAYLOAD(_hdr, _NL_IFA_HDRLEN)
+#endif
+
+/*
+ * Important comment:
+ * IFA_ADDRESS is prefix address, rather than local interface address.
+ * It makes no difference for normally configured broadcast interfaces,
+ * but for point-to-point IFA_ADDRESS is DESTINATION address,
+ * local address is supplied in IFA_LOCAL attribute.
+ *
+ * IFA_FLAGS is a u32 attribute that extends the u8 field ifa_flags.
+ * If present, the value from struct ifaddrmsg will be ignored.
+ */
+enum {
+ IFA_UNSPEC,
+ IFA_ADDRESS,
+ IFA_LOCAL,
+ IFA_LABEL,
+ IFA_BROADCAST,
+ IFA_ANYCAST,
+ IFA_CACHEINFO,
+ IFA_MULTICAST,
+ IFA_FLAGS,
+ IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */
+ IFA_TARGET_NETNSID,
+ __IFA_MAX,
+};
+#define IFA_MAX (__IFA_MAX - 1)
+
+/* ifa_flags */
+#define IFA_F_SECONDARY 0x01
+#define IFA_F_TEMPORARY IFA_F_SECONDARY
+#define IFA_F_NODAD 0x02
+#define IFA_F_OPTIMISTIC 0x04
+#define IFA_F_DADFAILED 0x08
+#define IFA_F_HOMEADDRESS 0x10
+#define IFA_F_DEPRECATED 0x20
+#define IFA_F_TENTATIVE 0x40
+#define IFA_F_PERMANENT 0x80
+#define IFA_F_MANAGETEMPADDR 0x100
+#define IFA_F_NOPREFIXROUTE 0x200
+#define IFA_F_MCAUTOJOIN 0x400
+#define IFA_F_STABLE_PRIVACY 0x800
+
+/* */
+
+
+struct ifa_cacheinfo {
+ uint32_t ifa_prefered;
+ uint32_t ifa_valid;
+ uint32_t cstamp; /* created timestamp, hundredths of seconds */
+ uint32_t tstamp; /* updated timestamp, hundredths of seconds */
+};
+
+
+#endif
diff --git a/sys/netlink/route/interface.h b/sys/netlink/route/interface.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/interface.h
@@ -0,0 +1,189 @@
+/*
+ * Interface-related (RTM_<NEW|DEL|GET|SET>LINK) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_INTERFACE_H_
+#define _NETLINK_ROUTE_INTERFACE_H_
+
+/* Base header for all of the relevant messages */
+struct ifinfomsg {
+ unsigned char ifi_family; /* not used */
+ unsigned char __ifi_pad;
+ unsigned short ifi_type; /* ARPHRD_* */
+ int ifi_index; /* Inteface index */
+ unsigned ifi_flags; /* IFF_* flags */
+ unsigned ifi_change; /* IFF_* change mask */
+};
+
+#ifndef _KERNEL
+/* Compatilbility helpers */
+#define _IFINFO_HDRLEN ((int)sizeof(struct ifinfomsg))
+#define IFLA_RTA(_ifi) ((struct rtattr *)NL_ITEM_DATA(_ifi, _IFINFO_HDRLEN))
+#define IFLA_PAYLOAD(_ifi) NLMSG_PAYLOAD(_ifi, _IFINFO_HDRLEN)
+#endif
+
+enum {
+ IFLA_UNSPEC = 0,
+ IFLA_ADDRESS = 1, /* binary: Link-level address (MAC) */
+#define IFLA_ADDRESS IFLA_ADDRESS
+ IFLA_BROADCAST = 2, /* binary: link-level broadcast address */
+#define IFLA_BROADCAST IFLA_BROADCAST
+ IFLA_IFNAME = 3, /* string: Interface name */
+#define IFLA_IFNAME IFLA_IFNAME
+ IFLA_MTU = 4, /* u32: Current interface L3 mtu */
+#define IFLA_MTU IFLA_MTU
+ IFLA_LINK = 5, /* not supported */
+#define IFLA_LINK IFLA_LINK
+ IFLA_QDISC = 6, /* string: Queing policy (not supported) */
+#define IFLA_QDISC IFLA_QDISC
+ IFLA_STATS = 7, /* Interface counters */
+#define IFLA_STATS IFLA_STATS
+ IFLA_COST = 8, /* not supported */
+#define IFLA_COST IFLA_COST
+ IFLA_PRIORITY = 9, /* not supported */
+#define IFLA_PRIORITY IFLA_PRIORITY
+ IFLA_MASTER = 10, /* u32: parent interface ifindex */
+#define IFLA_MASTER IFLA_MASTER
+ IFLA_WIRELESS = 11, /* not supported */
+#define IFLA_WIRELESS IFLA_WIRELESS
+ IFLA_PROTINFO = 12, /* protocol-specific data */
+#define IFLA_PROTINFO IFLA_PROTINFO
+ IFLA_TXQLEN = 13, /* u32: transmit queue length */
+#define IFLA_TXQLEN IFLA_TXQLEN
+ IFLA_MAP = 14, /* not supported */
+#define IFLA_MAP IFLA_MAP
+ IFLA_WEIGHT = 15, /* not supported */
+#define IFLA_WEIGHT IFLA_WEIGHT
+ IFLA_OPERSTATE = 16, /* u8: ifOperStatus per RFC 2863 */
+#define IFLA_OPERSTATE IFLA_OPERSTATE
+ IFLA_LINKMODE = 17, /* u8: ifmedia (not supported) */
+#define IFLA_LINKMODE IFLA_LINKMODE
+ IFLA_LINKINFO = 18, /* not supported */
+#define IFLA_LINKINFO IFLA_LINKINFO
+ IFLA_NET_NS_PID = 19, /* u32: vnet id (not supported) */
+#define IFLA_NET_NS_PID IFLA_NET_NS_PID
+ IFLA_IFALIAS = 20, /* not supported */
+#define IFLA_IFALIAS IFLA_IFALIAS
+ IFLA_NUM_VF = 21, /* not supported */
+#define IFLA_NUM_VF IFLA_NUM_VF
+ IFLA_VFINFO_LIST= 22, /* not supported */
+#define IFLA_VFINFO_LIST IFLA_VFINFO_LIST
+ IFLA_STATS64 = 23, /* rtnl_link_stats64: iface stats */
+#define IFLA_STATS64 IFLA_STATS64
+ IFLA_VF_PORTS,
+ IFLA_PORT_SELF,
+ IFLA_AF_SPEC,
+ IFLA_GROUP, /* Group the device belongs to */
+ IFLA_NET_NS_FD,
+ IFLA_EXT_MASK, /* Extended info mask, VFs, etc */
+ IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */
+#define IFLA_PROMISCUITY IFLA_PROMISCUITY
+ IFLA_NUM_TX_QUEUES,
+ IFLA_NUM_RX_QUEUES,
+ IFLA_CARRIER,
+ IFLA_PHYS_PORT_ID,
+ IFLA_CARRIER_CHANGES,
+ IFLA_PHYS_SWITCH_ID,
+ IFLA_LINK_NETNSID,
+ IFLA_PHYS_PORT_NAME,
+ IFLA_PROTO_DOWN,
+ IFLA_GSO_MAX_SEGS,
+ IFLA_GSO_MAX_SIZE,
+ IFLA_PAD,
+ IFLA_XDP,
+ IFLA_EVENT,
+ IFLA_NEW_NETNSID,
+ IFLA_IF_NETNSID,
+ IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */
+ IFLA_CARRIER_UP_COUNT,
+ IFLA_CARRIER_DOWN_COUNT,
+ IFLA_NEW_IFINDEX,
+ IFLA_MIN_MTU,
+ IFLA_MAX_MTU,
+ IFLA_PROP_LIST,
+ IFLA_ALT_IFNAME, /* Alternative ifname */
+ IFLA_PERM_ADDRESS,
+ IFLA_PROTO_DOWN_REASON,
+ __IFLA_MAX
+};
+#define IFLA_MAX (__IFLA_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ * IFLA_IFNAME, IFLA_GROUP, IFLA_ALT_IFNAME
+ * Headers that can be used as filters:
+ * ifi_index, ifi_type
+ */
+
+/*
+ * IFLA_OPERSTATE.
+ * The values below represent the possible
+ * states of ifOperStatus defined by RFC 2863
+ */
+enum {
+ IF_OPER_UNKNOWN = 0, /* status can not be determined */
+ IF_OPER_NOTPRESENT = 1, /* some (hardware) component not present */
+ IF_OPER_DOWN = 2, /* down */
+ IF_OPER_LOWERLAYERDOWN = 3, /* some lower-level interface is down */
+ IF_OPER_TESTING = 4, /* in some test mode */
+ IF_OPER_DORMANT = 5, /* "up" but waiting for some condition (802.1X) */
+ IF_OPER_UP = 6, /* ready to pass packets */
+};
+
+/* IFLA_STATS */
+struct rtnl_link_stats {
+ uint32_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */
+ uint32_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */
+ uint32_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */
+ uint32_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */
+ uint32_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */
+ uint32_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */
+ uint32_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */
+ uint32_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */
+ uint32_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */
+ uint32_t collisions; /* not supported */
+ uint32_t rx_length_errors; /* not supported */
+ uint32_t rx_over_errors; /* not supported */
+ uint32_t rx_crc_errors; /* not supported */
+ uint32_t rx_frame_errors; /* not supported */
+ uint32_t rx_fifo_errors; /* not supported */
+ uint32_t rx_missed_errors; /* not supported */
+ uint32_t tx_aborted_errors; /* not supported */
+ uint32_t tx_carrier_errors; /* not supported */
+ uint32_t tx_fifo_errors; /* not supported */
+ uint32_t tx_heartbeat_errors; /* not supported */
+ uint32_t tx_window_errors; /* not supported */
+ uint32_t rx_compressed; /* not supported */
+ uint32_t tx_compressed; /* not supported */
+ uint32_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */
+};
+
+/* IFLA_STATS64 */
+struct rtnl_link_stats64 {
+ uint64_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */
+ uint64_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */
+ uint64_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */
+ uint64_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */
+ uint64_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */
+ uint64_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */
+ uint64_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */
+ uint64_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */
+ uint64_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */
+ uint64_t collisions; /* not supported */
+ uint64_t rx_length_errors; /* not supported */
+ uint64_t rx_over_errors; /* not supported */
+ uint64_t rx_crc_errors; /* not supported */
+ uint64_t rx_frame_errors; /* not supported */
+ uint64_t rx_fifo_errors; /* not supported */
+ uint64_t rx_missed_errors; /* not supported */
+ uint64_t tx_aborted_errors; /* not supported */
+ uint64_t tx_carrier_errors; /* not supported */
+ uint64_t tx_fifo_errors; /* not supported */
+ uint64_t tx_heartbeat_errors; /* not supported */
+ uint64_t tx_window_errors; /* not supported */
+ uint64_t rx_compressed; /* not supported */
+ uint64_t tx_compressed; /* not supported */
+ uint64_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */
+};
+
+#endif
diff --git a/sys/netlink/route/neigh.h b/sys/netlink/route/neigh.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/neigh.h
@@ -0,0 +1,79 @@
+
+/*
+ * Neighbors-related (RTM_<NEW|DEL|GET>NEIGH) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_NEIGH_H_
+#define _NETLINK_ROUTE_NEIGH_H_
+
+/* Base header for all of the relevant messages */
+struct ndmsg {
+ uint8_t ndm_family;
+ uint8_t ndm_pad1;
+ uint16_t ndm_pad2;
+ int32_t ndm_ifindex;
+ uint16_t ndm_state;
+ uint8_t ndm_flags;
+ uint8_t ndm_type;
+};
+
+/* Attributes */
+enum {
+ NDA_UNSPEC,
+ NDA_DST, /* neigh l3 address */
+ NDA_LLADDR, /* neigh link-level address */
+ NDA_CACHEINFO, /* lifetime */
+ NDA_PROBES, /* XXX */
+ NDA_VLAN, /* upper 802.1Q tag */
+ NDA_PORT, /* not used */
+ NDA_VNI, /* not used */
+ NDA_IFINDEX, /* interface index */
+ NDA_MASTER, /* not used */
+ NDA_LINK_NETNSID, /* not used */
+ NDA_SRC_VNI, /* not used */
+ NDA_PROTOCOL, /* XXX */
+ NDA_NH_ID, /* not used */
+ NDA_FDB_EXT_ATTRS, /* not used */
+ NDA_FLAGS_EXT, /* ndm_flags */
+ NDA_NDM_STATE_MASK, /* XXX */
+ NDA_NDM_FLAGS_MASK, /* XXX */
+ __NDA_MAX
+};
+
+#define NDA_MAX (__NDA_MAX - 1)
+
+
+/* ndm_flags / NDA_FLAGS_EXT */
+#define NTF_USE 0x0001 /* XXX */
+#define NTF_SELF 0x0002 /* local station */
+#define NTF_MASTER 0x0004 /* XXX */
+#define NTF_PROXY 0x0008 /* proxy entry */
+#define NTF_EXT_LEARNED 0x0010 /* not used */
+#define NTF_OFFLOADED 0x0020 /* not used */
+#define NTF_STICKY 0x0040 /* permament entry */
+#define NTF_ROUTER 0x0080 /* dst indicated itself as a router */
+/* start of NDA_FLAGS_EXT */
+#define NTF_EXT_MANAGED 0x0100 /* not used */
+
+/* ndm_state */
+#define NUD_INCOMPLETE 0x01 /* No lladdr, address resolution in progress */
+#define NUD_REACHABLE 0x02 /* reachable & recently resolved */
+#define NUD_STALE 0x04 /* has lladdr but it's stale */
+#define NUD_DELAY 0x08 /* has lladdr, is stale, probes delayed */
+#define NUD_PROBE 0x10 /* has lladdr, is stale, probes sent */
+#define NUD_FAILED 0x20 /* unused */
+
+/* Dummy states */
+#define NUD_NOARP 0x40 /* not used */
+#define NUD_PERMANENT 0x80 /* not flushed */
+#define NUD_NONE 0x00
+
+/* NDA_CACHEINFO */
+struct nda_cacheinfo {
+ uint32_t ndm_confirmed; /* seconds since ARP/ND was received from neigh */
+ uint32_t ndm_used; /* seconds since last used (not provided) */
+ uint32_t ndm_updated; /* seconds since state was updated last */
+ uint32_t ndm_refcnt; /* number of references held */
+};
+
+#endif
diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/neigh.c
@@ -0,0 +1,347 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/socketvar.h>
+
+#include <net/if.h>
+#include <net/if_llatbl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#include <netinet6/in6_var.h> /* nd6.h requires this */
+#include <netinet6/nd6.h> /* nd6 state machine */
+#include <netinet6/scope6_var.h> /* scope deembedding */
+
+#define DEBUG_MOD_NAME nl_neigh
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+static int lle_families[] = { AF_INET, AF_INET6 };
+
+struct netlink_walkargs {
+ struct nlmsg_state *ns;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ struct ifnet *ifp;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+
+#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
+
+static int
+lle_state_to_nl_state(int family, struct llentry *lle)
+{
+ int state = lle->ln_state;
+
+ switch (family) {
+ case AF_INET:
+ if (lle->la_flags & (LLE_STATIC | LLE_IFADDR))
+ state = 1;
+ switch (state) {
+ case 0: /* ARP_LLINFO_INCOMPLETE */
+ return (NUD_INCOMPLETE);
+ case 1: /* ARP_LLINFO_REACHABLE */
+ return (NUD_REACHABLE);
+ case 2: /* ARP_LLINFO_VERIFY */
+ return (NUD_PROBE);
+ }
+ break;
+ case AF_INET6:
+ switch (state) {
+ case ND6_LLINFO_INCOMPLETE:
+ return (NUD_INCOMPLETE);
+ case ND6_LLINFO_REACHABLE:
+ return (NUD_REACHABLE);
+ case ND6_LLINFO_STALE:
+ return (NUD_STALE);
+ case ND6_LLINFO_DELAY:
+ return (NUD_DELAY);
+ case ND6_LLINFO_PROBE:
+ return (NUD_PROBE);
+ }
+ break;
+ }
+
+ return (NUD_NONE);
+}
+
+static uint32_t
+lle_flags_to_nl_flags(const struct llentry *lle)
+{
+ uint32_t nl_flags = 0;
+
+ if (lle->la_flags & LLE_IFADDR)
+ nl_flags |= NTF_SELF;
+ if (lle->la_flags & LLE_PUB)
+ nl_flags |= NTF_PROXY;
+ if (lle->la_flags & LLE_STATIC)
+ nl_flags |= NTF_STICKY;
+ if (lle->ln_router != 0)
+ nl_flags |= NTF_ROUTER;
+
+ return (nl_flags);
+}
+
+static int
+dump_lle(struct lltable *llt, struct llentry *lle, void *arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)arg;
+ struct nlmsghdr *hdr = &wa->hdr;
+ struct nlmsg_state *ns = wa->ns;
+ struct ndmsg *ndm;
+ union {
+ struct in_addr in;
+ struct in6_addr in6;
+ } addr;
+
+ LLE_RLOCK(lle);
+
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ {
+ char llebuf[NHOP_PRINT_BUFSIZE];
+ llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf));
+ RT_LOG(LOG_DEBUG2, "dumping %s", llebuf);
+ }
+#endif
+
+ if (!nlmsg_reply(ns, hdr, sizeof(struct ndmsg)))
+ goto enomem;
+
+ ndm = nlmsg_reserve_object(ns, struct ndmsg);
+ ENOMEM_IF_NULL(ndm);
+ ndm->ndm_family = wa->family;
+ ndm->ndm_ifindex = wa->ifp->if_index;
+ ndm->ndm_state = lle_state_to_nl_state(wa->family, lle);
+ ndm->ndm_flags = lle_flags_to_nl_flags(lle);
+
+ switch (wa->family) {
+#ifdef INET
+ case AF_INET:
+ addr.in = lle->r_l3addr.addr4;
+ if (!nlattr_add(ns, NDA_DST, 4, &addr))
+ goto enomem;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ addr.in6 = lle->r_l3addr.addr6;
+ in6_clearscope(&addr.in6);
+ if (!nlattr_add(ns, NDA_DST, 16, &addr))
+ goto enomem;
+ break;
+#endif
+ }
+
+ if (lle->r_flags & RLLE_VALID) {
+ /* Has L2 */
+ int addrlen = wa->ifp->if_addrlen;
+ if (!nlattr_add(ns, NDA_LLADDR, addrlen, lle->ll_addr))
+ goto enomem;
+ }
+
+ if (!nlattr_add_u32(ns, NDA_PROBES, lle->la_asked))
+ goto enomem;
+
+ struct nda_cacheinfo *cache;
+ cache = nlmsg_reserve_attr(ns, NDA_CACHEINFO, struct nda_cacheinfo);
+ ENOMEM_IF_NULL(cache);
+ /* TODO: provide confirmed/updated */
+ cache->ndm_refcnt = lle->lle_refcnt;
+
+ LLE_RUNLOCK(lle);
+ nlmsg_end(ns);
+
+ return (0);
+
+enomem:
+ LLE_RUNLOCK(lle);
+ RT_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)");
+ nlmsg_abort(ns);
+ return (ENOMEM);
+}
+
+static bool
+dump_llt(struct lltable *llt, struct netlink_walkargs *wa)
+{
+ lltable_foreach_lle(llt, dump_lle, wa);
+
+ return (true);
+}
+
+static int
+dump_llts_iface(struct netlink_walkargs *wa, struct ifnet *ifp, int family)
+{
+ int error = 0;
+
+ wa->ifp = ifp;
+ for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) {
+ int fam = lle_families[i];
+ struct lltable *llt = lltable_get(ifp, fam);
+ if (llt != NULL && (family == 0 || family == fam)) {
+ wa->count++;
+ wa->family = fam;
+ if (!dump_llt(llt, wa)) {
+ error = ENOMEM;
+ break;
+ }
+ wa->dumped++;
+ }
+ }
+ return (error);
+}
+
+static int
+dump_llts(struct netlink_walkargs *wa, struct ifnet *ifp, int family)
+{
+ RT_LOG(LOG_DEBUG, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family);
+
+ wa->hdr.nlmsg_flags |= NLM_F_MULTI;
+
+ if (ifp != NULL) {
+ dump_llts_iface(wa, ifp, family);
+ } else {
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ dump_llts_iface(wa, ifp, family);
+ }
+ }
+
+ RT_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa->count, wa->dumped);
+
+ if (!nlmsg_end_dump(wa->ns, wa->error, &wa->hdr)) {
+ RT_LOG(LOG_DEBUG, "Unable to add new message");
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+static int
+get_lle(struct netlink_walkargs *wa, struct ifnet *ifp, int family, struct sockaddr *dst)
+{
+ struct lltable *llt = lltable_get(ifp, family);
+ if (llt == NULL)
+ return (ESRCH);
+
+#ifdef INET6
+ if (dst->sa_family == AF_INET6) {
+ struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
+
+ if (IN6_IS_SCOPE_LINKLOCAL(&dst6->sin6_addr))
+ in6_set_unicast_scopeid(&dst6->sin6_addr, ifp->if_index);
+ }
+#endif
+ struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst);
+ if (lle == NULL)
+ return (ESRCH);
+
+ wa->ifp = ifp;
+ wa->family = family;
+
+ return (dump_lle(llt, lle, wa));
+}
+
+struct nl_parsed_neigh {
+ struct sockaddr *nda_dst;
+ uint32_t nda_ifindex;
+ uint8_t ndm_family;
+};
+#define _OFF_S(_field) offsetof(struct nl_parsed_neigh, _field)
+
+static struct nlattr_parser ps[] = {
+ { .type = NDA_DST, .off = _OFF_S(nda_dst), .cb = nlattr_get_ip },
+ { .type = NDA_IFINDEX, .off = _OFF_S(nda_ifindex), .cb = nlattr_get_uint32 },
+};
+
+static int
+rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ struct ifnet *ifp = NULL;
+ int error;
+
+ struct ndmsg *ndm = (struct ndmsg *)nlmsg_data(hdr);
+
+ struct nl_parsed_neigh attrs = {
+ .ndm_family = ndm->ndm_family,
+ .nda_ifindex = ndm->ndm_ifindex,
+ };
+ error = nl_parse_attrs(hdr, sizeof(*ndm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nda_ifindex != 0) {
+ if ((ifp = ifnet_byindex(attrs.nda_ifindex)) == NULL) {
+ RT_LOG(LOG_DEBUG, "unknown ifindex %d", attrs.nda_ifindex);
+ return (EINVAL);
+ }
+ }
+
+ if (attrs.nda_dst != NULL && ifp == NULL) {
+ RT_LOG(LOG_DEBUG, "has NDA_DST but no ifindex provided");
+ return (EINVAL);
+ }
+
+ struct netlink_walkargs wa = {
+ .so = nlp,
+ .ns = npt->ns,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEIGH,
+ };
+
+ if (attrs.nda_dst == NULL)
+ error = dump_llts(&wa, ifp, ndm->ndm_family);
+ else
+ error = get_lle(&wa, ifp, ndm->ndm_family, attrs.nda_dst);
+
+ return (error);
+}
+
+static struct rtnl_cmd_handler cmd_handlers[] = {
+ { NL_RTM_GETNEIGH, "RTM_GETNEIGH", &rtnl_handle_getneigh, sizeof(struct ndmsg)},
+};
+
+void
+rtnl_neighs_init()
+{
+ rtnl_register_messages(cmd_handlers, RTNL_ARRAY_LEN(cmd_handlers));
+}
+
diff --git a/sys/netlink/route/nexthop.h b/sys/netlink/route/nexthop.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/nexthop.h
@@ -0,0 +1,75 @@
+/*
+ * NEXTHOP-related (RTM_<NEW|DEL|GET>NEXTHOP) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_NEXTHOP_H_
+#define _NETLINK_ROUTE_NEXTHOP_H_
+
+/* Base header for all of the relevant messages */
+struct nhmsg {
+ unsigned char nh_family; /* transport family */
+ unsigned char nh_scope; /* ignored on RX, filled by kernel */
+ unsigned char nh_protocol; /* Routing protocol that installed nh */
+ unsigned char resvd;
+ unsigned int nh_flags; /* RTNH_F_* flags from route.h */
+};
+
+enum {
+ NHA_UNSPEC,
+ NHA_ID, /* u32: nexthop userland index, auto-assigned if 0 */
+ NHA_GROUP, /* binary: array of struct nexthop_grp */
+ NHA_GROUP_TYPE, /* u16: set to NEXTHOP_GRP_TYPE */
+ NHA_BLACKHOLE, /* flag: nexthop used to blackhole packets */
+ NHA_OIF, /* u32: transmit ifindex */
+ NHA_GATEWAY, /* network: IPv4/IPv6 gateway addr */
+ NHA_ENCAP_TYPE, /* not supported */
+ NHA_ENCAP, /* not supported */
+ NHA_GROUPS, /* flag: match nexthop groups */
+ NHA_MASTER, /* not supported */
+ NHA_FDB, /* not supported */
+ NHA_RES_GROUP, /* not supported */
+ NHA_RES_BUCKET, /* not supported */
+ __NHA_MAX,
+};
+#define NHA_MAX (__NHA_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ * NHA_ID (nexhop or group), NHA_OIF, NHA_GROUPS,
+ */
+
+/*
+ * NHA_GROUP: array of the following structures.
+ * If attribute is set, the only other valid attributes are
+ * NHA_ID and NHA_GROUP_TYPE.
+ * NHA_RES_GROUP and NHA_RES_BUCKET are not supported yet
+ */
+struct nexthop_grp {
+ uint32_t id; /* nexhop userland index */
+ uint8_t weight; /* weight of this nexthop */
+ uint8_t resvd1;
+ uint16_t resvd2;
+};
+
+/* NHA_GROUP_TYPE: u16 */
+enum {
+ NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */
+ NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */
+ __NEXTHOP_GRP_TYPE_MAX,
+};
+#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1)
+
+
+/* NHA_RES_GROUP */
+enum {
+ NHA_RES_GROUP_UNSPEC,
+ NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC,
+ NHA_RES_GROUP_BUCKETS,
+ NHA_RES_GROUP_IDLE_TIMER,
+ NHA_RES_GROUP_UNBALANCED_TIMER,
+ NHA_RES_GROUP_UNBALANCED_TIME,
+ __NHA_RES_GROUP_MAX,
+};
+#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1)
+
+#endif
diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/nexthop.c
@@ -0,0 +1,1007 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+#include <sys/ck.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_utils.h>
+
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netinet6/scope6_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_nhop
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG3);
+
+/*
+ * This file contains the logic to maintain kernel nexthops and
+ * nexhop groups based om the data provided by the user.
+ *
+ * Kernel stores (nearly) all of the routing data in the nexthops,
+ * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT).
+ *
+ * Netlink API provides higher-level abstraction for the user. Each
+ * user-created nexthop may map to multiple kernel nexthops.
+ *
+ * The following variations require separate kernel nexthop to be
+ * created:
+ * * prefix flags (NHF_HOST, NHF_DEFAULT)
+ * * using IPv6 gateway for IPv4 routes
+ * * different fibnum
+ *
+ * These kernel nexthops have the lifetime bound to the lifetime of
+ * the user_nhop object. They are not collected until user requests
+ * to delete the created user_nhop.
+ *
+ */
+struct user_nhop {
+ uint32_t un_idx; /* Userland-provided index */
+ uint32_t un_fibfam; /* fibnum+af(as highest byte) */
+ uint8_t un_protocol; /* protocol that install the record */
+ struct nhop_object *un_nhop; /* "production" nexthop */
+ struct nhop_object *un_nhop_src; /* nexthop to copy from */
+ struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */
+ uint32_t un_nhgrp_count; /* number of nexthops */
+ struct user_nhop *un_next; /* next item in hash chain */
+ struct user_nhop *un_nextchild; /* master -> children */
+ struct epoch_context un_epoch_ctx; /* epoch ctl helper */
+};
+
+/* produce hash value for an object */
+#define unhop_hash_obj(_obj) (hash_unhop(_obj))
+/* compare two objects */
+#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two))
+/* next object accessor */
+#define unhop_next(_obj) (_obj)->un_next
+
+CHT_SLIST_DEFINE(unhop, struct user_nhop);
+
+struct unhop_ctl {
+ struct unhop_head un_head;
+ struct rmlock un_lock;
+};
+#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl")
+#define UN_TRACKER struct rm_priotracker un_tracker
+#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker)
+#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker)
+
+#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock);
+#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock);
+
+VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL;
+#define V_un_ctl VNET(un_ctl)
+
+static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size);
+static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b);
+static unsigned int hash_unhop(const struct user_nhop *obj);
+
+static void destroy_unhop(struct user_nhop *unhop);
+static struct nhop_object *clone_unhop(const struct user_nhop *unhop,
+ uint32_t fibnum, int family, int nh_flags);
+
+static int
+cmp_unhop(const struct user_nhop *a, const struct user_nhop *b)
+{
+ return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_unhop(const struct user_nhop *obj)
+{
+ return (obj->un_idx ^ obj->un_fibfam);
+}
+
+#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0)
+
+/*
+ * Factory interface for creating matching kernel nexthops/nexthop groups
+ *
+ * @uidx: userland nexhop index used to create the nexthop
+ * @fibnum: fibnum nexthop will be used in
+ * @family: upper family nexthop will be used in
+ * @nh_flags: desired nexthop prefix flags
+ * @perror: pointer to store error to
+ *
+ * Returns referenced nexthop linked to @fibnum/@family rib on success.
+ */
+struct nhop_object *
+nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx,
+ int nh_flags, int *perror)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ UN_TRACKER;
+
+ if (__predict_false(ctl == NULL))
+ return (NULL);
+
+ struct user_nhop key= {
+ .un_idx = uidx,
+ .un_fibfam = fibnum | ((uint32_t)family) << 24,
+ };
+ struct user_nhop *unhop;
+
+ nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT);
+
+ if (__predict_false(family == 0))
+ return (NULL);
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop != NULL) {
+ struct nhop_object *nh = unhop->un_nhop;
+ UN_RLOCK(ctl);
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+ }
+
+ /*
+ * Exact nexthop not found. Search for template nexthop to clone from.
+ */
+ key.un_fibfam = 0;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ UN_RUNLOCK(ctl);
+ *perror = ESRCH;
+ return (NULL);
+ }
+
+ UN_RUNLOCK(ctl);
+
+ /* Create entry to insert first */
+ struct user_nhop *un_new, *un_tmp;
+ un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (un_new == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ un_new->un_idx = uidx;
+ un_new->un_fibfam = fibnum | ((uint32_t)family) << 24;
+
+ /* Relying on epoch to protect unhop here */
+ un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags);
+ if (un_new->un_nhop == NULL) {
+ free(un_new, M_NETLINK);
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ /* Insert back and report */
+ UN_WLOCK(ctl);
+
+ /* First, find template record once again */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ /* Someone deleted the nexthop during the call */
+ UN_WUNLOCK(ctl);
+ *perror = ESRCH;
+ destroy_unhop(un_new);
+ return (NULL);
+ }
+
+ /* Second, check the direct match */
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp);
+ struct nhop_object *nh;
+ if (un_tmp != NULL) {
+ /* Another thread already created the desired nextop, use it */
+ nh = un_tmp->un_nhop;
+ } else {
+ /* Finally, insert the new nexthop and link it to the primary */
+ nh = un_new->un_nhop;
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new);
+ un_new->un_nextchild = unhop->un_nextchild;
+ unhop->un_nextchild = un_new;
+ un_new = NULL;
+ RT_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh);
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (un_new != NULL)
+ destroy_unhop(un_new);
+
+ *perror = 0;
+ nhop_ref_any(nh);
+ return (nh);
+}
+
+static struct user_nhop *
+nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx)
+{
+ struct user_nhop key= { .un_idx = uidx };
+ struct user_nhop *unhop = NULL;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ return (unhop);
+}
+
+#define MAX_STACK_NHOPS 4
+static struct nhop_object *
+clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags)
+{
+ const struct weightened_nhop *wn;
+ struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS];
+ struct nhop_object *nh = NULL;
+ uint32_t num_nhops;
+ int error;
+
+ if (unhop->un_nhop_src != NULL) {
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src,
+ "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum,
+ family, nh_flags);
+ }
+#endif
+ struct nhop_object *nh;
+ nh = nhop_alloc(fibnum, AF_UNSPEC);
+ if (nh == NULL)
+ return (NULL);
+ nhop_copy(nh, unhop->un_nhop_src);
+ /* Check that nexthop gateway is compatible with the new family */
+ if (!nhop_set_upper_family(nh, family)) {
+ nhop_free(nh);
+ return (NULL);
+ }
+ nhop_set_uidx(nh, unhop->un_idx);
+ nhop_set_pxtype_flag(nh, nh_flags);
+ return (nhop_get_nhop(nh, &error));
+ }
+
+ wn = unhop->un_nhgrp_src;
+ num_nhops = unhop->un_nhgrp_count;
+
+ if (num_nhops > MAX_STACK_NHOPS) {
+ wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT);
+ if (wn_new == NULL)
+ return (NULL);
+ } else
+ wn_new = wn_base;
+
+ for (int i = 0; i < num_nhops; i++) {
+ uint32_t uidx = nhop_get_uidx(wn[i].nh);
+ MPASS(uidx != 0);
+ wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error);
+ if (error != 0)
+ break;
+ wn_new[i].weight = wn[i].weight;
+ }
+
+ if (error == 0) {
+ struct rib_head *rh = nhop_get_rh(wn_new[0].nh);
+ struct nhgrp_object *nhg;
+
+ error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg);
+ nh = (struct nhop_object *)nhg;
+ }
+
+ if (wn_new != wn_base)
+ free(wn_new, M_TEMP);
+ return (nh);
+}
+
+static void
+destroy_unhop(struct user_nhop *unhop)
+{
+ if (unhop->un_nhop != NULL)
+ nhop_free_any(unhop->un_nhop);
+ if (unhop->un_nhop_src != NULL)
+ nhop_free_any(unhop->un_nhop_src);
+ free(unhop, M_NETLINK);
+}
+
+static void
+destroy_unhop_epoch(epoch_context_t ctx)
+{
+ struct user_nhop *unhop;
+
+ unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx);
+
+ destroy_unhop(unhop);
+}
+
+static uint32_t
+find_spare_uidx(struct unhop_ctl *ctl)
+{
+ struct user_nhop *unhop, key = {};
+ uint32_t uidx = 0;
+ UN_TRACKER;
+
+ UN_RLOCK(ctl);
+ /* This should return spare uid with 75% of 65k used in ~99/100 cases */
+ for (int i = 0; i < 16; i++) {
+ key.un_idx = (arc4random() % 65536) + 65536 * 4;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ if (unhop == NULL) {
+ uidx = key.un_idx;
+ break;
+ }
+ }
+ UN_RUNLOCK(ctl);
+
+ return (uidx);
+}
+
+
+/*
+ * Actual netlink code
+ */
+struct netlink_walkargs {
+ struct nlmsg_state *ns;
+ struct nlmsghdr hdr;
+ struct nlpcb *so;
+ int family;
+ int error;
+ int count;
+ int dumped;
+};
+#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem
+
+static bool
+dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nlmsg_state *ns)
+{
+
+ if (!nlmsg_reply(ns, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(ns, struct nhmsg);
+ ENOMEM_IF_NULL(nhm);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_scope = 0;
+ nhm->nh_protocol = unhop->un_protocol;
+ nhm->nh_flags = 0;
+
+ if (!nlattr_add_u32(ns, NHA_ID, unhop->un_idx))
+ goto enomem;
+
+ if (!nlattr_add_u16(ns, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH))
+ goto enomem;
+
+ struct weightened_nhop *wn = unhop->un_nhgrp_src;
+ uint32_t num_nhops = unhop->un_nhgrp_count;
+ /* TODO: a better API? */
+ int nla_len = sizeof(struct nlattr);
+ nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp));
+ struct nlattr *nla = nlmsg_reserve_data(ns, nla_len, struct nlattr);
+ if (nla == NULL)
+ goto enomem;
+ nla->nla_type = NHA_GROUP;
+ nla->nla_len = nla_len;
+ for (int i = 0; i < num_nhops; i++) {
+ struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i];
+ grp->id = nhop_get_uidx(wn[i].nh);
+ grp->weight = wn[i].weight;
+ grp->resvd1 = 0;
+ grp->resvd2 = 0;
+ }
+
+ nlmsg_end(ns);
+ return (true);
+
+enomem:
+ RT_LOG(LOG_DEBUG, "error: unable to allocate attribute memory");
+ nlmsg_abort(ns);
+ return (false);
+}
+
+static bool
+dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nlmsg_state *ns)
+{
+ struct nhop_object *nh = unhop->un_nhop_src;
+
+ if (!nlmsg_reply(ns, hdr, sizeof(struct nhmsg)))
+ goto enomem;
+
+ struct nhmsg *nhm = nlmsg_reserve_object(ns, struct nhmsg);
+ ENOMEM_IF_NULL(nhm);
+ nhm->nh_family = nhop_get_neigh_family(nh);
+ nhm->nh_scope = 0; // XXX: what's that?
+ nhm->nh_protocol = unhop->un_protocol;
+ nhm->nh_flags = 0;
+
+ if (!nlattr_add_u32(ns, NHA_ID, unhop->un_idx))
+ goto enomem;
+
+ if (nh->nh_flags & NHF_BLACKHOLE) {
+ if (!nlattr_add_flag(ns, NHA_BLACKHOLE))
+ goto enomem;
+ goto done;
+ }
+
+ if (!nlattr_add_u32(ns, NHA_OIF, nh->nh_ifp->if_index))
+ goto enomem;
+
+ switch (nh->gw_sa.sa_family) {
+#ifdef INET
+ case AF_INET:
+ if (!nlattr_add(ns, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr))
+ goto enomem;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct in6_addr addr = nh->gw6_sa.sin6_addr;
+ in6_clearscope(&addr);
+ if (!nlattr_add(ns, NHA_GATEWAY, 16, &addr))
+ goto enomem;
+ break;
+ }
+#endif
+ }
+
+done:
+ nlmsg_end(ns);
+ return (true);
+enomem:
+ nlmsg_abort(ns);
+ return (false);
+}
+
+static void
+dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr,
+ struct nlmsg_state *ns)
+{
+ if (unhop->un_nhop_src != NULL)
+ dump_nhop(unhop, hdr, ns);
+ else
+ dump_nhgrp(unhop, hdr, ns);
+}
+
+static int
+delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx)
+{
+ struct user_nhop *unhop_ret, *unhop_base, *unhop_chain;
+
+ struct user_nhop key = { .un_idx = uidx };
+
+ UN_WLOCK(ctl);
+
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base);
+
+ if (unhop_base != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret);
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ {
+/*
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG2, unhop_base->un_nhop,
+ "removed base nhop %u: %s", uidx, nhbuf);
+*/
+ }
+#endif
+ /* Unlink all child nexhops as well, keeping the chain intact */
+ unhop_chain = unhop_base->un_nextchild;
+ while (unhop_chain != NULL) {
+ CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain,
+ unhop_ret);
+ MPASS(unhop_chain == unhop_ret);
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ {
+/*
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf_any(unhop_chain->un_nhop,
+ nhbuf, sizeof(nhbuf));
+ FIB_NH_LOG(LOG_DEBUG2, unhop_chain->un_nhop,
+ "removed child nhop %u: %s", uidx, nhbuf);
+*/
+ }
+#endif
+ unhop_chain = unhop_chain->un_nextchild;
+ }
+ }
+
+ UN_WUNLOCK(ctl);
+
+ if (unhop_base == NULL) {
+ RT_LOG(LOG_DEBUG, "unable to find unhop %u", uidx);
+ return (ENOENT);
+ }
+
+ /* Report nexthop deletion */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_DELNEXTHOP,
+ };
+
+ struct nlmsg_state ns = {};
+ if (!nlmsg_get_group_writer(NLMSG_SMALL, RTNLGRP_NEXTHOP, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop_base, &wa.hdr, &ns);
+ nlmsg_flush(&ns);
+
+ while (unhop_base != NULL) {
+ unhop_chain = unhop_base->un_nextchild;
+ epoch_call(net_epoch_preempt, destroy_unhop_epoch,
+ &unhop_base->un_epoch_ctx);
+ unhop_base = unhop_chain;
+ }
+
+ return (0);
+}
+
+static void
+consider_resize(struct unhop_ctl *ctl, uint32_t new_size)
+{
+ void *new_ptr = NULL;
+ size_t alloc_size;
+
+ if (new_size == 0)
+ return;
+
+ if (new_size != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size);
+ new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (new_ptr == NULL)
+ return;
+ }
+
+ RT_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size);
+ UN_WLOCK(ctl);
+ if (new_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size);
+ }
+ UN_WUNLOCK(ctl);
+
+
+ if (new_ptr != NULL)
+ free(new_ptr, M_NETLINK);
+}
+
+static bool __noinline
+vnet_init_unhops()
+{
+ uint32_t num_buckets = 16;
+ size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+
+ struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK,
+ M_NOWAIT | M_ZERO);
+ if (ctl == NULL)
+ return (false);
+
+ void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (ptr == NULL) {
+ free(ctl, M_NETLINK);
+ return (false);
+ }
+ CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets);
+ UN_LOCK_INIT(ctl);
+
+ if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) {
+ free(ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+ }
+
+ if (atomic_load_ptr(&V_un_ctl) == NULL)
+ return (false);
+
+ RT_LOG(LOG_NOTICE, "UNHOPS init done");
+
+ return (true);
+}
+
+static void
+vnet_destroy_unhops(const void *unused __unused)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop *unhop, *tmp;
+
+ if (ctl == NULL)
+ return;
+ V_un_ctl = NULL;
+
+ /* Wait till all unhop users finish their reads */
+ epoch_wait_preempt(net_epoch_preempt);
+
+ UN_WLOCK(ctl);
+ CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) {
+ destroy_unhop(unhop);
+ } CHT_SLIST_FOREACH_SAFE_END;
+ UN_WUNLOCK(ctl);
+
+ free(ctl->un_head.ptr, M_NETLINK);
+ free(ctl, M_NETLINK);
+}
+VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY,
+ vnet_destroy_unhops, NULL);
+
+static int
+nlattr_get_nhg(struct nlattr *nla, struct netlink_parse_tracker *npt, void *target)
+{
+ int error = 0;
+
+ /* Verify attribute correctness */
+ struct nexthop_grp *grp = NLA_DATA(nla);
+ int data_len = NLA_DATA_LEN(nla);
+
+ int count = data_len / sizeof(*grp);
+ if (count == 0 || (count * sizeof(*grp) != data_len)) {
+ RT_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len);
+ return (EINVAL);
+ }
+
+ *((struct nlattr **)target) = nla;
+ return (error);
+}
+
+struct nl_parsed_nhop {
+ uint32_t nha_id;
+ uint8_t nha_blackhole;
+ uint8_t nha_groups;
+ struct ifnet *nha_oif;
+ struct sockaddr *nha_gw;
+ struct nlattr *nha_group;
+ int nh_family;
+};
+#define _OFF_S(_field) offsetof(struct nl_parsed_nhop, _field)
+
+static struct nlattr_parser ps[] = {
+ { .type = NHA_ID, .off = _OFF_S(nha_id), .cb = nlattr_get_uint32 },
+ { .type = NHA_GROUP, .off = _OFF_S(nha_group), .cb = nlattr_get_nhg },
+ { .type = NHA_BLACKHOLE, .off = _OFF_S(nha_blackhole), .cb = nlattr_get_flag },
+ { .type = NHA_OIF, .off = _OFF_S(nha_oif), .cb = nlattr_get_ifindex },
+ { .type = NHA_GATEWAY, .off = _OFF_S(nha_gw), .cb = nlattr_get_ip },
+ { .type = NHA_GROUPS, .off = _OFF_S(nha_groups), .cb = nlattr_get_flag },
+};
+
+static bool
+eligible_nhg(const struct nhop_object *nh)
+{
+ return (nh->nh_flags & NHF_GATEWAY);
+}
+
+static int
+newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ struct nexthop_grp *grp = NLA_DATA(attrs->nha_group);
+ int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp);
+ struct weightened_nhop *wn;
+
+ wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO);
+ if (wn == NULL)
+ return (ENOMEM);
+
+ for (int i = 0; i < count; i++) {
+ struct user_nhop *unhop;
+ unhop = nl_find_base_unhop(ctl, grp[i].id);
+ if (unhop == NULL) {
+ RT_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id);
+ free(wn, M_NETLINK);
+ return (ESRCH);
+ } else if (unhop->un_nhop_src == NULL) {
+ RT_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ } else if (!eligible_nhg(unhop->un_nhop_src)) {
+ RT_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible",
+ grp[i].id);
+ free(wn, M_NETLINK);
+ return (ENOTSUP);
+ }
+ /*
+ * TODO: consider more rigid eligibility checks:
+ * restrict nexthops with the same gateway
+ */
+ wn[i].nh = unhop->un_nhop_src;
+ wn[i].weight = grp[i].weight;
+ }
+ unhop->un_nhgrp_src = wn;
+ unhop->un_nhgrp_count = count;
+ return (0);
+}
+
+static int
+newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ struct ifaddr *ifa = NULL;
+ struct nhop_object *nh;
+ int error;
+
+ if (!attrs->nha_blackhole) {
+ if (attrs->nha_gw == NULL) {
+ RT_LOG(LOG_DEBUG, "missing NHA_GATEWAY");
+ return (EINVAL);
+ }
+ if (attrs->nha_oif == NULL) {
+ RT_LOG(LOG_DEBUG, "missing NHA_OIF");
+ return (EINVAL);
+ }
+ if (ifa == NULL)
+ ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif);
+ if (ifa == NULL) {
+ RT_LOG(LOG_DEBUG, "Unable to determine default source IP");
+ return (EINVAL);
+ }
+ }
+
+ int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family;
+
+ nh = nhop_alloc(RT_DEFAULT_FIB, family);
+ if (nh == NULL) {
+ RT_LOG(LOG_DEBUG, "Unable to allocate nexthop");
+ return (ENOMEM);
+ }
+ nhop_set_uidx(nh, attrs->nha_id);
+
+ if (attrs->nha_blackhole)
+ nhop_set_blackhole(nh, NHF_BLACKHOLE);
+ else {
+ nhop_set_gw(nh, attrs->nha_gw, true);
+ nhop_set_transmit_ifp(nh, attrs->nha_oif);
+ nhop_set_src(nh, ifa);
+ }
+
+ error = nhop_get_unlinked(nh);
+ if (error != 0) {
+ RT_LOG(LOG_DEBUG, "unable to finalize nexthop");
+ return (error);
+ }
+
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ {
+ char nhbuf[NHOP_PRINT_BUFSIZE];
+ nhop_print_buf(nh, nhbuf, sizeof(nhbuf));
+ RT_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf);
+ }
+#endif
+ unhop->un_nhop_src = nh;
+ return (0);
+}
+
+static int
+rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt)
+{
+ struct user_nhop *unhop;
+ int error;
+
+ error = nlp_has_priv_route(nlp);
+ if (error != 0)
+ return (error);
+
+ if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops()))
+ return (ENOMEM);
+ struct unhop_ctl *ctl = V_un_ctl;
+
+ struct nhmsg *nhm = (struct nhmsg *)nlmsg_data(hdr);
+ struct nl_parsed_nhop attrs = { .nh_family = nhm->nh_family };
+ error = nl_parse_attrs(hdr, sizeof(*nhm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class
+ * citizen.
+ */
+ if (attrs.nha_id == 0) {
+ attrs.nha_id = find_spare_uidx(ctl);
+ if (attrs.nha_id == 0) {
+ RT_LOG(LOG_DEBUG, "Unable to get spare uidx");
+ return (ENOSPC);
+ }
+ }
+
+ RT_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0);
+
+ unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO);
+ if (unhop == NULL) {
+ RT_LOG(LOG_DEBUG, "Unable to allocate user_nhop");
+ return (ENOMEM);
+ }
+ unhop->un_idx = attrs.nha_id;
+ unhop->un_protocol = nhm->nh_protocol;
+
+ if (attrs.nha_group)
+ error = newnhg(ctl, &attrs, unhop);
+ else
+ error = newnhop(&attrs, unhop);
+
+ if (error != 0) {
+ free(unhop, M_NETLINK);
+ return (error);
+ }
+
+ UN_WLOCK(ctl);
+ /* Check if uidx already exists */
+ struct user_nhop *tmp = NULL;
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp);
+ if (tmp != NULL) {
+ UN_WUNLOCK(ctl);
+ RT_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id);
+ destroy_unhop(unhop);
+ return (EEXIST);
+ }
+ CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop);
+ uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head);
+ UN_WUNLOCK(ctl);
+
+ /* Report addition of the next nexhop */
+ struct netlink_walkargs wa = {
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ struct nlmsg_state ns = {};
+ if (!nlmsg_get_group_writer(NLMSG_SMALL, RTNLGRP_NEXTHOP, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating message writer");
+ return (ENOMEM);
+ }
+
+ dump_unhop(unhop, &wa.hdr, &ns);
+ nlmsg_flush(&ns);
+
+ consider_resize(ctl, num_buckets_new);
+
+ return (0);
+}
+
+static int
+rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ int error;
+
+ error = nlp_has_priv_route(nlp);
+ if (error != 0)
+ return (error);
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ struct nhmsg *nhm = (struct nhmsg *)nlmsg_data(hdr);
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_attrs(hdr, sizeof(*nhm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.nha_id == 0) {
+ RT_LOG(LOG_DEBUG, "NHA_ID not set");
+ return (EINVAL);
+ }
+
+ error = delete_unhop(ctl, hdr, attrs.nha_id);
+
+ return (error);
+}
+
+static bool
+match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop)
+{
+ if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id)
+ return (false);
+ if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL)
+ return (false);
+ if (attrs->nha_oif != NULL &&
+ (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif))
+ return (false);
+
+ return (true);
+}
+
+static int
+rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt)
+{
+ struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl);
+ struct user_nhop *unhop;
+ UN_TRACKER;
+ int error;
+
+ if (__predict_false(ctl == NULL))
+ return (ESRCH);
+
+ struct nhmsg *nhm = (struct nhmsg *)nlmsg_data(hdr);
+ struct nl_parsed_nhop attrs = {};
+ error = nl_parse_attrs(hdr, sizeof(*nhm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ struct netlink_walkargs wa = {
+ .ns = npt->ns,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags,
+ .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP,
+ };
+
+ if (attrs.nha_id != 0) {
+ RT_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id);
+ struct user_nhop key= { .un_idx = attrs.nha_id };
+ UN_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop);
+ UN_RUNLOCK(ctl);
+
+ if (unhop == NULL)
+ return (ESRCH);
+ dump_unhop(unhop, &wa.hdr, wa.ns);
+ return (0);
+ }
+
+ UN_RLOCK(ctl);
+ wa.hdr.nlmsg_flags |= NLM_F_MULTI;
+ CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) {
+ if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop))
+ dump_unhop(unhop, &wa.hdr, wa.ns);
+ } CHT_SLIST_FOREACH_END;
+ UN_RUNLOCK(ctl);
+
+ if (wa.error == 0) {
+ if (!nlmsg_end_dump(wa.ns, wa.error, &wa.hdr))
+ return (ENOMEM);
+ }
+ return (0);
+}
+
+static struct rtnl_cmd_handler cmd_handlers[] = {
+ { NL_RTM_GETNEXTHOP, "RTM_GETNEXTHOP", &rtnl_handle_getnhop, sizeof(struct nhmsg)},
+ { NL_RTM_DELNEXTHOP, "RTM_DELNEXTHOP", &rtnl_handle_delnhop, sizeof(struct nhmsg)},
+ { NL_RTM_NEWNEXTHOP, "RTM_NEWNEXTHOP", &rtnl_handle_newnhop, sizeof(struct nhmsg)},
+};
+
+void
+rtnl_nexthops_init()
+{
+ rtnl_register_messages(cmd_handlers, RTNL_ARRAY_LEN(cmd_handlers));
+}
diff --git a/sys/netlink/route/route.h b/sys/netlink/route/route.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/route.h
@@ -0,0 +1,349 @@
+/*
+ * Route-related (RTM_<NEW|DEL|GET>ROUTE) message header and attributes.
+ */
+
+#ifndef _NETLINK_ROUTE_ROUTE_H_
+#define _NETLINK_ROUTE_ROUTE_H_
+
+/* Base header for all of the relevant messages */
+struct rtmsg {
+ unsigned char rtm_family; /* address family */
+ unsigned char rtm_dst_len; /* Prefix length */
+ unsigned char rtm_src_len; /* Source prefix length (not used) */
+ unsigned char rtm_tos; /* Type of service (not used) */
+ unsigned char rtm_table; /* rtable id */
+ unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */
+ unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */
+ unsigned char rtm_type; /* Route type (RTN_) */
+ unsigned rtm_flags; /* Route flags (RTM_F_) */
+};
+
+/*
+ * RFC 3549, 3.1.1, route type (rtm_type field).
+ */
+enum {
+ RTN_UNSPEC,
+ RTN_UNICAST, /* Unicast route */
+ RTN_LOCAL, /* Accept locally (not supported) */
+ RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */
+ RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */
+ RTN_MULTICAST, /* Multicast route */
+ RTN_BLACKHOLE, /* Drop traffic towards destination */
+ RTN_UNREACHABLE, /* Destination is unreachable */
+ RTN_PROHIBIT, /* Administratively prohibited */
+ RTN_THROW, /* Not in this table (not supported) */
+ RTN_NAT, /* Translate this address (not supported) */
+ RTN_XRESOLVE, /* Use external resolver (not supported) */
+ __RTN_MAX,
+};
+#define RTN_MAX (__RTN_MAX - 1)
+
+/*
+ * RFC 3549, 3.1.1, protocol (Identifies what/who added the route).
+ * Values larger than RTPROT_STATIC(4) are not interpreted by the
+ * kernel, they are just for user information.
+ */
+#define RTPROT_UNSPEC 0
+#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */
+#define RTPROT_KERNEL 2 /* Route installed by kernel */
+#define RTPROT_BOOT 3 /* Route installed during boot */
+#define RTPROT_STATIC 4 /* Route installed by administrator */
+
+#define RTPROT_GATED 8 /* Apparently, GateD */
+#define RTPROT_RA 9 /* RDISC/ND router advertisements */
+#define RTPROT_MRT 10 /* Merit MRT */
+#define RTPROT_ZEBRA 11 /* Zebra */
+#define RTPROT_BIRD 12 /* BIRD */
+#define RTPROT_DNROUTED 13 /* DECnet routing daemon */
+#define RTPROT_XORP 14 /* XORP */
+#define RTPROT_NTK 15 /* Netsukuku */
+#define RTPROT_DHCP 16 /* DHCP client */
+#define RTPROT_MROUTED 17 /* Multicast daemon */
+#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */
+#define RTPROT_BABEL 42 /* Babel daemon */
+#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */
+#define RTPROT_BGP 186 /* BGP Routes */
+#define RTPROT_ISIS 187 /* ISIS Routes */
+#define RTPROT_OSPF 188 /* OSPF Routes */
+#define RTPROT_RIP 189 /* RIP Routes */
+#define RTPROT_EIGRP 192 /* EIGRP Routes */
+
+/*
+ * RFC 3549 3.1.1 Route scope (valid distance to destination).
+ *
+ * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200)
+ * are available to the user.
+ */
+enum rt_scope_t {
+ RT_SCOPE_UNIVERSE = 0,
+ /* User defined values */
+ RT_SCOPE_SITE = 200,
+ RT_SCOPE_LINK = 253,
+ RT_SCOPE_HOST = 254,
+ RT_SCOPE_NOWHERE = 255
+};
+
+/*
+ * RFC 3549 3.1.1 Route flags.
+ */
+#define RTM_F_NOTIFY 0x100 /* Notify user of route change */
+#define RTM_F_CLONED 0x200 /* This route is cloned (not used) */
+#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
+#define RTM_F_PREFIX 0x800 /* Prefix addresses */
+#define RTM_F_LOOKUP_TABLE 0x1000 /* set tableid to FIB lookup result */
+#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */
+#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */
+#define RTM_F_TRAP 0x8000 /* route is trapping packets */
+#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed */
+
+/* Compatibility handling helpers */
+#ifndef _KERNEL
+#define NL_RTM_HDRLEN ((int)sizeof(struct rtmsg))
+#define RTM_RTA(_rtm) ((struct rtattr *)((char *)(_rtm) + NL_RTM_HDRLEN))
+#define RTM_PAYLOAD(_hdr) NLMSG_PAYLOAD((_hdr), NL_RTM_HDRLEN)
+#endif
+
+/*
+ * Routing table identifiers.
+ * FreeBSD route table numbering starts from 0, where 0 is a valid default routing table.
+ * Indicating "all tables" via netlink can be done by not including RTA_TABLE attribute
+ * and keeping rtm_table=0 (compatibility) or setting RTA_TABLE value to RT_TABLE_UNSPEC.
+ */
+#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */
+#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */
+
+enum rtattr_type_t {
+ NL_RTA_UNSPEC,
+ NL_RTA_DST, /* network: IPv4/IPv6 destination */
+ NL_RTA_SRC,
+ NL_RTA_IIF, /* not supported */
+ NL_RTA_OIF, /* u32: transmit ifindex */
+ NL_RTA_GATEWAY, /* network: IPv4/IPv6 gateway */
+ NL_RTA_PRIORITY,
+ NL_RTA_PREFSRC,
+ NL_RTA_METRICS,
+ NL_RTA_MULTIPATH,
+ NL_RTA_PROTOINFO, /* not supported / deprecated */
+ NL_RTA_FLOW, /* not supported */
+ NL_RTA_CACHEINFO, /* not supported */
+ NL_RTA_SESSION, /* not supported / deprecated */
+ NL_RTA_MP_ALGO, /* not supported / deprecated */
+ NL_RTA_TABLE, /* u32: fibnum */
+ NL_RTA_MARK, /* not used */
+ NL_RTA_MFC_STATS,
+ NL_RTA_VIA, /* network: af+ gw address */
+ NL_RTA_NEWDST,
+ NL_RTA_PREF,
+ NL_RTA_ENCAP_TYPE,
+ NL_RTA_ENCAP,
+ NL_RTA_EXPIRES,
+ NL_RTA_PAD,
+ NL_RTA_UID,
+ NL_RTA_TTL_PROPAGATE,
+ NL_RTA_IP_PROTO,
+ NL_RTA_SPORT,
+ NL_RTA_DPORT,
+ NL_RTA_NH_ID, /* u32: nexthop/nexthop group index */
+ __RTA_MAX
+};
+#define NL_RTA_MAX (__RTA_MAX - 1)
+
+/*
+ * Attributes that can be used as filters:
+ *
+ */
+
+#ifndef _KERNEL
+/*
+ * RTA_* space has clashes with rtsock namespace.
+ * Use NL_RTA_ prefix in the kernel and map to
+ * RTA_ for userland.
+ */
+#define RTA_UNSPEC NL_RTA_UNSPEC
+#define RTA_DST NL_RTA_DST
+#define RTA_SRC NL_RTA_SRC
+#define RTA_IIF NL_RTA_IIF
+#define RTA_OIF NL_RTA_OIF
+#define RTA_GATEWAY NL_RTA_GATEWAY
+#define RTA_PRIORITY NL_RTA_PRIORITY
+#define RTA_PREFSRC NL_RTA_PREFSRC
+#define RTA_METRICS NL_RTA_METRICS
+#define RTA_MULTIPATH NL_RTA_MULTIPATH
+#define RTA_PROTOINFO NL_RTA_PROTOINFO
+#define RTA_FLOW NL_RTA_FLOW
+#define RTA_CACHEINFO NL_RTA_CACHEINFO
+#define RTA_SESSION NL_RTA_SESSION
+#define RTA_MP_ALGO NL_RTA_MP_ALGO
+#define RTA_TABLE NL_RTA_TABLE
+#define RTA_MARK NL_RTA_MARK
+#define RTA_MFC_STATS NL_RTA_MFC_STATS
+#define RTA_VIA NL_RTA_VIA
+#define RTA_NEWDST NL_RTA_NEWDST
+#define RTA_PREF NL_RTA_PREF
+#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE
+#define RTA_ENCAP NL_RTA_ENCAP
+#define RTA_EXPIRES NL_RTA_EXPIRES
+#define RTA_PAD NL_RTA_PAD
+#define RTA_UID NL_RTA_UID
+#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE
+#define RTA_IP_PROTO NL_RTA_IP_PROTO
+#define RTA_SPORT NL_RTA_SPORT
+#define RTA_DPORT NL_RTA_DPORT
+#define RTA_NH_ID NL_RTA_NH_ID
+#define RTA_MAX NL_RTA_MAX
+#endif
+
+/* route attribute header */
+struct rtattr {
+ unsigned short rta_len;
+ unsigned short rta_type;
+};
+
+#define NL_RTA_ALIGN_SIZE NL_ITEM_ALIGN_SIZE
+#define NL_RTA_ALIGN NL_ITEM_ALIGN
+#define NL_RTA_HDRLEN ((int)sizeof(struct rtattr))
+#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - NL_RTA_HDRLEN))
+#define NL_RTA_DATA(_rta) NL_ITEM_DATA(_rta, NL_RTA_HDRLEN)
+#define NL_RTA_DATA_CONST(_rta) NL_ITEM_DATA_CONST(_rta, NL_RTA_HDRLEN)
+
+/* Compatibility attribute handling helpers */
+#ifndef _KERNEL
+#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE
+#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len)
+#define _RTA_LEN(_rta) ((int)(_rta)->rta_len)
+#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN(_RTA_LEN(_rta))
+#define RTA_OK(_rta, _len) NL_ITEM_OK(_rta, _len, NL_RTA_HDRLEN, _RTA_LEN)
+#define RTA_NEXT(_rta, _len) NL_ITEM_ITER(_rta, _len, _RTA_ALIGNED_LEN)
+#define RTA_LENGTH(_len) (NL_RTA_HDRLEN + (_len))
+#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len))
+#define RTA_DATA(_rta) NL_RTA_DATA(_rta)
+#define RTA_PAYLOAD(_rta) ((int)(_RTA_LEN(_rta) - NL_RTA_HDRLEN))
+#endif
+
+/* RTA attribute headers */
+
+/* RTA_VIA */
+struct rtvia {
+ sa_family_t rtvia_family;
+ uint8_t rtvia_addr[0];
+};
+
+/*
+ * RTA_METRICS is a nested attribute, consising of array of 'struct rtattr'
+ * with the types defined below. Most of the values are uint32_t.
+ */
+ enum {
+ NL_RTAX_UNSPEC,
+#define NL_RTAX_UNSPEC NL_RTAX_UNSPEC
+ NL_RTAX_LOCK,
+#define NL_RTAX_LOCK NL_RTAX_LOCK
+ NL_RTAX_MTU,
+#define NL_RTAX_MTU NL_RTAX_MTU
+ NL_RTAX_WINDOW,
+#define NL_RTAX_WINDOW NL_RTAX_WINDOW
+ NL_RTAX_RTT,
+#define NL_RTAX_RTT NL_RTAX_RTT
+ NL_RTAX_RTTVAR,
+#define NL_RTAX_RTTVAR NL_RTAX_RTTVAR
+ NL_RTAX_SSTHRESH,
+#define NL_RTAX_SSTHRESH NL_RTAX_SSTHRESH
+ NL_RTAX_CWND,
+#define NL_RTAX_CWND NL_RTAX_CWND
+ NL_RTAX_ADVMSS,
+#define NL_RTAX_ADVMSS NL_RTAX_ADVMSS
+ NL_RTAX_REORDERING,
+#define NL_RTAX_REORDERING NL_RTAX_REORDERING
+ NL_RTAX_HOPLIMIT,
+#define NL_RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+ NL_RTAX_INITCWND,
+#define NL_RTAX_INITCWND NL_RTAX_INITCWND
+ NL_RTAX_FEATURES,
+#define NL_RTAX_FEATURES NL_RTAX_FEATURES
+ NL_RTAX_RTO_MIN,
+#define NL_RTAX_RTO_MIN NL_RTAX_RTO_MIN
+ NL_RTAX_INITRWND,
+#define NL_RTAX_INITRWND NL_RTAX_INITRWND
+ NL_RTAX_QUICKACK,
+#define NL_RTAX_QUICKACK NL_RTAX_QUICKACK
+ NL_RTAX_CC_ALGO,
+#define NL_RTAX_CC_ALGO NL_RTAX_CC_ALGO
+ NL_RTAX_FASTOPEN_NO_COOKIE,
+#define NL_RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+ __NL_RTAX_MAX
+};
+#define NL_RTAX_MAX (__NL_RTAX_MAX - 1)
+
+#define RTAX_FEATURE_ECN (1 << 0)
+#define RTAX_FEATURE_SACK (1 << 1)
+#define RTAX_FEATURE_TIMESTAMP (1 << 2)
+#define RTAX_FEATURE_ALLFRAG (1 << 3)
+
+#define RTAX_FEATURE_MASK \
+ (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \
+ RTAX_FEATURE_ALLFRAG)
+
+#ifndef _KERNEL
+
+/*
+ * RTAX_* space clashes with rtsock namespace.
+ * Use NL_RTAX_ prefix in the kernel and map to
+ * RTAX_ for userland.
+ */
+#define RTAX_UNSPEC NL_RTAX_UNSPEC
+#define RTAX_LOCK NL_RTAX_LOCK
+#define RTAX_MTU NL_RTAX_MTU
+#define RTAX_WINDOW NL_RTAX_WINDOW
+#define RTAX_RTT NL_RTAX_RTT
+#define RTAX_RTTVAR NL_RTAX_RTTVAR
+#define RTAX_SSTHRESH NL_RTAX_SSTHRESH
+#define RTAX_CWND NL_RTAX_CWND
+#define RTAX_ADVMSS NL_RTAX_ADVMSS
+#define RTAX_REORDERING NL_RTAX_REORDERING
+#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT
+#define RTAX_INITCWND NL_RTAX_INITCWND
+#define RTAX_FEATURES NL_RTAX_FEATURES
+#define RTAX_RTO_MIN NL_RTAX_RTO_MIN
+#define RTAX_INITRWND NL_RTAX_INITRWND
+#define RTAX_QUICKACK NL_RTAX_QUICKACK
+#define RTAX_CC_ALGO NL_RTAX_CC_ALGO
+#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE
+#endif
+
+/* RTA_MULTIPATH consists of an array of rtnexthop structures. */
+struct rtnexthop {
+ unsigned short rtnh_len;
+ unsigned char rtnh_flags;
+ unsigned char rtnh_hops;
+ int rtnh_ifindex;
+};
+
+/* rtnh_flags */
+#define RTNH_F_DEAD 0x01 /* Nexthop is dead (used by multipath) */
+#define RTNH_F_PERVASIVE 0x02 /* Do recursive gateway lookup */
+#define RTNH_F_ONLINK 0x04 /* Gateway is forced on link */
+#define RTNH_F_OFFLOAD 0x08 /* Nexthop is offloaded */
+#define RTNH_F_LINKDOWN 0x10 /* carrier-down on nexthop */
+#define RTNH_F_UNRESOLVED 0x20 /* The entry is unresolved (ipmr) */
+#define RTNH_F_TRAP 0x40 /* Nexthop is trapping packets */
+
+#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \
+ RTNH_F_OFFLOAD | RTNH_F_TRAP)
+
+/* Macros to handle hexthops */
+#define RTNH_ALIGNTO NL_ITEM_ALIGN_SIZE
+#define RTNH_ALIGN(_len) NL_ITEM_ALIGN(_len)
+#define RTNH_HDRLEN ((int)sizeof(struct rtnexthop))
+#define _RTNH_LEN(_nh) ((int)(_nh)->rtnh_len)
+#define _RTNH_ALIGNED_LEN(_nh) RTNH_ALIGN(_RTNH_LEN(_nh))
+#define RTNH_OK(_nh, _len) NL_ITEM_OK(_nh, _len, RTNH_HDRLEN, _RTNH_LEN)
+#define RTNH_NEXT(_nh) ((struct rtnexthop *)((char *)(_nh) + _RTNH_ALIGNED_LEN(_nh)))
+#define RTNH_LENGTH(_len) (RTNH_HDRLEN + (_len))
+#define RTNH_SPACE(_len) RTNH_ALIGN(RTNH_LENGTH(_len))
+#define RTNH_DATA(_nh) ((struct rtattr *)NL_ITEM_DATA(_nh, RTNH_HDRLEN))
+
+
+struct rtgenmsg {
+ unsigned char rtgen_family;
+};
+
+#endif
diff --git a/sys/netlink/route/route.c b/sys/netlink/route/route.c
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/route.c
@@ -0,0 +1,822 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/rmlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_route.h>
+#include <netlink/route/route_var.h>
+
+#define DEBUG_MOD_NAME nl_route
+#define DEBUG_MAX_LEVEL LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+static unsigned char
+get_rtm_type(const struct nhop_object *nh)
+{
+ int nh_flags = nh->nh_flags;
+
+ /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */
+ if (nh_flags & NHF_BLACKHOLE)
+ return (RTN_BLACKHOLE);
+ else if (nh_flags & NHF_REJECT)
+ return (RTN_PROHIBIT);
+ return (RTN_UNICAST);
+}
+
+static unsigned char
+nl_get_rtm_protocol(const struct nhop_object *nh)
+{
+ if (NH_IS_NHGRP(nh)) {
+ const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh;
+ nh = nhg->nhops[0];
+ }
+ int rt_flags = nhop_get_rtflags(nh);
+ if (rt_flags & RTF_PROTO1)
+ return (RTPROT_ZEBRA);
+ if (rt_flags & RTF_STATIC)
+ return (RTPROT_STATIC);
+ return (RTPROT_KERNEL);
+}
+
+static int
+get_rtmsg_type_from_rtsock(int cmd)
+{
+ switch (cmd) {
+ case RTM_ADD:
+ case RTM_CHANGE:
+ case RTM_GET:
+ return NL_RTM_NEWROUTE;
+ case RTM_DELETE:
+ return NL_RTM_DELROUTE;
+ }
+
+ return (0);
+}
+
+/*
+ * fibnum heuristics
+ *
+ * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS
+ * msg rtm_table RTA_TABLE result
+ * RTM_GETROUTE/dump 0 - RT_ALL_FIBS
+ * RTM_GETROUTE/dump 1 - 1
+ * RTM_GETROUTE/get 0 - 0
+ *
+ */
+
+static struct nhop_object *
+rc_get_nhop(const struct rib_cmd_info *rc)
+{
+ return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new);
+}
+
+static bool
+dump_rc_nhop_gw(struct nlmsg_state *ns, const struct nhop_object *nh)
+{
+ int upper_family;
+
+ switch (nhop_get_neigh_family(nh)) {
+ case AF_LINK:
+ /* onlink prefix, skip */
+ break;
+ case AF_INET:
+ if (!nlattr_add(ns, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr))
+ return (false);
+ break;
+ case AF_INET6:
+ upper_family = nhop_get_upper_family(nh);
+ if (upper_family == AF_INET6) {
+ if (!nlattr_add(ns, NL_RTA_GATEWAY, 16, &nh->gw6_sa.sin6_addr))
+ return (false);
+ } else if (upper_family == AF_INET) {
+ /* IPv4 over IPv6 */
+ char buf[20];
+ struct rtvia *via = (struct rtvia *)&buf[0];
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &nh->gw6_sa.sin6_addr, 16);
+ if (!nlattr_add(ns, NL_RTA_VIA, 17, via))
+ return (false);
+ } else {
+ /* shouldn't happen */
+ return (false);
+ }
+ break;
+ }
+
+ return (true);
+
+}
+
+static bool
+dump_rc_nhg(struct nlmsg_state *ns, const struct nhgrp_object *nhg)
+{
+ uint32_t uidx = nhgrp_get_uidx(nhg);
+
+ if (uidx != 0) {
+ if (!nlattr_add_u32(ns, NL_RTA_NH_ID, uidx))
+ return (false);
+ }
+
+ uint32_t num_nhops;
+ const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops);
+
+ int off = nlattr_save_offset(ns);
+ if (!nlattr_add_flag(ns, NL_RTA_MULTIPATH))
+ return (false);
+
+ for (int i = 0; i < num_nhops; i++) {
+ int nh_off = nlattr_save_offset(ns);
+ struct rtnexthop *rtnh = nlmsg_reserve_object(ns, struct rtnexthop);
+ if (rtnh == NULL)
+ return (false);
+ rtnh->rtnh_flags = 0;
+ rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index;
+ rtnh->rtnh_hops = wn[i].weight;
+ if (!dump_rc_nhop_gw(ns, wn[i].nh))
+ return (false);
+ rtnh = nlattr_restore_offset(ns, nh_off, struct rtnexthop);
+ /*
+ * nlattr_add() allocates 4-byte aligned storage, no need to aligh
+ * length here
+ * */
+ rtnh->rtnh_len = nlattr_save_offset(ns) - nh_off;
+ }
+
+ struct nlattr *nla = nlattr_restore_offset(ns, off, struct nlattr);
+ nla->nla_len = nlattr_save_offset(ns) - off;
+
+ return (true);
+}
+
+static bool
+dump_rc_nhop(struct nlmsg_state *ns, const struct nhop_object *nh)
+{
+ if (NH_IS_NHGRP(nh))
+ return (dump_rc_nhg(ns, (const struct nhgrp_object *)nh));
+
+ /*
+ * IPv4 over IPv6
+ * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2),
+ * IPv4 w/ gw
+ * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)],
+ * Direct route:
+ * ('RTA_OIF', 2)
+ */
+ if (nh->nh_flags & NHF_GATEWAY)
+ dump_rc_nhop_gw(ns, nh);
+
+ uint32_t uidx = nhop_get_uidx(nh);
+ if (uidx != 0) {
+ if (!nlattr_add_u32(ns, NL_RTA_NH_ID, uidx))
+ return (false);
+ }
+
+ if (nhop_get_rtflags(nh) & RTF_FIXEDMTU) {
+ int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t);
+ struct nlattr *nla = nlmsg_reserve_data(ns, nla_len, struct nlattr);
+ if (nla == NULL)
+ return (false);
+ nla->nla_type = NL_RTA_METRICS;
+ nla->nla_len = nla_len;
+ nla++;
+ nla->nla_type = NL_RTAX_MTU;
+ nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t);
+ *((uint32_t *)(nla + 1)) = nh->nh_mtu;
+ }
+
+ /* In any case, fill outgoing interface */
+ if (!nlattr_add_u32(ns, NL_RTA_OIF, nh->nh_ifp->if_index))
+ return (false);
+
+ return (true);
+}
+
+/*
+ * Dumps output from a rib command into an rtmsg
+ */
+
+static int
+dump_px(uint32_t fibnum, const struct nlmsghdr *hdr,
+ const struct rtentry *rt, struct route_nhop_data *rnd,
+ struct nlmsg_state *ns)
+{
+ struct rtmsg *rtm;
+ int error = 0;
+
+ NET_EPOCH_ASSERT();
+
+ int payload_len = sizeof(struct rtmsg);
+ if (!nlmsg_reply(ns, hdr, payload_len))
+ goto enomem;
+
+ int family = rt_get_family(rt);
+ int rtm_off = nlattr_save_offset(ns);
+ rtm = nlmsg_reserve_object(ns, struct rtmsg);
+ if (rtm == NULL)
+ goto enomem;
+ rtm->rtm_family = family;
+ rtm->rtm_dst_len = 0;
+ rtm->rtm_src_len = 0;
+ rtm->rtm_tos = 0;
+ if (fibnum < 255)
+ rtm->rtm_table = (unsigned char)fibnum;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (!NH_IS_NHGRP(rnd->rnd_nhop)) {
+ rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop);
+ rtm->rtm_type = get_rtm_type(rnd->rnd_nhop);
+ } else {
+ rtm->rtm_protocol = RTPROT_UNSPEC; /* TODO: protocol from nhg? */
+ rtm->rtm_type = RTN_UNICAST;
+ }
+ rtm->rtm_flags = 0;
+
+ if (!nlattr_add_u32(ns, NL_RTA_TABLE, fibnum))
+ goto enomem;
+
+ int plen = 0;
+ uint32_t scopeid = 0;
+ switch (family) {
+ case AF_INET:
+ {
+ struct in_addr addr;
+ rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
+ if (!nlattr_add(ns, NL_RTA_DST, 4, &addr))
+ goto enomem;
+ break;
+ }
+ case AF_INET6:
+ {
+ struct in6_addr addr;
+ rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
+ if (!nlattr_add(ns, NL_RTA_DST, 16, &addr))
+ goto enomem;
+ break;
+ }
+ default:
+ FIB_LOG(LOG_NOTICE, fibnum, family, "unknown rt family");
+ error = EAFNOSUPPORT;
+ goto flush;
+ }
+
+ if (plen > 0) {
+ rtm = nlattr_restore_offset(ns, rtm_off, struct rtmsg);
+ rtm->rtm_dst_len = plen;
+ }
+
+ if (!dump_rc_nhop(ns, rnd->rnd_nhop))
+ goto enomem;
+
+/*
+ struct nlattr *metrics_nla;
+ metrics_nla = nla_nest_start(m, NL_RTA_METRICS);
+ nlattr_add_u32(m, NL_RTAX_MTU, nh->nh_mtu);
+ nla_nest_end(m, metrics_nla);
+*/
+ nlmsg_end(ns);
+ return (0);
+enomem:
+ error = ENOMEM;
+flush:
+ nlmsg_abort(ns);
+ return (error);
+}
+
+static int
+family_to_group(int family)
+{
+ switch (family) {
+ case AF_INET:
+ return (RTNLGRP_IPV4_ROUTE);
+ case AF_INET6:
+ return (RTNLGRP_IPV6_ROUTE);
+ }
+ return (0);
+}
+
+
+static void
+report_operation(uint32_t fibnum, struct rib_cmd_info *rc,
+ struct nlpcb *nlp, struct nlmsghdr *hdr)
+{
+ struct nlmsg_state ns;
+
+ uint32_t group_mask = family_to_group(rt_get_family(rc->rc_rt));
+ if (nlmsg_get_group_writer(NLMSG_SMALL, group_mask, &ns)) {
+ struct route_nhop_data rnd = {
+ .rnd_nhop = rc_get_nhop(rc),
+ .rnd_weight = rc->rc_nh_weight,
+ };
+ dump_px(fibnum, hdr, rc->rc_rt, &rnd, &ns);
+ nlmsg_flush(&ns);
+ }
+ rib_bridge_rt_event(NLBR_PROVIDER_NETLINK, fibnum, NULL, rc);
+}
+
+struct nl_parsed_route {
+ struct sockaddr *rta_dst;
+ struct sockaddr *rta_gw;
+ struct ifnet *rta_oif;
+ struct nlattr *rta_metrics;
+ uint32_t rta_table;
+ uint32_t rta_nh_id;
+ uint32_t rtax_mtu;
+ uint8_t rtm_family;
+};
+#define _OFF_S(_field) offsetof(struct nl_parsed_route, _field)
+
+static struct nlattr_parser ps[] = {
+ { .type = NL_RTA_DST, .off = _OFF_S(rta_dst), .cb = nlattr_get_ip },
+ { .type = NL_RTA_OIF, .off = _OFF_S(rta_oif), .cb = nlattr_get_ifindex },
+ { .type = NL_RTA_GATEWAY, .off = _OFF_S(rta_gw), .cb = nlattr_get_ip },
+ { .type = NL_RTA_METRICS, .off = _OFF_S(rta_metrics), .cb = nlattr_get_nla },
+ { .type = NL_RTA_TABLE, .off = _OFF_S(rta_table), .cb = nlattr_get_uint32 },
+ { .type = NL_RTA_VIA, .off = _OFF_S(rta_gw), .cb = nlattr_get_ipvia },
+ { .type = NL_RTA_NH_ID, .off = _OFF_S(rta_nh_id), .cb = nlattr_get_uint32 },
+};
+
+static struct nlattr_parser psm[] = {
+ { .type = NL_RTAX_MTU, .off = _OFF_S(rtax_mtu), .cb = nlattr_get_uint32 },
+};
+
+struct netlink_walkargs {
+ struct nlmsg_state *ns;
+ struct route_nhop_data rnd;
+ struct nlmsghdr hdr;
+ struct nlpcb *nlp;
+ uint32_t fibnum;
+ int family;
+ int error;
+ int count;
+ int dumped;
+ int dumped_tables;
+};
+
+static int
+dump_rtentry(struct rtentry *rt, void *_arg)
+{
+ struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg;
+ int error;
+
+ wa->count++;
+ if (wa->error != 0)
+ return (0);
+ wa->dumped++;
+
+ rt_get_rnd(rt, &wa->rnd);
+
+ error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->ns);
+
+ IF_DEBUG_LEVEL(LOG_DEBUG3) {
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
+ "Dump %s, offset %u, error %d",
+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
+ wa->ns->offset, error);
+ }
+ wa->error = error;
+
+ return (0);
+}
+
+static void
+dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump");
+ wa->count = 0;
+ wa->dumped = 0;
+
+ rib_walk(fibnum, family, false, dump_rtentry, wa);
+
+ wa->dumped_tables++;
+
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
+ wa->count, wa->dumped);
+ RT_LOG(LOG_DEBUG2, "Current offset: %d", wa->ns->offset);
+}
+
+static int
+dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family)
+{
+ wa->fibnum = fibnum;
+
+ if (family == AF_UNSPEC) {
+ for (int i = 0; i < AF_MAX; i++) {
+ if (rt_tables_get_rnh(fibnum, i) != 0) {
+ wa->family = i;
+ dump_rtable_one(wa, fibnum, i);
+ if (wa->error != 0)
+ break;
+ }
+ }
+ } else {
+ if (rt_tables_get_rnh(fibnum, family) != 0) {
+ wa->family = family;
+ dump_rtable_one(wa, fibnum, family);
+ }
+ }
+
+ return (wa->error);
+}
+
+
+static int
+handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
+ struct nlmsghdr *hdr, struct nlmsg_state *ns)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+ struct rtentry *rt;
+ uint32_t fibnum = attrs->rta_table;
+ sa_family_t family = attrs->rtm_family;
+
+ if (attrs->rta_dst == NULL) {
+ RT_LOG(LOG_DEBUG, "No RTA_DST supplied");
+ return (EINVAL);
+ }
+
+ FIB_LOG(LOG_DEBUG, fibnum, family, "getroute called");
+
+ rnh = rt_tables_get_rnh(fibnum, family);
+ if (rnh == NULL)
+ return (EAFNOSUPPORT);
+
+ RIB_RLOCK(rnh);
+
+ rt = (struct rtentry *)rnh->rnh_matchaddr(attrs->rta_dst, &rnh->head);
+ if (rt == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ struct route_nhop_data rnd;
+ rt_get_rnd(rt, &rnd);
+ rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0);
+
+ RIB_RUNLOCK(rnh);
+
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ {
+ char rtbuf[INET6_ADDRSTRLEN + 5], nhbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
+ nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)),
+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)));
+ }
+#endif
+ hdr->nlmsg_type = NL_RTM_NEWROUTE;
+ dump_px(fibnum, hdr, rt, &rnd, ns);
+
+ return (0);
+}
+
+static int
+handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family,
+ struct nlmsghdr *hdr, struct nlmsg_state *ns)
+{
+ struct netlink_walkargs wa = {
+ .nlp = nlp,
+ .ns = ns,
+ .hdr.nlmsg_pid = hdr->nlmsg_pid,
+ .hdr.nlmsg_seq = hdr->nlmsg_seq,
+ .hdr.nlmsg_type = NL_RTM_NEWROUTE,
+ .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI,
+ };
+
+ if (fibnum == RT_TABLE_UNSPEC) {
+ for (int i = 0; i < V_rt_numfibs; i++) {
+ dump_rtable_fib(&wa, fibnum, family);
+ if (wa.error != 0)
+ break;
+ }
+ } else
+ dump_rtable_fib(&wa, fibnum, family);
+
+ if (wa.error == 0 && wa.dumped_tables == 0) {
+ FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family");
+ wa.error = ESRCH;
+ // How do we propagate it?
+ }
+
+ if (!nlmsg_end_dump(wa.ns, wa.error, &wa.hdr)) {
+ RT_LOG(LOG_DEBUG, "Unable to finalize the dump");
+ return (ENOMEM);
+ }
+
+ return (wa.error);
+}
+
+static struct nhop_object *
+finalize_nhop(struct nhop_object *nh, int *perror)
+{
+ /*
+ * The following MUST be filled:
+ * nh_ifp, nh_ifa, nh_gw
+ */
+ if (nh->gw_sa.sa_family == 0) {
+ /*
+ * Empty gateway. Can be direct route with RTA_OIF set.
+ */
+ if (nh->nh_ifp != NULL)
+ nhop_set_direct_gw(nh, nh->nh_ifp);
+ else {
+ RT_LOG(LOG_DEBUG, "empty gateway and interface, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ /* Both nh_ifp and gateway are set */
+ } else {
+ /* Gateway is set up, we can derive ifp if not set */
+ if (nh->nh_ifp == NULL) {
+ struct ifaddr *ifa = ifa_ifwithnet(&nh->gw_sa, 1, nhop_get_fibnum(nh));
+ if (ifa == NULL) {
+ RT_LOG(LOG_DEBUG, "Unable to determine ifp, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ nhop_set_transmit_ifp(nh, ifa->ifa_ifp);
+ }
+ }
+ /* Both nh_ifp and gateway are set */
+ if (nh->nh_ifa == NULL) {
+ struct ifaddr *ifa = ifaof_ifpforaddr(&nh->gw_sa, nh->nh_ifp);
+ if (ifa == NULL) {
+ RT_LOG(LOG_DEBUG, "Unable to determine ifa, skipping");
+ *perror = EINVAL;
+ return (NULL);
+ }
+ nhop_set_src(nh, ifa);
+ }
+
+ return (nhop_get_nhop(nh, perror));
+}
+
+static int
+get_pxflag(const struct rtmsg *rtm)
+{
+ int pxflag = 0;
+ switch (rtm->rtm_family) {
+ case AF_INET:
+ if (rtm->rtm_dst_len == 32)
+ pxflag = NHF_HOST;
+ else if (rtm->rtm_dst_len == 0)
+ pxflag = NHF_DEFAULT;
+ break;
+ case AF_INET6:
+ if (rtm->rtm_dst_len == 32)
+ pxflag = NHF_HOST;
+ else if (rtm->rtm_dst_len == 0)
+ pxflag = NHF_DEFAULT;
+ break;
+ }
+
+ return (pxflag);
+}
+
+static int
+get_rtm_flags(int nlm_flags)
+{
+ int rtm_flags = 0;
+
+ rtm_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0;
+ rtm_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0;
+ rtm_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0;
+ rtm_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0;
+
+ return (rtm_flags);
+}
+
+static int
+rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt)
+{
+ struct rib_cmd_info rc = {};
+ struct nhop_object *nh = NULL;
+ int error;
+
+ error = nlp_has_priv_route(nlp);
+ if (error != 0)
+ return (error);
+
+ struct rtmsg *rtm = (struct rtmsg *)nlmsg_data(hdr);
+
+ struct nl_parsed_route attrs = {};
+ error = nl_parse_attrs(hdr, sizeof(*rtm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+ if (attrs.rta_metrics != NULL) {
+ int data_len = attrs.rta_metrics->nla_len - sizeof(struct nlattr);
+ error = nl_parse_attrs_raw(attrs.rta_metrics + 1, data_len,
+ psm, sizeof(psm)/sizeof(psm[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+ }
+
+ /* Check if we have enough data */
+ if (attrs.rta_dst == NULL) {
+ RT_LOG(LOG_DEBUG, "missing RTA_DST");
+ return (EINVAL);
+ }
+
+ if (attrs.rta_nh_id != 0) {
+ /* Referenced uindex */
+ int pxflag = get_pxflag(rtm);
+ nh = nl_find_nhop(attrs.rta_table, rtm->rtm_family, attrs.rta_nh_id,
+ pxflag, &error);
+ if (error != 0)
+ return (error);
+ } else {
+ nh = nhop_alloc(attrs.rta_table, rtm->rtm_family);
+ if (nh == NULL)
+ return (ENOMEM);
+ if (attrs.rta_gw != NULL)
+ nhop_set_gw(nh, attrs.rta_gw, true);
+ if (attrs.rta_oif != NULL)
+ nhop_set_transmit_ifp(nh, attrs.rta_oif);
+ if (attrs.rtax_mtu != 0)
+ nhop_set_mtu(nh, attrs.rtax_mtu, true);
+ nh = finalize_nhop(nh, &error);
+ if (error != 0) {
+ RT_LOG(LOG_DEBUG, "Error finalising nexthop");
+ return (error);
+ }
+ }
+
+ int weight = NH_IS_NHGRP(nh) ? 0 : RT_DEFAULT_WEIGHT;
+ struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = weight };
+ int rtm_flags = get_rtm_flags(hdr->nlmsg_flags);
+
+ error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, rtm->rtm_dst_len,
+ &rnd, rtm_flags, &rc);
+ if (error == 0)
+ report_operation(attrs.rta_table, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
+{
+ struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data;
+
+ if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw))
+ return (0);
+
+ if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp))
+ return (0);
+
+ return (1);
+}
+
+static int
+rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt)
+{
+ struct rib_cmd_info rc;
+ int error;
+
+ error = nlp_has_priv_route(nlp);
+ if (error != 0)
+ return (error);
+
+ struct rtmsg *rtm = (struct rtmsg *)nlmsg_data(hdr);
+ struct nl_parsed_route attrs = { .rtm_family = rtm->rtm_family };
+ error = nl_parse_attrs(hdr, sizeof(*rtm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (attrs.rta_dst == NULL) {
+ RT_LOG(LOG_DEBUG, "RTA_DST is not set");
+ return (ESRCH);
+ }
+
+ error = rib_del_route_px(attrs.rta_table, attrs.rta_dst,
+ rtm->rtm_dst_len, path_match_func, &attrs, 0, &rc);
+ if (error == 0)
+ report_operation(attrs.rta_table, &rc, nlp, hdr);
+ return (error);
+}
+
+static int
+rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt)
+{
+ int error;
+
+ struct rtmsg *rtm = (struct rtmsg *)nlmsg_data(hdr);
+ struct nl_parsed_route attrs = { .rtm_family = rtm->rtm_family };
+ error = nl_parse_attrs(hdr, sizeof(*rtm), ps, sizeof(ps)/sizeof(ps[0]), npt, &attrs);
+ if (error != 0)
+ return (error);
+
+ if (hdr->nlmsg_flags & NLM_F_DUMP)
+ error = handle_rtm_dump(nlp, attrs.rta_table, rtm->rtm_family, hdr, npt->ns);
+ else
+ error = handle_rtm_getroute(nlp, &attrs, hdr, npt->ns);
+
+ return (error);
+}
+
+void
+rtnl_handle_route_event(uint32_t fibnum, const struct rt_addrinfo *info,
+ const struct rib_cmd_info *rc)
+{
+ int family, nlm_flags = 0;
+
+ struct nlmsg_state ns;
+
+ family = rt_get_family(rc->rc_rt);
+
+ /* XXX: check if there are active listeners first */
+
+ /* TODO: consider passing PID/type/seq */
+ switch (rc->rc_cmd) {
+ case RTM_ADD:
+ nlm_flags = NLM_F_EXCL | NLM_F_CREATE;
+ break;
+ case RTM_CHANGE:
+ nlm_flags = NLM_F_REPLACE;
+ break;
+ case RTM_DELETE:
+ nlm_flags = 0;
+ break;
+ }
+#if DEBUG_MAX_LEVEL >= LOG_DEBUG2
+ {
+ char rtbuf[INET6_ADDRSTRLEN + 5];
+ FIB_LOG(LOG_DEBUG2, fibnum, family,
+ "received event %s for %s / nlm_flags=%X",
+ rib_print_cmd(rc->rc_cmd),
+ rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)),
+ nlm_flags);
+ }
+#endif
+ struct nlmsghdr hdr = {
+ .nlmsg_flags = nlm_flags,
+ .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd),
+ };
+
+ struct route_nhop_data rnd = {
+ .rnd_nhop = rc_get_nhop(rc),
+ .rnd_weight = rc->rc_nh_weight,
+ };
+
+ uint32_t group_mask = family_to_group(family);
+
+ if (!nlmsg_get_group_writer(NLMSG_SMALL, group_mask, &ns)) {
+ RT_LOG(LOG_DEBUG, "error allocating mbuf");
+ return;
+ }
+
+ dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &ns);
+ nlmsg_flush(&ns);
+}
+
+static struct rtnl_cmd_handler cmd_handlers[] = {
+ { NL_RTM_GETROUTE, "RTM_GETROUTE", &rtnl_handle_getroute, sizeof(struct rtmsg)},
+ { NL_RTM_DELROUTE, "RTM_DELROUTE", &rtnl_handle_delroute, sizeof(struct rtmsg)},
+ { NL_RTM_NEWROUTE, "RTM_NEWROUTE", &rtnl_handle_newroute, sizeof(struct rtmsg)},
+};
+
+void
+rtnl_routes_init()
+{
+ rtnl_register_messages(cmd_handlers, RTNL_ARRAY_LEN(cmd_handlers));
+}
diff --git a/sys/netlink/route/route_var.h b/sys/netlink/route/route_var.h
new file mode 100644
--- /dev/null
+++ b/sys/netlink/route/route_var.h
@@ -0,0 +1,77 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 Ng Peng Nam Sean
+ * Copyright (c) 2022 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This file contains definitions shared amount NETLINK_ROUTE family
+ */
+
+#ifndef _NETLINK_ROUTE_ROUTE_VAR_H_
+#define _NETLINK_ROUTE_ROUTE_VAR_H_
+
+struct nlmsghdr;
+struct nlpcb;
+struct netlink_parse_tracker;
+
+typedef int rtnl_msg_cb_f(struct nlmsghdr *hdr, struct nlpcb *nlp,
+ struct netlink_parse_tracker *npt);
+
+struct rtnl_cmd_handler {
+ int rtnl_cmd;
+ const char *rtnl_cmd_name;
+ rtnl_msg_cb_f *rtnl_cb;
+ int rtnl_min_size;
+ int rtnl_flags;
+};
+#define RTNL_ARRAY_LEN(_a) (sizeof(_a) / sizeof((_a)[0]))
+
+#define RTNL_F_NOEPOCH 0x01
+#define RTNL_F_NOWRITER 0x02
+
+bool rtnl_register_messages(struct rtnl_cmd_handler *handlers, int count);
+
+/* route.c */
+struct rt_addrinfo;
+struct rib_cmd_info;
+void rtnl_handle_route_event(uint32_t fibnum, const struct rt_addrinfo *info,
+ const struct rib_cmd_info *rc);
+void rtnl_routes_init(void);
+
+/* neigh.c */
+void rtnl_neighs_init(void);
+
+/* iface.c */
+void rtnl_ifaces_init(void);
+void rtnl_ifaces_destroy(void);
+
+/* nexthop.c */
+void rtnl_nexthops_init(void);
+struct nhop_object *nl_find_nhop(uint32_t fibnum, int family,
+ uint32_t uidx, int nh_flags, int *perror);
+
+
+#endif
diff --git a/tests/sys/net/routing/netlink.py b/tests/sys/net/routing/netlink.py
new file mode 100644
--- /dev/null
+++ b/tests/sys/net/routing/netlink.py
@@ -0,0 +1,1090 @@
+#!/usr/local/bin/python3
+
+from ctypes import *
+import socket
+import os
+import sys
+import unittest
+import struct
+
+from enum import Enum, auto
+
+from typing import List, Callable, Dict, NamedTuple, Optional
+
+
+def roundup2(val: int, num: int) -> int:
+ if val % num:
+ return (val | (num - 1)) + 1
+ else:
+ return val
+
+
+def align4(val: int) -> int:
+ return roundup2(val, 4)
+
+
+class SockaddrNl(Structure):
+ _fields_ = [
+ ("nl_len", c_ubyte),
+ ("nl_family", c_ubyte),
+ ("nl_pad", c_ushort),
+ ("nl_pid", c_uint),
+ ("nl_groups", c_uint),
+ ]
+
+
+class Nlmsghdr(Structure):
+ _fields_ = [
+ ("nlmsg_len", c_uint),
+ ("nlmsg_type", c_ushort),
+ ("nlmsg_flags", c_ushort),
+ ("nlmsg_seq", c_uint),
+ ("nlmsg_pid", c_uint),
+ ]
+
+
+class Nlmsgerr(Structure):
+ _fields_ = [
+ ("error", c_int),
+ ("msg", Nlmsghdr),
+ ]
+
+
+class RtattrType(Enum):
+ RTA_UNSPEC = 0
+ RTA_DST = auto()
+ RTA_SRC = auto()
+ RTA_IIF = auto()
+ RTA_OIF = auto()
+ RTA_GATEWAY = auto()
+ RTA_PRIORITY = auto()
+ RTA_PREFSRC = auto()
+ RTA_METRICS = auto()
+ RTA_MULTIPATH = auto()
+ RTA_PROTOINFO = auto()
+ RTA_FLOW = auto()
+ RTA_CACHEINFO = auto()
+ RTA_SESSION = auto()
+ RTA_MP_ALGO = auto()
+ RTA_TABLE = auto()
+ RTA_MARK = auto()
+ RTA_MFC_STATS = auto()
+ RTA_VIA = auto()
+ RTA_NEWDST = auto()
+ RTA_PREF = auto()
+ RTA_ENCAP_TYPE = auto()
+ RTA_ENCAP = auto()
+ RTA_EXPIRES = auto()
+ RTA_PAD = auto()
+ RTA_UID = auto()
+ RTA_TTL_PROPAGATE = auto()
+ RTA_IP_PROTO = auto()
+ RTA_SPORT = auto()
+ RTA_DPORT = auto()
+ RTA_NH_ID = auto()
+
+
+class NlMsgType(Enum):
+ NLMSG_NOOP = 1
+ NLMSG_ERROR = 2
+ NLMSG_DONE = 3
+ NLMSG_OVERRUN = 4
+
+
+class NlRtMsgType(Enum):
+ RTM_NEWLINK = 16
+ RTM_DELLINK = 17
+ RTM_GETLINK = 18
+ RTM_SETLINK = 19
+ RTM_NEWADDR = 20
+ RTM_DELADDR = 21
+ RTM_GETADDR = 22
+ RTM_NEWROUTE = 24
+ RTM_DELROUTE = 25
+ RTM_GETROUTE = 26
+ RTM_NEWNEIGH = 28
+ RTM_DELNEIGH = 27
+ RTM_GETNEIGH = 28
+ RTM_NEWRULE = 32
+ RTM_DELRULE = 33
+ RTM_GETRULE = 34
+ RTM_NEWQDISC = 36
+ RTM_DELQDISC = 37
+ RTM_GETQDISC = 38
+ RTM_NEWTCLASS = 40
+ RTM_DELTCLASS = 41
+ RTM_GETTCLASS = 42
+ RTM_NEWTFILTER = 44
+ RTM_DELTFILTER = 45
+ RTM_GETTFILTER = 46
+ RTM_NEWACTION = 48
+ RTM_DELACTION = 49
+ RTM_GETACTION = 50
+ RTM_NEWPREFIX = 52
+ RTM_GETMULTICAST = 58
+ RTM_GETANYCAST = 62
+ RTM_NEWNEIGHTBL = 64
+ RTM_GETNEIGHTBL = 66
+ RTM_SETNEIGHTBL = 67
+ RTM_NEWNDUSEROPT = 68
+ RTM_NEWADDRLABEL = 72
+ RTM_DELADDRLABEL = 73
+ RTM_GETADDRLABEL = 74
+ RTM_GETDCB = 78
+ RTM_SETDCB = 79
+ RTM_NEWNETCONF = 80
+ RTM_GETNETCONF = 82
+ RTM_NEWMDB = 84
+ RTM_DELMDB = 85
+ RTM_GETMDB = 86
+ RTM_NEWNSID = 88
+ RTM_DELNSID = 89
+ RTM_GETNSID = 90
+ RTM_NEWSTATS = 92
+ RTM_GETSTATS = 94
+
+
+class RtAttr(Structure):
+ _fields_ = [
+ ("rta_len", c_ushort),
+ ("rta_type", c_ushort),
+ ]
+
+
+class RtMsgHdr(Structure):
+ _fields_ = [
+ ("rtm_family", c_ubyte),
+ ("rtm_dst_len", c_ubyte),
+ ("rtm_src_len", c_ubyte),
+ ("rtm_tos", c_ubyte),
+ ("rtm_table", c_ubyte),
+ ("rtm_protocol", c_ubyte),
+ ("rtm_scope", c_ubyte),
+ ("rtm_type", c_ubyte),
+ ("rtm_flags", c_uint),
+ ]
+
+
+class RtMsgFlags(Enum):
+ RTM_F_NOTIFY = 0x100
+ RTM_F_CLONED = 0x200
+ RTM_F_EQUALIZE = 0x400
+ RTM_F_PREFIX = 0x800
+ RTM_F_LOOKUP_TABLE = 0x1000
+ RTM_F_FIB_MATCH = 0x2000
+ RTM_F_OFFLOAD = 0x4000
+ RTM_F_TRAP = 0x8000
+ RTM_F_OFFLOAD_FAILED = 0x20000000
+
+
+class AddressFamilyLinux(Enum):
+ AF_INET = socket.AF_INET
+ AF_INET6 = socket.AF_INET6
+ AF_NETLINK = 16
+
+
+class AddressFamilyBsd(Enum):
+ AF_INET = socket.AF_INET
+ AF_INET6 = socket.AF_INET6
+ AF_NETLINK = 38
+
+
+class NlmBaseFlags(Enum):
+ NLM_F_REQUEST = 0x01
+ NLM_F_MULTI = 0x02
+ NLM_F_ACK = 0x04
+ NLM_F_ECHO = 0x08
+ NLM_F_DUMP_INTR = 0x10
+ NLM_F_DUMP_FILTERED = 0x20
+
+# XXX: in python3.8 it is possible to
+# class NlmGetFlags(Enum, NlmBaseFlags):
+
+
+class NlmGetFlags(Enum):
+ NLM_F_ROOT = 0x100
+ NLM_F_MATCH = 0x200
+ NLM_F_ATOMIC = 0x400
+
+
+class NlmNewFlags(Enum):
+ NLM_F_REPLACE = 0x100
+ NLM_F_EXCL = 0x200
+ NLM_F_CREATE = 0x400
+ NLM_F_APPEND = 0x800
+
+
+class NlmDeleteFlags(Enum):
+ NLM_F_NONREC = 0x100
+
+
+class NlmAckFlags(Enum):
+ NLM_F_CAPPED = 0x100
+ NLM_F_ACK_TLVS = 0x200
+
+
+class RtScope(Enum):
+ RT_SCOPE_UNIVERSE = 0
+ RT_SCOPE_SITE = 200
+ RT_SCOPE_LINK = 253
+ RT_SCOPE_HOST = 254
+ RT_SCOPE_NOWHERE = 255
+
+
+class RtType(Enum):
+ RTN_UNSPEC = 0
+ RTN_UNICAST = auto()
+ RTN_LOCAL = auto()
+ RTN_BROADCAST = auto()
+ RTN_ANYCAST = auto()
+ RTN_MULTICAST = auto()
+ RTN_BLACKHOLE = auto()
+ RTN_UNREACHABLE = auto()
+ RTN_PROHIBIT = auto()
+ RTN_THROW = auto()
+ RTN_NAT = auto()
+ RTN_XRESOLVE = auto()
+
+
+class RtProto(Enum):
+ RTPROT_UNSPEC = 0
+ RTPROT_REDIRECT = 1
+ RTPROT_KERNEL = 2
+ RTPROT_BOOT = 3
+ RTPROT_STATIC = 4
+ RTPROT_GATED = 8
+ RTPROT_RA = 9
+ RTPROT_MRT = 10
+ RTPROT_ZEBRA = 11
+ RTPROT_BIRD = 12
+ RTPROT_DNROUTED = 13
+ RTPROT_XORP = 14
+ RTPROT_NTK = 15
+ RTPROT_DHCP = 16
+ RTPROT_MROUTED = 17
+ RTPROT_KEEPALIVED = 18
+ RTPROT_BABEL = 42
+ RTPROT_OPENR = 99
+ RTPROT_BGP = 186
+ RTPROT_ISIS = 187
+ RTPROT_OSPF = 188
+ RTPROT_RIP = 189
+ RTPROT_EIGRP = 192
+
+
+class NlRtaxType(Enum):
+ RTAX_UNSPEC = 0
+ RTAX_LOCK = auto()
+ RTAX_MTU = auto()
+ RTAX_WINDOW = auto()
+ RTAX_RTT = auto()
+ RTAX_RTTVAR = auto()
+ RTAX_SSTHRESH = auto()
+ RTAX_CWND = auto()
+ RTAX_ADVMSS = auto()
+ RTAX_REORDERING = auto()
+ RTAX_HOPLIMIT = auto()
+ RTAX_INITCWND = auto()
+ RTAX_FEATURES = auto()
+ RTAX_RTO_MIN = auto()
+ RTAX_INITRWND = auto()
+ RTAX_QUICKACK = auto()
+ RTAX_CC_ALGO = auto()
+ RTAX_FASTOPEN_NO_COOKIE = auto()
+
+
+class NlRtGroup(Enum):
+ RTNLGRP_NONE = 0
+ RTNLGRP_LINK = auto()
+ RTNLGRP_NOTIFY = auto()
+ RTNLGRP_NEIGH = auto()
+ RTNLGRP_TC = auto()
+ RTNLGRP_IPV4_IFADDR = auto()
+ RTNLGRP_IPV4_MROUTE = auto()
+ RTNLGRP_IPV4_ROUTE = auto()
+ RTNLGRP_IPV4_RULE = auto()
+ RTNLGRP_IPV6_IFADDR = auto()
+ RTNLGRP_IPV6_MROUTE = auto()
+ RTNLGRP_IPV6_ROUTE = auto()
+ RTNLGRP_IPV6_IFINFO = auto()
+ RTNLGRP_DECnet_IFADDR = auto()
+ RTNLGRP_NOP2 = auto()
+ RTNLGRP_DECnet_ROUTE = auto()
+ RTNLGRP_DECnet_RULE = auto()
+ RTNLGRP_NOP4 = auto()
+ RTNLGRP_IPV6_PREFIX = auto()
+ RTNLGRP_IPV6_RULE = auto()
+ RTNLGRP_ND_USEROPT = auto()
+ RTNLGRP_PHONET_IFADDR = auto()
+ RTNLGRP_PHONET_ROUTE = auto()
+ RTNLGRP_DCB = auto()
+ RTNLGRP_IPV4_NETCONF = auto()
+ RTNLGRP_IPV6_NETCONF = auto()
+ RTNLGRP_MDB = auto()
+ RTNLGRP_MPLS_ROUTE = auto()
+ RTNLGRP_NSID = auto()
+ RTNLGRP_MPLS_NETCONF = auto()
+ RTNLGRP_IPV4_MROUTE_R = auto()
+ RTNLGRP_IPV6_MROUTE_R = auto()
+ RTNLGRP_NEXTHOP = auto()
+ RTNLGRP_BRVLAN = auto()
+
+
+class IfinfoMsg(Structure):
+ _fields_ = [
+ ("ifi_family", c_ubyte),
+ ("__ifi_pad", c_ubyte),
+ ("ifi_type", c_ushort),
+ ("ifi_index", c_int),
+ ("ifi_flags", c_uint),
+ ("ifi_change", c_uint),
+ ]
+
+
+class IflattrType(Enum):
+ IFLA_UNSPEC = 0
+ IFLA_ADDRESS = auto()
+ IFLA_BROADCAST = auto()
+ IFLA_IFNAME = auto()
+ IFLA_MTU = auto()
+ IFLA_LINK = auto()
+ IFLA_QDISC = auto()
+ IFLA_STATS = auto()
+ IFLA_COST = auto()
+ IFLA_PRIORITY = auto()
+ IFLA_MASTER = auto()
+ IFLA_WIRELESS = auto()
+ IFLA_PROTINFO = auto()
+ IFLA_TXQLEN = auto()
+ IFLA_MAP = auto()
+ IFLA_WEIGHT = auto()
+ IFLA_OPERSTATE = auto()
+ IFLA_LINKMODE = auto()
+ IFLA_LINKINFO = auto()
+ IFLA_NET_NS_PID = auto()
+ IFLA_IFALIAS = auto()
+ IFLA_NUM_VF = auto()
+ IFLA_VFINFO_LIST = auto()
+ IFLA_STATS64 = auto()
+ IFLA_VF_PORTS = auto()
+ IFLA_PORT_SELF = auto()
+ IFLA_AF_SPEC = auto()
+ IFLA_GROUP = auto()
+ IFLA_NET_NS_FD = auto()
+ IFLA_EXT_MASK = auto()
+ IFLA_PROMISCUITY = auto()
+ IFLA_NUM_TX_QUEUES = auto()
+ IFLA_NUM_RX_QUEUES = auto()
+ IFLA_CARRIER = auto()
+ IFLA_PHYS_PORT_ID = auto()
+ IFLA_CARRIER_CHANGES = auto()
+ IFLA_PHYS_SWITCH_ID = auto()
+ IFLA_LINK_NETNSID = auto()
+ IFLA_PHYS_PORT_NAME = auto()
+ IFLA_PROTO_DOWN = auto()
+ IFLA_GSO_MAX_SEGS = auto()
+ IFLA_GSO_MAX_SIZE = auto()
+ IFLA_PAD = auto()
+ IFLA_XDP = auto()
+ IFLA_EVENT = auto()
+ IFLA_NEW_NETNSID = auto()
+ IFLA_IF_NETNSID = auto()
+ IFLA_CARRIER_UP_COUNT = auto()
+ IFLA_CARRIER_DOWN_COUNT = auto()
+ IFLA_NEW_IFINDEX = auto()
+ IFLA_MIN_MTU = auto()
+ IFLA_MAX_MTU = auto()
+ IFLA_PROP_LIST = auto()
+ IFLA_ALT_IFNAME = auto()
+ IFLA_PERM_ADDRESS = auto()
+ IFLA_PROTO_DOWN_REASON = auto()
+
+
+class IfaddrMsg(Structure):
+ _fields_ = [
+ ("ifa_family", c_ubyte),
+ ("ifa_prefixlen", c_ubyte),
+ ("ifa_flags", c_ubyte),
+ ("ifa_scope", c_ubyte),
+ ("ifa_index", c_uint),
+ ]
+
+
+class IfattrType(Enum):
+ IFA_UNSPEC = 0
+ IFA_ADDRESS = auto()
+ IFA_LOCAL = auto()
+ IFA_LABEL = auto()
+ IFA_BROADCAST = auto()
+ IFA_ANYCAST = auto()
+ IFA_CACHEINFO = auto()
+ IFA_MULTICAST = auto()
+ IFA_FLAGS = auto()
+ IFA_RT_PRIORITY = auto()
+ IFA_TARGET_NETNSID = auto()
+
+
+class NlConst():
+ AF_NETLINK = 38
+ NETLINK_ROUTE = 0
+
+
+class NlHelper():
+ def __init__(self):
+ self._pmap = {}
+ self._af_cls = self.get_af_cls()
+
+ def get_af_cls(self):
+ if sys.platform.startswith("freebsd"):
+ cls = AddressFamilyBsd
+ else:
+ cls = AddressFamilyLinux
+ return cls
+
+ def get_propmap(self, cls):
+ if cls not in self._pmap:
+ ret = {}
+ for prop in dir(cls):
+ if not prop.startswith("_"):
+ ret[getattr(cls, prop).value] = prop
+ self._pmap[cls] = ret
+ return self._pmap[cls]
+
+ def get_name_propmap(self, cls):
+ ret = {}
+ for prop in dir(cls):
+ if not prop.startswith("_"):
+ ret[prop] = getattr(cls, prop).value
+ return ret
+
+ def get_attr_byval(self, cls, attr_val):
+ propmap = self.get_propmap(cls)
+ return propmap.get(attr_val)
+
+ def get_nlmsg_name(self, val):
+ for cls in [NlRtMsgType, NlMsgType]:
+ v = self.get_attr_byval(cls, val)
+ if v is not None:
+ return v
+ return "msg#{}".format(val)
+
+ def get_af_name(self, family):
+ v = self.get_attr_byval(self._af_cls, family)
+ if v is not None:
+ return v
+ return "af#{}".format(family)
+
+ def get_af_value(self, family_str: str) -> int:
+ propmap = self.get_name_propmap(self._af_cls)
+ return propmap.get(family_str)
+
+ def get_rta_name(self, val):
+ return self.get_attr_byval(RtattrType, val)
+
+ def get_bitmask_map(self, cls, val):
+ propmap = self.get_propmap(cls)
+ v = 1
+ ret = {}
+ while val:
+ if v & val:
+ if v in propmap:
+ ret[v] = propmap[v]
+ else:
+ ret[v] = hex(v)
+ val -= v
+ v *= 2
+ return ret
+
+ def get_bitmask_str(self, cls, val):
+ bmap = self.get_bitmask_map(cls, val)
+ return ",".join([v for k, v in bmap.items()])
+
+ def get_nlm_flags_str(self, msg_str: str, reply: bool, val):
+ if reply:
+ return self.get_bitmask_str(NlmAckFlags, val)
+ if msg_str.startswith("RTM_GET"):
+ return self.get_bitmask_str(NlmGetFlags, val)
+ elif msg_str.startswith("RTM_DEL"):
+ return self.get_bitmask_str(NlmDeleteFlags, val)
+ elif msg_str.startswith("RTM_NEW"):
+ return self.get_bitmask_str(NlmNewFlags, val)
+ else:
+ return self.get_bitmask_str(NlmBaseFlags, val)
+
+
+class BaseRtAttr(object):
+ def __init__(self, parent, rta_type, rta_len, data=None):
+ self.parent = parent
+ self.helper = parent.helper
+ self.attr_enum = parent.attr_enum
+ self.rta_type = rta_type & 0x3f
+ self.is_nested = rta_type & (1 << 15)
+ self.network_byte_order = rta_type & (1 << 14)
+ self.rta_len = rta_len
+ self.rta_type_str = self.helper.get_attr_byval(self.attr_enum, self.rta_type) # noqa: E501
+ if data is not None:
+ self._validate(data)
+ self._parse(data)
+ self._orig_data = data
+
+ def print_attribute(self, prepend=""):
+ if self.rta_type_str:
+ type_str = self.rta_type_str
+ else:
+ type_str = "rta#{}".format(self.rta_type)
+ print("{}rta_len={} rta_type={}({}){}".format(prepend,
+ self.rta_len,
+ type_str,
+ self.rta_type,
+ self._print_attr_value())
+ )
+
+ def _print_attr_value(self):
+ return " [" + " ".join(["{:02X}".format(b) for b in self._orig_data[4:]]) + "]" # noqa: E501
+
+ @classmethod
+ def from_bytes(cls, parent, data):
+ if len(data) < sizeof(RtAttr):
+ raise ValueError("length less than rtattr header")
+ rta_hdr = RtAttr.from_buffer_copy(data)
+ self = cls(parent, rta_hdr.rta_type, rta_hdr.rta_len, data[:rta_hdr.rta_len]) # noqa: E501
+ # XXX: nested
+ return self
+
+ def __bytes__(self):
+ ret = self._orig_data
+ if align4(len(ret)) != len(ret):
+ ret += bytes(align4(len(ret)) - len(ret))
+ return ret
+
+ def _validate(self, data):
+ pass
+
+ def _parse(self, data):
+ pass
+
+
+class RtAttrIp(BaseRtAttr):
+ def _validate(self, data):
+ data_len = len(data) - 4
+ if data_len != 4 and data_len != 16:
+ raise ValueError("Error validating attr {}: rta_len is not valid".format( # noqa: E501
+ self.rta_type_str))
+
+ def _parse(self, data):
+ data_len = len(data) - 4
+ if data_len == 4:
+ self.family = socket.AF_INET
+ self.addr = socket.inet_ntop(self.family, data[4:8])
+ else:
+ self.family = socket.AF_INET6
+ self.addr = socket.inet_ntop(self.family, data[4:20])
+
+ def _print_attr_value(self):
+ return " addr={}".format(self.addr)
+
+
+class RtAttrU32(BaseRtAttr):
+ def _validate(self, data):
+ if len(data) != 8:
+ raise ValueError("Error validating attr {}: rta_len is not valid".format( # noqa: E501
+ self.rta_type_str))
+
+ def _parse(self, data):
+ self.value = struct.unpack("@I", data[4:8])[0]
+
+ def _print_attr_value(self):
+ return " value={}".format(self.value)
+
+
+class RtAttrIfindex(RtAttrU32):
+ def _print_attr_value(self):
+ try:
+ ifname = socket.if_indextoname(self.value)
+ return " iface={}(#{})".format(ifname, self.value)
+ except OSError as e:
+ pass
+ return " iface=if#{}".format(self.value)
+
+
+class RtAttrTable(RtAttrU32):
+ def _print_attr_value(self):
+ return " rtable={}".format(self.value)
+
+
+class RtAttrNhId(RtAttrU32):
+ def _print_attr_value(self):
+ return " nh_id={}".format(self.value)
+
+
+class RtAttrVia(BaseRtAttr):
+ def _validate(self, data):
+ data_len = len(data) - 4
+ if data_len == 0:
+ raise ValueError("Error validating attr {}: empty data".format(self.rta_type_str)) # noqa: E501
+ family = int(data_len[0])
+ if family not in (socket.AF_INET, socket.AF_INET6):
+ raise ValueError("Error validating attr {}: unsupported AF {}".format( # noqa: E501
+ self.rta_type_str, family))
+ if family == socket.AF_INET:
+ expected_len = 1 + 4
+ else:
+ expected_len = 1 + 16
+ if data_len != expected_len:
+ raise ValueError("Error validating attr {}: expected len {} got {}".format( # noqa: E501
+ self.rta_type_str, expected_len, data_len))
+
+ def _parse(self, data):
+ data_len = len(data) - 4
+ self.family = int(data_len[0])
+ if self.family == socket.AF_INET:
+ self.addr = socket.inet_ntop(self.family, data[5:9])
+ else:
+ self.addr = socket.inet_ntop(self.family, data[5:21])
+
+ def _print_attr_value(self):
+ return ", via={}".format(self.addr)
+
+
+class RtAttrStr(BaseRtAttr):
+ def _validate(self, data):
+ try:
+ s = data[4:].decode("utf-8")
+ except Exception as e:
+ raise ValueError("wrong utf-8 string")
+
+ def _parse(self, data):
+ self.str = data[4:].decode("utf-8")
+
+ def _print_attr_value(self):
+ return " str=\"{}\"".format(self.str)
+
+
+rta_class_map = {
+ "RTA_DST": RtAttrIp,
+ "RTA_SRC": RtAttrIp,
+ "RTA_IIF": RtAttrIfindex,
+ "RTA_OIF": RtAttrIfindex,
+ "RTA_GATEWAY": RtAttrIp,
+ "RTA_TABLE": RtAttrTable,
+ "RTA_VIA": RtAttrVia,
+ "RTA_NH_ID": RtAttrNhId,
+}
+
+
+ifla_class_map = {
+ "IFLA_MTU": RtAttrU32,
+}
+
+ifa_class_map = {
+ "IFA_ADDRESS": RtAttrIp,
+ "IFA_LOCAL": RtAttrIp,
+ "IFA_LABEL": RtAttrStr,
+ "IFA_BROADCAST": RtAttrIp,
+ "IFA_ANYCAST": RtAttrIp,
+ "IFA_FLAGS": RtAttrU32,
+}
+
+
+class BaseNetlinkMessage(object):
+ def __init__(self, helper, nlmsg_type):
+ self.nlmsg_type = nlmsg_type
+ self.ut = unittest.TestCase()
+ self.rta_list = []
+ self._orig_data = None
+ self.helper = helper
+ self.nl_hdr = Nlmsghdr(nlmsg_type=nlmsg_type)
+
+ def assertEqual(self, a, b, msg=None):
+ self.ut.assertEqual(a, b, msg)
+
+ def assertNotEqual(self, a, b, msg=None):
+ self.ut.assertNotEqual(a, b, msg)
+
+ @staticmethod
+ def parse_nl_header(data: bytes):
+ if len(data) < sizeof(Nlmsghdr):
+ raise ValueError("length less than netlink message header")
+ return Nlmsghdr.from_buffer_copy(data), sizeof(Nlmsghdr)
+
+ def is_reply(self, hdr):
+ return hdr.nlmsg_type == NlMsgType.NLMSG_ERROR.value
+
+ def print_nl_header(self, hdr, prepend=""):
+ # len=44, type=RTM_DELROUTE, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1641163704, pid=0 # noqa: E501
+ is_reply = self.is_reply(hdr)
+ msg_name = self.helper.get_nlmsg_name(hdr.nlmsg_type)
+ print("{}len={}, type={}, flags={}(0x{:X}), seq={}, pid={}".format(
+ prepend,
+ hdr.nlmsg_len,
+ msg_name,
+ self.helper.get_nlm_flags_str(msg_name, is_reply, hdr.nlmsg_flags), # noqa: E501
+ hdr.nlmsg_flags,
+ hdr.nlmsg_seq,
+ hdr.nlmsg_pid
+ ))
+
+ @classmethod
+ def from_bytes(cls, helper, data):
+ try:
+ hdr, hdrlen = BaseNetlinkMessage.parse_nl_header(data)
+ self = cls(helper, hdr.nlmsg_type)
+ self._orig_data = data
+ self.nl_hdr = hdr
+ except ValueError as e:
+ print("Failed to parse nl header: {}".format(e))
+ cls.print_as_bytes(data)
+ raise
+ return self
+
+ def print_message(self):
+ self.print_nl_header(self.nl_hdr)
+
+ @staticmethod
+ def print_as_bytes(data: bytes, descr: str):
+ print("===vv {} (len:{:3d}) vv===".format(descr, len(data)))
+ off = 0
+ step = 16
+ while off < len(data):
+ for i in range(step):
+ if off + i < len(data):
+ print(" {:02X}".format(data[off + i]), end="")
+ print("")
+ off += step
+ print("--------------------")
+
+
+class NetlinkErrorMessage(BaseNetlinkMessage):
+ messages = [NlMsgType.NLMSG_ERROR.value]
+
+ def __init__(self, helper, nlmsg_type, error):
+ super().__init__(helper, nlmsg_type)
+ self.err_hdr = Nlmsgerr()
+
+ def print_error_header(self, errhdr, prepend=""):
+ print("{}error={}, ".format(prepend), end="")
+ self.print_nl_header(errhdr.msg, prepend)
+
+ def print_message(self, prepend=""):
+ self.print_nl_header(self.nl_nhr, prepend)
+ self.print_error_header(self.err_hdr, prepend + " ")
+
+
+class BaseNetlinkRtMessage(BaseNetlinkMessage):
+ attr_class_map = {}
+ attr_enum = None
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = None
+
+ def parse_rta_list(self, data: bytes) -> List[BaseRtAttr]:
+ ret = []
+ offset = 0
+ while offset < len(data):
+ # print("OFFSET={}".format(offset))
+ if offset + 4 > len(data):
+ raise ValueError("only {} bytes remaining".format(len(data) - offset)) # noqa: E501
+ rta_hdr = RtAttr.from_buffer_copy(data[offset:])
+ rta_type_str = self.helper.get_attr_byval(self.attr_enum, rta_hdr.rta_type) # noqa: E501
+ cls = self.attr_class_map.get(rta_type_str, BaseRtAttr)
+ rta = cls.from_bytes(self, data[offset:])
+ offset += align4(rta.rta_len)
+ if rta.rta_len == 0:
+ raise ValueError("empty rta len, {} bytes remaining".format(len(data) - offset)) # noqa: E501
+ ret.append(rta)
+ return ret, offset
+
+ @classmethod
+ def from_bytes(cls, helper, data):
+ try:
+ hdr, hdrlen = BaseNetlinkMessage.parse_nl_header(data)
+ self = cls(helper, hdr.nlmsg_type)
+ self._orig_data = data
+ self.nl_hdr = hdr
+ except ValueError as e:
+ print("Failed to parse nl header: {}".format(e))
+ cls.print_as_bytes(data)
+ raise
+
+ offset = align4(hdrlen)
+ try:
+ base_hdr, hdrlen = self.parse_base_header(data[offset:])
+ self.base_hdr = base_hdr
+ offset += align4(hdrlen)
+ except ValueError as e:
+ print("Failed to parse nl rt header: {}".format(e))
+ cls.print_as_bytes(data)
+ raise
+
+ orig_offset = offset
+ try:
+ rta_list, rta_len = self.parse_rta_list(data[offset:])
+ offset += rta_len
+ if offset != len(data):
+ raise ValueError("{} bytes left at the end of the packet".format(len(data) - offset)) # noqa: E501
+ self.rta_list = rta_list
+ except ValueError as e:
+ print("Failed to parse nl rta attributes at offset {}: {}".format(orig_offset, e)) # noqa: E501
+ cls.print_as_bytes(data, "msg dump")
+ cls.print_as_bytes(data[orig_offset:], "failed block")
+ raise
+ return self
+
+ def __bytes__(self):
+ ret = bytes()
+ for rta in self.rta_list:
+ ret += bytes(rta)
+ ret = bytes(self.base_hdr) + ret
+ self.nl_hdr.nlmsg_len = len(ret) + sizeof(Nlmsghdr)
+ return bytes(self.nl_hdr) + ret
+
+ def print_message(self):
+ self.print_nl_header(self.nl_hdr)
+ self.print_base_header(self.base_hdr, " ")
+ for rta in self.rta_list:
+ rta.print_attribute(" ")
+
+
+class NetlinkRtMessage(BaseNetlinkRtMessage):
+ messages = [
+ NlRtMsgType.RTM_NEWROUTE.value,
+ NlRtMsgType.RTM_DELROUTE.value,
+ NlRtMsgType.RTM_GETROUTE.value,
+ ]
+ attr_class_map = rta_class_map
+ attr_enum = RtattrType
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = RtMsgHdr()
+
+ def parse_base_header(self, data):
+ if len(data) < sizeof(RtMsgHdr):
+ raise ValueError("length less than rtmsg header")
+ rtm_hdr = RtMsgHdr.from_buffer_copy(data)
+ return (rtm_hdr, sizeof(RtMsgHdr))
+
+ def print_base_header(self, hdr, prepend=""):
+ family = self.helper.get_af_name(hdr.rtm_family)
+ print("{}family={}, dst_len={}, src_len={}, tos={}, table={}, protocol={}({}), scope={}({}), type={}({}), flags={}({})".format( # noqa: E501
+ prepend,
+ family,
+ hdr.rtm_dst_len,
+ hdr.rtm_src_len,
+ hdr.rtm_tos,
+ hdr.rtm_table,
+ self.helper.get_attr_byval(RtProto, hdr.rtm_protocol),
+ hdr.rtm_protocol,
+ self.helper.get_attr_byval(RtScope, hdr.rtm_scope),
+ hdr.rtm_scope,
+ self.helper.get_attr_byval(RtType, hdr.rtm_type),
+ hdr.rtm_type,
+ self.helper.get_bitmask_str(RtMsgFlags, hdr.rtm_flags),
+ hdr.rtm_flags))
+
+
+class NetlinkIflaMessage(BaseNetlinkRtMessage):
+ messages = [
+ NlRtMsgType.RTM_NEWLINK.value,
+ NlRtMsgType.RTM_DELLINK.value,
+ NlRtMsgType.RTM_GETLINK.value,
+ ]
+ attr_class_map = ifla_class_map
+ attr_enum = IflattrType
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = IfinfoMsg()
+
+ def parse_base_header(self, data):
+ if len(data) < sizeof(IfinfoMsg):
+ raise ValueError("length less than IfinfoMsg header")
+ rtm_hdr = IfinfoMsg.from_buffer_copy(data)
+ return (rtm_hdr, sizeof(IfinfoMsg))
+
+ def print_base_header(self, hdr, prepend=""):
+ family = self.helper.get_af_name(hdr.ifi_family)
+ print("{}family={}, ifi_type={}, ifi_index={}, ifi_flags={}, ifi_change={}".format( # noqa: E501
+ prepend,
+ family,
+ hdr.ifi_type,
+ hdr.ifi_index,
+ hdr.ifi_flags,
+ hdr.ifi_change))
+
+
+class NetlinkIfaMessage(BaseNetlinkRtMessage):
+ messages = [
+ NlRtMsgType.RTM_NEWADDR.value,
+ NlRtMsgType.RTM_DELADDR.value,
+ NlRtMsgType.RTM_GETADDR.value,
+ ]
+ attr_class_map = ifa_class_map
+ attr_enum = IfattrType
+
+ def __init__(self, helper, nlm_type):
+ super().__init__(helper, nlm_type)
+ self.base_hdr = IfaddrMsg()
+
+ def parse_base_header(self, data):
+ if len(data) < sizeof(IfaddrMsg):
+ raise ValueError("length less than IfaddrMsg header")
+ rtm_hdr = IfaddrMsg.from_buffer_copy(data)
+ return (rtm_hdr, sizeof(IfaddrMsg))
+
+ def print_base_header(self, hdr, prepend=""):
+ family = self.helper.get_af_name(hdr.ifa_family)
+ print("{}family={}, ifa_prefixlen={}, ifa_flags={}, ifa_scope={}, ifa_index={}".format( # noqa: E501
+ prepend,
+ family,
+ hdr.ifa_prefixlen,
+ hdr.ifa_flags,
+ hdr.ifa_scope,
+ hdr.ifa_index))
+
+
+class Nlsock():
+ def __init__(self, helper):
+ self.helper = helper
+ self.sock_fd = self._setup_netlink()
+ self._data = bytes()
+ self.rtm_seq = 1
+ self.pid = os.getpid()
+ self.msgmap = self.build_msgmap()
+ self.set_groups(NlRtGroup.RTNLGRP_IPV4_ROUTE.value | NlRtGroup.RTNLGRP_IPV6_ROUTE.value) # noqa: E501
+
+ def build_msgmap(self):
+ classes = [NetlinkRtMessage, NetlinkIfaMessage, NetlinkErrorMessage]
+ xmap = {}
+ for cls in classes:
+ for message in cls.messages:
+ xmap[message] = cls
+ return xmap
+
+ def get_seq(self):
+ ret = self.rtm_seq
+ self.rtm_seq += 1
+ return ret
+
+ def _setup_netlink(self) -> int:
+ family = self.helper.get_af_value("AF_NETLINK")
+ s = socket.socket(family, socket.SOCK_RAW, NlConst.NETLINK_ROUTE)
+ return s
+
+ def set_groups(self, mask: int):
+ self.sock_fd.setsockopt(socket.SOL_SOCKET, 1, mask)
+ # snl = SockaddrNl(nl_len = sizeof(SockaddrNl), nl_family=38,
+ # nl_pid=self.pid, nl_groups=mask)
+ # xbuffer = create_string_buffer(sizeof(SockaddrNl))
+ # memmove(xbuffer, addressof(snl), sizeof(SockaddrNl))
+ # k = struct.pack("@BBHII", 12, 38, 0, self.pid, mask)
+ # self.sock_fd.bind(k)
+
+ def write_message(self, msg):
+ print("vvvvvvvv OUT vvvvvvvv")
+ msg.print_message()
+ msg_bytes = bytes(msg)
+ try:
+ ret = os.write(self.sock_fd.fileno(), bytes(msg))
+ except Exception as e:
+ print("write({}) -> {}".format(len(msg_bytes), e))
+
+ def parse_message(self, data: bytes):
+ if len(data) < sizeof(Nlmsghdr):
+ raise Exception("Short read from nl: {} bytes".format(len(data)))
+ hdr = Nlmsghdr.from_buffer_copy(data)
+ nlmsg_type = hdr.nlmsg_type
+ cls = self.msgmap.get(nlmsg_type)
+ if not cls:
+ cls = BaseNetlinkMessage
+ return cls.from_bytes(self.helper, data)
+
+ def write_data(self, data: bytes):
+ self.sock_fd.send(data)
+
+ def read_data(self):
+ while True:
+ data = self.sock_fd.recv(65535)
+ self._data += data
+ if len(self._data) >= sizeof(Nlmsghdr):
+ break
+ if seq is None:
+ break
+ hdr = Nlmsghdr.from_buffer_copy(data)
+ if hdr.nlmsg_pid == self.pid and hdr.nlmsg_seq == seq:
+ break
+ return data
+
+ def read_message(self) -> bytes:
+ if len(self._data) < sizeof(Nlmsghdr):
+ self.read_data()
+ hdr = Nlmsghdr.from_buffer_copy(self._data)
+ while (hdr.nlmsg_len > len(self._data)):
+ self.read_data()
+ raw_msg = self._data[:hdr.nlmsg_len]
+ self._data = self._data[hdr.nlmsg_len:]
+ return self.parse_message(raw_msg)
+
+ def fill_msg_seq(self, msg):
+ msg.nl_hdr.nlmsg_seq = self.get_seq()
+ msg.nl_hdr.nlmsg_pid = self.pid
+
+ def request_ifaces(self):
+ msg = NetlinkIfaMessage(self.helper, NlRtMsgType.RTM_GETLINK.value)
+ flags = NlmGetFlags.NLM_F_ROOT.value | NlmGetFlags.NLM_F_MATCH.value
+ self.fill_msg_seq(msg)
+ msg.nl_hdr.nlmsg_flags = flags | NlmBaseFlags.NLM_F_REQUEST.value
+
+ msg_bytes = bytes(msg)
+ x = self.parse_message(msg_bytes)
+ x.print_message()
+ print(msg_bytes)
+ # Skip family for now
+ self.write_data(msg_bytes)
+
+ def request_ifaddrs(self, family):
+ msg = NetlinkIfaMessage(self.helper, NlRtMsgType.RTM_GETADDR.value)
+ flags = NlmGetFlags.NLM_F_ROOT.value | NlmGetFlags.NLM_F_MATCH.value
+ self.fill_msg_seq(msg)
+ msg.base_hdr.ifa_family = family
+ msg.nl_hdr.nlmsg_flags = flags | NlmBaseFlags.NLM_F_REQUEST.value
+
+ msg_bytes = bytes(msg)
+ x = self.parse_message(msg_bytes)
+ x.print_message()
+ print(msg_bytes)
+ # Skip family for now
+ self.write_data(msg_bytes)
+
+ def request_routes(self, family):
+ msg = NetlinkRtMessage(self.helper, NlRtMsgType.RTM_GETROUTE.value)
+ flags = NlmGetFlags.NLM_F_ROOT.value | NlmGetFlags.NLM_F_MATCH.value
+ self.fill_msg_seq(msg)
+ msg.base_hdr.rtm_family = family
+ msg.nl_hdr.nlmsg_flags = flags | NlmBaseFlags.NLM_F_REQUEST.value
+
+ msg_bytes = bytes(msg)
+ x = self.parse_message(msg_bytes)
+ x.print_message()
+ print(msg_bytes)
+ # Skip family for now
+ self.write_data(msg_bytes)
+
+
+def main():
+ helper = NlHelper()
+ nl = Nlsock(helper)
+ # nl.request_ifaddrs(socket.AF_INET)
+ #nl.request_routes(0)
+ nl.request_ifaces()
+ while True:
+ msg = nl.read_message()
+ print("")
+ msg.print_message()
+
+ pass
+
+
+if __name__ == "__main__":
+ main()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Jan 24, 6:45 PM (11 h, 18 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16097131
Default Alt Text
D36002.id109971.diff (253 KB)
Attached To
Mode
D36002: netlink: add netlink support
Attached
Detach File
Event Timeline
Log In to Comment