Page MenuHomeFreeBSD

D24232.id70449.diff
No OneTemporary

D24232.id70449.diff

Index: etc/mtree/BSD.include.dist
===================================================================
--- etc/mtree/BSD.include.dist
+++ etc/mtree/BSD.include.dist
@@ -208,6 +208,8 @@
net
altq
..
+ route
+ ..
..
net80211
..
Index: include/Makefile
===================================================================
--- include/Makefile
+++ include/Makefile
@@ -53,6 +53,7 @@
geom/mirror geom/mountver geom/multipath geom/nop \
geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
net/altq \
+ net/route \
netgraph/atm netgraph/netflow \
netinet/cc \
netinet/netdump \
Index: lib/libc/gen/sysctl.3
===================================================================
--- lib/libc/gen/sysctl.3
+++ lib/libc/gen/sysctl.3
@@ -563,6 +563,7 @@
.It Dv NET_RT_IFLIST Ta 0 or if_index Ta None
.It Dv NET_RT_IFMALIST Ta 0 or if_index Ta None
.It Dv NET_RT_IFLISTL Ta 0 or if_index Ta None
+.It Dv NET_RT_NHOPS Ta None Ta fib number
.El
.Pp
The
@@ -583,6 +584,9 @@
.Va struct if_msghdrl
and
.Va struct ifa_msghdrl .
+.Pp
+.Dv NET_RT_NHOPS
+returns all nexthops for specified address family in given fib.
.It Li PF_INET
Get or set various global information about the IPv4
(Internet Protocol version 4).
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4091,6 +4091,11 @@
net/raw_usrreq.c standard
net/route.c standard
net/route_temporal.c standard
+net/route/nhop.c standard
+net/route/nhop_ctl.c standard
+net/route/nhop_utils.c standard
+net/route/route_ctl.c standard
+net/route/route_helpers.c standard
net/rss_config.c optional inet rss | inet6 rss
net/rtsock.c standard
net/slcompress.c optional netgraph_vjc | sppp | \
Index: sys/net/radix_mpath.h
===================================================================
--- sys/net/radix_mpath.h
+++ sys/net/radix_mpath.h
@@ -56,9 +56,26 @@
struct sockaddr *);
void rtalloc_mpath_fib(struct route *, u_int32_t, u_int);
struct rtentry *rt_mpath_select(struct rtentry *, uint32_t);
+struct rtentry *rt_mpath_selectrte(struct rtentry *, uint32_t);
int rt_mpath_deldup(struct rtentry *, struct rtentry *);
int rn4_mpath_inithead(void **, int, u_int);
int rn6_mpath_inithead(void **, int, u_int);
+
+static inline struct rtentry *
+rt_mpath_next(struct rtentry *rt)
+{
+ struct radix_node *next, *rn;
+
+ rn = (struct radix_node *)rt;
+
+ if (!rn->rn_dupedkey)
+ return (NULL);
+ next = rn->rn_dupedkey;
+ if (rn->rn_mask == next->rn_mask)
+ return (struct rtentry *)next;
+ else
+ return (NULL);
+}
#endif
Index: sys/net/radix_mpath.c
===================================================================
--- sys/net/radix_mpath.c
+++ sys/net/radix_mpath.c
@@ -211,7 +211,7 @@
return (0);
}
-static struct rtentry *
+struct rtentry *
rt_mpath_selectrte(struct rtentry *rte, uint32_t hash)
{
struct radix_node *rn0, *rn;
Index: sys/net/route.h
===================================================================
--- sys/net/route.h
+++ sys/net/route.h
@@ -90,7 +90,8 @@
u_long rmx_rttvar; /* estimated rtt variance */
u_long rmx_pksent; /* packets sent using this route */
u_long rmx_weight; /* route weight */
- u_long rmx_filler[3]; /* will be used for T/TCP later */
+ u_long rmx_nhidx; /* route nexhop index */
+ u_long rmx_filler[2]; /* will be used for T/TCP later */
};
/*
@@ -150,6 +151,7 @@
struct sockaddr *rt_gateway; /* value */
struct ifnet *rt_ifp; /* the answer: interface to use */
struct ifaddr *rt_ifa; /* the answer: interface address to use */
+ struct nhop_object *rt_nhop; /* nexthop data */
int rt_flags; /* up/down?, host/net */
int rt_refcnt; /* # held references */
u_int rt_fibnum; /* which FIB */
@@ -215,9 +217,13 @@
#define NHF_HOST 0x0400 /* RTF_HOST */
/* Nexthop request flags */
+#define NHR_NONE 0x00 /* empty flags field */
#define NHR_IFAIF 0x01 /* Return ifa_ifp interface */
#define NHR_REF 0x02 /* For future use */
+/* uRPF */
+#define NHR_NODEFAULT 0x04 /* do not consider default route */
+
/* Control plane route request flags */
#define NHR_COPY 0x100 /* Copy rte data */
@@ -245,6 +251,8 @@
uint64_t rts_newgateway; /* routes modified by redirects */
uint64_t rts_unreach; /* lookups which failed */
uint64_t rts_wildcard; /* lookups satisfied by a wildcard */
+ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/
+ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/
};
/*
@@ -507,6 +515,8 @@
struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
int flags, int expire_sec);
+/* New API */
+void rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg);
#endif
#endif
Index: sys/net/route.c
===================================================================
--- sys/net/route.c
+++ sys/net/route.c
@@ -62,6 +62,8 @@
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#ifdef RADIX_MPATH
@@ -108,10 +110,7 @@
SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
&VNET_NAME(rt_add_addr_allfibs), 0, "");
-VNET_PCPUSTAT_DEFINE_STATIC(struct rtstat, rtstat);
-#define RTSTAT_ADD(name, val) \
- VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
-#define RTSTAT_INC(name) RTSTAT_ADD(name, 1)
+VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat);
VNET_PCPUSTAT_SYSINIT(rtstat);
#ifdef VIMAGE
@@ -240,6 +239,7 @@
rt_numfibs = RT_MAXFIBS;
if (rt_numfibs == 0)
rt_numfibs = 1;
+ nhops_init();
}
SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL);
@@ -377,6 +377,8 @@
/* Init locks */
RIB_LOCK_INIT(rh);
+ nhops_init_rib(rh);
+
/* Finally, set base callbacks */
rh->rnh_addaddr = rn_addroute;
rh->rnh_deladdr = rn_delete;
@@ -408,6 +410,8 @@
rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
+ nhops_destroy_rib(rh);
+
/* Assume table is already empty */
RIB_LOCK_DESTROY(rh);
free(rh, M_RTABLE);
@@ -586,6 +590,9 @@
*/
R_Free(rt_key(rt));
+ /* Unreference nexthop */
+ nhop_free(rt->rt_nhop);
+
/*
* and the rtentry itself of course
*/
@@ -1400,6 +1407,7 @@
RIB_WLOCK(rnh);
rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu);
RIB_WUNLOCK(rnh);
+ nhops_update_ifmtu(rnh, ifp, ifmtu.mtu);
}
}
}
@@ -1544,6 +1552,7 @@
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
{
+ struct epoch_tracker et;
const struct sockaddr *dst;
struct rib_head *rnh;
int error;
@@ -1592,9 +1601,11 @@
error = add_route(rnh, info, ret_nrt);
break;
case RTM_CHANGE:
+ NET_EPOCH_ENTER(et);
RIB_WLOCK(rnh);
error = change_route(rnh, info, ret_nrt);
RIB_WUNLOCK(rnh);
+ NET_EPOCH_EXIT(et);
break;
default:
error = EOPNOTSUPP;
@@ -1609,9 +1620,11 @@
{
struct sockaddr *dst, *ndst, *gateway, *netmask;
struct rtentry *rt, *rt_old;
+ struct nhop_object *nh;
struct radix_node *rn;
struct ifaddr *ifa;
int error, flags;
+ struct epoch_tracker et;
dst = info->rti_info[RTAX_DST];
gateway = info->rti_info[RTAX_GATEWAY];
@@ -1631,18 +1644,30 @@
} else {
ifa_ref(info->rti_ifa);
}
+
+ NET_EPOCH_ENTER(et);
+ error = nhop_create_from_info(rnh, info, &nh);
+ NET_EPOCH_EXIT(et);
+ if (error != 0) {
+ ifa_free(info->rti_ifa);
+ return (error);
+ }
+
rt = uma_zalloc(V_rtzone, M_NOWAIT);
if (rt == NULL) {
ifa_free(info->rti_ifa);
+ nhop_free(nh);
return (ENOBUFS);
}
rt->rt_flags = RTF_UP | flags;
rt->rt_fibnum = rnh->rib_fibnum;
+ rt->rt_nhop = nh;
/*
* Add the gateway. Possibly re-malloc-ing the storage for it.
*/
if ((error = rt_setgate(rt, dst, gateway)) != 0) {
ifa_free(info->rti_ifa);
+ nhop_free(nh);
uma_zfree(V_rtzone, rt);
return (error);
}
@@ -1682,6 +1707,7 @@
ifa_free(rt->rt_ifa);
R_Free(rt_key(rt));
+ nhop_free(nh);
uma_zfree(V_rtzone, rt);
return (EEXIST);
}
@@ -1723,6 +1749,7 @@
if (rn == NULL) {
ifa_free(rt->rt_ifa);
R_Free(rt_key(rt));
+ nhop_free(nh);
uma_zfree(V_rtzone, rt);
return (EEXIST);
}
@@ -1802,6 +1829,7 @@
int error = 0;
int free_ifa = 0;
int family, mtu;
+ struct nhop_object *nh;
struct if_mtuinfo ifmtu;
RIB_WLOCK_ASSERT(rnh);
@@ -1824,6 +1852,7 @@
}
#endif
+ nh = NULL;
RT_LOCK(rt);
rt_setmetrics(info, rt);
@@ -1852,6 +1881,10 @@
goto bad;
}
+ error = nhop_create_from_nhop(rnh, rt->rt_nhop, info, &nh);
+ if (error != 0)
+ goto bad;
+
/* Check if outgoing interface has changed */
if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa &&
rt->rt_ifa != NULL) {
@@ -1897,6 +1930,11 @@
}
}
+ /* Update nexthop */
+ nhop_free(rt->rt_nhop);
+ rt->rt_nhop = nh;
+ nh = NULL;
+
/*
* This route change may have modified the route's gateway. In that
* case, any inpcbs that have cached this route need to invalidate their
@@ -1910,6 +1948,8 @@
}
bad:
RT_UNLOCK(rt);
+ if (nh != NULL)
+ nhop_free(nh);
if (free_ifa != 0) {
ifa_free(info->rti_ifa);
info->rti_ifa = NULL;
Index: sys/net/route/nhop.h
===================================================================
--- /dev/null
+++ sys/net/route/nhop.h
@@ -0,0 +1,229 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains public definitions for the nexthop routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHOP_H_
+#define _NET_ROUTE_NHOP_H_
+
+#include <netinet/in.h> /* sockaddr_in && sockaddr_in6 */
+
+#include <sys/counter.h>
+
+enum nhop_type {
+ NH_TYPE_IPV4_ETHER_RSLV = 1, /* IPv4 ethernet without GW */
+ NH_TYPE_IPV4_ETHER_NHOP = 2, /* IPv4 with pre-calculated ethernet encap */
+ NH_TYPE_IPV6_ETHER_RSLV = 3, /* IPv6 ethernet, without GW */
+ NH_TYPE_IPV6_ETHER_NHOP = 4 /* IPv6 with pre-calculated ethernet encap*/
+};
+
+#ifdef _KERNEL
+
+/*
+ * Define shorter version of AF_LINK sockaddr.
+ *
+ * Currently the only use case of AF_LINK gateway is storing
+ * interface index of the interface of the source IPv6 address.
+ * This is used by the IPv6 code for the connections over loopback
+ * interface.
+ *
+ * The structure below copies 'struct sockaddr_dl', reducing the
+ * size of sdl_data buffer, as it is not used. This change
+ * allows to store the AF_LINK gateways in the nhop gateway itself,
+ * simplifying control plane handling.
+ */
+struct sockaddr_dl_short {
+ u_char sdl_len; /* Total length of sockaddr */
+ u_char sdl_family; /* AF_LINK */
+ u_short sdl_index; /* if != 0, system given index for interface */
+ u_char sdl_type; /* interface type */
+ u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */
+ u_char sdl_alen; /* link level address length */
+ u_char sdl_slen; /* link layer selector length */
+ char sdl_data[8]; /* unused */
+};
+
+#define NHOP_RELATED_FLAGS \
+ (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_BLACKHOLE | \
+ RTF_FIXEDMTU | RTF_LOCAL | RTF_BROADCAST | RTF_MULTICAST)
+
+struct nh_control;
+struct nhop_priv;
+
+/*
+ * Struct 'nhop_object' field description:
+ *
+ * nh_flags: NHF_ flags used in the dataplane code. NHF_GATEWAY or NHF_BLACKHOLE
+ * can be examples of such flags.
+ * nh_mtu: ready-to-use nexthop mtu. Already accounts for the link-level header,
+ * interface MTU and protocol-specific limitations.
+ * nh_prepend_len: link-level prepend length. Currently unused.
+ * nh_ifp: logical transmit interface. The one from which if_transmit() will be
+ * called. Guaranteed to be non-NULL.
+ * nh_aifp: ifnet of the source address. Same as nh_ifp except IPv6 loopback
+ * routes. See the example below.
+ * nh_ifa: interface address to use. Guaranteed to be non-NULL.
+ * nh_pksent: counter(9) reflecting the number of packets transmitted.
+ *
+ * gw_: storage suitable to hold AF_INET, AF_INET6 or AF_LINK gateway. More
+ * details ara available in the examples below.
+ *
+ * Examples:
+ *
+ * Direct routes (routes w/o gateway):
+ * NHF_GATEWAY is NOT set.
+ * nh_ifp denotes the logical transmit interface ().
+ * nh_aifp is the same as nh_ifp
+ * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat)
+ * Loopback routes:
+ * NHF_GATEWAY is NOT set.
+ * nh_ifp points to the loopback interface (lo0).
+ * nh_aifp points to the interface where the destination address belongs to.
+ * This is useful in IPv6 link-local-over-loopback communications.
+ * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat)
+ * GW routes:
+ * NHF_GATEWAY is set.
+ * nh_ifp denotes the logical transmit interface.
+ * nh_aifp is the same as nh_ifp
+ * gw_sa contains L3 address (either AF_INET or AF_INET6).
+ *
+ *
+ * Note: struct nhop_object fields are ordered in a way that
+ * supports memcmp-based comparisons.
+ *
+ */
+#define NHOP_END_CMP (__offsetof(struct nhop_object, nh_pksent))
+
+struct nhop_object {
+ uint16_t nh_flags; /* nhop flags */
+ uint16_t nh_mtu; /* nexthop mtu */
+ union {
+ struct sockaddr_in gw4_sa; /* GW accessor as IPv4 */
+ struct sockaddr_in6 gw6_sa; /* GW accessor as IPv6 */
+ struct sockaddr gw_sa;
+ struct sockaddr_dl_short gwl_sa; /* AF_LINK gw (compat) */
+ char gw_buf[28];
+ };
+ struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */
+ struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */
+ struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */
+ counter_u64_t nh_pksent; /* packets sent using this nhop */
+ /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */
+ uint8_t nh_prepend_len; /* length of prepend data */
+ uint8_t spare[3];
+ uint32_t spare1; /* alignment */
+ char nh_prepend[48]; /* L2 prepend */
+ struct nhop_priv *nh_priv; /* control plane data */
+ /* -- 128 bytes -- */
+};
+
+/*
+ * Nhop validness.
+ *
+ * Currently we verify whether link is up or not on every packet, which can be
+ * quite costy.
+ * TODO: subscribe for the interface notifications and update the nexthops
+ * with NHF_INVALID flag.
+ */
+
+#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp)
+#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
+
+#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
+#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
+
+#define NH_FREE(_nh) do { \
+ nhop_free(_nh); \
+ /* guard against invalid refs */ \
+ _nh = NULL; \
+} while (0)
+
+
+void nhop_free(struct nhop_object *nh);
+
+struct sysctl_req;
+struct sockaddr_dl;
+struct rib_head;
+
+uint32_t nhop_get_idx(const struct nhop_object *nh);
+enum nhop_type nhop_get_type(const struct nhop_object *nh);
+int nhop_get_rtflags(const struct nhop_object *nh);
+
+int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+#endif /* _KERNEL */
+
+/* Kernel <> userland structures */
+
+/* Structure usage and layout are described in dump_nhop_entry() */
+struct nhop_external {
+ uint32_t nh_len; /* length of the datastructure */
+ uint32_t nh_idx; /* Nexthop index */
+ uint32_t nh_fib; /* Fib nexhop is attached to */
+ uint32_t ifindex; /* transmit interface ifindex */
+ uint32_t aifindex; /* address ifindex */
+ uint8_t prepend_len; /* length of the prepend */
+ uint8_t nh_family; /* address family */
+ uint16_t nh_type; /* nexthop type */
+ uint16_t nh_mtu; /* nexthop mtu */
+
+ uint16_t nh_flags; /* nhop flags */
+ struct in_addr nh_addr; /* GW/DST IPv4 address */
+ struct in_addr nh_src; /* default source IPv4 address */
+ uint64_t nh_pksent;
+ /* control plane */
+ /* lookup key: address, family, type */
+ char nh_prepend[64]; /* L2 prepend */
+ uint64_t nh_refcount; /* number of references */
+};
+
+struct nhop_addrs {
+ uint32_t na_len; /* length of the datastructure */
+ uint16_t gw_sa_off; /* offset of gateway SA */
+ uint16_t src_sa_off; /* offset of src address SA */
+};
+
+struct mpath_nhop_external {
+ uint32_t nh_idx;
+ uint32_t nh_weight;
+};
+
+struct mpath_external {
+ uint32_t mp_idx;
+ uint32_t mp_refcount;
+ uint32_t mp_nh_count;
+ uint32_t mp_group_size;
+};
+
+
+#endif
+
+
Index: sys/net/route/nhop.c
===================================================================
--- /dev/null
+++ sys/net/route/nhop.c
@@ -0,0 +1,388 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/vnet.h>
+
+/*
+ * This file contains data structures management logic for the nexthop ("nhop")
+ * route subsystem.
+ *
+ * Nexthops in the original sense are the objects containing all the necessary
+ * information to forward the packet to the selected destination.
+ * In particular, nexthop is defined by a combination of
+ * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
+ * NHF_DEFAULT
+ *
+ * All nexthops are stored in the resizable hash table.
+ * Additionally, each nexthop gets assigned its unique index (nexthop index)
+ * so userland programs can interact with the nexthops easier. Index allocation
+ * is backed by the bitmask array.
+ */
+
+static MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
+
+
+/* Hash management functions */
+
+int
+nhops_init_rib(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *ptr;
+
+ ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO);
+
+ /*
+ * Allocate nexthop hash. Start with 16 items by default (128 bytes).
+ * This will be enough for most of the cases.
+ */
+ num_buckets = 16;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO);
+ CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets);
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128 * 8; /* 128 bytes */
+ ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO);
+ bitmask_init(&ctl->nh_idx_head, ptr, num_items);
+
+ NHOPS_LOCK_INIT(ctl);
+
+ rh->nh_control = ctl;
+ ctl->ctl_rh = rh;
+
+ DPRINTF("NHOPS init for fib %u af %u: ctl %p rh %p", rh->rib_fibnum,
+ rh->rib_family, ctl, rh);
+
+ return (0);
+}
+
+static void
+destroy_ctl(struct nh_control *ctl)
+{
+
+ NHOPS_LOCK_DESTROY(ctl);
+ free(ctl->nh_head.ptr, M_NHOP);
+ free(ctl->nh_idx_head.idx, M_NHOP);
+ free(ctl, M_NHOP);
+}
+
+/*
+ * Epoch callback indicating ctl is safe to destroy
+ */
+static void
+destroy_ctl_epoch(epoch_context_t ctx)
+{
+ struct nh_control *ctl;
+
+ ctl = __containerof(ctx, struct nh_control, ctl_epoch_ctx);
+
+ destroy_ctl(ctl);
+}
+
+void
+nhops_destroy_rib(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv;
+
+ ctl = rh->nh_control;
+
+ /*
+ * All routes should have been deleted in rt_table_destroy().
+ * However, TCP stack or other consumers may store referenced
+ * nexthop pointers. When these references go to zero,
+ * nhop_free() will try to unlink these records from the
+ * datastructures, most likely leading to panic.
+ *
+ * Avoid that by explicitly marking all of the remaining
+ * nexthops as unlinked by removing a reference from a special
+ * counter. Please see nhop_free() comments for more
+ * details.
+ */
+
+ NHOPS_WLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ DPRINTF("Marking nhop %u unlinked", nh_priv->nh_idx);
+ refcount_release(&nh_priv->nh_linked);
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_WUNLOCK(ctl);
+
+ /*
+ * Postpone destruction till the end of current epoch
+ * so nhop_free() can safely use nh_control pointer.
+ */
+ epoch_call(net_epoch_preempt, destroy_ctl_epoch,
+ &ctl->ctl_epoch_ctx);
+}
+
+/*
+ * Nexhop hash calculation:
+ *
+ * Nexthops distribution:
+ * 2 "mandatory" nexthops per interface ("interface route", "loopback").
+ * For direct peering: 1 nexthop for the peering router per ifp/af.
+ * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af.
+ * IGP control plane & broadcast segment: tens of nexthops per ifp/af.
+ *
+ * Each fib/af combination has its own hash table.
+ * With that in mind, hash nexthops by the combination of the interface
+ * and GW IP address.
+ *
+ * To optimize hash calculation, ignore higher bytes of ifindex, as they
+ * give very little entropy.
+ * Similarly, use lower 4 bytes of IPv6 address to distinguish between the
+ * neighbors.
+ */
+struct _hash_data {
+ uint16_t ifindex;
+ uint8_t family;
+ uint8_t nh_type;
+ uint32_t gw_addr;
+};
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+static uint32_t
+hash_priv(const struct nhop_priv *priv)
+{
+ struct nhop_object *nh;
+ uint16_t ifindex;
+ struct _hash_data key;
+
+ nh = priv->nh;
+ ifindex = nh->nh_ifp->if_index & 0xFFFF;
+ memset(&key, 0, sizeof(key));
+
+ key.ifindex = ifindex;
+ key.family = nh->gw_sa.sa_family;
+ key.nh_type = priv->nh_type & 0xFF;
+ if (nh->gw_sa.sa_family == AF_INET6)
+ memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4);
+ else if (nh->gw_sa.sa_family == AF_INET)
+ memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4);
+
+ return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key)));
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+ /* Either resize is not required or allocations have failed. */
+ return;
+ }
+
+ DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr,
+ new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items) == 0)
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Links nextop @nh_priv to the nexhop hash table and allocates
+ * nexhop index.
+ * Returns allocated index or 0 on failure.
+ */
+int
+link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv)
+{
+ uint16_t idx;
+ uint32_t num_buckets_new, num_items_new;
+
+ KASSERT((nh_priv->nh_idx == 0), ("nhop index is already allocated"));
+ NHOPS_WLOCK(ctl);
+
+ /*
+ * Check if we need to resize hash and index.
+ * The following 2 functions returns either new size or 0
+ * if resize is not required.
+ */
+ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
+ num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate nhop index");
+ RTSTAT_INC(rts_nh_idx_alloc_failure);
+ consider_resize(ctl, num_buckets_new, num_items_new);
+ return (0);
+ }
+
+ nh_priv->nh_idx = idx;
+ nh_priv->nh_control = ctl;
+
+ CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx,
+ hash_priv(nh_priv), ctl);
+ consider_resize(ctl, num_buckets_new, num_items_new);
+
+ return (idx);
+}
+
+/*
+ * Unlinks nexthop specified by @nh_priv data from the hash.
+ *
+ * Returns found nexthop or NULL.
+ */
+struct nhop_priv *
+unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv_del)
+{
+ struct nhop_priv *priv_ret;
+ int idx;
+ uint32_t num_buckets_new, num_items_new;
+
+ idx = 0;
+
+ NHOPS_WLOCK(ctl);
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->nh_head, nhops, nh_priv_del, priv_ret);
+
+ if (priv_ret != NULL) {
+ idx = priv_ret->nh_idx;
+ priv_ret->nh_idx = 0;
+
+ KASSERT((idx != 0), ("bogus nhop index 0"));
+ if ((bitmask_free_idx(&ctl->nh_idx_head, idx)) != 0) {
+ DPRINTF("Unable to remove index %d from fib %u af %d",
+ idx, ctl->ctl_rh->rib_fibnum,
+ ctl->ctl_rh->rib_family);
+ }
+ }
+
+ /* Check if hash or index needs to be resized */
+ num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
+ num_items_new = bitmask_get_resize_items(&ctl->nh_idx_head);
+
+ NHOPS_WUNLOCK(ctl);
+
+ if (priv_ret == NULL)
+ DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p",
+ nh_priv_del, hash_priv(nh_priv_del), ctl);
+ else
+ DPRINTF("Unlinked nhop %p priv idx %d", priv_ret, idx);
+
+ consider_resize(ctl, num_buckets_new, num_items_new);
+
+ return (priv_ret);
+}
+
+/*
+ * Searches for the nexthop by data specifcied in @nh_priv.
+ * Returns referenced nexthop or NULL.
+ */
+struct nhop_priv *
+find_nhop(struct nh_control *ctl, const struct nhop_priv *nh_priv)
+{
+ struct nhop_priv *nh_priv_ret;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret);
+ if (nh_priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){
+ /* refcount was 0 -> nhop is being deleted */
+ nh_priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ return (nh_priv_ret);
+}
+
Index: sys/net/route/nhop_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/nhop_ctl.c
@@ -0,0 +1,827 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/vnet.h>
+
+/*
+ * This file contains core functionality for the nexthop ("nhop") route subsystem.
+ * The business logic needed to create nexhop objects is implemented here.
+ *
+ * Nexthops in the original sense are the objects containing all the necessary
+ * information to forward the packet to the selected destination.
+ * In particular, nexthop is defined by a combination of
+ * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
+ * NHF_DEFAULT
+ *
+ * Additionally, each nexthop gets assigned its unique index (nexthop index).
+ * It serves two purposes: first one is to ease the ability of userland programs to
+ * reference nexthops by their index. The second one allows lookup algorithms to
+ * to store index instead of pointer (2 bytes vs 8) as a lookup result.
+ * All nexthops are stored in the resizable hash table.
+ *
+ * Basically, this file revolves around supporting 3 functions:
+ * 1) nhop_create_from_info / nhop_create_from_nhop, which contains all
+ * business logic on filling the nexthop fields based on the provided request.
+ * 2) nhop_get(), which gets a usable referenced nexthops.
+ *
+ * Conventions:
+ * 1) non-exported functions start with verb
+ * 2) exported function starts with the subsystem prefix: "nhop"
+ */
+
+static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w);
+
+static struct nhop_priv *alloc_nhop_structure(void);
+static int get_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_priv **pnh_priv);
+static int finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info,
+ struct nhop_priv *nh_priv);
+static struct ifnet *get_aifp(const struct nhop_object *nh, int reference);
+static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp);
+
+static void destroy_nhop_epoch(epoch_context_t ctx);
+static void destroy_nhop(struct nhop_priv *nh_priv);
+
+static void print_nhop(const char *prefix, const struct nhop_object *nh);
+
+_Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32,
+ "nhop_object: wrong nh_ifp offset");
+_Static_assert(sizeof(struct nhop_object) <= 128,
+ "nhop_object: size exceeds 128 bytes");
+
+static uma_zone_t nhops_zone; /* Global zone for each and every nexthop */
+
+
+#define NHOP_OBJECT_ALIGNED_SIZE roundup2(sizeof(struct nhop_object), \
+ 2 * CACHE_LINE_SIZE)
+#define NHOP_PRIV_ALIGNED_SIZE roundup2(sizeof(struct nhop_priv), \
+ 2 * CACHE_LINE_SIZE)
+void
+nhops_init(void)
+{
+
+ nhops_zone = uma_zcreate("routing nhops",
+ NHOP_OBJECT_ALIGNED_SIZE + NHOP_PRIV_ALIGNED_SIZE,
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+/*
+ * Fetches the interface of source address used by the route.
+ * In all cases except interface-address-route it would be the
+ * same as the transmit interfaces.
+ * However, for the interface address this function will return
+ * this interface ifp instead of loopback. This is needed to support
+ * link-local IPv6 loopback communications.
+ *
+ * If @reference is non-zero, found ifp is referenced.
+ *
+ * Returns found ifp.
+ */
+static struct ifnet *
+get_aifp(const struct nhop_object *nh, int reference)
+{
+ struct ifnet *aifp = NULL;
+
+ /*
+ * Adjust the "outgoing" interface. If we're going to loop
+ * the packet back to ourselves, the ifp would be the loopback
+ * interface. However, we'd rather know the interface associated
+ * to the destination address (which should probably be one of
+ * our own addresses).
+ */
+ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) &&
+ nh->gw_sa.sa_family == AF_LINK) {
+ if (reference)
+ aifp = ifnet_byindex_ref(nh->gwl_sa.sdl_index);
+ else
+ aifp = ifnet_byindex(nh->gwl_sa.sdl_index);
+ if (aifp == NULL) {
+ DPRINTF("unable to get aifp for %s index %d",
+ if_name(nh->nh_ifp), nh->gwl_sa.sdl_index);
+ }
+ }
+
+ if (aifp == NULL) {
+ aifp = nh->nh_ifp;
+ if (reference)
+ if_ref(aifp);
+ }
+
+ return (aifp);
+}
+
+int
+cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two)
+{
+
+ if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0)
+ return (0);
+
+ if ((_one->nh_type != _two->nh_type) ||
+ (_one->nh_family != _two->nh_family))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Conditionally sets @nh mtu data based on the @info data.
+ */
+static void
+set_nhop_mtu_from_info(struct nhop_object *nh, const struct rt_addrinfo *info)
+{
+
+ if (info->rti_mflags & RTV_MTU) {
+ if (info->rti_rmx->rmx_mtu != 0) {
+
+ /*
+ * MTU was explicitly provided by user.
+ * Keep it.
+ */
+
+ nh->nh_priv->rt_flags |= RTF_FIXEDMTU;
+ } else {
+
+ /*
+ * User explicitly sets MTU to 0.
+ * Assume rollback to default.
+ */
+ nh->nh_priv->rt_flags &= ~RTF_FIXEDMTU;
+ }
+ nh->nh_mtu = info->rti_rmx->rmx_mtu;
+ }
+}
+
+/*
+ * Fills in shorted link-level sockadd version suitable to be stored inside the
+ * nexthop gateway buffer.
+ */
+static void
+fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp)
+{
+
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_len = sizeof(struct sockaddr_dl_short);
+ sdl->sdl_index = ifp->if_index;
+ sdl->sdl_type = ifp->if_type;
+}
+
+static int
+set_nhop_gw_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
+{
+ struct sockaddr *gw;
+
+ gw = info->rti_info[RTAX_GATEWAY];
+ if (info->rti_flags & RTF_GATEWAY) {
+ if (gw->sa_len > sizeof(struct sockaddr_in6)) {
+ DPRINTF("nhop SA size too big: AF %d len %u",
+ gw->sa_family, gw->sa_len);
+ return (ENOMEM);
+ }
+ memcpy(&nh->gw_sa, gw, gw->sa_len);
+ } else {
+ /*
+ * Interface route. Currently the route.c code adds
+ * sa of type AF_LINK, which is 56 bytes long. The only
+ * meaningful data there is the interface index. It is used
+ * used is the IPv6 loopback output, where we need to preserve
+ * the original interface to maintain proper scoping.
+ * Despite the fact that nexthop code stores original interface
+ * in the separate field (nh_aifp, see below), write AF_LINK
+ * compatible sa with shorter total length.
+ */
+ fill_sdl_from_ifp(&nh->gwl_sa, nh->nh_ifp);
+ }
+
+ return (0);
+}
+
+static int
+fill_nhop_from_info(struct nhop_priv *nh_priv, struct rt_addrinfo *info)
+{
+ int error, rt_flags;
+ struct nhop_object *nh;
+
+ nh = nh_priv->nh;
+
+ rt_flags = info->rti_flags & NHOP_RT_FLAG_MASK;
+
+ nh->nh_priv->rt_flags = rt_flags;
+ nh_priv->nh_family = info->rti_info[RTAX_DST]->sa_family;
+ nh_priv->nh_type = 0; // hook responsibility to set nhop type
+
+ nh->nh_flags = fib_rte_to_nh_flags(rt_flags);
+ set_nhop_mtu_from_info(nh, info);
+ nh->nh_ifp = info->rti_ifa->ifa_ifp;
+ nh->nh_ifa = info->rti_ifa;
+ nh->nh_aifp = get_aifp(nh, 0);
+
+ if ((error = set_nhop_gw_from_info(nh, info)) != 0)
+ return (error);
+
+ /*
+ * Note some of the remaining data is set by the
+ * per-address-family pre-add hook.
+ */
+
+ return (0);
+}
+
+/*
+ * Creates a new nexthop based on the information in @info.
+ *
+ * Returns:
+ * 0 on success, filling @nh_ret with the desired nexthop object ptr
+ * errno otherwise
+ */
+int
+nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object **nh_ret)
+{
+ struct nhop_priv *nh_priv;
+ int error;
+
+ NET_EPOCH_ASSERT();
+
+ nh_priv = alloc_nhop_structure();
+
+ error = fill_nhop_from_info(nh_priv, info);
+ if (error != 0) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ return (error);
+ }
+
+ error = get_nhop(rnh, info, &nh_priv);
+ if (error == 0)
+ *nh_ret = nh_priv->nh;
+
+ return (error);
+}
+
+/*
+ * Gets linked nhop using the provided @pnh_priv nexhop data.
+ * If linked nhop is found, returns it, freeing the provided one.
+ * If there is no such nexthop, attaches the remaining data to the
+ * provided nexthop and links it.
+ *
+ * Returns 0 on success, storing referenced nexthop in @pnh_priv.
+ * Otherwise, errno is returned.
+ */
+static int
+get_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_priv **pnh_priv)
+{
+ const struct sockaddr *dst, *gateway, *netmask;
+ struct nhop_priv *nh_priv, *tmp_priv;
+ int error;
+
+ nh_priv = *pnh_priv;
+
+ /* Give the protocols chance to augment the request data */
+ dst = info->rti_info[RTAX_DST];
+ netmask = info->rti_info[RTAX_NETMASK];
+ gateway = info->rti_info[RTAX_GATEWAY];
+
+ error = rnh->rnh_preadd(rnh->rib_fibnum, dst, netmask, nh_priv->nh);
+ if (error != 0) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ return (error);
+ }
+
+ tmp_priv = find_nhop(rnh->nh_control, nh_priv);
+ if (tmp_priv != NULL) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ *pnh_priv = tmp_priv;
+ return (0);
+ }
+
+ /*
+ * Existing nexthop not found, need to create new one.
+ * Note: multiple simultaneous get_nhop() requests
+ * can result in multiple equal nexhops existing in the
+ * nexthop table. This is not a not a problem until the
+ * relative number of such nexthops is significant, which
+ * is extremely unlikely.
+ */
+
+ error = finalize_nhop(rnh->nh_control, info, nh_priv);
+ if (error != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Update @nh with data supplied in @info.
+ * This is a helper function to support route changes.
+ *
+ * It limits the changes that can be done to the route to the following:
+ * 1) all combination of gateway changes (gw, interface, blackhole/reject)
+ * 2) route flags (FLAG[123],STATIC,BLACKHOLE,REJECT)
+ * 3) route MTU
+ *
+ * Returns:
+ * 0 on success
+ */
+static int
+alter_nhop_from_info(struct nhop_object *nh, struct rt_addrinfo *info)
+{
+ struct sockaddr *info_gw;
+ int error;
+
+ /* Update MTU if set in the request*/
+ set_nhop_mtu_from_info(nh, info);
+
+ /* XXX: allow only one of BLACKHOLE,REJECT,GATEWAY */
+
+ /* Allow some flags (FLAG1,STATIC,BLACKHOLE,REJECT) to be toggled on change. */
+ nh->nh_priv->rt_flags &= ~RTF_FMASK;
+ nh->nh_priv->rt_flags |= info->rti_flags & RTF_FMASK;
+
+ /* Consider gateway change */
+ info_gw = info->rti_info[RTAX_GATEWAY];
+ if (info_gw != NULL) {
+ error = set_nhop_gw_from_info(nh, info);
+ if (error != 0)
+ return (error);
+ /* Update RTF_GATEWAY flag status */
+ nh->nh_priv->rt_flags &= ~RTF_GATEWAY;
+ nh->nh_priv->rt_flags |= (RTF_GATEWAY & info->rti_flags);
+ }
+ /* Update datapath flags */
+ nh->nh_flags = fib_rte_to_nh_flags(nh->nh_priv->rt_flags);
+
+ if (info->rti_ifa != NULL)
+ nh->nh_ifa = info->rti_ifa;
+ if (info->rti_ifp != NULL)
+ nh->nh_ifp = info->rti_ifp;
+ nh->nh_aifp = get_aifp(nh, 0);
+
+ return (0);
+}
+
+/*
+ * Creates new nexthop based on @nh_orig and augmentation data from @info.
+ * Helper function used in the route changes, please see
+ * alter_nhop_from_info() comments for more details.
+ *
+ * Returns:
+ * 0 on success, filling @nh_ret with the desired nexthop object
+ * errno otherwise
+ */
+int
+nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig,
+ struct rt_addrinfo *info, struct nhop_object **pnh)
+{
+ struct nhop_priv *nh_priv;
+ struct nhop_object *nh;
+ int error;
+
+ NET_EPOCH_ASSERT();
+
+ nh_priv = alloc_nhop_structure();
+ nh = nh_priv->nh;
+
+ /* Start with copying data from original nexthop */
+ nh_priv->nh_family = nh_orig->nh_priv->nh_family;
+ nh_priv->rt_flags = nh_orig->nh_priv->rt_flags;
+ nh_priv->nh_type = nh_orig->nh_priv->nh_type;
+
+ nh->nh_ifp = nh_orig->nh_ifp;
+ nh->nh_ifa = nh_orig->nh_ifa;
+ nh->nh_aifp = nh_orig->nh_aifp;
+ nh->nh_mtu = nh_orig->nh_mtu;
+ nh->nh_flags = nh_orig->nh_flags;
+ memcpy(&nh->gw_sa, &nh_orig->gw_sa, nh_orig->gw_sa.sa_len);
+
+ error = alter_nhop_from_info(nh, info);
+ if (error != 0) {
+ uma_zfree(nhops_zone, nh_priv->nh);
+ return (error);
+ }
+
+ error = get_nhop(rnh, info, &nh_priv);
+ if (error == 0)
+ *pnh = nh_priv->nh;
+
+ return (error);
+}
+
+/*
+ * Allocates memory for public/private nexthop structures.
+ *
+ * Returns pointer to nhop_priv or NULL.
+ */
+static struct nhop_priv *
+alloc_nhop_structure()
+{
+ struct nhop_object *nh;
+ struct nhop_priv *nh_priv;
+
+ nh = (struct nhop_object *)uma_zalloc(nhops_zone, M_NOWAIT | M_ZERO);
+ if (nh == NULL)
+ return (NULL);
+ nh_priv = (struct nhop_priv *)((char *)nh + NHOP_OBJECT_ALIGNED_SIZE);
+
+ nh->nh_priv = nh_priv;
+ nh_priv->nh = nh;
+
+ return (nh_priv);
+}
+
+/*
+ * Alocates/references the remaining bits of nexthop data and links
+ * it to the hash table.
+ * Returns 0 if successful,
+ * errno otherwise. @nh_priv is freed in case of error.
+ */
+static int
+finalize_nhop(struct nh_control *ctl, struct rt_addrinfo *info,
+ struct nhop_priv *nh_priv)
+{
+ struct nhop_object *nh;
+
+ nh = nh_priv->nh;
+
+ /* Allocate per-cpu packet counter */
+ nh->nh_pksent = counter_u64_alloc(M_NOWAIT);
+ if (nh->nh_pksent == NULL) {
+ uma_zfree(nhops_zone, nh);
+ RTSTAT_INC(rts_nh_alloc_failure);
+ DPRINTF("nh_alloc_finalize failed");
+ return (ENOMEM);
+ }
+
+ /* Reference external objects and calculate (referenced) ifa */
+ if_ref(nh->nh_ifp);
+ ifa_ref(nh->nh_ifa);
+ nh->nh_aifp = get_aifp(nh, 1);
+ DPRINTF("AIFP: %p nh_ifp %p", nh->nh_aifp, nh->nh_ifp);
+
+ refcount_init(&nh_priv->nh_refcnt, 1);
+
+ /* Please see nhop_free() comments on the initial value */
+ refcount_init(&nh_priv->nh_linked, 2);
+
+ print_nhop("FINALIZE", nh);
+
+ if (link_nhop(ctl, nh_priv) == 0) {
+
+ /*
+ * Adding nexthop to the datastructures
+ * failed. Call destructor w/o waiting for
+ * the epoch end, as nexthop is not used
+ * and return.
+ */
+ DPRINTF("link_nhop failed!");
+ destroy_nhop(nh_priv);
+
+ return (ENOBUFS);
+ }
+
+ return (0);
+}
+
+static void
+print_nhop_sa(char *buf, size_t buflen, const struct sockaddr *sa)
+{
+
+ if (sa->sa_family == AF_INET) {
+ const struct sockaddr_in *sin4;
+ sin4 = (const struct sockaddr_in *)sa;
+ inet_ntop(AF_INET, &sin4->sin_addr, buf, buflen);
+ } else if (sa->sa_family == AF_INET6) {
+ const struct sockaddr_in6 *sin6;
+ sin6 = (const struct sockaddr_in6 *)sa;
+ inet_ntop(AF_INET6, &sin6->sin6_addr, buf, buflen);
+ } else if (sa->sa_family == AF_LINK) {
+ const struct sockaddr_dl *sdl;
+ sdl = (const struct sockaddr_dl *)sa;
+ snprintf(buf, buflen, "if#%d", sdl->sdl_index);
+ } else
+ snprintf(buf, buflen, "af:%d", sa->sa_family);
+}
+
+static void
+print_nhop(const char *prefix, const struct nhop_object *nh)
+{
+ char src_buf[INET6_ADDRSTRLEN], addr_buf[INET6_ADDRSTRLEN];
+
+ print_nhop_sa(src_buf, sizeof(src_buf), nh->nh_ifa->ifa_addr);
+ print_nhop_sa(addr_buf, sizeof(addr_buf), &nh->gw_sa);
+
+ DPRINTF("%s nhop priv %p: AF %d ifp %p %s addr %s src %p %s aifp %p %s mtu %d nh_flags %X",
+ prefix, nh->nh_priv, nh->nh_priv->nh_family, nh->nh_ifp,
+ if_name(nh->nh_ifp), addr_buf, nh->nh_ifa, src_buf, nh->nh_aifp,
+ if_name(nh->nh_aifp), nh->nh_mtu, nh->nh_flags);
+}
+
+static void
+destroy_nhop(struct nhop_priv *nh_priv)
+{
+ struct nhop_object *nh;
+
+ nh = nh_priv->nh;
+
+ print_nhop("DEL", nh);
+
+ if_rele(nh->nh_ifp);
+ if_rele(nh->nh_aifp);
+ ifa_free(nh->nh_ifa);
+ counter_u64_free(nh->nh_pksent);
+
+ uma_zfree(nhops_zone, nh);
+}
+
+/*
+ * Epoch callback indicating nhop is safe to destroy
+ */
+static void
+destroy_nhop_epoch(epoch_context_t ctx)
+{
+ struct nhop_priv *nh_priv;
+
+ nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx);
+
+ destroy_nhop(nh_priv);
+}
+
+int
+nhop_ref_object(struct nhop_object *nh)
+{
+
+ return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt));
+}
+
+void
+nhop_free(struct nhop_object *nh)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv = nh->nh_priv;
+ struct epoch_tracker et;
+
+ if (!refcount_release(&nh_priv->nh_refcnt))
+ return;
+
+ /*
+ * There are only 2 places, where nh_linked can be decreased:
+ * rib destroy (nhops_destroy_rib) and this function.
+ * nh_link can never be increased.
+ *
+ * Hence, use initial value of 2 to make use of
+ * refcount_release_if_not_last().
+ *
+ * There can be two scenarious when calling this function:
+ *
+ * 1) nh_linked value is 2. This means that either
+ * nhops_destroy_rib() has not been called OR it is running,
+ * but we are guaranteed that nh_control won't be freed in
+ * this epoch. Hence, nexthop can be safely unlinked.
+ *
+ * 2) nh_linked value is 1. In that case, nhops_destroy_rib()
+ * has been called and nhop unlink can be skipped.
+ */
+
+ NET_EPOCH_ENTER(et);
+ if (refcount_release_if_not_last(&nh_priv->nh_linked)) {
+ ctl = nh_priv->nh_control;
+ if (unlink_nhop(ctl, nh_priv) == NULL) {
+ /* Do not try to reclaim */
+ DPRINTF("Failed to unlink nexhop %p", nh_priv);
+ NET_EPOCH_EXIT(et);
+ return;
+ }
+ }
+ NET_EPOCH_EXIT(et);
+
+ epoch_call(net_epoch_preempt, destroy_nhop_epoch,
+ &nh_priv->nh_epoch_ctx);
+}
+
+int
+nhop_ref_any(struct nhop_object *nh)
+{
+
+ return (nhop_ref_object(nh));
+}
+
+void
+nhop_free_any(struct nhop_object *nh)
+{
+
+ nhop_free(nh);
+}
+
+
+/* Helper functions */
+
+uint32_t
+nhop_get_idx(const struct nhop_object *nh)
+{
+
+ return (nh->nh_priv->nh_idx);
+}
+
+enum nhop_type
+nhop_get_type(const struct nhop_object *nh)
+{
+
+ return (nh->nh_priv->nh_type);
+}
+
+void
+nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type)
+{
+
+ nh->nh_priv->nh_type = nh_type;
+}
+
+int
+nhop_get_rtflags(const struct nhop_object *nh)
+{
+
+ return (nh->nh_priv->rt_flags);
+}
+
+void
+nhop_set_rtflags(struct nhop_object *nh, int rt_flags)
+{
+
+ nh->nh_priv->rt_flags = rt_flags;
+}
+
+void
+nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv;
+ struct nhop_object *nh;
+
+ ctl = rh->nh_control;
+
+ NHOPS_WLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ nh = nh_priv->nh;
+ if (nh->nh_ifp == ifp) {
+ if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 ||
+ nh->nh_mtu > mtu) {
+ /* Update MTU directly */
+ nh->nh_mtu = mtu;
+ }
+ }
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_WUNLOCK(ctl);
+
+}
+
+/*
+ * Dumps a single entry to sysctl buffer.
+ *
+ * Layout:
+ * rt_msghdr - generic RTM header to allow users to skip non-understood messages
+ * nhop_external - nexhop description structure (with length)
+ * nhop_addrs - structure encapsulating GW/SRC sockaddrs
+ */
+static int
+dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w)
+{
+ struct {
+ struct rt_msghdr rtm;
+ struct nhop_external nhe;
+ struct nhop_addrs na;
+ } arpc;
+ struct nhop_external *pnhe;
+ struct sockaddr *gw_sa, *src_sa;
+ struct sockaddr_storage ss;
+ size_t addrs_len;
+ int error;
+
+ //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w);
+
+ memset(&arpc, 0, sizeof(arpc));
+
+ arpc.rtm.rtm_msglen = sizeof(arpc);
+ arpc.rtm.rtm_version = RTM_VERSION;
+ arpc.rtm.rtm_type = RTM_GET;
+ //arpc.rtm.rtm_flags = RTF_UP;
+ arpc.rtm.rtm_flags = nh->nh_priv->rt_flags;
+
+ /* nhop_external */
+ pnhe = &arpc.nhe;
+ pnhe->nh_len = sizeof(struct nhop_external);
+ pnhe->nh_idx = nh->nh_priv->nh_idx;
+ pnhe->nh_fib = rh->rib_fibnum;
+ pnhe->ifindex = nh->nh_ifp->if_index;
+ pnhe->aifindex = nh->nh_aifp->if_index;
+ pnhe->nh_family = nh->nh_priv->nh_family;
+ pnhe->nh_type = nh->nh_priv->nh_type;
+ pnhe->nh_mtu = nh->nh_mtu;
+ pnhe->nh_flags = nh->nh_flags;
+
+ memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend));
+ pnhe->prepend_len = nh->nh_prepend_len;
+ pnhe->nh_refcount = nh->nh_priv->nh_refcnt;
+ pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent);
+
+ /* sockaddr container */
+ addrs_len = sizeof(struct nhop_addrs);
+ arpc.na.gw_sa_off = addrs_len;
+ gw_sa = (struct sockaddr *)&nh->gw4_sa;
+ addrs_len += gw_sa->sa_len;
+
+ src_sa = nh->nh_ifa->ifa_addr;
+ if (src_sa->sa_family == AF_LINK) {
+ /* Shorten structure */
+ memset(&ss, 0, sizeof(struct sockaddr_storage));
+ fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss,
+ nh->nh_ifa->ifa_ifp);
+ src_sa = (struct sockaddr *)&ss;
+ }
+ arpc.na.src_sa_off = addrs_len;
+ addrs_len += src_sa->sa_len;
+
+ /* Write total container length */
+ arpc.na.na_len = addrs_len;
+
+ arpc.rtm.rtm_msglen += arpc.na.na_len - sizeof(struct nhop_addrs);
+
+ error = SYSCTL_OUT(w, &arpc, sizeof(arpc));
+ if (error == 0)
+ error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len);
+ if (error == 0)
+ error = SYSCTL_OUT(w, src_sa, src_sa->sa_len);
+
+ return (error);
+}
+
+int
+nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv;
+ int error;
+
+ ctl = rh->nh_control;
+
+ NHOPS_RLOCK(ctl);
+ DPRINTF("NHDUMP: count=%u", ctl->nh_head.items_count);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ error = dump_nhop_entry(rh, nh_priv->nh, w);
+ if (error != 0) {
+ NHOPS_RUNLOCK(ctl);
+ return (error);
+ }
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+
+ return (0);
+}
+
Index: sys/net/route/nhop_utils.h
===================================================================
--- /dev/null
+++ sys/net/route/nhop_utils.h
@@ -0,0 +1,200 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_ROUTE_NHOP_UTILS_H_
+#define _NET_ROUTE_NHOP_UTILS_H_
+
+/* Chained hash table */
+struct _cht_head {
+ uint32_t hash_size;
+ uint32_t items_count;
+ void **ptr;
+};
+
+static inline uint32_t
+_cht_get_resize_size(const struct _cht_head *head)
+{
+ uint32_t new_size = 0;
+
+ if ((head->items_count * 2 > head->hash_size) && (head->hash_size < 65536))
+ new_size = head->hash_size * 2;
+ else if ((head->items_count * 4 < head->hash_size) && head->hash_size > 16)
+ new_size = head->hash_size / 2;
+
+ return (new_size);
+}
+
+static inline int
+_cht_need_resize(const struct _cht_head *head)
+{
+
+ return (_cht_get_resize_size(head) > 0);
+}
+
+
+#ifndef typeof
+#define typeof __typeof
+#endif
+
+#define CHT_SLIST_NEED_RESIZE(_head) \
+ _cht_need_resize((const struct _cht_head *)(_head))
+#define CHT_SLIST_GET_RESIZE_BUCKETS(_head) \
+ _cht_get_resize_size((const struct _cht_head *)(_head))
+#define CHT_SLIST_GET_RESIZE_SIZE(_buckets) ((_buckets) * sizeof(void *))
+
+#define CHT_SLIST_DEFINE(_HNAME, _ITEM_TYPE) \
+struct _HNAME##_head { \
+ uint32_t hash_size; \
+ uint32_t items_count; \
+ _ITEM_TYPE **ptr; \
+}
+
+#define CHT_SLIST_INIT(_head, _ptr, _num_buckets) \
+ (_head)->hash_size = _num_buckets; \
+ (_head)->items_count = 0; \
+ (_head)->ptr = _ptr;
+
+/* Default hash method for constant-size keys */
+
+#define CHT_GET_BUCK(_head, _PX, _key) _PX##_hash_key(_key) & ((_head)->hash_size - 1)
+#define CHT_GET_BUCK_OBJ(_head, _PX, _obj) _PX##_hash_obj(_obj) & ((_head)->hash_size - 1)
+
+#define CHT_FIRST(_head, idx) _CHT_FIRST((_head)->ptr, idx)
+#define _CHT_FIRST(_ptr, idx) (_ptr)[idx]
+
+#define CHT_SLIST_FIND(_head, _PX, _key, _ret) do { \
+ uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \
+ _ret = CHT_FIRST(_head, _buck); \
+ for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_key, (_ret))) \
+ break; \
+ } \
+} while(0)
+
+/*
+ * hash_obj, nhop_cmp
+ */
+#define CHT_SLIST_FIND_BYOBJ(_head, _PX, _obj, _ret) do { \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _ret = CHT_FIRST(_head, _buck); \
+ for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_obj, _ret)) \
+ break; \
+ } \
+} while(0)
+
+#define CHT_SLIST_INSERT_HEAD(_head, _PX, _obj) do { \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _PX##_next(_obj) = CHT_FIRST(_head, _buck); \
+ CHT_FIRST(_head, _buck) = _obj; \
+ (_head)->items_count++; \
+} while(0)
+
+#define CHT_SLIST_REMOVE(_head, _PX, _key, _ret) do { \
+ typeof(*(_head)->ptr) _tmp; \
+ uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \
+ _ret = CHT_FIRST(_head, _buck); \
+ _tmp = NULL; \
+ for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_key, _ret)) \
+ break; \
+ } \
+ if (_ret != NULL) { \
+ if (_tmp == NULL) \
+ CHT_FIRST(_head, _buck) = _PX##_next(_ret); \
+ else \
+ _PX##_next(_tmp) = _PX##_next(_ret); \
+ (_head)->items_count--; \
+ } \
+} while(0)
+
+#define CHT_SLIST_REMOVE_BYOBJ(_head, _PX, _obj, _ret) do { \
+ typeof(*(_head)->ptr) _tmp; \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _ret = CHT_FIRST(_head, _buck); \
+ _tmp = NULL; \
+ for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_obj, _ret)) \
+ break; \
+ } \
+ if (_ret != NULL) { \
+ if (_tmp == NULL) \
+ CHT_FIRST(_head, _buck) = _PX##_next(_ret); \
+ else \
+ _PX##_next(_tmp) = _PX##_next(_ret); \
+ (_head)->items_count--; \
+ } \
+} while(0)
+
+
+#define CHT_SLIST_FOREACH(_head, _PX, _x) \
+ for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \
+ for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x))
+
+#define CHT_SLIST_FOREACH_END }
+
+#define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \
+ uint32_t _new_idx; \
+ typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \
+ typeof(*(_head)->ptr) _x, _y; \
+ for (uint32_t _old_idx = 0; _old_idx < (_head)->hash_size; _old_idx++) {\
+ _x = CHT_FIRST(_head, _old_idx); \
+ _y = _x; \
+ while (_y != NULL) { \
+ _y = _PX##_next(_x); \
+ _new_idx = _PX##_hash_obj(_x) & (_new_hsize - 1);\
+ _PX##_next(_x) = _CHT_FIRST(_new_ptr, _new_idx);\
+ _CHT_FIRST(_new_ptr, _new_idx) = _x; \
+ _x = _y; \
+ } \
+ } \
+ (_head)->hash_size = _new_hsize; \
+ _new_void_ptr = (void *)(_head)->ptr; \
+ (_head)->ptr = _new_ptr;
+
+/* bitmasks */
+
+struct bitmask_head {
+ uint16_t free_off; /* index of the first potentially free block */
+ uint16_t blocks; /* number of 4/8-byte blocks in the index */
+ uint32_t items_count; /* total number of items */
+ u_long *idx;
+};
+
+size_t bitmask_get_size(uint32_t items);
+uint32_t bitmask_get_resize_items(const struct bitmask_head *nh);
+int bitmask_should_resize(const struct bitmask_head *bh);
+void bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx);
+void bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items);
+int bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items);
+int bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx);
+int bitmask_free_idx(struct bitmask_head *bi, uint16_t idx);
+
+#endif
+
Index: sys/net/route/nhop_utils.c
===================================================================
--- /dev/null
+++ sys/net/route/nhop_utils.c
@@ -0,0 +1,219 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_mpath.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+
+#include <net/route/nhop_utils.h>
+
+#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */
+
+#define _BLOCKS_TO_SZ(_blocks) ((size_t)(_blocks) * sizeof(u_long))
+#define _BLOCKS_TO_ITEMS(_blocks) ((uint32_t)(_blocks) * BLOCK_ITEMS)
+#define _ITEMS_TO_BLOCKS(_items) ((_items) / BLOCK_ITEMS)
+
+
+static void _bitmask_init_idx(void *index, uint32_t items);
+
+void
+bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items)
+{
+
+ if (idx != NULL)
+ _bitmask_init_idx(idx, num_items);
+
+ memset(bh, 0, sizeof(struct bitmask_head));
+ bh->blocks = _ITEMS_TO_BLOCKS(num_items);
+ bh->idx = (u_long *)idx;
+}
+
+uint32_t
+bitmask_get_resize_items(const struct bitmask_head *bh)
+{
+ if ((bh->items_count * 2 > _BLOCKS_TO_ITEMS(bh->blocks)) && bh->items_count < 65536)
+ return (_BLOCKS_TO_ITEMS(bh->blocks) * 2);
+
+ return (0);
+}
+
+int
+bitmask_should_resize(const struct bitmask_head *bh)
+{
+
+ return (bitmask_get_resize_items(bh) != 0);
+}
+
+#if 0
+uint32_t
+_bitmask_get_blocks(uint32_t items)
+{
+
+ return (items / BLOCK_ITEMS);
+}
+#endif
+
+size_t
+bitmask_get_size(uint32_t items)
+{
+#if _KERNEL
+ KASSERT((items % BLOCK_ITEMS) == 0,
+ ("bitmask size needs to power of 2 and greater or equal to %zu",
+ BLOCK_ITEMS));
+#else
+ assert((items % BLOCK_ITEMS) == 0);
+#endif
+
+ return (items / 8);
+}
+
+static void
+_bitmask_init_idx(void *_idx, uint32_t items)
+{
+ size_t size = bitmask_get_size(items);
+ u_long *idx = (u_long *)_idx;
+
+ /* Mark all as free */
+ memset(idx, 0xFF, size);
+ *idx &= ~(u_long)1; /* Always skip index 0 */
+}
+
+
+/*
+ * _try_merge api to allow shrinking?
+ */
+int
+bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items)
+{
+ uint32_t new_blocks = _BLOCKS_TO_ITEMS(new_items);
+
+ _bitmask_init_idx(new_idx, new_items);
+
+ if (bi->blocks < new_blocks) {
+ /* extend current blocks */
+ if (bi->blocks > 0)
+ memcpy(new_idx, bi->idx, _BLOCKS_TO_SZ(bi->blocks));
+ return (0);
+ } else {
+ /* XXX: ensure all other blocks are non-zero */
+ for (int i = new_blocks; i < bi->blocks; i++) {
+ }
+
+ return (1);
+ }
+}
+
+void
+bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx)
+{
+ void *old_ptr;
+
+ old_ptr = bh->idx;
+
+ bh->idx = (u_long *)new_idx;
+ bh->blocks = _ITEMS_TO_BLOCKS(new_items);
+
+ if (pidx != NULL)
+ *pidx = old_ptr;
+}
+
+/*
+ * Allocate new index in given instance and stores in in @pidx.
+ * Returns 0 on success.
+ */
+int
+bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx)
+{
+ u_long *mask;
+ int i, off, v;
+
+ off = bi->free_off;
+ mask = &bi->idx[off];
+
+ for (i = off; i < bi->blocks; i++, mask++) {
+ if ((v = ffsl(*mask)) == 0)
+ continue;
+
+ /* Mark as busy */
+ *mask &= ~ ((u_long)1 << (v - 1));
+
+ bi->free_off = i;
+
+ v = BLOCK_ITEMS * i + v - 1;
+
+ *pidx = v;
+ bi->items_count++;
+ return (0);
+ }
+
+ return (1);
+}
+
+/*
+ * Removes index from given set.
+ * Returns 0 on success.
+ */
+int
+bitmask_free_idx(struct bitmask_head *bi, uint16_t idx)
+{
+ u_long *mask;
+ int i, v;
+
+ if (idx == 0)
+ return (1);
+
+ i = idx / BLOCK_ITEMS;
+ v = idx % BLOCK_ITEMS;
+
+ if (i >= bi->blocks)
+ return (1);
+
+ mask = &bi->idx[i];
+
+ if ((*mask & ((u_long)1 << v)) != 0)
+ return (1);
+
+ /* Mark as free */
+ *mask |= (u_long)1 << v;
+ bi->items_count--;
+
+ /* Update free offset */
+ if (bi->free_off > i)
+ bi->free_off = i;
+
+ return (0);
+}
+
Index: sys/net/route/nhop_var.h
===================================================================
--- /dev/null
+++ sys/net/route/nhop_var.h
@@ -0,0 +1,96 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for nexthop routing.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHOP_VAR_H_
+#define _NET_ROUTE_NHOP_VAR_H_
+
+/* define nhop hash table */
+struct nhop_priv;
+CHT_SLIST_DEFINE(nhops, struct nhop_priv);
+/* produce hash value for an object */
+#define nhops_hash_obj(_obj) hash_priv(_obj)
+/* compare two objects */
+#define nhops_cmp(_one, _two) cmp_priv(_one, _two)
+/* next object accessor */
+#define nhops_next(_obj) (_obj)->nh_next
+
+
+struct nh_control {
+ struct nhops_head nh_head; /* hash table head */
+ struct bitmask_head nh_idx_head; /* nhop index head */
+ struct rwlock ctl_lock; /* overall ctl lock */
+ struct rib_head *ctl_rh; /* pointer back to rnh */
+ struct epoch_context ctl_epoch_ctx; /* epoch ctl helper */
+};
+
+#define NHOPS_WLOCK(ctl) rw_wlock(&(ctl)->ctl_lock)
+#define NHOPS_RLOCK(ctl) rw_rlock(&(ctl)->ctl_lock)
+#define NHOPS_WUNLOCK(ctl) rw_wunlock(&(ctl)->ctl_lock)
+#define NHOPS_RUNLOCK(ctl) rw_runlock(&(ctl)->ctl_lock)
+#define NHOPS_LOCK_INIT(ctl) rw_init(&(ctl)->ctl_lock, "nhop_ctl")
+#define NHOPS_LOCK_DESTROY(ctl) rw_destroy(&(ctl)->ctl_lock)
+#define NHOPS_WLOCK_ASSERT(ctl) rw_assert(&(ctl)->ctl_lock, RA_WLOCKED)
+
+
+/* Control plane-only nhop data */
+struct nhop_object;
+struct nhop_priv {
+ uint32_t nh_idx; /* nexthop index */
+ uint8_t nh_family; /* address family of the lookup */
+ uint16_t nh_type; /* nexthop type */
+ void *cb_func; /* function handling additional rewrite caps */
+ u_int nh_refcnt; /* number of references, refcount(9) */
+ u_int nh_linked; /* refcount(9), == 2 if linked to the list */
+ int rt_flags; /* routing flags for the control plane */
+ struct nhop_object *nh; /* backreference to the dataplane nhop */
+ struct nh_control *nh_control; /* backreference to the rnh */
+ struct nhop_priv *nh_next; /* hash table membership */
+ struct epoch_context nh_epoch_ctx; /* epoch data for nhop */
+};
+
+#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED)
+
+/* nhop.c */
+struct nhop_priv *find_nhop(struct nh_control *ctl,
+ const struct nhop_priv *nh_priv);
+int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv);
+struct nhop_priv *unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv);
+
+/* nhop_ctl.c */
+int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two);
+
+#endif
+
Index: sys/net/route/route_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/route_ctl.c
@@ -0,0 +1,65 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/vnet.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <netinet/in.h>
+
+#include <vm/uma.h>
+
+
+/*
+ * This file contains control plane routing tables functions.
+ *
+ * All functions assumes they are called in net epoch.
+ */
+
+
Index: sys/net/route/route_helpers.c
===================================================================
--- /dev/null
+++ sys/net/route/route_helpers.c
@@ -0,0 +1,83 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/vnet.h>
+
+/*
+ * RIB helper functions.
+ */
+
+/*
+ * Calls @wa_f with @arg for each entry in the table specified by
+ * @af and @fibnum.
+ *
+ * Table is traversed under read lock.
+ */
+void
+rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+
+ if ((rnh = rt_tables_get_rnh(fibnum, af)) == NULL)
+ return;
+
+ RIB_RLOCK(rnh);
+ rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
+ RIB_RUNLOCK(rnh);
+}
+
Index: sys/net/route/shared.h
===================================================================
--- /dev/null
+++ sys/net/route/shared.h
@@ -0,0 +1,68 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Contains various definitions shared between the parts of a routing subsystem.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_SHARED_H_
+#define _NET_ROUTE_SHARED_H_
+
+#ifdef RTDEBUG
+#define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__)
+#else
+#define DPRINTF(_fmt, ...)
+#endif
+
+struct rib_head;
+
+/* Nexhops */
+void nhops_init(void);
+int nhops_init_rib(struct rib_head *rh);
+void nhops_destroy_rib(struct rib_head *rh);
+int nhop_ref_object(struct nhop_object *nh);
+int nhop_ref_any(struct nhop_object *nh);
+void nhop_free_any(struct nhop_object *nh);
+
+void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type);
+void nhop_set_rtflags(struct nhop_object *nh, int rt_flags);
+
+int nhop_create_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object **nh_ret);
+int nhop_create_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_orig,
+ struct rt_addrinfo *info, struct nhop_object **pnh_priv);
+
+void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
+int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+#endif
+
Index: sys/net/route_var.h
===================================================================
--- sys/net/route_var.h
+++ sys/net/route_var.h
@@ -32,6 +32,10 @@
#ifndef _NET_ROUTE_VAR_H_
#define _NET_ROUTE_VAR_H_
+struct nh_control;
+typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr,
+ const struct sockaddr *mask, struct nhop_object *nh);
+
struct rib_head {
struct radix_head head;
rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */
@@ -41,6 +45,7 @@
rn_walktree_t *rnh_walktree; /* traverse tree */
rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */
rn_close_t *rnh_close; /*do something when the last ref drops*/
+ rnh_preadd_entry_f_t *rnh_preadd; /* hook to alter record prior to insertion */
rt_gen_t rnh_gen; /* generation counter */
int rnh_multipath; /* multipath capable ? */
struct radix_node rnh_nodes[3]; /* empty tree for common case */
@@ -51,6 +56,7 @@
u_int rib_fibnum; /* fib number */
struct callout expire_callout; /* Callout for expiring dynamic routes */
time_t next_expire; /* Next expire run ts */
+ struct nh_control *nh_control; /* nexthop subsystem data */
};
#define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker
@@ -90,6 +96,44 @@
struct rib_head *rt_tables_get_rnh(int fib, int family);
void rt_mpath_init_rnh(struct rib_head *rnh);
+VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
+#define RTSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
+#define RTSTAT_INC(name) RTSTAT_ADD(name, 1)
+
+/*
+ * With the split between the routing entry and the nexthop,
+ * rt_flags has to be split between these 2 entries. As rtentry
+ * mostly contains prefix data and is thought to be generic enough
+ * so one can transparently change the nexthop pointer w/o requiring
+ * any other rtentry changes, most of rt_flags shifts to the particular nexthop.
+ * /
+ *
+ * RTF_UP: rtentry, as an indication that it is linked.
+ * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath
+ * RTF_DYNAMIC: nhop, to make rtentry generic.
+ * RTF_MODIFIED: nhop, to make rtentry generic. (legacy)
+ * -- "native" path (nhop) properties:
+ * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU,
+ * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST
+ */
+
+/* Nexthop rt flags mask */
+#define NHOP_RT_FLAG_MASK (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \
+ RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \
+ RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST)
+
+/* rtentry rt flag mask */
+#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST)
+
+/* Nexthop selection */
+#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh))
+#define _SELECT_NHOP(_nh, _flowid) \
+ (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
+#define _RT_SELECT_NHOP(_nh, _flowid) \
+ ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
+#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
+
/* rte<>nhop translation */
static inline uint16_t
fib_rte_to_nh_flags(int rt_flags)
Index: sys/net/rtsock.c
===================================================================
--- sys/net/rtsock.c
+++ sys/net/rtsock.c
@@ -77,6 +77,7 @@
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif
+#include <net/route/nhop.h>
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
@@ -1076,6 +1077,7 @@
out->rmx_mtu = rt->rt_mtu;
out->rmx_weight = rt->rt_weight;
out->rmx_pksent = counter_u64_fetch(rt->rt_pksent);
+ out->rmx_nhidx = nhop_get_idx(rt->rt_nhop);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = rt->rt_expire ?
rt->rt_expire - time_uptime + time_second : 0;
@@ -2025,7 +2027,7 @@
namelen--;
if (req->newptr)
return (EPERM);
- if (name[1] == NET_RT_DUMP) {
+ if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP) {
if (namelen == 3)
fib = req->td->td_proc->p_fibnum;
else if (namelen == 4)
@@ -2092,7 +2094,25 @@
error = EAFNOSUPPORT;
}
break;
-
+ case NET_RT_NHOP:
+ /* Allow dumping one specific af/fib at a time */
+ if (namelen < 4) {
+ error = EINVAL;
+ break;
+ }
+ fib = name[3];
+ if (fib < 0 || fib > rt_numfibs) {
+ error = EINVAL;
+ break;
+ }
+ rnh = rt_tables_get_rnh(fib, af);
+ if (rnh == NULL) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+ if (w.w_op == NET_RT_NHOP)
+ error = nhops_dump_sysctl(rnh, w.w_req);
+ break;
case NET_RT_IFLIST:
case NET_RT_IFLISTL:
error = sysctl_iflist(af, &w);
Index: sys/netinet/in_fib.h
===================================================================
--- sys/netinet/in_fib.h
+++ sys/netinet/in_fib.h
@@ -58,5 +58,9 @@
uint32_t flowid, struct nhop4_extended *pnh4);
void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4);
+struct nhop_object *fib4_lookup(uint32_t fibnum, struct in_addr dst,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid);
+int fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, const struct ifnet *src_if);
#endif
Index: sys/netinet/in_fib.c
===================================================================
--- sys/netinet/in_fib.c
+++ sys/netinet/in_fib.c
@@ -49,6 +49,8 @@
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#ifdef RADIX_MPATH
@@ -60,59 +62,49 @@
#include <netinet/in_fib.h>
#ifdef INET
-static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+static void fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_basic *pnh4);
-static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+static void fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_extended *pnh4);
#define RNTORT(p) ((struct rtentry *)(p))
static void
-fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_basic *pnh4)
{
- struct sockaddr_in *gw;
if ((flags & NHR_IFAIF) != 0)
- pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_ifp = nh->nh_ifa->ifa_ifp;
else
- pnh4->nh_ifp = rte->rt_ifp;
- pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in *)rte->rt_gateway;
- pnh4->nh_addr = gw->sin_addr;
- } else
+ pnh4->nh_ifp = nh->nh_ifp;
+ pnh4->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY)
+ pnh4->nh_addr = nh->gw4_sa.sin_addr;
+ else
pnh4->nh_addr = dst;
/* Set flags */
- pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in *)rt_key(rte);
- if (gw->sin_addr.s_addr == 0)
- pnh4->nh_flags |= NHF_DEFAULT;
+ pnh4->nh_flags = nh->nh_flags;
/* TODO: Handle RTF_BROADCAST here */
}
static void
-fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_extended *pnh4)
{
- struct sockaddr_in *gw;
if ((flags & NHR_IFAIF) != 0)
- pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_ifp = nh->nh_ifa->ifa_ifp;
else
- pnh4->nh_ifp = rte->rt_ifp;
- pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in *)rte->rt_gateway;
- pnh4->nh_addr = gw->sin_addr;
- } else
+ pnh4->nh_ifp = nh->nh_ifp;
+ pnh4->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY)
+ pnh4->nh_addr = nh->gw4_sa.sin_addr;
+ else
pnh4->nh_addr = dst;
/* Set flags */
- pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in *)rt_key(rte);
- if (gw->sin_addr.s_addr == 0)
- pnh4->nh_flags |= NHF_DEFAULT;
- pnh4->nh_ia = ifatoia(rte->rt_ifa);
+ pnh4->nh_flags = nh->nh_flags;
+ pnh4->nh_ia = ifatoia(nh->nh_ifa);
pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr;
}
@@ -135,7 +127,7 @@
struct rib_head *rh;
struct radix_node *rn;
struct sockaddr_in sin;
- struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
@@ -150,10 +142,10 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
+ nh = RNTORT(rn)->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib4_rte_to_nh_basic(rte, dst, flags, pnh4);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib4_rte_to_nh_basic(nh, dst, flags, pnh4);
RIB_RUNLOCK(rh);
return (0);
@@ -185,6 +177,7 @@
struct radix_node *rn;
struct sockaddr_in sin;
struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
@@ -207,9 +200,10 @@
return (ENOENT);
}
#endif
+ nh = rte->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib4_rte_to_nh_extended(rte, dst, flags, pnh4);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib4_rte_to_nh_extended(nh, dst, flags, pnh4);
if ((flags & NHR_REF) != 0) {
/* TODO: lwref on egress ifp's ? */
}
@@ -227,6 +221,140 @@
fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4)
{
+}
+
+/*
+ * Looks up path in fib @fibnum specified by @dst.
+ * Returns path nexthop on success. Nexthop is safe to use
+ * within the current network epoch. If longer lifetime is required,
+ * one needs to pass NHR_REF as a flag. This will return referenced
+ * nexthop.
+ */
+struct nhop_object *
+fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, uint32_t flowid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct nhop_object *nh;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (NULL);
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4;
+ memset(&sin4, 0, sizeof(sin4));
+ sin4.sin_family = AF_INET;
+ sin4.sin_len = sizeof(struct sockaddr_in);
+ sin4.sin_addr = dst;
+
+ nh = NULL;
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ if (rt_mpath_next(rt) != NULL)
+ rt = rt_mpath_selectrte(rt, flowid);
+#endif
+ nh = rt->rt_nhop;
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ RIB_RUNLOCK(rh);
+ return (nh);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+
+inline static int
+check_urpf(const struct nhop_object *nh, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ if (src_if != NULL && nh->nh_aifp == src_if) {
+ return (1);
+ }
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nh->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+#ifdef RADIX_MPATH
+inline static int
+check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ while (rt != NULL) {
+ if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
+ return (1);
+ rt = rt_mpath_next(rt);
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, const struct ifnet *src_if)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ int ret;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (0);
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4;
+ memset(&sin4, 0, sizeof(sin4));
+ sin4.sin_len = sizeof(struct sockaddr_in);
+ sin4.sin_addr = dst;
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ ret = check_urpf_mpath(rt, flags, src_if);
+#else
+ ret = check_urpf(rt->rt_nhop, flags, src_if);
+#endif
+ RIB_RUNLOCK(rh);
+ return (ret);
+ }
+ RIB_RUNLOCK(rh);
+
+ return (0);
}
#endif
Index: sys/netinet/in_rmx.c
===================================================================
--- sys/netinet/in_rmx.c
+++ sys/netinet/in_rmx.c
@@ -43,6 +43,8 @@
#include <net/if_var.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -56,6 +58,67 @@
extern int in_detachhead(void **head, int off);
#endif
+static int
+rib4_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
+ struct nhop_object *nh)
+{
+ const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr;
+ uint16_t nh_type;
+ int rt_flags;
+
+ /* XXX: RTF_LOCAL && RTF_MULTICAST */
+
+ rt_flags = nhop_get_rtflags(nh);
+
+ if (rt_flags & RTF_HOST) {
+
+ /*
+ * Backward compatibility:
+ * if the destination is broadcast,
+ * mark route as broadcast.
+ * This behavior was useful when route cloning
+ * was in place, so there was an explicit cloned
+ * route for every broadcasted address.
+ * Currently (2020-04) there is no kernel machinery
+ * to do route cloning, though someone might explicitly
+ * add these routes to support some cases with active-active
+ * load balancing. Given that, retain this support.
+ */
+ if (in_broadcast(addr4->sin_addr, nh->nh_ifp)) {
+ rt_flags |= RTF_BROADCAST;
+ nhop_set_rtflags(nh, rt_flags);
+ nh->nh_flags |= NHF_BROADCAST;
+ }
+ }
+
+ /*
+ * Check route MTU:
+ * inherit interface MTU if not set or
+ * check if MTU is too large.
+ */
+ if (nh->nh_mtu == 0) {
+ nh->nh_mtu = nh->nh_ifp->if_mtu;
+ } else if (nh->nh_mtu > nh->nh_ifp->if_mtu)
+ nh->nh_mtu = nh->nh_ifp->if_mtu;
+
+ /* Ensure that default route nhop has special flag */
+ const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask;
+ if ((rt_flags & RTF_HOST) == 0 && mask4->sin_addr.s_addr == 0)
+ nh->nh_flags |= NHF_DEFAULT;
+
+ /* Set nhop type to basic per-AF nhop */
+ if (nhop_get_type(nh) == 0) {
+ if (nh->nh_flags & NHF_GATEWAY)
+ nh_type = NH_TYPE_IPV4_ETHER_NHOP;
+ else
+ nh_type = NH_TYPE_IPV4_ETHER_RSLV;
+
+ nhop_set_type(nh, nh_type);
+ }
+
+ return (0);
+}
+
/*
* Do what we need to do when inserting a route.
*/
@@ -126,6 +189,7 @@
if (rh == NULL)
return (0);
+ rh->rnh_preadd = rib4_preadd;
rh->rnh_addaddr = in_addroute;
#ifdef RADIX_MPATH
rt_mpath_init_rnh(rh);
Index: sys/netinet6/in6_fib.h
===================================================================
--- sys/netinet6/in6_fib.h
+++ sys/netinet6/in6_fib.h
@@ -58,5 +58,11 @@
uint32_t scopeid, uint32_t flags, uint32_t flowid,
struct nhop6_extended *pnh6);
void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6);
+
+struct nhop_object *fib6_lookup(uint32_t fibnum,
+ const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags,
+ uint32_t flowid);
+int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if);
#endif
Index: sys/netinet6/in6_fib.c
===================================================================
--- sys/netinet6/in6_fib.c
+++ sys/netinet6/in6_fib.c
@@ -50,6 +50,8 @@
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#ifdef RADIX_MPATH
@@ -68,94 +70,63 @@
#include <net/if_types.h>
#ifdef INET6
-static void fib6_rte_to_nh_extended(struct rtentry *rte,
+static void fib6_rte_to_nh_extended(const struct nhop_object *nh,
const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6);
-static void fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
+static void fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_basic *pnh6);
-static struct ifnet *fib6_get_ifaifp(struct rtentry *rte);
#define RNTORT(p) ((struct rtentry *)(p))
#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))
CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst);
-/*
- * Gets real interface for the @rte.
- * Returns rt_ifp for !IFF_LOOPBACK routers.
- * Extracts "real" address interface from interface address
- * loopback routes.
- */
-static struct ifnet *
-fib6_get_ifaifp(struct rtentry *rte)
-{
- struct ifnet *ifp;
- struct sockaddr_dl *sdl;
- ifp = rte->rt_ifp;
- if ((ifp->if_flags & IFF_LOOPBACK) &&
- rte->rt_gateway->sa_family == AF_LINK) {
- sdl = (struct sockaddr_dl *)rte->rt_gateway;
- return (ifnet_byindex(sdl->sdl_index));
- }
- return (ifp);
-}
-
static void
-fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
+fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_basic *pnh6)
{
- struct sockaddr_in6 *gw;
/* Do explicit nexthop zero unless we're copying it */
memset(pnh6, 0, sizeof(*pnh6));
if ((flags & NHR_IFAIF) != 0)
- pnh6->nh_ifp = fib6_get_ifaifp(rte);
+ pnh6->nh_ifp = nh->nh_aifp;
else
- pnh6->nh_ifp = rte->rt_ifp;
+ pnh6->nh_ifp = nh->nh_ifp;
- pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
- if (rte->rt_flags & RTF_GATEWAY) {
+ pnh6->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY) {
/* Return address with embedded scope. */
- gw = (struct sockaddr_in6 *)rte->rt_gateway;
- pnh6->nh_addr = gw->sin6_addr;
+ pnh6->nh_addr = nh->gw6_sa.sin6_addr;
} else
pnh6->nh_addr = *dst;
/* Set flags */
- pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in6 *)rt_key(rte);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
- pnh6->nh_flags |= NHF_DEFAULT;
+ pnh6->nh_flags = nh->nh_flags;
}
static void
-fib6_rte_to_nh_extended(struct rtentry *rte, const struct in6_addr *dst,
+fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_extended *pnh6)
{
- struct sockaddr_in6 *gw;
/* Do explicit nexthop zero unless we're copying it */
memset(pnh6, 0, sizeof(*pnh6));
if ((flags & NHR_IFAIF) != 0)
- pnh6->nh_ifp = fib6_get_ifaifp(rte);
+ pnh6->nh_ifp = nh->nh_aifp;
else
- pnh6->nh_ifp = rte->rt_ifp;
+ pnh6->nh_ifp = nh->nh_ifp;
- pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
- if (rte->rt_flags & RTF_GATEWAY) {
+ pnh6->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY) {
/* Return address with embedded scope. */
- gw = (struct sockaddr_in6 *)rte->rt_gateway;
- pnh6->nh_addr = gw->sin6_addr;
+ pnh6->nh_addr = nh->gw6_sa.sin6_addr;
} else
pnh6->nh_addr = *dst;
/* Set flags */
- pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in6 *)rt_key(rte);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
- pnh6->nh_flags |= NHF_DEFAULT;
- pnh6->nh_ia = ifatoia6(rte->rt_ifa);
+ pnh6->nh_flags = nh->nh_flags;
+ pnh6->nh_ia = ifatoia6(nh->nh_ifa);
}
/*
@@ -180,7 +151,7 @@
struct rib_head *rh;
struct radix_node *rn;
struct sockaddr_in6 sin6;
- struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -198,10 +169,10 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
+ nh = RNTORT(rn)->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib6_rte_to_nh_basic(rte, &sin6.sin6_addr, flags, pnh6);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib6_rte_to_nh_basic(nh, &sin6.sin6_addr, flags, pnh6);
RIB_RUNLOCK(rh);
return (0);
}
@@ -231,6 +202,7 @@
struct radix_node *rn;
struct sockaddr_in6 sin6;
struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -256,9 +228,10 @@
return (ENOENT);
}
#endif
+ nh = rte->rt_nhop;
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib6_rte_to_nh_extended(rte, &sin6.sin6_addr, flags,
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib6_rte_to_nh_extended(nh, &sin6.sin6_addr, flags,
pnh6);
if ((flags & NHR_REF) != 0) {
/* TODO: Do lwref on egress ifp's */
@@ -277,6 +250,146 @@
fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6)
{
+}
+
+/*
+ * Looks up path in fib @fibnum specified by @dst.
+ * Assumes scope is deembedded and provided in @scopeid.
+ *
+ * Returns path nexthop on success. Nexthop is safe to use
+ * within the current network epoch. If longer lifetime is required,
+ * one needs to pass NHR_REF as a flag. This will return referenced
+ * nexthop.
+ */
+struct nhop_object *
+fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct nhop_object *nh;
+ struct sockaddr_in6 sin6;
+
+ KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET6);
+ if (rh == NULL)
+ return (NULL);
+
+ /* TODO: radix changes */
+ //addr = *dst6;
+ /* Prepare lookup key */
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+ sin6.sin6_addr = *dst6;
+
+ /* Assume scopeid is valid and embed it directly */
+ if (IN6_IS_SCOPE_LINKLOCAL(dst6))
+ sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ if (rt_mpath_next(rt) != NULL)
+ rt = rt_mpath_selectrte(rt, flowid);
+#endif
+ nh = rt->rt_nhop;
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ RIB_RUNLOCK(rh);
+ return (nh);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+
+inline static int
+check_urpf(const struct nhop_object *nh, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ if (src_if != NULL && nh->nh_aifp == src_if) {
+ return (1);
+ }
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nh->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+#ifdef RADIX_MPATH
+inline static int
+check_urpf_mpath(struct rtentry *rt, uint32_t flags,
+ const struct ifnet *src_if)
+{
+
+ while (rt != NULL) {
+ if (check_urpf(rt->rt_nhop, flags, src_if) != 0)
+ return (1);
+ rt = rt_mpath_next(rt);
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct in6_addr addr;
+ int ret;
+
+ KASSERT((fibnum < rt_numfibs), ("fib6_check_urpf: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET6);
+ if (rh == NULL)
+ return (0);
+
+ addr = *dst6;
+ /* Assume scopeid is valid and embed it directly */
+ if (IN6_IS_SCOPE_LINKLOCAL(dst6))
+ addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&addr, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+#ifdef RADIX_MPATH
+ ret = check_urpf_mpath(rt, flags, src_if);
+#else
+ ret = check_urpf(rt->rt_nhop, flags, src_if);
+#endif
+ RIB_RUNLOCK(rh);
+ return (ret);
+ }
+ RIB_RUNLOCK(rh);
+
+ return (0);
}
#endif
Index: sys/netinet6/in6_rmx.c
===================================================================
--- sys/netinet6/in6_rmx.c
+++ sys/netinet6/in6_rmx.c
@@ -82,6 +82,8 @@
#include <net/if_var.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
#include <netinet/in.h>
#include <netinet/ip_var.h>
@@ -103,6 +105,43 @@
extern int in6_detachhead(void **head, int off);
#endif
+static int
+rib6_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
+ struct nhop_object *nh)
+{
+ uint16_t nh_type;
+
+ /* XXX: RTF_LOCAL */
+
+ /*
+ * Check route MTU:
+ * inherit interface MTU if not set or
+ * check if MTU is too large.
+ */
+ if (nh->nh_mtu == 0) {
+ nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
+ } else if (nh->nh_mtu > IN6_LINKMTU(nh->nh_ifp))
+ nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
+
+ /* Ensure that default route nhop has special flag */
+ const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask;
+ if ((nhop_get_rtflags(nh) & RTF_HOST) == 0 &&
+ IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr))
+ nh->nh_flags |= NHF_DEFAULT;
+
+ /* Set nexthop type */
+ if (nhop_get_type(nh) == 0) {
+ if (nh->nh_flags & NHF_GATEWAY)
+ nh_type = NH_TYPE_IPV6_ETHER_NHOP;
+ else
+ nh_type = NH_TYPE_IPV6_ETHER_RSLV;
+
+ nhop_set_type(nh, nh_type);
+ }
+
+ return (0);
+}
+
/*
* Do what we need to do when inserting a route.
*/
@@ -169,6 +208,7 @@
return (0);
rh->rnh_addaddr = in6_addroute;
+ rh->rnh_preadd = rib6_preadd;
#ifdef RADIX_MPATH
rt_mpath_init_rnh(rh);
#endif
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -416,6 +416,7 @@
#define NET_RT_IFMALIST 4 /* return multicast address list */
#define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en
* versions of msghdr structs. */
+#define NET_RT_NHOP 6 /* dump routing nexthops */
#endif /* __BSD_VISIBLE */
/*
Index: usr.bin/netstat/Makefile
===================================================================
--- usr.bin/netstat/Makefile
+++ usr.bin/netstat/Makefile
@@ -5,7 +5,7 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \
- unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \
+ unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \
nl_defs.h
nl_symbols.c: nlist_symbols
Index: usr.bin/netstat/common.h
===================================================================
--- /dev/null
+++ usr.bin/netstat/common.h
@@ -0,0 +1,24 @@
+#ifndef _NETSTAT_COMMON_H_
+#define _NETSTAT_COMMON_H_
+
+struct bits {
+ u_long b_mask;
+ char b_val;
+ const char *b_name;
+};
+extern struct bits rt_bits[];
+
+const char *fmt_flags(const struct bits *p, int f);
+void print_flags_generic(int flags, const struct bits *pbits,
+ const char *format, const char *tag_name);
+int print_sockaddr(const char *name, struct sockaddr *sa,
+ struct sockaddr *mask, int flags, int width);
+
+struct ifmap_entry {
+ char ifname[IFNAMSIZ];
+};
+
+struct ifmap_entry *prepare_ifmap(size_t *ifmap_size);
+
+#endif
+
Index: usr.bin/netstat/common.c
===================================================================
--- /dev/null
+++ usr.bin/netstat/common.c
@@ -0,0 +1,140 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1983, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <libutil.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+const char *
+fmt_flags(const struct bits *p, int f)
+{
+ static char name[33];
+ char *flags;
+
+ for (flags = name; p->b_mask; p++)
+ if (p->b_mask & f)
+ *flags++ = p->b_val;
+ *flags = '\0';
+ return (name);
+}
+
+void
+print_flags_generic(int flags, const struct bits *pbits, const char *format,
+ const char *tag_name)
+{
+ const struct bits *p;
+ char tag_fmt[64];
+
+ xo_emit(format, fmt_flags(pbits, flags));
+
+ snprintf(tag_fmt, sizeof(tag_fmt), "{le:%s/%%s}", tag_name);
+ xo_open_list(tag_name);
+ for (p = pbits; p->b_mask; p++)
+ if (p->b_mask & flags)
+ xo_emit(tag_fmt, p->b_name);
+ xo_close_list(tag_name);
+}
+
+struct ifmap_entry *
+prepare_ifmap(size_t *pifmap_size)
+{
+ int ifindex = 0, size;
+ struct ifaddrs *ifap, *ifa;
+ struct sockaddr_dl *sdl;
+
+ struct ifmap_entry *ifmap = NULL;
+ int ifmap_size = 0;
+
+ /*
+ * Retrieve interface list at first
+ * since we need #ifindex -> if_xname match
+ */
+ if (getifaddrs(&ifap) != 0)
+ err(EX_OSERR, "getifaddrs");
+
+ for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
+
+ if (ifa->ifa_addr->sa_family != AF_LINK)
+ continue;
+
+ sdl = (struct sockaddr_dl *)ifa->ifa_addr;
+ ifindex = sdl->sdl_index;
+
+ if (ifindex >= ifmap_size) {
+ size = roundup(ifindex + 1, 32) *
+ sizeof(struct ifmap_entry);
+ if ((ifmap = realloc(ifmap, size)) == NULL)
+ errx(2, "realloc(%d) failed", size);
+ memset(&ifmap[ifmap_size], 0,
+ size - ifmap_size *
+ sizeof(struct ifmap_entry));
+
+ ifmap_size = roundup(ifindex + 1, 32);
+ }
+
+ if (*ifmap[ifindex].ifname != '\0')
+ continue;
+
+ strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ);
+ }
+
+ freeifaddrs(ifap);
+
+ *pifmap_size = ifmap_size;
+
+ return (ifmap);
+}
+
Index: usr.bin/netstat/main.c
===================================================================
--- usr.bin/netstat/main.c
+++ usr.bin/netstat/main.c
@@ -214,6 +214,7 @@
int noutputs = 0; /* how much outputs before we exit */
int numeric_addr; /* show addresses numerically */
int numeric_port; /* show ports numerically */
+int oflag; /* show nexthop objects*/
int Pflag; /* show TCP log ID */
static int pflag; /* show given protocol */
static int Qflag; /* show netisr information */
@@ -248,7 +249,7 @@
if (argc < 0)
exit(EXIT_FAILURE);
- while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz"))
+ while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz"))
!= -1)
switch(ch) {
case '4':
@@ -345,6 +346,9 @@
case 'n':
numeric_addr = numeric_port = 1;
break;
+ case 'o':
+ oflag = 1;
+ break;
case 'P':
Pflag = 1;
break;
@@ -494,6 +498,14 @@
xo_finish();
exit(0);
}
+ if (oflag) {
+ xo_open_container("statistics");
+ nhops_print(fib, af);
+ xo_close_container("statistics");
+ xo_finish();
+ exit(0);
+ }
+
if (gflag) {
xo_open_container("statistics");
Index: usr.bin/netstat/netstat.h
===================================================================
--- usr.bin/netstat/netstat.h
+++ usr.bin/netstat/netstat.h
@@ -147,6 +147,10 @@
char *routename(struct sockaddr *, int);
const char *netname(struct sockaddr *, struct sockaddr *);
void routepr(int, int);
+int p_sockaddr(const char *name, struct sockaddr *sa,
+ struct sockaddr *mask, int flags, int width);
+const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask,
+ int flags);
#ifdef NETGRAPH
void netgraphprotopr(u_long, const char *, int, int);
@@ -157,3 +161,4 @@
void mroutepr(void);
void mrt_stats(void);
void bpf_stats(char *);
+void nhops_print(int fibnum, int af);
Index: usr.bin/netstat/nhops.c
===================================================================
--- /dev/null
+++ usr.bin/netstat/nhops.c
@@ -0,0 +1,472 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1983, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <netinet/in.h>
+#include <netgraph/ng_socket.h>
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <libutil.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+/* column widths; each followed by one space */
+#ifndef INET6
+#define WID_DST_DEFAULT(af) 18 /* width of destination column */
+#define WID_GW_DEFAULT(af) 18 /* width of gateway column */
+#define WID_IF_DEFAULT(af) (Wflag ? 10 : 8) /* width of netif column */
+#else
+#define WID_DST_DEFAULT(af) \
+ ((af) == AF_INET6 ? (numeric_addr ? 33: 18) : 18)
+#define WID_GW_DEFAULT(af) \
+ ((af) == AF_INET6 ? (numeric_addr ? 29 : 18) : 18)
+#define WID_IF_DEFAULT(af) ((af) == AF_INET6 ? 8 : (Wflag ? 10 : 8))
+#endif /*INET6*/
+static int wid_dst;
+static int wid_gw;
+static int wid_flags;
+static int wid_pksent;
+static int wid_mtu;
+static int wid_if;
+static int wid_nhidx;
+static int wid_nhtype;
+static int wid_refcnt;
+static int wid_prepend;
+
+static struct bits nh_bits[] = {
+ { NHF_REJECT, 'R', "reject" },
+ { NHF_BLACKHOLE,'B', "blackhole" },
+ { NHF_REDIRECT, 'r', "redirect" },
+ { NHF_GATEWAY, 'G', "gateway" },
+ { NHF_DEFAULT, 'd', "default" },
+ { NHF_BROADCAST,'b', "broadcast" },
+ { 0 , 0, NULL }
+};
+
+static char *nh_types[] = {
+ "empty", /* 0 */
+ "v4/resolve", /* 1 */
+ "v4/gw",
+ "v6/resolve",
+ "v6/gw"
+};
+
+struct nhop_entry {
+ char gw[64];
+ char ifname[IFNAMSIZ];
+};
+
+struct nhop_map {
+ struct nhop_entry *ptr;
+ size_t size;
+};
+static struct nhop_map global_nhop_map;
+
+static void nhop_map_update(struct nhop_map *map, uint32_t idx,
+ char *gw, char *ifname);
+static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx);
+
+
+static struct ifmap_entry *ifmap;
+static size_t ifmap_size;
+
+static void
+print_sockaddr_buf(char *buf, size_t bufsize, const struct sockaddr *sa)
+{
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ inet_ntop(AF_INET, &((struct sockaddr_in *)sa)->sin_addr,
+ buf, bufsize);
+ break;
+ case AF_INET6:
+ inet_ntop(AF_INET6, &((struct sockaddr_in6 *)sa)->sin6_addr,
+ buf, bufsize);
+ break;
+ default:
+ snprintf(buf, bufsize, "unknown:%d", sa->sa_family);
+ break;
+ }
+}
+
+static int
+print_addr(const char *name, const char *addr, int width)
+{
+ char buf[128];
+ int protrusion;
+
+ if (width < 0) {
+ snprintf(buf, sizeof(buf), "{:%s/%%s} ", name);
+ xo_emit(buf, addr);
+ protrusion = 0;
+ } else {
+ if (Wflag != 0 || numeric_addr) {
+ snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%s}{]:} ",
+ -width, name);
+ xo_emit(buf, addr);
+ protrusion = strlen(addr) - width;
+ if (protrusion < 0)
+ protrusion = 0;
+ } else {
+ snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%-.*s}{]:} ",
+ -width, name);
+ xo_emit(buf, width, addr);
+ protrusion = 0;
+ }
+ }
+ return (protrusion);
+}
+
+
+static void
+print_nhop_header(int af1 __unused)
+{
+
+ if (Wflag) {
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
+ "{T:/%*.*s} {T:/%-*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "Idx",
+ wid_nhtype, wid_nhtype, "Type",
+ wid_dst, wid_dst, "IFA",
+ wid_gw, wid_gw, "Gateway",
+ wid_flags, wid_flags, "Flags",
+ wid_pksent, wid_pksent, "Use",
+ wid_mtu, wid_mtu, "Mtu",
+ wid_if, wid_if, "Netif",
+ wid_if, wid_if, "Addrif",
+ wid_refcnt, wid_refcnt, "Refcnt",
+ wid_prepend, "Prepend");
+ } else {
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
+ " {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "Idx",
+ wid_dst, wid_dst, "IFA",
+ wid_gw, wid_gw, "Gateway",
+ wid_flags, wid_flags, "Flags",
+ wid_if, wid_if, "Netif",
+ wid_prepend, "Refcnt");
+ }
+}
+
+static void
+nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname)
+{
+ if (idx >= map->size) {
+ uint32_t new_size;
+ size_t sz;
+ if (map->size == 0)
+ new_size = 32;
+ else
+ new_size = map->size * 2;
+ if (new_size <= idx)
+ new_size = roundup(idx + 1, 32);
+
+ sz = new_size * (sizeof(struct nhop_entry));
+ if ((map->ptr = realloc(map->ptr, sz)) == NULL)
+ errx(2, "realloc(%lu) failed", sz);
+
+ memset(&map->ptr[map->size], 0, (new_size - map->size) * sizeof(struct nhop_entry));
+ map->size = new_size;
+ }
+
+ strlcpy(map->ptr[idx].ifname, ifname, sizeof(map->ptr[idx].ifname));
+ strlcpy(map->ptr[idx].gw, gw, sizeof(map->ptr[idx].gw));
+}
+
+static struct nhop_entry *
+nhop_get(struct nhop_map *map, uint32_t idx)
+{
+
+ if (idx >= map->size)
+ return (NULL);
+ if (*map->ptr[idx].ifname == '\0')
+ return (NULL);
+ return &map->ptr[idx];
+}
+
+static void
+print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_external *nh)
+{
+ char buffer[128];
+ char iface_name[128];
+ int protrusion;
+ char gw_addr[64];
+ struct nhop_addrs *na;
+ struct sockaddr *sa_gw, *sa_ifa;
+
+ xo_open_instance(name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:index/%%lu}{]:} ", wid_nhidx);
+ //xo_emit("{t:index/%-lu} ", wid_nhidx, nh->nh_idx);
+ xo_emit(buffer, nh->nh_idx);
+
+ if (Wflag) {
+ char *cp = nh_types[nh->nh_type];
+ xo_emit("{t:type_str/%*s} ", wid_nhtype, cp);
+ }
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->ifindex < (uint32_t)ifmap_size) {
+ strlcpy(iface_name, ifmap[nh->ifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+
+ na = (struct nhop_addrs *)((char *)nh + nh->nh_len);
+ //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr));
+ //protrusion = p_addr("ifa", src_addr, wid_dst);
+ sa_gw = (struct sockaddr *)((char *)na + na->gw_sa_off);
+ sa_ifa = (struct sockaddr *)((char *)na + na->src_sa_off);
+ protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst);
+
+ if (nh->nh_flags & NHF_GATEWAY) {
+ const char *cp;
+ cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST);
+ strlcpy(gw_addr, cp, sizeof(gw_addr));
+ } else
+ snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
+ protrusion = print_addr("gateway", gw_addr, wid_dst - protrusion);
+
+ nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ",
+ wid_flags - protrusion);
+
+ //p_nhflags(nh->nh_flags, buffer);
+ print_flags_generic(rtm->rtm_flags, rt_bits, buffer, "rt_flags_pretty");
+
+ if (Wflag) {
+ xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent);
+ xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu);
+ }
+ //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n");
+
+ if (Wflag)
+ xo_emit("{t:interface-name/%*s}", wid_if, iface_name);
+ else
+ xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name);
+
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) {
+ strlcpy(iface_name, ifmap[nh->aifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+ if (Wflag)
+ xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name);
+
+ xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount);
+ if (Wflag && nh->prepend_len) {
+ char *prepend_hex = "AABBCCDDEE";
+ xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex);
+ }
+
+ xo_emit("\n");
+ xo_close_instance(name);
+}
+
+struct nhops_map {
+ uint32_t idx;
+ struct rt_msghdr *rtm;
+};
+
+static int
+cmp_nh_idx(const void *_a, const void *_b)
+{
+ const struct nhops_map *a, *b;
+
+ a = _a;
+ b = _b;
+
+ if (a->idx > b->idx)
+ return (1);
+ else if (a->idx < b->idx)
+ return (-1);
+ return (0);
+}
+
+static void
+print_nhops_sysctl(int fibnum, int af)
+{
+ size_t needed;
+ int mib[7];
+ char *buf, *next, *lim;
+ struct rt_msghdr *rtm;
+ struct nhop_external *nh;
+ int fam;
+ struct nhops_map *nh_map;
+ size_t nh_count, nh_size;
+
+ mib[0] = CTL_NET;
+ mib[1] = PF_ROUTE;
+ mib[2] = 0;
+ mib[3] = af;
+ mib[4] = NET_RT_NHOP;
+ mib[5] = 0;
+ mib[6] = fibnum;
+ if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0)
+ err(EX_OSERR, "sysctl: net.route.0.%d.nhdump.%d estimate", af,
+ fibnum);
+ if ((buf = malloc(needed)) == NULL)
+ errx(2, "malloc(%lu)", (unsigned long)needed);
+ if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
+ err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum);
+ lim = buf + needed;
+ xo_open_container("nhop-table");
+ xo_open_list("rt-family");
+
+ /*
+ * nexhops are received unsorted. Collect everything first, sort and then display
+ * sorted.
+ */
+ nh_count = 0;
+ nh_size = 16;
+ nh_map = calloc(nh_size, sizeof(struct nhops_map));
+ for (next = buf; next < lim; next += rtm->rtm_msglen) {
+ rtm = (struct rt_msghdr *)next;
+ if (rtm->rtm_version != RTM_VERSION)
+ continue;
+
+ if (nh_count >= nh_size) {
+ nh_size *= 2;
+ nh_map = realloc(nh_map, nh_size * sizeof(struct nhops_map));
+ }
+
+ nh = (struct nhop_external *)(rtm + 1);
+ nh_map[nh_count].idx = nh->nh_idx;
+ nh_map[nh_count].rtm = rtm;
+ nh_count++;
+ }
+
+ if (nh_count > 0) {
+ qsort(nh_map, nh_count, sizeof(struct nhops_map), cmp_nh_idx);
+ nh = (struct nhop_external *)(nh_map[0].rtm + 1);
+ fam = nh->nh_family;
+
+ wid_dst = WID_GW_DEFAULT(fam);
+ wid_gw = WID_GW_DEFAULT(fam);
+ wid_nhidx = 5;
+ wid_nhtype = 12;
+ wid_refcnt = 6;
+ wid_flags = 6;
+ wid_pksent = 8;
+ wid_mtu = 6;
+ wid_if = WID_IF_DEFAULT(fam);
+ xo_open_instance("rt-family");
+ pr_family(fam);
+ xo_open_list("nh-entry");
+
+ print_nhop_header(fam);
+
+ for (size_t i = 0; i < nh_count; i++) {
+ rtm = nh_map[i].rtm;
+ nh = (struct nhop_external *)(rtm + 1);
+ print_nhop_entry_sysctl("nh-entry", rtm, nh);
+ }
+
+ xo_close_list("nh-entry");
+ xo_close_instance("rt-family");
+ }
+ xo_close_list("rt-family");
+ xo_close_container("nhop-table");
+ free(buf);
+}
+
+static void
+p_nhflags(int f, const char *format)
+{
+ struct bits *p;
+ char *pretty_name = "nh_flags_pretty";
+
+ xo_emit(format, fmt_flags(nh_bits, f));
+
+ xo_open_list(pretty_name);
+ for (p = nh_bits; p->b_mask; p++)
+ if (p->b_mask & f)
+ xo_emit("{le:nh_flags_pretty/%s}", p->b_name);
+ xo_close_list(pretty_name);
+}
+
+void
+nhops_print(int fibnum, int af)
+{
+ size_t intsize;
+ int numfibs;
+
+ intsize = sizeof(int);
+ if (fibnum == -1 &&
+ sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1)
+ fibnum = 0;
+ if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
+ numfibs = 1;
+ if (fibnum < 0 || fibnum > numfibs - 1)
+ errx(EX_USAGE, "%d: invalid fib", fibnum);
+
+ ifmap = prepare_ifmap(&ifmap_size);
+
+ xo_open_container("route-nhop-information");
+ xo_emit("{T:Nexthop data}");
+ if (fibnum)
+ xo_emit(" ({L:fib}: {:fib/%d})", fibnum);
+ xo_emit("\n");
+ print_nhops_sysctl(fibnum, af);
+ xo_close_container("route-nhop-information");
+}
+
Index: usr.bin/netstat/route.c
===================================================================
--- usr.bin/netstat/route.c
+++ usr.bin/netstat/route.c
@@ -69,16 +69,13 @@
#include <err.h>
#include <libxo/xo.h>
#include "netstat.h"
+#include "common.h"
#include "nl_defs.h"
/*
* Definitions for showing gateway flags.
*/
-static struct bits {
- u_long b_mask;
- char b_val;
- const char *b_name;
-} bits[] = {
+struct bits rt_bits[] = {
{ RTF_UP, 'U', "up" },
{ RTF_GATEWAY, 'G', "gateway" },
{ RTF_HOST, 'H', "host" },
@@ -99,11 +96,8 @@
{ 0 , 0, NULL }
};
-struct ifmap_entry {
- char ifname[IFNAMSIZ];
-};
static struct ifmap_entry *ifmap;
-static int ifmap_size;
+static size_t ifmap_size;
static struct timespec uptime;
static const char *netname4(in_addr_t, in_addr_t);
@@ -112,12 +106,7 @@
#endif
static void p_rtable_sysctl(int, int);
static void p_rtentry_sysctl(const char *name, struct rt_msghdr *);
-static int p_sockaddr(const char *name, struct sockaddr *, struct sockaddr *,
- int, int);
-static const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask,
- int flags);
static void p_flags(int, const char *);
-static const char *fmt_flags(int f);
static void domask(char *, size_t, u_long);
@@ -229,7 +218,7 @@
wid_dst, wid_dst, "Destination",
wid_gw, wid_gw, "Gateway",
wid_flags, wid_flags, "Flags",
- wid_pksent, wid_pksent, "Use",
+ wid_mtu, wid_mtu, "Nhop#",
wid_mtu, wid_mtu, "Mtu",
wid_if, wid_if, "Netif",
wid_expire, "Expire");
@@ -252,47 +241,11 @@
char *buf, *next, *lim;
struct rt_msghdr *rtm;
struct sockaddr *sa;
- int fam = AF_UNSPEC, ifindex = 0, size;
+ int fam = AF_UNSPEC;
int need_table_close = false;
- struct ifaddrs *ifap, *ifa;
- struct sockaddr_dl *sdl;
+ ifmap = prepare_ifmap(&ifmap_size);
- /*
- * Retrieve interface list at first
- * since we need #ifindex -> if_xname match
- */
- if (getifaddrs(&ifap) != 0)
- err(EX_OSERR, "getifaddrs");
-
- for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
-
- if (ifa->ifa_addr->sa_family != AF_LINK)
- continue;
-
- sdl = (struct sockaddr_dl *)ifa->ifa_addr;
- ifindex = sdl->sdl_index;
-
- if (ifindex >= ifmap_size) {
- size = roundup(ifindex + 1, 32) *
- sizeof(struct ifmap_entry);
- if ((ifmap = realloc(ifmap, size)) == NULL)
- errx(2, "realloc(%d) failed", size);
- memset(&ifmap[ifmap_size], 0,
- size - ifmap_size *
- sizeof(struct ifmap_entry));
-
- ifmap_size = roundup(ifindex + 1, 32);
- }
-
- if (*ifmap[ifindex].ifname != '\0')
- continue;
-
- strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ);
- }
-
- freeifaddrs(ifap);
-
mib[0] = CTL_NET;
mib[1] = PF_ROUTE;
mib[2] = 0;
@@ -377,7 +330,8 @@
wid_flags - protrusion);
p_flags(rtm->rtm_flags, buffer);
if (Wflag) {
- xo_emit("{t:use/%*lu} ", wid_pksent, rtm->rtm_rmx.rmx_pksent);
+ /* XXX: use=0? */
+ xo_emit("{t:nhop/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_nhidx);
if (rtm->rtm_rmx.rmx_mtu != 0)
xo_emit("{t:mtu/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_mtu);
@@ -410,7 +364,7 @@
xo_close_instance(name);
}
-static int
+int
p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask,
int flags, int width)
{
@@ -442,7 +396,7 @@
return (protrusion);
}
-static const char *
+const char *
fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags)
{
static char buf[128];
@@ -519,30 +473,10 @@
static void
p_flags(int f, const char *format)
{
- struct bits *p;
- xo_emit(format, fmt_flags(f));
-
- xo_open_list("flags_pretty");
- for (p = bits; p->b_mask; p++)
- if (p->b_mask & f)
- xo_emit("{le:flags_pretty/%s}", p->b_name);
- xo_close_list("flags_pretty");
+ print_flags_generic(f, rt_bits, format, "flags_pretty");
}
-static const char *
-fmt_flags(int f)
-{
- static char name[33];
- char *flags;
- struct bits *p = bits;
-
- for (flags = name; p->b_mask; p++)
- if (p->b_mask & f)
- *flags++ = p->b_val;
- *flags = '\0';
- return (name);
-}
char *
routename(struct sockaddr *sa, int flags)

File Metadata

Mime Type
text/plain
Expires
Sun, Nov 17, 4:50 PM (19 h, 11 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14682302
Default Alt Text
D24232.id70449.diff (123 KB)

Event Timeline