Page MenuHomeFreeBSD

D24141.diff
No OneTemporary

D24141.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: include/Makefile
===================================================================
--- include/Makefile
+++ include/Makefile
@@ -54,6 +54,7 @@
geom/mirror geom/mountver geom/multipath geom/nop \
geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \
net/altq \
+ net/route \
netgraph/atm netgraph/netflow \
netinet/cc \
netinet/netdump \
Index: lib/libc/gen/sysctl.3
===================================================================
--- lib/libc/gen/sysctl.3
+++ lib/libc/gen/sysctl.3
@@ -563,6 +563,8 @@
.It Dv NET_RT_IFLIST Ta 0 or if_index Ta None
.It Dv NET_RT_IFMALIST Ta 0 or if_index Ta None
.It Dv NET_RT_IFLISTL Ta 0 or if_index Ta None
+.It Dv NET_RT_NHOPS Ta None Ta fib number
+.It Dv NET_RT_NHGROUPS Ta None Ta fib number
.El
.Pp
The
@@ -583,6 +585,12 @@
.Va struct if_msghdrl
and
.Va struct ifa_msghdrl .
+.Pp
+.Dv NET_RT_NHOPS
+returns all nexthops for specified address family in given fib.
+.Pp
+.Dv NET_RT_NHGRUOPS
+returns all multipath groups for specified address family in given fib.
.It Li PF_INET
Get or set various global information about the IPv4
(Internet Protocol version 4).
Index: sys/amd64/conf/GENERIC
===================================================================
--- sys/amd64/conf/GENERIC
+++ sys/amd64/conf/GENERIC
@@ -31,6 +31,7 @@
options INET # InterNETworking
options INET6 # IPv6 communications protocols
options IPSEC_SUPPORT # Allow kldload of ipsec and tcpmd5
+options ROUTE_MPATH # Enable multipath routing
options TCP_OFFLOAD # TCP offload
options TCP_BLACKBOX # Enhanced TCP event logging
options TCP_HHOOK # hhook(9) framework for TCP
Index: sys/conf/NOTES
===================================================================
--- sys/conf/NOTES
+++ sys/conf/NOTES
@@ -986,7 +986,7 @@
#
# TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
#
-# RADIX_MPATH provides support for equal-cost multi-path routing.
+# ROUTE_MPATH provides support for multi-path routing.
#
options MROUTING # Multicast routing
options IPFIREWALL #firewall
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4089,16 +4089,23 @@
net/debugnet_inet.c optional inet debugnet
net/pfil.c optional ether | inet
net/radix.c standard
-net/radix_mpath.c standard
net/raw_cb.c standard
net/raw_usrreq.c standard
net/route.c standard
net/route_temporal.c standard
+net/route/mpath_ctl.c optional route_mpath
+net/route/route_ctl.c standard
+net/route/route_helpers.c standard
+net/route/nhop.c standard
+net/route/nhop_ctl.c standard
+net/route/nhop_utils.c standard
+net/route/nhgrp.c optional route_mpath
+net/route/nhgrp_ctl.c optional route_mpath
net/rss_config.c optional inet rss | inet6 rss
net/rtsock.c standard
net/slcompress.c optional netgraph_vjc | sppp | \
netgraph_sppp
-net/toeplitz.c optional inet rss | inet6 rss
+net/toeplitz.c optional inet rss | inet6 rss | route_mpath
net/vnet.c optional vimage
net80211/ieee80211.c optional wlan
net80211/ieee80211_acl.c optional wlan wlan_acl
Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -453,6 +453,7 @@
PCBGROUP opt_pcbgroup.h
PF_DEFAULT_TO_DROP opt_pf.h
RADIX_MPATH opt_mpath.h
+ROUTE_MPATH opt_route_mpath.h
ROUTETABLES opt_route.h
RSS opt_rss.h
SLIP_IFF_OPTS opt_slip.h
Index: sys/dev/cxgbe/tom/t4_connect.c
===================================================================
--- sys/dev/cxgbe/tom/t4_connect.c
+++ sys/dev/cxgbe/tom/t4_connect.c
@@ -49,6 +49,7 @@
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/ip.h>
@@ -224,13 +225,13 @@
* rtalloc1, RT_UNLOCK on rt.
*/
int
-t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt,
+t4_connect(struct toedev *tod, struct socket *so, struct nhop_object *nh,
struct sockaddr *nam)
{
struct adapter *sc = tod->tod_softc;
struct toepcb *toep = NULL;
struct wrqe *wr = NULL;
- struct ifnet *rt_ifp = rt->rt_ifp;
+ struct ifnet *rt_ifp = nh->nh_ifp;
struct vi_info *vi;
int qid_atid, rc, isipv6;
struct inpcb *inp = sotoinpcb(so);
@@ -275,7 +276,7 @@
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
toep->l2te = t4_l2t_get(vi->pi, rt_ifp,
- rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway : nam);
+ nh->nh_flags & NHF_GATEWAY ? &nh->gw_sa : nam);
if (toep->l2te == NULL)
DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM);
Index: sys/dev/cxgbe/tom/t4_tom.h
===================================================================
--- sys/dev/cxgbe/tom/t4_tom.h
+++ sys/dev/cxgbe/tom/t4_tom.h
@@ -369,7 +369,7 @@
/* t4_connect.c */
void t4_init_connect_cpl_handlers(void);
void t4_uninit_connect_cpl_handlers(void);
-int t4_connect(struct toedev *, struct socket *, struct rtentry *,
+int t4_connect(struct toedev *, struct socket *, struct nhop_object *,
struct sockaddr *);
void act_open_failure_cleanup(struct adapter *, u_int, u_int);
Index: sys/fs/nfsclient/nfs_clvfsops.c
===================================================================
--- sys/fs/nfsclient/nfs_clvfsops.c
+++ sys/fs/nfsclient/nfs_clvfsops.c
@@ -473,10 +473,9 @@
sin.sin_len = sizeof(sin);
/* XXX MRT use table 0 for this sort of thing */
CURVNET_SET(TD_TO_VNET(td));
- error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&sin,
- (struct sockaddr *)&nd->mygateway,
- (struct sockaddr *)&mask,
- RTF_UP | RTF_GATEWAY, NULL, RT_DEFAULT_FIB);
+ error = rib_request_simple(RIB_ADD, RT_DEFAULT_FIB,
+ (struct sockaddr *)&sin, (struct sockaddr *)&mask,
+ (struct sockaddr *)&nd->mygateway, RTF_UP | RTF_GATEWAY);
CURVNET_RESTORE();
if (error)
panic("nfs_mountroot: RTM_ADD: %d", error);
Index: sys/modules/tests/Makefile
===================================================================
--- sys/modules/tests/Makefile
+++ sys/modules/tests/Makefile
@@ -3,6 +3,7 @@
SUBDIR+= framework
SUBDIR+= .WAIT
SUBDIR+= callout_test
+SUBDIR+= routing
SUBDIR_PARALLEL=
Index: sys/modules/tests/routing/Makefile
===================================================================
--- /dev/null
+++ sys/modules/tests/routing/Makefile
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${SRCTOP}/sys/tests/routing
+
+KMOD= routing_test
+SRCS= module.c test_route_ctl.c
+
+#
+# Enable full debugging
+#
+CFLAGS += -g -O0
+
+.include <bsd.kmod.mk>
Index: sys/net/debugnet.c
===================================================================
--- sys/net/debugnet.c
+++ sys/net/debugnet.c
@@ -57,6 +57,8 @@
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+#include <net/route/nhop.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
@@ -644,7 +646,8 @@
if (pcb->dp_client == INADDR_ANY || pcb->dp_gateway == INADDR_ANY ||
pcb->dp_ifp == NULL) {
struct sockaddr_in dest_sin, *gw_sin, *local_sin;
- struct rtentry *dest_rt;
+ struct nhop_object *dest_nh;
+ struct in_addr dest_addr;
struct ifnet *rt_ifp;
memset(&dest_sin, 0, sizeof(dest_sin));
@@ -653,31 +656,32 @@
.sin_family = AF_INET,
.sin_addr.s_addr = pcb->dp_server,
};
+ dest_addr.s_addr = pcb->dp_server;
CURVNET_SET(vnet0);
- dest_rt = rtalloc1((struct sockaddr *)&dest_sin, 0,
- RTF_RNH_LOCKED);
+ dest_nh = fib4_lookup_nh_ptr(RT_DEFAULT_FIB, dest_addr, 0,
+ NHR_REF, 0);
CURVNET_RESTORE();
- if (dest_rt == NULL) {
+ if (dest_nh == NULL) {
printf("%s: Could not get route for that server.\n",
__func__);
error = ENOENT;
goto cleanup;
}
- if (dest_rt->rt_gateway->sa_family == AF_INET)
- gw_sin = (struct sockaddr_in *)dest_rt->rt_gateway;
+ if (dest_nh->gw4_sa.sin_family == AF_INET)
+ gw_sin = &dest_nh->gw4_sa;
else {
- if (dest_rt->rt_gateway->sa_family == AF_LINK)
+ if (dest_nh->gw4_sa.sin_family == AF_LINK)
DNETDEBUG("Destination address is on link.\n");
gw_sin = NULL;
}
- MPASS(dest_rt->rt_ifa->ifa_addr->sa_family == AF_INET);
- local_sin = (struct sockaddr_in *)dest_rt->rt_ifa->ifa_addr;
+ MPASS(dest_nh->nh_ifa->ifa_addr->sa_family == AF_INET);
+ local_sin = (struct sockaddr_in *)dest_nh->nh_ifa->ifa_addr;
- rt_ifp = dest_rt->rt_ifp;
+ rt_ifp = dest_nh->nh_ifp;
if (pcb->dp_client == INADDR_ANY)
pcb->dp_client = local_sin->sin_addr.s_addr;
@@ -686,7 +690,7 @@
if (pcb->dp_ifp == NULL)
pcb->dp_ifp = rt_ifp;
- RTFREE_LOCKED(dest_rt);
+ NH_FREE(dest_nh);
}
ifp = pcb->dp_ifp;
Index: sys/net/if.c
===================================================================
--- sys/net/if.c
+++ sys/net/if.c
@@ -1851,6 +1851,7 @@
struct sockaddr_dl null_sdl;
struct ifnet *ifp;
struct ifaddr *rti_ifa = NULL;
+ struct rib_cmd_info rc;
ifp = ifa->ifa_ifp;
@@ -1873,7 +1874,9 @@
info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type);
- error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib);
+ NET_EPOCH_ENTER(et);
+ error = rib_request(cmd, ifp->if_fib, &info, &rc);
+ NET_EPOCH_EXIT(et);
if (rti_ifa != NULL)
ifa_free(rti_ifa);
Index: sys/net/if_var.h
===================================================================
--- sys/net/if_var.h
+++ sys/net/if_var.h
@@ -61,6 +61,7 @@
*/
struct rtentry; /* ifa_rtrequest */
+struct nhop_object; /* ifa_rtrequest */
struct rt_addrinfo; /* ifa_rtrequest */
struct socket;
struct carp_if;
@@ -551,7 +552,7 @@
struct carp_softc *ifa_carp; /* pointer to CARP data */
CK_STAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */
void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */
- (int, struct rtentry *, struct rt_addrinfo *);
+ (int, struct rtentry *, struct nhop_object *, struct rt_addrinfo *);
u_short ifa_flags; /* mostly rt_flags for cloning */
#define IFA_ROUTE RTF_UP /* route installed */
#define IFA_RTSELF RTF_HOST /* loopback route to self installed */
Index: sys/net/route.h
===================================================================
--- sys/net/route.h
+++ sys/net/route.h
@@ -35,7 +35,6 @@
#ifndef _NET_ROUTE_H_
#define _NET_ROUTE_H_
-#include <sys/counter.h>
#include <net/vnet.h>
/*
@@ -45,13 +44,14 @@
* are set by making entries for all directly connected interfaces.
*/
+struct nhop_object;
/*
* Struct route consiste of a destination address,
* a route entry pointer, link-layer prepend data pointer along
* with its length.
*/
struct route {
- struct rtentry *ro_rt;
+ struct nhop_object *ro_nh;
struct llentry *ro_lle;
/*
* ro_prepend and ro_plen are only used for bpf to pass in a
@@ -118,6 +118,10 @@
extern u_int rt_numfibs; /* number of usable routing tables */
VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */
#define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs)
+
+/* Calculate flowid for locally-originated packets */
+#define V_fib_hash_outbound VNET(fib_hash_outbound)
+VNET_DECLARE(u_int, fib_hash_outbound);
#endif
/*
@@ -128,41 +132,7 @@
* gateways are marked so that the output routines know to address the
* gateway rather than the ultimate destination.
*/
-#ifndef RNF_NORMAL
-#include <net/radix.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-#endif
-#if defined(_KERNEL)
-struct rtentry {
- struct radix_node rt_nodes[2]; /* tree glue, and other values */
- /*
- * XXX struct rtentry must begin with a struct radix_node (or two!)
- * because the code does some casts of a 'struct radix_node *'
- * to a 'struct rtentry *'
- */
-#define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key)))
-#define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask)))
-#define rt_key_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key)))
-#define rt_mask_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask)))
- struct sockaddr *rt_gateway; /* value */
- struct ifnet *rt_ifp; /* the answer: interface to use */
- struct ifaddr *rt_ifa; /* the answer: interface address to use */
- int rt_flags; /* up/down?, host/net */
- int rt_refcnt; /* # held references */
- u_int rt_fibnum; /* which FIB */
- u_long rt_mtu; /* MTU for this path */
- u_long rt_weight; /* absolute weight */
- u_long rt_expire; /* lifetime for route, e.g. redirect */
-#define rt_endzero rt_pksent
- counter_u64_t rt_pksent; /* packets sent using this route */
- struct mtx rt_mtx; /* mutex for routing entry */
- struct rtentry *rt_chain; /* pointer to next rtentry to delete */
-};
-#endif /* _KERNEL */
-
#define RTF_UP 0x1 /* route usable */
#define RTF_GATEWAY 0x2 /* destination is a gateway */
#define RTF_HOST 0x4 /* host entry (net otherwise) */
@@ -197,15 +167,17 @@
with existing routing apps */
/* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */
-#define RTF_FMASK \
- (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \
- RTF_REJECT | RTF_STATIC | RTF_STICKY)
+#define RIB_RTE_CHANGE_MASK (RTF_GATEWAY | RTF_REJECT | RTF_DYNAMIC | \
+ RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | \
+ RTF_FIXEDMTU)
/*
* fib_ nexthop API flags.
*/
/* Consumer-visible nexthop info flags */
+#define NHF_INVALID 0x0001 /* Nexthop is not usable */
+#define NHF_MULTIPATH 0x0008 /* Nexhop is a multipath group */
#define NHF_REJECT 0x0010 /* RTF_REJECT */
#define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */
#define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */
@@ -215,27 +187,16 @@
#define NHF_HOST 0x0400 /* RTF_HOST */
/* Nexthop request flags */
+#define NHR_NONE 0x00 /* empty flags field */
#define NHR_IFAIF 0x01 /* Return ifa_ifp interface */
#define NHR_REF 0x02 /* For future use */
+/* uRPF */
+#define NHR_NODEFAULT 0x04 /* do not consider default route */
+
/* Control plane route request flags */
#define NHR_COPY 0x100 /* Copy rte data */
-#ifdef _KERNEL
-/* rte<>ro_flags translation */
-static inline void
-rt_update_ro_flags(struct route *ro)
-{
- int rt_flags = ro->ro_rt->rt_flags;
-
- ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
-
- ro->ro_flags |= (rt_flags & RTF_REJECT) ? RT_REJECT : 0;
- ro->ro_flags |= (rt_flags & RTF_BLACKHOLE) ? RT_BLACKHOLE : 0;
- ro->ro_flags |= (rt_flags & RTF_GATEWAY) ? RT_HAS_GW : 0;
-}
-#endif
-
/*
* Routing statistics.
*/
@@ -245,6 +206,15 @@
uint64_t rts_newgateway; /* routes modified by redirects */
uint64_t rts_unreach; /* lookups which failed */
uint64_t rts_wildcard; /* lookups satisfied by a wildcard */
+ uint64_t rts_add_success; /* number of routes added */
+ uint64_t rts_add_algo_fail; /* failuers to add a routing entry */
+ uint64_t rts_add_pinned; /* number of pinned routes added */
+ uint64_t rts_add_retry; /* number of add retries */
+ uint64_t rts_mpath_ineligible; /* number of ineligible mpath add failures */
+ uint64_t rts_del_fail_priority; /* # of delte failures due to priority */
+ uint64_t rts_del_algo_fail; /* # of algorithm failures to delete an entry */
+ uint64_t rts_del_success; /* number of successful deletes */
+ uint64_t rts_del_retry; /* number of delete retries */
};
/*
@@ -338,7 +308,8 @@
#define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */
#define RTAX_MAX 8 /* size of array to allocate */
-typedef int rt_filter_f_t(const struct rtentry *, void *);
+struct rtentry;
+typedef int rt_filter_f_t(const struct rtentry *, const struct nhop_object *, void *);
struct rt_addrinfo {
int rti_addrs; /* Route RTF_ flags */
@@ -373,58 +344,22 @@
#define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \
|| (ifp)->if_link_state == LINK_STATE_UP)
-#define RT_LOCK_INIT(_rt) \
- mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW)
-#define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx)
-#define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx)
-#define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx)
-#define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED)
-#define RT_UNLOCK_COND(_rt) do { \
- if (mtx_owned(&(_rt)->rt_mtx)) \
- mtx_unlock(&(_rt)->rt_mtx); \
-} while (0)
-
-#define RT_ADDREF(_rt) do { \
- RT_LOCK_ASSERT(_rt); \
- KASSERT((_rt)->rt_refcnt >= 0, \
- ("negative refcnt %d", (_rt)->rt_refcnt)); \
- (_rt)->rt_refcnt++; \
-} while (0)
-
-#define RT_REMREF(_rt) do { \
- RT_LOCK_ASSERT(_rt); \
- KASSERT((_rt)->rt_refcnt > 0, \
- ("bogus refcnt %d", (_rt)->rt_refcnt)); \
- (_rt)->rt_refcnt--; \
-} while (0)
-
-#define RTFREE_LOCKED(_rt) do { \
- if ((_rt)->rt_refcnt <= 1) \
- rtfree(_rt); \
- else { \
- RT_REMREF(_rt); \
- RT_UNLOCK(_rt); \
+#define RO_NHFREE(_ro) do { \
+ if ((_ro)->ro_nh) { \
+ NH_FREE((_ro)->ro_nh); \
+ (_ro)->ro_nh = NULL; \
} \
- /* guard against invalid refs */ \
- _rt = 0; \
} while (0)
-#define RTFREE(_rt) do { \
- RT_LOCK(_rt); \
- RTFREE_LOCKED(_rt); \
-} while (0)
-
-#define RO_RTFREE(_ro) do { \
- if ((_ro)->ro_rt) \
- RTFREE((_ro)->ro_rt); \
-} while (0)
-
#define RO_INVALIDATE_CACHE(ro) do { \
- RO_RTFREE(ro); \
if ((ro)->ro_lle != NULL) { \
LLE_FREE((ro)->ro_lle); \
(ro)->ro_lle = NULL; \
} \
+ if ((ro)->ro_nh != NULL) { \
+ NH_FREE((ro)->ro_nh); \
+ (ro)->ro_nh = NULL; \
+ } \
} while (0)
/*
@@ -432,7 +367,7 @@
* out-of-date cache, simply free it. Update the generation number
* for the new allocation
*/
-#define RT_VALIDATE(ro, cookiep, fibnum) do { \
+#define NH_VALIDATE(ro, cookiep, fibnum) do { \
rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \
if (*(cookiep) != cookie) { \
RO_INVALIDATE_CACHE(ro); \
@@ -440,6 +375,25 @@
} \
} while (0)
+/* Keep values consistent with RTM_ ones for now */
+enum rib_cmd_type {
+ RIB_ADD = 1, /* Add route to the RIB */
+ RIB_DEL = 2, /* Delete route from the RIB */
+ RIB_CHANGE = 3, /* Change route properties */
+};
+
+struct rib_cmd_info {
+ uint8_t cmd; /* RTM_ADD|RTM_DEL|RTM_CHANGE */
+ uint8_t num_changed; /* Number of changed nhops */
+ uint8_t idx_changed; /* Index of the first changed nhop */
+ uint8_t spare;
+ uint32_t rt_weight; /* new weight */
+ struct rtentry *rt; /* Target entry */
+ struct nhop_object *nh_old; /* Target nhop OR mpath */
+ struct nhop_object *nh_new; /* Target nhop OR mpath */
+ uint64_t mask_changed; /* Bitmask of changed nhops */
+};
+
struct ifmultiaddr;
struct rib_head;
@@ -450,18 +404,18 @@
void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int);
void rt_newaddrmsg_fib(int, struct ifaddr *, struct rtentry *, int);
int rt_addrmsg(int, struct ifaddr *, int);
-int rt_routemsg(int, struct rtentry *, struct ifnet *ifp, int, int);
+int rt_routemsg(int, struct rtentry *, struct nhop_object *, int);
int rt_routemsg_info(int, struct rt_addrinfo *, int);
void rt_newmaddrmsg(int, struct ifmultiaddr *);
-int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *);
void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *);
struct rib_head *rt_table_init(int, int, u_int);
void rt_table_destroy(struct rib_head *);
u_int rt_tables_get_gen(int table, int fam);
+int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum);
int rtsock_addrmsg(int, struct ifaddr *, int);
-int rtsock_routemsg(int, struct rtentry *, struct ifnet *ifp, int, int);
-int rtsock_routemsg_info(int, struct rt_addrinfo *, int);
+int rtsock_routemsg(int, struct rtentry *, struct nhop_object *, int);
+int rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum);
/*
* Note the following locking behavior:
@@ -487,25 +441,67 @@
/* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */
/* Thes are used by old code not yet converted to use multiple FIBS */
-struct rtentry *rtalloc1(struct sockaddr *, int, u_long);
int rtinit(struct ifaddr *, int, int);
/* XXX MRT NEW VERSIONS THAT USE FIBs
* For now the protocol indepedent versions are the same as the AF_INET ones
* but this will change..
*/
-void rtalloc_ign_fib(struct route *ro, u_long ignflags, u_int fibnum);
-struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int);
int rtioctl_fib(u_long, caddr_t, u_int);
-int rtrequest_fib(int, struct sockaddr *,
- struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int);
-int rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int);
+
int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t,
struct rt_addrinfo *);
void rib_free_info(struct rt_addrinfo *info);
int rib_add_redirect(u_int fibnum, struct sockaddr *dst,
struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp,
int flags, int expire_sec);
+
+/* New API */
+#define ROUTE_DEFAULT_WEIGHT 100
+#define ROUTE_MAX_WEIGHT 16777215 /* Limit weight to 3 bytes */
+
+int rib_add_route(u_int fibnum, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+int rib_del_route(u_int fibnum, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+int rib_change_route(u_int fibnum, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+int rib_request(enum rib_cmd_type cmd, u_int fibnum,
+ struct rt_addrinfo *info, struct rib_cmd_info *rc);
+int rib_request_simple(enum rib_cmd_type cmd, u_int fibnum,
+ struct sockaddr *dst, struct sockaddr *mask, struct sockaddr *gw,
+ int rt_flags);
+
+int rib_lookup_route_netmask(u_int fibnum, const struct sockaddr *dst,
+ const struct sockaddr *mask, struct rtentry **ret);
+
+int rib_get_entry_prefix(const struct rtentry *rt, struct sockaddr *dst,
+ struct sockaddr *netmask, int *plen);
+struct sockaddr *rib_get_entry_netmask_sa(const struct rtentry *rt,
+ struct sockaddr *netmask, size_t sa_len, int *error);
+struct sockaddr *rib_get_entry_dst_sa(const struct rtentry *rt,
+ struct sockaddr *dst, size_t sa_len, int *error);
+int rib_get_entry_weight(const struct rtentry *rt);
+unsigned long rib_get_entry_expire_time(const struct rtentry *rt);
+int rib_get_entry_rtflags(const struct rtentry *rt,
+ const struct nhop_object *nh);
+const struct nhop_object *rib_get_entry_nhop(const struct rtentry *rt);
+sa_family_t rib_get_entry_family(const struct rtentry *rt);
+unsigned int rib_get_entry_fibnum(const struct rtentry *rt);
+int rib_get_entry_plen(const struct rtentry *rt);
+int rib_can_export_rte(struct ucred *td_ucred, const struct rtentry *rt);
+
+/* Helper functions */
+typedef void(route_notification_t)(int cmd, struct rib_head *rnh,
+ struct rt_addrinfo *info, struct rtentry *rt, struct nhop_object *nh_old,
+ struct nhop_object *nh_new, uint32_t weight, void *cbdata);
+int rib_decompose_notification(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata);
+void rib_notify_subscribers(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+
+int rib_print_sockaddr(char *buf, int buflen, const struct sockaddr *s);
+void rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg);
#endif
Index: sys/net/route.c
===================================================================
--- sys/net/route.c
+++ sys/net/route.c
@@ -61,7 +61,11 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
+#define NEED_RTZONE
#include <net/route_var.h>
+#include <net/route/rtentry_var.h>
#include <net/vnet.h>
#ifdef RADIX_MPATH
@@ -108,7 +112,7 @@
SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET,
&VNET_NAME(rt_add_addr_allfibs), 0, "");
-VNET_PCPUSTAT_DEFINE_STATIC(struct rtstat, rtstat);
+VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat);
#define RTSTAT_ADD(name, val) \
VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
#define RTSTAT_INC(name) RTSTAT_ADD(name, 1)
@@ -124,47 +128,30 @@
VNET_DEFINE(int, rttrash); /* routes not in table but not freed */
#define V_rttrash VNET(rttrash)
+#if 0
+VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */
+#endif
+uma_zone_t rtzone; /* Routing table UMA zone. */
-/*
- * Convert a 'struct radix_node *' to a 'struct rtentry *'.
- * The operation can be done safely (in this code) because a
- * 'struct rtentry' starts with two 'struct radix_node''s, the first
- * one representing leaf nodes in the routing tree, which is
- * what the code in radix.c passes us as a 'struct radix_node'.
- *
- * But because there are a lot of assumptions in this conversion,
- * do not cast explicitly, but always use the macro below.
- */
-#define RNTORT(p) ((struct rtentry *)(p))
-
-VNET_DEFINE_STATIC(uma_zone_t, rtzone); /* Routing table UMA zone. */
-#define V_rtzone VNET(rtzone)
-
EVENTHANDLER_LIST_DEFINE(rt_addrmsg);
-static int rt_getifa_fib(struct rt_addrinfo *, u_int);
-static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *,
- struct rtentry **, u_int);
-static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *);
-static int rt_ifdelroute(const struct rtentry *rt, void *arg);
-static struct rtentry *rt_unlinkrte(struct rib_head *rnh,
- struct rt_addrinfo *info, int *perror);
-static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info);
+static void destroy_rtentry(struct rtentry *rt);
+static void destroy_rtentry_epoch(epoch_context_t ctx);
+
+static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *,
+ void *arg);
+static void rt_notifydelete(struct rtentry *rt, struct nhop_object *nh,
+ struct rt_addrinfo *info);
#ifdef RADIX_MPATH
static struct radix_node *rt_mpath_unlink(struct rib_head *rnh,
struct rt_addrinfo *info, struct rtentry *rto, int *perror);
#endif
-static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info,
- int flags);
+static int rt_exportinfo(struct rtentry *rt, struct nhop_object *nh,
+ struct rt_addrinfo *info, int flags);
-struct if_mtuinfo
-{
- struct ifnet *ifp;
- int mtu;
-};
+int p_sockaddr(char *buf, int buflen, const struct sockaddr *s, int family);
+int rt_print(char *buf, int buflen, const struct rtentry *rt);
-static int if_updatemtu_cb(struct radix_node *, void *);
-
/*
* handler for net.my_fibnum
*/
@@ -222,31 +209,11 @@
}
-/*
- * route initialization must occur before ip6_init2(), which happenas at
- * SI_ORDER_MIDDLE.
- */
-static void
-route_init(void)
-{
-
- /* whack the tunable ints into line. */
- if (rt_numfibs > RT_MAXFIBS)
- rt_numfibs = RT_MAXFIBS;
- if (rt_numfibs == 0)
- rt_numfibs = 1;
-}
-SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL);
-
static int
rtentry_zinit(void *mem, int size, int how)
{
struct rtentry *rt = mem;
- rt->rt_pksent = counter_u64_alloc(how);
- if (rt->rt_pksent == NULL)
- return (ENOMEM);
-
RT_LOCK_INIT(rt);
return (0);
@@ -258,7 +225,6 @@
struct rtentry *rt = mem;
RT_LOCK_DESTROY(rt);
- counter_u64_free(rt->rt_pksent);
}
static int
@@ -267,7 +233,6 @@
struct rtentry *rt = mem;
bzero(rt, offsetof(struct rtentry, rt_endzero));
- counter_u64_zero(rt->rt_pksent);
rt->rt_chain = NULL;
return (0);
@@ -281,7 +246,26 @@
RT_UNLOCK_COND(rt);
}
+/*
+ * route initialization must occur before ip6_init2(), which happenas at
+ * SI_ORDER_MIDDLE.
+ */
static void
+route_init(void)
+{
+
+ /* whack the tunable ints into line. */
+ if (rt_numfibs > RT_MAXFIBS)
+ rt_numfibs = RT_MAXFIBS;
+ if (rt_numfibs == 0)
+ rt_numfibs = 1;
+ V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
+ rtentry_ctor, rtentry_dtor,
+ rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL);
+
+static void
vnet_route_init(const void *unused __unused)
{
struct domain *dom;
@@ -292,9 +276,11 @@
V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO);
+/*
V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry),
rtentry_ctor, rtentry_dtor,
rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0);
+*/
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtattach == NULL)
continue;
@@ -323,6 +309,9 @@
struct domain *dom;
struct rib_head **rnh;
+ rnh = (struct rib_head **)V_rt_tables;
+ printf("--VNET V_rt_tables=%p\n", rnh);
+
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtdetach == NULL)
continue;
@@ -341,7 +330,7 @@
}
free(V_rt_tables, M_RTABLE);
- uma_zdestroy(V_rtzone);
+ //uma_zdestroy(V_rtzone);
}
VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
vnet_route_uninit, 0);
@@ -372,6 +361,8 @@
/* Init locks */
RIB_LOCK_INIT(rh);
+ nhops_init(rh);
+
/* Finally, set base callbacks */
rh->rnh_addaddr = rn_addroute;
rh->rnh_deladdr = rn_delete;
@@ -403,6 +394,8 @@
rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
+ nhops_destroy(rh);
+
/* Assume table is already empty */
RIB_LOCK_DESTROY(rh);
free(rh, M_RTABLE);
@@ -423,39 +416,7 @@
return (0);
}
-/*
- * Packet routing routines.
- */
-void
-rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
-{
- struct rtentry *rt;
-
- if ((rt = ro->ro_rt) != NULL) {
- if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
- return;
- RTFREE(rt);
- ro->ro_rt = NULL;
- }
- ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
- if (ro->ro_rt)
- RT_UNLOCK(ro->ro_rt);
-}
-
-/*
- * Look up the route that matches the address given
- * Or, at least try.. Create a cloned route if needed.
- *
- * The returned route, if any, is locked.
- */
struct rtentry *
-rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
-{
-
- return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB));
-}
-
-struct rtentry *
rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
u_int fibnum)
{
@@ -534,9 +495,34 @@
RT_REMREF(rt);
if (rt->rt_refcnt > 0) {
log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
- goto done;
+ RT_UNLOCK(rt);
+ return;
}
+ RT_UNLOCK(rt);
+ /* Save curvnet */
+#ifdef VIMAGE
+ rt->rt_chain = (struct rtentry *)curvnet;
+#else
+ rt->rt_chain = NULL;
+#endif
+
+ epoch_call(net_epoch_preempt, destroy_rtentry_epoch, &rt->rt_epoch_ctx);
+}
+
+
+__noinline static void
+destroy_rtentry(struct rtentry *rt)
+{
+#if 0
+ struct rib_head *rnh;
+
+#ifdef VIMAGE
+ CURVNET_SET((struct vnet *)rt->rt_chain);
+#endif
+ rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
+ KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
+
/*
* On last reference give the "close method" a chance
* to cleanup private state. This also permits (for
@@ -548,50 +534,59 @@
*/
if (rt->rt_refcnt == 0 && rnh->rnh_close)
rnh->rnh_close((struct radix_node *)rt, &rnh->head);
+#endif
/*
* If we are no longer "up" (and ref == 0)
* then we can free the resources associated
* with the route.
*/
- if ((rt->rt_flags & RTF_UP) == 0) {
+ if (!RT_IS_UP(rt)) {
if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic("rtfree 2");
/*
* the rtentry must have been removed from the routing table
* so it is represented in rttrash.. remove that now.
*/
- V_rttrash--;
+ //V_rttrash--;
#ifdef DIAGNOSTIC
if (rt->rt_refcnt < 0) {
printf("rtfree: %p not freed (neg refs)\n", rt);
- goto done;
+ CURVNET_RESTORE();
+ return;
}
#endif
/*
- * release references on items we hold them on..
- * e.g other routes and ifaddrs.
- */
- if (rt->rt_ifa)
- ifa_free(rt->rt_ifa);
- /*
* The key is separatly alloc'd so free it (see rt_setgate()).
* This also frees the gateway, as they are always malloc'd
* together.
*/
- R_Free(rt_key(rt));
+ if (rt_key(rt) != &rt->rt_dst)
+ free(rt_key(rt), M_RTABLE);
/*
* and the rtentry itself of course
*/
uma_zfree(V_rtzone, rt);
- return;
}
-done:
- RT_UNLOCK(rt);
+
+ //CURVNET_RESTORE();
}
/*
+ * Epoch callback indicating rtentry is safe to destroy
+ */
+static void
+destroy_rtentry_epoch(epoch_context_t ctx)
+{
+ struct rtentry *rt;
+
+ rt = __containerof(ctx, struct rtentry, rt_epoch_ctx);
+
+ destroy_rtentry(rt);
+}
+
+/*
* Adds a temporal redirect entry to the routing table.
* @fibnum: fib number
* @dst: destination to install redirect to
@@ -607,9 +602,9 @@
rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway,
struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec)
{
- struct rtentry *rt;
int error;
struct rt_addrinfo info;
+ struct rib_cmd_info rc;
struct rt_metrics rti_rmx;
struct ifaddr *ifa;
@@ -641,7 +636,7 @@
info.rti_mflags |= RTV_EXPIRE;
info.rti_rmx = &rti_rmx;
- error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
+ error = rib_add_route(fibnum, &info, &rc);
ifa_free(ifa);
if (error != 0) {
@@ -649,9 +644,7 @@
return (error);
}
- RT_LOCK(rt);
- flags = rt->rt_flags;
- RTFREE_LOCKED(rt);
+ flags = rib_get_entry_rtflags(rc.rt, rc.nh_new);
RTSTAT_INC(rts_dynamic);
@@ -719,6 +712,7 @@
ifa = ifa_ifwithnet(gateway, 0, fibnum);
if (ifa == NULL) {
struct rtentry *rt;
+ struct nhop_object *nh;
rt = rtalloc1_fib(gateway, 0, flags, fibnum);
if (rt == NULL)
@@ -739,8 +733,9 @@
default:
break;
}
- if (!not_found && rt->rt_ifa != NULL) {
- ifa = rt->rt_ifa;
+ nh = RT_SELECT_NHOP(rt, 0);
+ if (!not_found && nh->nh_ifa != NULL) {
+ ifa = nh->nh_ifa;
}
RT_REMREF(rt);
RT_UNLOCK(rt);
@@ -758,33 +753,6 @@
}
/*
- * Do appropriate manipulations of a routing tree given
- * all the bits of info needed
- */
-int
-rtrequest_fib(int req,
- struct sockaddr *dst,
- struct sockaddr *gateway,
- struct sockaddr *netmask,
- int flags,
- struct rtentry **ret_nrt,
- u_int fibnum)
-{
- struct rt_addrinfo info;
-
- if (dst->sa_len == 0)
- return(EINVAL);
-
- bzero((caddr_t)&info, sizeof(info));
- info.rti_flags = flags;
- info.rti_info[RTAX_DST] = dst;
- info.rti_info[RTAX_GATEWAY] = gateway;
- info.rti_info[RTAX_NETMASK] = netmask;
- return rtrequest1_fib(req, &info, ret_nrt, fibnum);
-}
-
-
-/*
* Copy most of @rt data into @info.
*
* If @flags contains NHR_COPY, copies dst,netmask and gw to the
@@ -798,7 +766,8 @@
* Returns 0 on success.
*/
int
-rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags)
+rt_exportinfo(struct rtentry *rt, struct nhop_object *nh,
+ struct rt_addrinfo *info, int flags)
{
struct rt_metrics *rmx;
struct sockaddr *src, *dst;
@@ -833,9 +802,9 @@
}
/* Copy gateway is set && dst is non-zero */
- src = rt->rt_gateway;
+ src = &nh->gw_sa;
dst = info->rti_info[RTAX_GATEWAY];
- if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){
+ if ((nh->nh_flags & NHF_GATEWAY) && src != NULL && dst != NULL){
if (src->sa_len > dst->sa_len)
return (ENOMEM);
memcpy(dst, src, src->sa_len);
@@ -848,8 +817,8 @@
info->rti_info[RTAX_NETMASK] = rt_mask(rt);
info->rti_addrs |= RTA_NETMASK;
}
- if (rt->rt_flags & RTF_GATEWAY) {
- info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
+ if (nh->nh_flags & NHF_GATEWAY) {
+ info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
info->rti_addrs |= RTA_GATEWAY;
}
}
@@ -857,12 +826,12 @@
rmx = info->rti_rmx;
if (rmx != NULL) {
info->rti_mflags |= RTV_MTU;
- rmx->rmx_mtu = rt->rt_mtu;
+ rmx->rmx_mtu = nh->nh_mtu;
}
- info->rti_flags = rt->rt_flags;
- info->rti_ifp = rt->rt_ifp;
- info->rti_ifa = rt->rt_ifa;
+ info->rti_flags = rib_get_entry_rtflags(rt, nh);
+ info->rti_ifp = nh->nh_ifp;
+ info->rti_ifa = nh->nh_ifa;
if (flags & NHR_REF) {
if_ref(info->rti_ifp);
ifa_ref(info->rti_ifa);
@@ -889,6 +858,7 @@
struct rib_head *rh;
struct radix_node *rn;
struct rtentry *rt;
+ struct nhop_object *nh;
int error;
KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum"));
@@ -900,10 +870,11 @@
rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
rt = RNTORT(rn);
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rt->rt_ifp)) {
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
flags = (flags & NHR_REF) | NHR_COPY;
- error = rt_exportinfo(rt, info, flags);
+ error = rt_exportinfo(rt, nh, info, flags);
RIB_RUNLOCK(rh);
return (error);
@@ -927,10 +898,10 @@
}
/*
- * Iterates over all existing fibs in system calling
- * @setwa_f function prior to traversing each fib.
+ * Iterates over all existing fibs in system and deletes each element
+ * for which @filter_f function returns non-zero value.
* Calls @wa_f function for each element in current fib.
- * If af is not AF_UNSPEC, iterates over fibs in particular
+ * If @family is not AF_UNSPEC, iterates over fibs in particular
* address family.
*/
void
@@ -938,11 +909,11 @@
void *arg)
{
struct rib_head *rnh;
- uint32_t fibnum;
+ u_int fibnum;
int i;
for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
- /* Do we want some specific family? */
+ /* Do we want some specific af? */
if (af != AF_UNSPEC) {
rnh = rt_tables_get_rnh(fibnum, af);
if (rnh == NULL)
@@ -975,36 +946,148 @@
struct rt_addrinfo info;
struct rib_head *rnh;
struct rtentry *head;
+ int error_count;
};
+
+#ifdef ROUTE_MPATH
/*
- * Conditionally unlinks @rn from radix tree based
- * on info data passed in @arg.
+ * Helper function to remove matching paths from multipath route.
+ * @rt: prefix rtentry
+ * @di: filter function and data
*/
+static void
+rt_checkdelroute_mpath(struct rtentry *rt, struct rt_delinfo *di)
+{
+ struct nhop_object *nh_new;
+ struct nhop_mpath *mp, *mp_new;
+ struct rt_addrinfo *info;
+ struct weightened_nhop *wn;
+ struct radix_node *rn;
+ uint64_t del_mask;
+ uint32_t num_nhops, weight_new;
+ int count;
+
+ count = 0;
+ del_mask = 0;
+ info = &di->info;
+ nh_new = NULL;
+ weight_new = 0;
+
+ mp = (struct nhop_mpath *)rt->rt_nhop;
+ wn = mpath_get_nhops(mp, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ if (info->rti_filter.func(rt, wn[i].nh, info->rti_filter.data) != 0) {
+ del_mask |= (1 << i);
+ count++;
+ nh_new = wn[i].nh;
+ weight_new = wn[i].weight;
+ }
+ }
+
+ if (count == 0) {
+ /* No matches, just return */
+ return;
+ } else if (count == num_nhops) {
+ /* Eliminated all paths */
+ rn = di->rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &di->rnh->head);
+ if (rn == NULL) {
+ di->error_count++;
+ return;
+ }
+ /* Entry was unlinked. Lock, add to the list and return */
+ RT_LOCK(rt);
+ rt->rte_flags &= ~RTF_UP;
+ rt->rt_chain = di->head;
+ di->head = rt;
+
+ return;
+ } else if (count + 1 == num_nhops) {
+ /*
+ * Eliminated all but one path, so it's not multipath
+ * group anymore.
+ */
+ KASSERT((nh_new != NULL), ("nh_new == NULL"));
+
+ /* Switch mpath group to a single nexthop */
+ nhop_ref_object(nh_new);
+
+ RT_LOCK(rt);
+ rt->rt_nhop = nh_new;
+ /* XXX: violates rte fields immutability */
+ rt->rt_weight = weight_new;
+ RT_UNLOCK(rt);
+
+ mpath_free_group(mp);
+
+ return;
+ }
+
+ /*
+ * The worst case: new nhop group needs to be created, while radix
+ * WLOCK is held
+ */
+ mp_new = mpath_get_del_nhops(di->rnh->nh_control, mp, &del_mask);
+ if (mp_new == NULL) {
+ /*
+ * Failed to create new nexthop group, thus
+ * route deletion fails.
+ *
+ * Given that nexhops references all necessary pieces
+ * it may be not fatal for the system, however the stale route
+ * has to be evicted somehow. Assume the routing daemon will
+ * do the housekeeping.
+ */
+ di->error_count++;
+ return;
+ } else {
+ /* Switch to a new mpath group, freeing the old one. */
+ RT_LOCK(rt);
+ rt->rt_nhop = (struct nhop_object *)mp_new;
+ RT_UNLOCK(rt);
+ mpath_free_group(mp);
+ }
+
+ return;
+}
+#endif
+
static int
rt_checkdelroute(struct radix_node *rn, void *arg)
{
struct rt_delinfo *di;
struct rt_addrinfo *info;
struct rtentry *rt;
- int error;
+ struct nhop_object *nh;
+ struct rib_head *rnh;
di = (struct rt_delinfo *)arg;
rt = (struct rtentry *)rn;
+ rnh = di->rnh;
info = &di->info;
- error = 0;
- info->rti_info[RTAX_DST] = rt_key(rt);
- info->rti_info[RTAX_NETMASK] = rt_mask(rt);
- info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
+ nh = rt->rt_nhop;
- rt = rt_unlinkrte(di->rnh, info, &error);
- if (rt == NULL) {
- /* Either not allowed or not matched. Skip entry */
+#ifdef ROUTE_MPATH
+ if (NH_IS_MULTIPATH(nh)) {
+ rt_checkdelroute_mpath(rt, di);
return (0);
}
+#endif
+ if (info->rti_filter(rt, nh, info->rti_filterdata) == 0) {
+ /* Not matched */
+ return (0);
+ }
+ rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
+ if (rn == NULL) {
+ di->error_count++;
+ return (0);
+ }
/* Entry was unlinked. Add to the list and return */
+ RT_LOCK(rt);
+ RT_ADDREF(rt);
+ rt->rte_flags &= ~RTF_UP;
rt->rt_chain = di->head;
di->head = rt;
@@ -1019,6 +1102,10 @@
* @filter_f: function returning non-zero value for items to delete
* @arg: data to pass to the @filter_f function
* @report: true if rtsock notification is needed.
+ *
+ * Note: currently reporting is NOT supported for multipath routes.
+ * The only current customer requiring reporting is temporal routes,
+ * which are not multipath.
*/
void
rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool report)
@@ -1026,6 +1113,7 @@
struct rib_head *rnh;
struct rt_delinfo di;
struct rtentry *rt;
+ struct nhop_object *nh;
rnh = rt_tables_get_rnh(fibnum, family);
if (rnh == NULL)
@@ -1053,12 +1141,37 @@
di.info.rti_info[RTAX_DST] = rt_key(rt);
di.info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- rt_notifydelete(rt, &di.info);
+ nh = rt->rt_nhop;
+#ifdef ROUTE_MPATH
+ if (NH_IS_MULTIPATH(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rt->rt_nhop,
+ &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ nh = wn[i].nh;
+ rt_notifydelete(rt, nh, &di.info);
+ if (report)
+ rt_routemsg(RTM_DELETE, rt, nh, fibnum);
+ }
+ nhgrp_free_group((struct nhgrp_object *)rt->rt_nhop);
+ } else
+#endif
+ {
+ rt_notifydelete(rt, nh, &di.info);
- if (report)
- rt_routemsg(RTM_DELETE, rt, rt->rt_ifp, 0, fibnum);
+ if (report)
+ rt_routemsg(RTM_DELETE, rt, nh, fibnum);
+ NH_FREE(nh);
+ }
+
RTFREE_LOCKED(rt);
}
+
+ if (di.error_count > 0)
+ log(LOG_ERR, "Unable to delete %u route(s) for fib %u in family %d\n",
+ di.error_count, fibnum, family);
}
/*
@@ -1107,18 +1220,18 @@
* errno failed - reason indicated
*/
static int
-rt_ifdelroute(const struct rtentry *rt, void *arg)
+rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg)
{
struct ifnet *ifp = arg;
- if (rt->rt_ifp != ifp)
+ if (nh->nh_ifp != ifp)
return (0);
/*
* Protect (sorta) against walktree recursion problems
* with cloned routes
*/
- if ((rt->rt_flags & RTF_UP) == 0)
+ if (!RT_IS_UP(rt))
return (0);
return (1);
@@ -1146,97 +1259,24 @@
rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp);
}
-/*
- * Conditionally unlinks rtentry matching data inside @info from @rnh.
- * Returns unlinked, locked and referenced @rtentry on success,
- * Returns NULL and sets @perror to:
- * ESRCH - if prefix was not found,
- * EADDRINUSE - if trying to delete PINNED route without appropriate flag.
- * ENOENT - if supplied filter function returned 0 (not matched).
- */
-static struct rtentry *
-rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror)
-{
- struct sockaddr *dst, *netmask;
- struct rtentry *rt;
- struct radix_node *rn;
-
- dst = info->rti_info[RTAX_DST];
- netmask = info->rti_info[RTAX_NETMASK];
-
- rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
- if (rt == NULL) {
- *perror = ESRCH;
- return (NULL);
- }
-
- if ((info->rti_flags & RTF_PINNED) == 0) {
- /* Check if target route can be deleted */
- if (rt->rt_flags & RTF_PINNED) {
- *perror = EADDRINUSE;
- return (NULL);
- }
- }
-
- if (info->rti_filter != NULL) {
- if (info->rti_filter(rt, info->rti_filterdata) == 0) {
- /* Not matched */
- *perror = ENOENT;
- return (NULL);
- }
-
- /*
- * Filter function requested rte deletion.
- * Ease the caller work by filling in remaining info
- * from that particular entry.
- */
- info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- }
-
- /*
- * Remove the item from the tree and return it.
- * Complain if it is not there and do no more processing.
- */
- *perror = ESRCH;
-#ifdef RADIX_MPATH
- if (rt_mpath_capable(rnh))
- rn = rt_mpath_unlink(rnh, info, rt, perror);
- else
-#endif
- rn = rnh->rnh_deladdr(dst, netmask, &rnh->head);
- if (rn == NULL)
- return (NULL);
-
- if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
- panic ("rtrequest delete");
-
- rt = RNTORT(rn);
- RT_LOCK(rt);
- RT_ADDREF(rt);
- rt->rt_flags &= ~RTF_UP;
-
- *perror = 0;
-
- return (rt);
-}
-
static void
-rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info)
+rt_notifydelete(struct rtentry *rt, struct nhop_object *nh, struct rt_addrinfo *info)
{
struct ifaddr *ifa;
/*
* give the protocol a chance to keep things in sync.
*/
- ifa = rt->rt_ifa;
+ ifa = nh->nh_ifa;
if (ifa != NULL && ifa->ifa_rtrequest != NULL)
- ifa->ifa_rtrequest(RTM_DELETE, rt, info);
+ ifa->ifa_rtrequest(RTM_DELETE, rt, nh, info);
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
+ /* XXX: pcpu? */
V_rttrash++;
}
@@ -1336,120 +1376,81 @@
return (error);
}
-static int
-if_updatemtu_cb(struct radix_node *rn, void *arg)
-{
- struct rtentry *rt;
- struct if_mtuinfo *ifmtu;
- rt = (struct rtentry *)rn;
- ifmtu = (struct if_mtuinfo *)arg;
-
- if (rt->rt_ifp != ifmtu->ifp)
- return (0);
-
- if (rt->rt_mtu >= ifmtu->mtu) {
- /* We have to decrease mtu regardless of flags */
- rt->rt_mtu = ifmtu->mtu;
- return (0);
- }
-
- /*
- * New MTU is bigger. Check if are allowed to alter it
- */
- if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) {
-
- /*
- * Skip routes with user-supplied MTU and
- * non-interface routes
- */
- return (0);
- }
-
- /* We are safe to update route MTU */
- rt->rt_mtu = ifmtu->mtu;
-
- return (0);
-}
-
+/*
+ * Updates transmit mtu for all routes using interface @ifp.
+ */
void
rt_updatemtu(struct ifnet *ifp)
{
- struct if_mtuinfo ifmtu;
struct rib_head *rnh;
+ uint32_t mtu;
int i, j;
- ifmtu.ifp = ifp;
-
/*
* Try to update rt_mtu for all routes using this interface
* Unfortunately the only way to do this is to traverse all
* routing tables in all fibs/domains.
*/
for (i = 1; i <= AF_MAX; i++) {
- ifmtu.mtu = if_getmtu_family(ifp, i);
+ mtu = if_getmtu_family(ifp, i);
for (j = 0; j < rt_numfibs; j++) {
rnh = rt_tables_get_rnh(j, i);
if (rnh == NULL)
continue;
- RIB_WLOCK(rnh);
- rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu);
- RIB_WUNLOCK(rnh);
+ nhops_update_ifmtu(rnh, ifp, mtu);
}
}
}
-
-#if 0
-int p_sockaddr(char *buf, int buflen, struct sockaddr *s);
-int rt_print(char *buf, int buflen, struct rtentry *rt);
-
int
-p_sockaddr(char *buf, int buflen, struct sockaddr *s)
+p_sockaddr(char *buf, int buflen, const struct sockaddr *s, int family)
{
- void *paddr = NULL;
+ const void *paddr = NULL;
- switch (s->sa_family) {
+ switch (family) {
case AF_INET:
- paddr = &((struct sockaddr_in *)s)->sin_addr;
+ paddr = &((const struct sockaddr_in *)s)->sin_addr;
break;
case AF_INET6:
- paddr = &((struct sockaddr_in6 *)s)->sin6_addr;
+ paddr = &((const struct sockaddr_in6 *)s)->sin6_addr;
break;
}
if (paddr == NULL)
return (0);
- if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
+ if (inet_ntop(family, paddr, buf, buflen) == NULL)
return (0);
return (strlen(buf));
}
int
-rt_print(char *buf, int buflen, struct rtentry *rt)
+rt_print(char *buf, int buflen, const struct rtentry *rt)
{
- struct sockaddr *addr, *mask;
+ const struct sockaddr *addr, *mask;
int i = 0;
+ int family;
- addr = rt_key(rt);
- mask = rt_mask(rt);
+ addr = rt_key_const(rt);
+ mask = rt_mask_const(rt);
+ family = addr->sa_family;
- i = p_sockaddr(buf, buflen, addr);
- if (!(rt->rt_flags & RTF_HOST)) {
+ i = p_sockaddr(buf, buflen, addr, family);
+ if (!(rt->rte_flags & RTF_HOST)) {
buf[i++] = '/';
- i += p_sockaddr(buf + i, buflen - i, mask);
+ i += p_sockaddr(buf + i, buflen - i, mask, family);
}
+#if 0
if (rt->rt_flags & RTF_GATEWAY) {
buf[i++] = '>';
- i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway);
+ i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway, family);
}
-
+#endif
return (i);
}
-#endif
#ifdef RADIX_MPATH
/*
@@ -1528,223 +1529,6 @@
}
#endif
-int
-rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
- u_int fibnum)
-{
- int error = 0;
- struct rtentry *rt, *rt_old;
- struct radix_node *rn;
- struct rib_head *rnh;
- struct ifaddr *ifa;
- struct sockaddr *ndst;
- struct sockaddr_storage mdst;
-
- KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
- KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked"));
- switch (dst->sa_family) {
- case AF_INET6:
- case AF_INET:
- /* We support multiple FIBs. */
- break;
- default:
- fibnum = RT_DEFAULT_FIB;
- break;
- }
-
- /*
- * Find the correct routing tree to use for this Address Family
- */
- rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
- if (rnh == NULL)
- return (EAFNOSUPPORT);
-
- /*
- * If we are adding a host route then we don't want to put
- * a netmask in the tree, nor do we want to clone it.
- */
- if (flags & RTF_HOST)
- netmask = NULL;
-
- switch (req) {
- case RTM_DELETE:
- if (netmask) {
- if (dst->sa_len > sizeof(mdst))
- return (EINVAL);
- rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
- dst = (struct sockaddr *)&mdst;
- }
-
- RIB_WLOCK(rnh);
- rt = rt_unlinkrte(rnh, info, &error);
- RIB_WUNLOCK(rnh);
- if (error != 0)
- return (error);
-
- rt_notifydelete(rt, info);
-
- /*
- * If the caller wants it, then it can have it,
- * but it's up to it to free the rtentry as we won't be
- * doing it.
- */
- if (ret_nrt) {
- *ret_nrt = rt;
- RT_UNLOCK(rt);
- } else
- RTFREE_LOCKED(rt);
- break;
- case RTM_RESOLVE:
- /*
- * resolve was only used for route cloning
- * here for compat
- */
- break;
- case RTM_ADD:
- if ((flags & RTF_GATEWAY) && !gateway)
- return (EINVAL);
- if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
- (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
- return (EINVAL);
-
- if (info->rti_ifa == NULL) {
- error = rt_getifa_fib(info, fibnum);
- if (error)
- return (error);
- } else {
- ifa_ref(info->rti_ifa);
- }
- rt = uma_zalloc(V_rtzone, M_NOWAIT);
- if (rt == NULL) {
- ifa_free(info->rti_ifa);
- return (ENOBUFS);
- }
- rt->rt_flags = RTF_UP | flags;
- rt->rt_fibnum = fibnum;
- /*
- * Add the gateway. Possibly re-malloc-ing the storage for it.
- */
- if ((error = rt_setgate(rt, dst, gateway)) != 0) {
- ifa_free(info->rti_ifa);
- uma_zfree(V_rtzone, rt);
- return (error);
- }
-
- /*
- * point to the (possibly newly malloc'd) dest address.
- */
- ndst = (struct sockaddr *)rt_key(rt);
-
- /*
- * make sure it contains the value we want (masked if needed).
- */
- if (netmask) {
- rt_maskedcopy(dst, ndst, netmask);
- } else
- bcopy(dst, ndst, dst->sa_len);
-
- /*
- * We use the ifa reference returned by rt_getifa_fib().
- * This moved from below so that rnh->rnh_addaddr() can
- * examine the ifa and ifa->ifa_ifp if it so desires.
- */
- ifa = info->rti_ifa;
- rt->rt_ifa = ifa;
- rt->rt_ifp = ifa->ifa_ifp;
- rt->rt_weight = 1;
-
- rt_setmetrics(info, rt);
-
- RIB_WLOCK(rnh);
- RT_LOCK(rt);
-#ifdef RADIX_MPATH
- /* do not permit exactly the same dst/mask/gw pair */
- if (rt_mpath_capable(rnh) &&
- rt_mpath_conflict(rnh, rt, netmask)) {
- RIB_WUNLOCK(rnh);
-
- ifa_free(rt->rt_ifa);
- R_Free(rt_key(rt));
- uma_zfree(V_rtzone, rt);
- return (EEXIST);
- }
-#endif
-
- /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
- rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
-
- if (rn != NULL && rt->rt_expire > 0)
- tmproutes_update(rnh, rt);
-
- rt_old = NULL;
- if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) {
-
- /*
- * Force removal and re-try addition
- * TODO: better multipath&pinned support
- */
- struct sockaddr *info_dst = info->rti_info[RTAX_DST];
- info->rti_info[RTAX_DST] = ndst;
- /* Do not delete existing PINNED(interface) routes */
- info->rti_flags &= ~RTF_PINNED;
- rt_old = rt_unlinkrte(rnh, info, &error);
- info->rti_flags |= RTF_PINNED;
- info->rti_info[RTAX_DST] = info_dst;
- if (rt_old != NULL)
- rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head,
- rt->rt_nodes);
- }
- RIB_WUNLOCK(rnh);
-
- if (rt_old != NULL)
- RT_UNLOCK(rt_old);
-
- /*
- * If it still failed to go into the tree,
- * then un-make it (this should be a function)
- */
- if (rn == NULL) {
- ifa_free(rt->rt_ifa);
- R_Free(rt_key(rt));
- uma_zfree(V_rtzone, rt);
- return (EEXIST);
- }
-
- if (rt_old != NULL) {
- rt_notifydelete(rt_old, info);
- RTFREE(rt_old);
- }
-
- /*
- * If this protocol has something to add to this then
- * allow it to do that as well.
- */
- if (ifa->ifa_rtrequest)
- ifa->ifa_rtrequest(req, rt, info);
-
- /*
- * actually return a resultant rtentry and
- * give the caller a single reference.
- */
- if (ret_nrt) {
- *ret_nrt = rt;
- RT_ADDREF(rt);
- }
- rnh->rnh_gen++; /* Routing table updated */
- RT_UNLOCK(rt);
- break;
- case RTM_CHANGE:
- RIB_WLOCK(rnh);
- error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum);
- RIB_WUNLOCK(rnh);
- break;
- default:
- error = EOPNOTSUPP;
- }
-
- return (error);
-}
-
#undef dst
#undef gateway
#undef netmask
@@ -1752,199 +1536,6 @@
#undef ifpaddr
#undef flags
-static int
-rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info,
- struct rtentry **ret_nrt, u_int fibnum)
-{
- struct rtentry *rt = NULL;
- int error = 0;
- int free_ifa = 0;
- int family, mtu;
- struct if_mtuinfo ifmtu;
-
- RIB_WLOCK_ASSERT(rnh);
-
- rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
- info->rti_info[RTAX_NETMASK], &rnh->head);
-
- if (rt == NULL)
- return (ESRCH);
-
-#ifdef RADIX_MPATH
- /*
- * If we got multipath routes,
- * we require users to specify a matching RTAX_GATEWAY.
- */
- if (rt_mpath_capable(rnh)) {
- rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]);
- if (rt == NULL)
- return (ESRCH);
- }
-#endif
-
- RT_LOCK(rt);
-
- rt_setmetrics(info, rt);
-
- /*
- * New gateway could require new ifaddr, ifp;
- * flags may also be different; ifp may be specified
- * by ll sockaddr when protocol address is ambiguous
- */
- if (((rt->rt_flags & RTF_GATEWAY) &&
- info->rti_info[RTAX_GATEWAY] != NULL) ||
- info->rti_info[RTAX_IFP] != NULL ||
- (info->rti_info[RTAX_IFA] != NULL &&
- !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) {
- /*
- * XXX: Temporarily set RTF_RNH_LOCKED flag in the rti_flags
- * to avoid rlock in the ifa_ifwithroute().
- */
- info->rti_flags |= RTF_RNH_LOCKED;
- error = rt_getifa_fib(info, fibnum);
- info->rti_flags &= ~RTF_RNH_LOCKED;
- if (info->rti_ifa != NULL)
- free_ifa = 1;
-
- if (error != 0)
- goto bad;
- }
-
- /* Check if outgoing interface has changed */
- if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa &&
- rt->rt_ifa != NULL) {
- if (rt->rt_ifa->ifa_rtrequest != NULL)
- rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info);
- ifa_free(rt->rt_ifa);
- rt->rt_ifa = NULL;
- }
- /* Update gateway address */
- if (info->rti_info[RTAX_GATEWAY] != NULL) {
- error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]);
- if (error != 0)
- goto bad;
-
- rt->rt_flags &= ~RTF_GATEWAY;
- rt->rt_flags |= (RTF_GATEWAY & info->rti_flags);
- }
-
- if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) {
- ifa_ref(info->rti_ifa);
- rt->rt_ifa = info->rti_ifa;
- rt->rt_ifp = info->rti_ifp;
- }
- /* Allow some flags to be toggled on change. */
- rt->rt_flags &= ~RTF_FMASK;
- rt->rt_flags |= info->rti_flags & RTF_FMASK;
-
- if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL)
- rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
-
- /* Alter route MTU if necessary */
- if (rt->rt_ifp != NULL) {
- family = info->rti_info[RTAX_DST]->sa_family;
- mtu = if_getmtu_family(rt->rt_ifp, family);
- /* Set default MTU */
- if (rt->rt_mtu == 0)
- rt->rt_mtu = mtu;
- if (rt->rt_mtu != mtu) {
- /* Check if we really need to update */
- ifmtu.ifp = rt->rt_ifp;
- ifmtu.mtu = mtu;
- if_updatemtu_cb(rt->rt_nodes, &ifmtu);
- }
- }
-
- /*
- * This route change may have modified the route's gateway. In that
- * case, any inpcbs that have cached this route need to invalidate their
- * llentry cache.
- */
- rnh->rnh_gen++;
-
- if (ret_nrt) {
- *ret_nrt = rt;
- RT_ADDREF(rt);
- }
-bad:
- RT_UNLOCK(rt);
- if (free_ifa != 0) {
- ifa_free(info->rti_ifa);
- info->rti_ifa = NULL;
- }
- return (error);
-}
-
-static void
-rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt)
-{
-
- if (info->rti_mflags & RTV_MTU) {
- if (info->rti_rmx->rmx_mtu != 0) {
-
- /*
- * MTU was explicitly provided by user.
- * Keep it.
- */
- rt->rt_flags |= RTF_FIXEDMTU;
- } else {
-
- /*
- * User explicitly sets MTU to 0.
- * Assume rollback to default.
- */
- rt->rt_flags &= ~RTF_FIXEDMTU;
- }
- rt->rt_mtu = info->rti_rmx->rmx_mtu;
- }
- if (info->rti_mflags & RTV_WEIGHT)
- rt->rt_weight = info->rti_rmx->rmx_weight;
- /* Kernel -> userland timebase conversion. */
- if (info->rti_mflags & RTV_EXPIRE)
- rt->rt_expire = info->rti_rmx->rmx_expire ?
- info->rti_rmx->rmx_expire - time_second + time_uptime : 0;
-}
-
-int
-rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
-{
- /* XXX dst may be overwritten, can we move this to below */
- int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
-
- /*
- * Prepare to store the gateway in rt->rt_gateway.
- * Both dst and gateway are stored one after the other in the same
- * malloc'd chunk. If we have room, we can reuse the old buffer,
- * rt_gateway already points to the right place.
- * Otherwise, malloc a new block and update the 'dst' address.
- */
- if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
- caddr_t new;
-
- R_Malloc(new, caddr_t, dlen + glen);
- if (new == NULL)
- return ENOBUFS;
- /*
- * XXX note, we copy from *dst and not *rt_key(rt) because
- * rt_setgate() can be called to initialize a newly
- * allocated route entry, in which case rt_key(rt) == NULL
- * (and also rt->rt_gateway == NULL).
- * Free()/free() handle a NULL argument just fine.
- */
- bcopy(dst, new, dlen);
- R_Free(rt_key(rt)); /* free old block, if any */
- rt_key(rt) = (struct sockaddr *)new;
- rt->rt_gateway = (struct sockaddr *)(new + dlen);
- }
-
- /*
- * Copy the new gateway value into the memory chunk.
- */
- bcopy(gate, rt->rt_gateway, glen);
-
- return (0);
-}
-
void
rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
{
@@ -1975,7 +1566,6 @@
RIB_RLOCK_TRACKER;
struct sockaddr *dst;
struct sockaddr *netmask;
- struct rtentry *rt = NULL;
struct rt_addrinfo info;
int error = 0;
int startfib, endfib;
@@ -1984,6 +1574,7 @@
int a_failure = 0;
struct sockaddr_dl *sdl = NULL;
struct rib_head *rnh;
+ struct epoch_tracker et;
if (flags & RTF_HOST) {
dst = ifa->ifa_dstaddr;
@@ -2081,9 +1672,15 @@
}
}
#endif
- error = (rn == NULL ||
- (rn->rn_flags & RNF_ROOT) ||
- RNTORT(rn)->rt_ifa != ifa);
+ error = 0;
+ struct nhop_object *nh;
+ if (rn == NULL || (rn->rn_flags & RNF_ROOT))
+ error = 1;
+ else {
+ nh = RNTORT(rn)->rt_nhop;
+ if (NH_IS_MULTIPATH(nh) || nh->nh_ifa != ifa)
+ error = 1;
+ }
RIB_RUNLOCK(rnh);
if (error) {
/* this is only an error if bad on ALL tables */
@@ -2098,61 +1695,32 @@
info.rti_flags = flags |
(ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED;
info.rti_info[RTAX_DST] = dst;
+ info.rti_info[RTAX_NETMASK] = netmask;
/*
* doing this for compatibility reasons
*/
- if (cmd == RTM_ADD)
+ struct rib_cmd_info rc;
+ bzero(&rc, sizeof(rc));
+ NET_EPOCH_ENTER(et);
+ if (cmd == RTM_ADD) {
info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)sdl;
- else
+ error = rib_add_route(fibnum, &info, &rc);
+ if (error == 0) {
+ rt_addrmsg(cmd, ifa, fibnum);
+ rt_routemsg(cmd, rc.rt, rc.nh_new, fibnum);
+ }
+ } else {
info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
- info.rti_info[RTAX_NETMASK] = netmask;
- error = rtrequest1_fib(cmd, &info, &rt, fibnum);
- if (error == 0 && rt != NULL) {
- /*
- * notify any listening routing agents of the change
- */
- RT_LOCK(rt);
-#ifdef RADIX_MPATH
- /*
- * in case address alias finds the first address
- * e.g. ifconfig bge0 192.0.2.246/24
- * e.g. ifconfig bge0 192.0.2.247/24
- * the address set in the route is 192.0.2.246
- * so we need to replace it with 192.0.2.247
- */
- if (memcmp(rt->rt_ifa->ifa_addr,
- ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
- ifa_free(rt->rt_ifa);
- ifa_ref(ifa);
- rt->rt_ifp = ifa->ifa_ifp;
- rt->rt_ifa = ifa;
+ error = rib_del_route(fibnum, &info, &rc);
+ if (error == 0) {
+ rt_routemsg(cmd, rc.rt, rc.nh_old, fibnum);
+ rt_addrmsg(cmd, ifa, fibnum);
}
-#endif
- RT_ADDREF(rt);
- RT_UNLOCK(rt);
- rt_newaddrmsg_fib(cmd, ifa, rt, fibnum);
- RT_LOCK(rt);
- RT_REMREF(rt);
- if (cmd == RTM_DELETE) {
- /*
- * If we are deleting, and we found an entry,
- * then it's been removed from the tree..
- * now throw it away.
- */
- RTFREE_LOCKED(rt);
- } else {
- if (cmd == RTM_ADD) {
- /*
- * We just wanted to add it..
- * we don't actually need a reference.
- */
- RT_REMREF(rt);
- }
- RT_UNLOCK(rt);
- }
- didwork = 1;
}
- if (error)
+ NET_EPOCH_EXIT(et);
+ if (error == 0)
+ didwork = 1;
+ else
a_failure = error;
}
if (cmd == RTM_DELETE) {
@@ -2219,16 +1787,14 @@
* Announce kernel-originated route addition/removal to rtsock based on @rt data.
* cmd: RTM_ cmd
* @rt: valid rtentry
- * @ifp: target route interface
+ * @nh: nexthop of the route
* @fibnum: fib id or RT_ALL_FIBS
*
* Returns 0 on success.
*/
int
-rt_routemsg(int cmd, struct rtentry *rt, struct ifnet *ifp, int rti_addrs,
- int fibnum)
+rt_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum)
{
-
KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
("unexpected cmd %d", cmd));
@@ -2237,7 +1803,7 @@
KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__));
- return (rtsock_routemsg(cmd, rt, ifp, 0, fibnum));
+ return (rtsock_routemsg(cmd, rt, nh, fibnum));
}
/*
@@ -2280,10 +1846,10 @@
if (cmd == RTM_ADD) {
rt_addrmsg(cmd, ifa, fibnum);
if (rt != NULL)
- rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum);
+ rt_routemsg(cmd, rt, rt->rt_nhop, fibnum);
} else {
if (rt != NULL)
- rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum);
+ rt_routemsg(cmd, rt, rt->rt_nhop, fibnum);
rt_addrmsg(cmd, ifa, fibnum);
}
}
Index: sys/net/route/mpath_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/mpath_ctl.c
@@ -0,0 +1,558 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_route_mpath.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/route/rtentry_var.h>
+
+/*
+ * This file contains the supporting functions for adding/deleting/updating
+ * multipath routes to the routing table. Terms "Nexthop group" and "multipath
+ * groups" are used in this file interchangeably.
+ */
+
+VNET_DEFINE(u_int, fib_hash_outbound) = 0;
+SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
+ &VNET_NAME(fib_hash_outbound), 0,
+ "Compute flowid for locally-originated packets");
+
+static int try_add_route_mpath(struct rib_head *rnh, struct rtentry *rt,
+ struct weightened_nhop *wn_orig, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+static int try_del_route_mpath(struct rib_head *rnh, struct rtentry *rt,
+ struct nhgrp_object **pmp_orig, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+
+
+/* Default entropy to add to the hash calculation for the outbound connections*/
+uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
+ 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+ 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+ 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+ 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+ 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
+/*
+ * Tries to add route denoted by @rt and nhop @rt->rt_nhop to the (potentially mpath)
+ * nhop denoted by @wn_orig. @rt and @rt->rt_nhop are referenced.
+ *
+ * Returns 0 on success. @rt and @rt->rt_nhop references are consumed.
+ * EAGAIN if the original condition has changed. Updates nhop ptr in @wn_orig.
+ * errno otherwise.
+ * It is caller responsibility to free references for @rt and @rt->rt_nhop
+ * in case of an error.
+ */
+static int
+try_add_route_mpath(struct rib_head *rnh, struct rtentry *rt,
+ struct weightened_nhop *wn_orig, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+ struct nhgrp_object *mp;
+ struct rtentry *rt_new;
+ struct sockaddr *ndst, *netmask;
+ struct weightened_nhop wn[2], *wn_tmp;
+ void *rn = NULL;
+ uint64_t addmask;
+ uint32_t num_nhops_tmp;
+ int error;
+
+ ndst = (struct sockaddr *)rt_key(rt);
+ netmask = info->rti_info[RTAX_NETMASK];
+
+ /*
+ * @rt->rt_nhop represents new non-mpath nhop to be added.
+ * nhop in @wn_orig can be multipath (which is confusing)
+ *
+ * Try to create multipath group based on the joined nexthops
+ * above.
+ */
+
+ wn[0].nh = rt->rt_nhop;
+ wn[0].weight = rt->rt_weight;
+
+ if (!NH_IS_MULTIPATH(wn_orig->nh)) {
+ /* Simple merge of 2 non-multipath nexthops */
+ if (wn_orig->nh == rt->rt_nhop) {
+ /*
+ * This nexhop already exists. If the goal
+ * was to change weight, change request should
+ * have been executed for this nexthop.
+ */
+ return (EEXIST);
+ }
+ wn[1].nh = wn_orig->nh;
+ wn[1].weight = wn_orig->weight;
+
+ mp = nhgrp_get_group(rnh, wn, 2, &error);
+ if (mp != NULL) {
+ /* Calculate addition mask */
+ wn_tmp = nhgrp_get_nhops(mp, &num_nhops_tmp);
+ if (wn_tmp[0].nh == rt->rt_nhop)
+ addmask = 1 << 0;
+ else
+ addmask = 1 << 1;
+ }
+ } else {
+ /* Get new nhop group with @rt->rt_nhop as an additional nhop */
+ mp = nhgrp_append_nhops(rnh, (struct nhgrp_object *)wn_orig->nh,
+ wn, 1, &addmask, &error);
+ }
+
+ /*
+ * As we haven't referenced multipath groups/nhops it is possible that
+ * object in @wn_orig got scheduled for deletion. In that case, we need
+ * to re-fetch latest data from the RIB and retry.
+ *
+ * It is also possible that allocation simply fails. In that case, return
+ * immediately.
+ */
+ if (mp == NULL) {
+ if (error != EAGAIN) {
+ /*
+ * Some fatal allocation problem, most likely
+ * memory-related.
+ */
+ return (error);
+ }
+ /*
+ * Rare case when the @wn_orig data got scheduled for deletion.
+ * Zero the original data to indicate the need to refill it for
+ * the code below.
+ */
+ wn_orig->nh = NULL;
+ wn_orig->weight = 0;
+ }
+
+ RIB_WLOCK(rnh);
+
+ rt_new = (struct rtentry *)rnh->rnh_lookup(ndst, netmask, &rnh->head);
+ if (rt_new == NULL) {
+
+ /*
+ * Our prefix got deleted, let's add proposed single route path
+ * and return.
+ */
+ RT_LOCK(rt);
+ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes);
+ if (rn != NULL)
+ rnh->rnh_gen++;
+ RIB_WUNLOCK(rnh);
+
+ /* In any case, dereference created multipath group */
+ if (mp != NULL)
+ nhgrp_free_group(mp);
+
+ if (rn == NULL) {
+ /*
+ * Route addition failed, while there is no existing prefix.
+ * Most probably, we ran out of memory.
+ */
+ RT_UNLOCK(rt);
+ RTSTAT_INC(rts_add_algo_fail);
+ return (ENOMEM);
+ }
+
+ /*
+ * Success! As a result, single-path route has been added.
+ * By convention, references for original @rt and @rt->rt_nhop
+ * gets consumed.
+ */
+
+ /* Finalise notification */
+ rc->rt = rt;
+ rc->rt_weight = rt->rt_weight;
+ rc->nh_new = rt->rt_nhop;
+ RT_UNLOCK(rt);
+
+ return (0);
+ }
+
+ /* Prefix exists, try to update */
+ RT_LOCK(rt_new);
+ if ((rt_new->rt_nhop == wn_orig->nh) &&
+ (rt_new->rt_weight == wn_orig->weight)) {
+ /*
+ * Most common case: nhop/mpath group hasn't changed.
+ * Flip to the new precalculated one and return.
+ */
+ DPRINTF("mpath replace: %p -> %p", wn_orig->nh, mp);
+ rt_new->rt_nhop = (struct nhop_object *)mp;
+
+ rnh->rnh_gen++;
+ RIB_WUNLOCK(rnh);
+ RT_UNLOCK(rt_new);
+
+ rc->rt = rt_new;
+ rc->nh_old = wn_orig->nh;
+ rc->nh_new = (struct nhop_object *)mp;
+ rc->mask_changed = addmask;
+
+ /*
+ * As original @rt and @rt->rt_nhop was not used directly,
+ * unref them.
+ */
+ nhop_free_object(rt->rt_nhop);
+ RTFREE(rt);
+
+ return (0);
+ } else {
+ /*
+ * Nhop/mpath group has been updated.
+ * Need to backout the work done in this cycle and
+ * return EAGAIN indicating the caller to retry.
+ *
+ * Check the new nexthop before returning is it
+ * may be ineligible for multipath.
+ */
+ RIB_WUNLOCK(rnh);
+
+ /* unref our newly-created group */
+ if (mp != NULL)
+ nhgrp_free_group(mp);
+
+ /*
+ * Check if current in-tree nhop is eligible
+ * for multipath and update @wn_orig with
+ * its data.
+ */
+ if (can_nh_multipath(rt_new->rt_nhop)) {
+ wn_orig->nh = rt_new->rt_nhop;
+ wn_orig->weight = rt_new->rt_weight;
+ error = EAGAIN;
+ } else
+ error = EEXIST;
+ RT_UNLOCK(rt_new);
+
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * Tries to add @rt->rt_nhop to the existing set of nhops (@nh_orig) in the @rt prefix.
+ * @rt and @nh_orig are referenced and unlocked.
+ *
+ * On success: returns 0. Function consumes rt and rt->rt_nhop references.
+ * @rc gets populated with referenced objects.
+ * Otherwise: errno is returned, caller responsibility is to unlock/free rt and
+ * rt->rt_nhop.
+ */
+int
+add_route_mpath(struct rib_head *rnh, struct rtentry *rt, struct nhop_object *nh_orig,
+ u_long weight_orig, struct rt_addrinfo *info, struct rib_cmd_info *rc)
+{
+ int error;
+ struct weightened_nhop wn;
+
+ /*
+ * 1) In the presense of multiple rtsock speakers such as some
+ * loadbalancer-like automation there can be some contention present.
+ * As multiple adds/changes should not (from user standpoint) change
+ * the result of operation, retry the request multiple times.
+ * 2) In more common situation, most of the large-fib updates
+ * are done by the routing daemon via the single route socket, thus the
+ * contention should be minimal.
+ * With the above statements, optimize for the simplest case while still
+ * retain the possibility of retrying.
+ */
+ wn.nh = nh_orig;
+ wn.weight = weight_orig;
+ for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+ error = try_add_route_mpath(rnh, rt, &wn, info, rc);
+ if (error != EAGAIN)
+ break;
+ RTSTAT_INC(rts_add_retry);
+ }
+
+ if (V_fib_hash_outbound == 0 && error == 0 &&
+ NH_IS_MULTIPATH(rc->nh_new)) {
+ /*
+ * First multipath route got installed. Enable local
+ * outbound connections hashing.
+ */
+ if (bootverbose)
+ printf("FIB: enabled flowid calculation for locally-originated packets\n");
+ V_fib_hash_outbound = 1;
+ }
+
+ return (error);
+}
+
+/*
+ * Deletes paths matching gw from @info, from the route defined
+ * by @rt and mpath group defined by @mp_orig
+ *
+ */
+static int
+try_del_route_mpath(struct rib_head *rnh, struct rtentry *rt,
+ struct nhgrp_object **pmp_orig, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+ struct sockaddr *gw, *ndst, *netmask;
+ struct nhgrp_object *mp, *mp_orig;
+ struct nhop_object *nh_new;
+ struct weightened_nhop *wn;
+ unsigned long weight_new;
+ uint32_t num_nhops;
+ uint64_t del_mask;
+ int count, error;
+
+ ndst = (struct sockaddr *)rt_key(rt);
+ netmask = info->rti_info[RTAX_NETMASK];
+ gw = info->rti_info[RTAX_GATEWAY];
+ mp_orig = *pmp_orig;
+
+ KASSERT((mp_orig->mp_flags & MPF_MULTIPATH), ("mp_orig not mpath"));
+
+ del_mask = 0;
+ count = 0;
+
+ wn = nhgrp_get_nhops(mp_orig, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ if (rib_match_nhop_gw(wn[i].nh, gw)) {
+ del_mask |= (1 << i);
+ count++;
+ }
+ }
+
+ if (count == 0) {
+ /*
+ * Unable to find any matching nexthop to delete.
+ */
+ return (ESRCH);
+ }
+
+ weight_new = 0;
+
+ if (num_nhops > count + 1) {
+ /*
+ * The result will still be a multipath group.
+ * mp is returned unlocked&referenced
+ */
+ mp = nhgrp_get_del_nhops(rnh, mp_orig, &del_mask, &error);
+ if (mp == NULL)
+ return (error);
+ nh_new = (struct nhop_object *)mp;
+ } else if (num_nhops == count) {
+ /* All nexthops has been deleted, request prefix deletion */
+ nh_new = NULL;
+ } else {
+ /*
+ * Not multipath group anymore. Set nh_new to the last remaining
+ * nexthop.
+ */
+ nh_new = NULL;
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ if ((del_mask & (1 << i)) == 0) {
+ nh_new = wn[i].nh;
+ weight_new = wn[i].weight;
+ break;
+ }
+ }
+ KASSERT((nh_new != NULL), ("nh_new == NULL"));
+ nhop_ref_object(nh_new);
+ }
+
+ /* New nexhop or nexthop group is stored in @nh_new and referenced. */
+
+ RIB_WLOCK(rnh);
+
+ rt = (struct rtentry *)rnh->rnh_lookup(ndst, netmask, &rnh->head);
+
+ if (rt == NULL) {
+ /*
+ * Our prefix got deleted.
+ * Free resources and return.
+ */
+ RIB_WUNLOCK(rnh);
+
+ if (nh_new != NULL)
+ nhop_free_any(nh_new);
+
+ return (ESRCH);
+ }
+
+ /* Prefix still exists, try to update */
+ if (rt->rt_nhop == (struct nhop_object *)mp_orig) {
+
+ error = 0;
+ /*
+ * Nhop/mpath group hasn't changed. Flip
+ * to the new precalculated one and return
+ */
+ if (nh_new == NULL) {
+ /*
+ * Delete all of the routes for
+ * the multipath prefix.
+ */
+ struct radix_node *rn;
+ rn = rnh->rnh_deladdr(ndst, netmask, &rnh->head);
+ if (rn == NULL) {
+ RIB_WUNLOCK(rnh);
+ RTSTAT_INC(rts_del_algo_fail);
+ return (ESRCH);
+ }
+ } else {
+ /* Changing nexthop to a new one */
+ RT_LOCK(rt);
+ rt->rt_nhop = nh_new;
+ if (weight_new != 0)
+ rt->rt_weight = weight_new;
+ RT_UNLOCK(rt);
+ }
+
+ if (error == 0)
+ rnh->rnh_gen++;
+ RIB_WUNLOCK(rnh);
+
+ if (error != 0) {
+ nhop_free_object(nh_new);
+ return (error);
+ }
+
+ /* Prepare notification */
+
+ rc->rt = rt;
+ rc->nh_old = (struct nhop_object *)mp_orig;
+ rc->mask_changed = del_mask;
+ rc->nh_new = nh_new;
+
+ /* Unref mp_orig, as it was referenced when attached to rte */
+ nhgrp_free_group(mp_orig);
+
+ return (0);
+ }
+
+ /*
+ * Nexthop has changed. Check if it is not multipath anymore
+ */
+ if (!NH_IS_MULTIPATH(rt->rt_nhop)) {
+ int error = del_route_one(rnh, rt, info);
+
+ RIB_WUNLOCK(rnh);
+ /*
+ * Regardless of operation result, created multipath
+ * group is not needed anymore, hence free it.
+ */
+ if (nh_new != NULL)
+ nhop_free_any(nh_new);
+
+ if (error != 0)
+ return (error);
+
+ /* Successfully deleted, prepare operation result */
+ rc->rt = rt;
+ rc->nh_old = rt->rt_nhop;
+ rc->rt_weight = rt->rt_weight;
+
+ return (0);
+ }
+
+ /*
+ * The updated nexthop is a new multipath group.
+ * Need to restart the operation.
+ */
+ mp_orig = (struct nhgrp_object *)rt->rt_nhop;
+ RIB_WUNLOCK(rnh);
+
+ if (nh_new != NULL)
+ nhop_free_any(nh_new);
+
+ *pmp_orig = mp_orig;
+
+ return (EAGAIN);
+}
+
+/*
+ * Deletes paths specified in @info gateway from multipath route @rt
+ * with multupath group @mp_orig.
+ *
+ * Returns 0 on success, with filling @rc with operation results.
+ */
+int
+del_route_mpath(struct rib_head *rnh, struct rtentry *rt,
+ struct nhgrp_object *mp_orig, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+ int error;
+
+ /*
+ * 1) In the presense of multiple rtsock speakers such as some
+ * loadbalancer-like automation there can be some contention present.
+ * As multiple adds/changes should not (from user standpoint) change
+ * the result of operation, retry the request multiple times.
+ * 2) In more common situation, most of the large-fib updates
+ * are done by the routing daemon via the single route socket, thus the
+ * contention should be minimal.
+ * With the above statements, optimize for the simplest case while still
+ * retain the possibility of retrying.
+ */
+
+ for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+ error = try_del_route_mpath(rnh, rt, &mp_orig, info, rc);
+ if (error != EAGAIN)
+ break;
+ RTSTAT_INC(rts_del_retry);
+ }
+
+ return (error);
+}
+
+
Index: sys/net/route/nhgrp.h
===================================================================
--- /dev/null
+++ sys/net/route/nhgrp.h
@@ -0,0 +1,50 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_ROUTE_NHGRP_H_
+#define _NET_ROUTE_NHGRP_H_
+
+#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */
+
+struct nhgrp_object {
+ uint16_t mp_flags; /* mpath flags */
+ uint8_t mp_size; /* size of mpath group used in selection */
+ uint8_t spare;
+ struct nhop_object *nhops[0]; /* nhops */
+};
+
+struct weightened_nhop {
+ struct nhop_object *nh;
+ uint32_t weight;
+};
+
+struct nhop_mpath;
+struct weightened_nhop *mpath_get_nhops(struct nhop_mpath *mp, uint32_t *pnum_nhops);
+
+#endif
Index: sys/net/route/nhgrp.c
===================================================================
--- /dev/null
+++ sys/net/route/nhgrp.c
@@ -0,0 +1,321 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_route_mpath.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/shared.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains data structures management logic for the nexthop
+ * groups ("nhgrp") route subsystem.
+ *
+ * Nexthop groups are used to store multiple routes available for the specific
+ * prefix. Nexthop groups are immutable and can be shared across multiple
+ * prefixes.
+ *
+ * Each group consists of a control plane part and a dataplane part.
+ * Control plane is basically a collection of nexthop objects with
+ * weights and refcount.
+ *
+ * Datapath consists of a array of nexthop pointers, compiled from control
+ * plane data to support O(1) nexthop selection.
+ *
+ * For example, consider the following group:
+ * [(nh1, weight=100), (nh2, weight=200)]
+ * It will compile to the following array:
+ * [nh1, nh2, nh2]
+ *
+ */
+
+static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items);
+
+static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b);
+static unsigned int hash_nhgrp(const struct nhgrp_priv *obj);
+
+static int
+cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b)
+{
+ /*
+ * In case of consistent hashing, there can be multiple multipath groups
+ * with the same "control plane" list of nexthops with weights and a
+ * different set of "data plane" nexthops.
+ * For now, ignore the data plane and focus on the control plane list.
+ */
+ if (a->gr_nh_count != b->gr_nh_count)
+ return (0);
+ return !memcmp(a->gr_nh_weights, b->gr_nh_weights,
+ sizeof(struct weightened_nhop) * a->gr_nh_count);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_nhgrp(const struct nhgrp_priv *obj)
+{
+ const unsigned char *key;
+
+ key = (const unsigned char *)obj->gr_nh_weights;
+
+ return (djb_hash(key, sizeof(struct weightened_nhop) * obj->gr_nh_count));
+}
+
+/*
+ * Returns object referenced and unlocked
+ */
+struct nhgrp_priv *
+find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *priv_ret;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret);
+ if (priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&priv_ret->gr_refcnt) == 0) {
+ /* refcount was 0 -> group os being deleted */
+ priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ return (priv_ret);
+}
+
+int
+link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv)
+{
+ uint16_t idx;
+ uint32_t new_num_buckets, new_num_items;
+
+ NHOPS_WLOCK(ctl);
+ /* Check if we need to resize hash and index */
+ new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head);
+ new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate mpath index");
+ consider_resize(ctl, new_num_buckets, new_num_items);
+ return (0);
+ }
+
+ MP_PRIV_LOCK(grp_priv);
+ grp_priv->gr_idx = idx;
+ grp_priv->nh_control = ctl;
+ CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv);
+ MP_PRIV_UNLOCK(grp_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ consider_resize(ctl, new_num_buckets, new_num_items);
+
+ return (1);
+}
+
+struct nhgrp_priv *
+unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key)
+{
+ struct nhgrp_priv *mp_ret;
+ int ret, idx;
+
+ NHOPS_WLOCK(ctl);
+
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, mp_ret);
+
+ if (mp_ret == NULL) {
+ DPRINTF("Unable to find nhop group!");
+ NHOPS_WUNLOCK(ctl);
+ return (NULL);
+ }
+
+ idx = mp_ret->gr_idx;
+ ret = bitmask_free_idx(&ctl->gr_idx_head, idx);
+ MP_PRIV_LOCK(mp_ret);
+ mp_ret->gr_idx = 0;
+ mp_ret->nh_control = NULL;
+ MP_PRIV_UNLOCK(mp_ret);
+
+ NHOPS_WUNLOCK(ctl);
+
+ return (mp_ret);
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+__noinline static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL ;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL)
+ return;
+
+ DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]",
+ nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items))
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Function allocating the necessary group data structures.
+ */
+int
+nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags)
+{
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *cht_ptr, *mask_ptr;
+
+
+ num_buckets = 8;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags | M_ZERO);
+
+ if (cht_ptr == NULL) {
+ DPRINTF("mpath init failed");
+ return (0);
+ }
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128;
+ mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags | M_ZERO);
+ if (mask_ptr == NULL) {
+ DPRINTF("mpath bitmask init failed");
+ free(cht_ptr, M_NHOP);
+ return (0);
+ }
+
+ /*
+ * Reinit hash, as the previous instance contained 0 items.
+ */
+ NHOPS_WLOCK(ctl);
+
+ if (ctl->gr_head.hash_size == 0) {
+ CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets);
+ bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items);
+ NHOPS_WUNLOCK(ctl);
+ } else {
+ /* Other thread has already initiliazed hash/bitmask */
+ NHOPS_WUNLOCK(ctl);
+ free(cht_ptr, M_NHOP);
+ free(mask_ptr, M_NHOP);
+ }
+
+ DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum,
+ ctl->rh->rib_family);
+
+ return (1);
+}
+
+int
+nhgrp_ctl_init(struct nh_control *ctl)
+{
+ /*
+ * By default, do not allocate datastructures as multipath
+ * routes will not be necessarily used.
+ */
+ CHT_SLIST_INIT(&ctl->gr_head, NULL, 0);
+ bitmask_init(&ctl->gr_idx_head, NULL, 0);
+
+ return (0);
+}
+
+void
+nhgrp_ctl_free(struct nh_control *ctl)
+{
+
+ if (ctl->gr_head.ptr != NULL)
+ free(ctl->gr_head.ptr, M_NHOP);
+ if (ctl->gr_idx_head.idx != NULL)
+ free(ctl->gr_idx_head.idx, M_NHOP);
+}
+
Index: sys/net/route/nhgrp_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/nhgrp_ctl.c
@@ -0,0 +1,823 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_route_mpath.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains the supporting functions for creating multipath groups
+ * and compiling their dataplane parts.
+ */
+
+/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
+CTASSERT(MPF_MULTIPATH == NHF_MULTIPATH);
+/* Offset and size of flags field has to be the same for nhop/nhop groups */
+CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, mp_flags);
+/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
+CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
+
+static int wn_cmp(const void *a, const void *b);
+static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
+
+static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
+ struct weightened_nhop *wn, int num_nhops, int *perror);
+static void destroy_nhgrp(struct nhgrp_priv *gr_priv);
+static void destroy_nhgrp_epoch(epoch_context_t ctx);
+static void free_nhgrp_nhops(struct nhgrp_priv *gr_priv);
+
+static int dump_nhgrp_entry(struct rib_head *rh, struct nhgrp_priv *grp_priv,
+ char *buffer, struct sysctl_req *w);
+
+
+static int
+wn_cmp(const void *a, const void *b)
+{
+ const struct weightened_nhop *wa = a;
+ const struct weightened_nhop *wb = b;
+
+ if (wa->weight > wb->weight)
+ return (1);
+ else if (wa->weight < wb->weight)
+ return (-1);
+
+ /* Compare nexthops by pointer */
+ if (wa->nh > wb->nh)
+ return (1);
+ else if (wa->nh < wb->nh)
+ return (-1);
+ else
+ return (0);
+}
+
+/*
+ * Perform in-place sorting for array of nexthops in @wn.
+ *
+ * To avoid nh groups duplication, nexthops/weights in the
+ * @wn need to be ordered deterministically.
+ * As this sorting is needed only for the control plane functionality,
+ * there are no specific external requirements.
+ *
+ * Sort by weight first, to ease calculation of the slot sizes.
+ */
+static void
+sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
+{
+
+ qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights while maintaining weight coefficints.
+ *
+ * Assume @wn is sorted by weight ascending and each weight is > 0.
+ *
+ * Some examples:
+ * nh=1,weight=1 nh=2,weight=2 -> 3 slots [1, 2, 2]
+ * nh=1,weight=1000 nh=2,weight=2000 -> 3 slots: [1, 2, 2]
+ * nh=1,weight=17 nh=2,weight=37 -> 3 slots: [1, 2, 2]
+ * nh=1,weight=1 nh=2,weight=70 -> 64 slots: [1, 2, 2, ..]
+ */
+static uint32_t
+calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
+{
+ uint32_t i, last, xmin;
+ uint64_t v, total = 0;
+
+ last = 0;
+ xmin = wn[0].weight;
+ for (i = 0; i < num_items; i++) {
+ total += wn[i].weight;
+ if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
+ xmin = wn[i].weight - last;
+ last = wn[i].weight;
+ }
+ // got minimum unit of desired accuracy
+ v = total / xmin;
+ if (v > RIB_MAX_MPATH_WIDTH) {
+ /*
+ * TODO: round to the MAX_MPATH and
+ * see if this reduces the group size.
+ */
+ v = RIB_MAX_MPATH_WIDTH;
+ }
+
+ return (uint32_t)v;
+}
+
+/*
+ * Nexthop group data consists of
+ * 1) dataplane part, with nhgrp_object as a header followed by an
+ * arbitrary number of nexthop pointers.
+ * 2) control plane part, with nhgrp_priv as a header, followed by
+ * an arbirtrary number of 'struct weightened_nhop' object.
+ *
+ * Given nexthop groups are (mostly) immutable, allocate all data
+ * in one go.
+ *
+ */
+__noinline static size_t
+get_nhgrp_alloc_size(uint32_t mp_size, uint32_t num_nhops)
+{
+ size_t sz;
+
+ sz = sizeof(struct nhgrp_object);
+ sz += mp_size * sizeof(struct nhop_object *);
+ sz += sizeof(struct nhgrp_priv);
+ sz += num_nhops * sizeof(struct weightened_nhop);
+ return (sz);
+}
+
+
+/*
+ * Compile actual list of nexthops to be used by datapath from
+ * the nexthop group @dst.
+ *
+ * For example, compiling control plane list of 2 nexthops
+ * [(200, A), (100, B)] would result in the datapath array
+ * [A, A, B]
+ */
+static void
+compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
+ uint32_t num_slots)
+{
+ struct nhgrp_object *dst;
+ int i, slot_idx, remaining_slots;
+ uint64_t remaining_sum, nh_weight, nh_slots;
+
+ slot_idx = 0;
+ dst = dst_priv->gr;
+ /* Calculate sum of all weights */
+ remaining_sum = 0;
+ for (i = 0; i < dst_priv->gr_nh_count; i++)
+ remaining_sum += x[i].weight;
+ remaining_slots = num_slots;
+ DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
+ for (i = 0; i < dst_priv->gr_nh_count; i++) {
+ /* Calculate number of slots for the current nexthop */
+ if (remaining_sum > 0) {
+ nh_weight = (uint64_t)x[i].weight;
+ nh_slots = (nh_weight * remaining_slots / remaining_sum);
+ } else
+ nh_slots = 0;
+
+ remaining_sum -= x[i].weight;
+ remaining_slots -= nh_slots;
+
+ DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
+ (uint32_t)remaining_sum, remaining_slots,
+ (int)nh_slots, slot_idx);
+
+ while (nh_slots-- > 0)
+ dst->nhops[slot_idx++] = x[i].nh;
+ }
+}
+
+/*
+ * Allocates new multipath group for the list of weightened nexthops.
+ * Does NOT reference any nexthops in the group.
+ * Returns group with refcount=1 or NULL.
+ */
+static struct nhgrp_priv *
+alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
+{
+ uint32_t mpath_size;
+ int flags = M_NOWAIT;
+ struct nhgrp_object *mp;
+ struct nhgrp_priv *grp_priv;
+
+ sort_weightened_nhops(wn, num_nhops);
+
+ mpath_size = calc_min_mpath_slots(wn, num_nhops);
+ if (mpath_size == 0) {
+ /* Zero weights, abort */
+ return (NULL);
+ }
+
+ size_t sz = get_nhgrp_alloc_size(mpath_size, num_nhops);
+ mp = malloc(sz, M_NHOP, flags | M_ZERO);
+ if (mp == NULL) {
+ return (NULL);
+ }
+
+ /* Has to be the first to make NHGRP_PRIV() work */
+ mp->mp_size = mpath_size;
+ DPRINTF("new mpath group: num_nhops: %u", (uint32_t)mpath_size);
+ mp->mp_flags = MPF_MULTIPATH;
+
+ grp_priv = NHGRP_PRIV(mp);
+ grp_priv->gr_nh_count = num_nhops;
+ refcount_init(&grp_priv->gr_refcnt, 1);
+ grp_priv->gr = mp;
+ memcpy(&grp_priv->gr_nh_weights[0], wn,
+ num_nhops * sizeof(struct weightened_nhop));
+
+ compile_nhgrp(grp_priv, wn, mp->mp_size);
+
+ MP_PRIV_LOCK_INIT(grp_priv);
+
+ return (grp_priv);
+}
+
+void
+nhgrp_free_group(struct nhgrp_object *gr)
+{
+ struct nhgrp_priv *gr_priv;
+ struct nh_control *ctl;
+
+ gr_priv = NHGRP_PRIV(gr);
+
+ if (!refcount_release(&gr_priv->gr_refcnt))
+ return;
+
+ MP_PRIV_LOCK(gr_priv);
+ ctl = gr_priv->nh_control;
+ MP_PRIV_UNLOCK(gr_priv);
+
+ if (ctl != NULL) {
+ if (unlink_nhgrp(ctl, gr_priv) == NULL) {
+ /* Failed to unlink. Do not try to reclaim */
+ /* XXX: error reporting */
+ return;
+ }
+ }
+
+ epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
+ &gr_priv->gr_epoch_ctx);
+}
+
+/*
+ * Destroys all local resources belonging to @gr_priv.
+ */
+__noinline static void
+destroy_nhgrp_int(struct nhgrp_priv *gr_priv)
+{
+
+ MP_PRIV_LOCK(gr_priv);
+ MP_PRIV_LOCK_DESTROY(gr_priv);
+
+ free(gr_priv->gr, M_NHOP);
+}
+
+__noinline static void
+destroy_nhgrp(struct nhgrp_priv *gr_priv)
+{
+
+ KASSERT((gr_priv->gr_refcnt == 0), ("gr_refcnt != 0"));
+
+ DPRINTF("DEL MPATH %p", gr_priv);
+
+ KASSERT((gr_priv->gr_idx == 0), ("gr_idx != 0"));
+
+ free_nhgrp_nhops(gr_priv);
+
+ destroy_nhgrp_int(gr_priv);
+}
+
+/*
+ * Epoch callback indicating group is safe to destroy
+ */
+static void
+destroy_nhgrp_epoch(epoch_context_t ctx)
+{
+ struct nhgrp_priv *gr_priv;
+
+ gr_priv = __containerof(ctx, struct nhgrp_priv, gr_epoch_ctx);
+
+ destroy_nhgrp(gr_priv);
+}
+
+
+static int
+ref_nhgrp(struct nhgrp_priv *gr_priv)
+{
+
+ return (refcount_acquire_if_not_zero(&gr_priv->gr_refcnt));
+}
+
+int
+nhgrp_ref_group(struct nhgrp_object *gr)
+{
+
+ return (ref_nhgrp(NHGRP_PRIV(gr)));
+}
+
+static int
+ref_nhgrp_nhops(struct nhgrp_priv *gr_priv)
+{
+
+ for (int i = 0; i < gr_priv->gr_nh_count; i++) {
+ if (nhop_ref_object(gr_priv->gr_nh_weights[i].nh) != 0)
+ continue;
+
+ /*
+ * Failed to ref the nexthop, b/c it's deleted.
+ * Need to rollback references back.
+ */
+ for (int j = 0; j < i; j++)
+ nhop_free_object(gr_priv->gr_nh_weights[j].nh);
+ return (0);
+ }
+
+ return (1);
+}
+
+static void
+free_nhgrp_nhops(struct nhgrp_priv *gr_priv)
+{
+
+ for (int i = 0; i < gr_priv->gr_nh_count; i++)
+ NH_FREE(gr_priv->gr_nh_weights[i].nh);
+}
+
+
+/*
+ * Creates or looks up an existing nexthop group based on @wn and @num_nhops.
+ *
+ * Returns referenced nhop group or NULL, passing error code in @perror.
+ */
+struct nhgrp_priv *
+get_nhgrp(struct nh_control *ctl, struct weightened_nhop *wn, int num_nhops,
+ int *perror)
+{
+ struct nhgrp_priv *key, *grp_priv;
+
+ if (ctl->gr_head.hash_size == 0) {
+ /* First multipath request. Bootstrap mpath datastructures. */
+ if (nhgrp_ctl_alloc_default(ctl, M_NOWAIT) == 0) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ if ((key = alloc_nhgrp(wn, num_nhops)) == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+
+ grp_priv = find_nhgrp(ctl, key);
+ if (grp_priv != NULL) {
+ /*
+ * Free originally-created group. As it hasn't been linked
+ * and the dependent nexhops haven't been referenced, just free
+ * the group.
+ */
+ destroy_nhgrp_int(key);
+ *perror = 0;
+ return (grp_priv);
+ } else {
+ /* No existing group, try to link the new one */
+ if (ref_nhgrp_nhops(key) == 0) {
+ /*
+ * Some of the nexthops have been sheduled for deletion.
+ * As the group hasn't been linked / no nexhops have been
+ * referenced, call the final destructor immediately.
+ */
+ destroy_nhgrp_int(key);
+ *perror = EAGAIN;
+ return (NULL);
+ }
+ if (link_nhgrp(ctl, key) == 0) {
+ /* Unable to allocate index? */
+ *perror = EAGAIN;
+ destroy_nhgrp(key);
+ }
+ *perror = 0;
+ return (key);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * Creates/finds nexthop group based on @wn and @num_nhops.
+ * Returns referenced group or NULL, with an error in @perror.
+ *
+ * If the error is EAGAIN, then the operation can be retried.
+ */
+struct nhgrp_object *
+nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops,
+ int *perror)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhgrp_priv *gr_priv;
+
+ gr_priv = get_nhgrp(ctl, wn, num_nhops, perror);
+
+ if (gr_priv != NULL)
+ return (gr_priv->gr);
+
+ return (NULL);
+}
+
+/*
+ * Creates new nexthop group based on @src group with the nexthops defined in bitmask
+ * @nhop_mask removed.
+ * Returns referenced nexthop group or NULL on failure.
+ */
+struct nhgrp_object *
+nhgrp_get_del_nhops(struct rib_head *rh, const struct nhgrp_object *src,
+ uint64_t *nhop_mask, int *perror)
+{
+ char storage[64];
+ struct nh_control *ctl = rh->nh_control;
+ struct weightened_nhop *pnhops;
+ const struct nhgrp_priv *mp_priv, *src_priv;
+ size_t sz;
+ int i, num_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(src);
+
+ sz = src_priv->gr_nh_count * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ pnhops = malloc(sz, M_TEMP, M_NOWAIT);
+ if (pnhops == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Copy nhops first */
+ num_nhops = 0;
+ for (i = 0; i < src_priv->gr_nh_count; i++) {
+ /* Do not copy deleted nexthops */
+ if (nhop_mask[i / 64] & (1 << (i % 64)))
+ continue;
+ memcpy(&pnhops[num_nhops++], &src_priv->gr_nh_weights[i],
+ sizeof(struct weightened_nhop));
+ }
+
+ KASSERT((num_nhops >= 2), ("num_nhops < 2 after deletion"));
+
+
+ mp_priv = get_nhgrp(ctl, pnhops, num_nhops, perror);
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ if (mp_priv == NULL)
+ return (NULL);
+
+ return (mp_priv->gr);
+}
+
+
+#if 0
+/*
+ * Adds new nexthop to existing multipath group or a single nexthop.
+ *
+ */
+struct nhgrp_object *
+nhgrp_append_nhop(struct rib_head *rh, struct nhop_object *nh_orig,
+ u_long weight_orig, struct nhop_object *nh_new, u_long weight_new,
+ uint64_t *addmask, uint64_t *changemask)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct weightened_nhop wn[2];
+ struct nhop_mpath *mp;
+
+ /*
+ * Add our nexthop we try to add as a first one.
+ */
+ wn[0].nh = nh_new;
+ wn[0].weight = weight_new;
+ if (!NH_IS_MULTIPATH(nh_orig)) {
+ /*
+ * Original nexthop is not multipath.
+ * Request new multipath group consisting of 2 nexthops.
+ */
+ wn[1].nh = nh_orig;
+ wn[1].weight = weight_orig;
+ mp = get_nhgrp(ctl, wn, 2);
+ *addmask = 1 << 0;
+ *changemask = 0;
+ } else {
+ /*
+ * Original nexthop is already multipath.
+ * Create a new multipath group from existing group
+ * and the new nexthop.
+ */
+ struct nhop_mpath *mp_orig = (struct nhop_mpath *)nh_orig;
+ *addmask = 0;
+ *changemask = 0;
+ mp = nhgrp_append_nhops(ctl, mp_orig, wn, 1, addmask,
+ changemask);
+ DPRINTF("mpath append returned %p from %p\n", mp, mp_orig);
+ }
+ /* mp has now referenced all nexthops */
+
+ return (mp);
+}
+#endif
+
+/*
+ * Appends one or more nexthops denoted by @wm to the nexthop group @gr_orig.
+ *
+ * Returns referenced nexthop group or NULL. In the latter case, @perror is
+ * filled with an error code.
+ * Note that function does NOT care if the next nexthops already exists
+ * in the @gr_orig. As a result, they will be added, resulting in the
+ * same nexthop being present multiple times in the new group.
+ */
+struct nhgrp_object *
+nhgrp_append_nhops(struct rib_head *rh, const struct nhgrp_object *gr_orig,
+ struct weightened_nhop *wn, int num_nhops, uint64_t *paddmask, int *perror)
+{
+ char storage[64];
+ struct weightened_nhop *pnhops;
+ const struct weightened_nhop *c_wn;
+ const struct nhgrp_priv *src_priv, *mp_priv;
+ size_t sz;
+ int curr_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(gr_orig);
+ curr_nhops = src_priv->gr_nh_count;
+
+ *perror = 0;
+
+ sz = (src_priv->gr_nh_count + num_nhops) * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ pnhops = malloc(sz, M_TEMP, M_NOWAIT);
+ if (pnhops == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Copy nhops from original group first */
+ memcpy(pnhops, src_priv->gr_nh_weights,
+ curr_nhops * sizeof(struct weightened_nhop));
+ memcpy(&pnhops[curr_nhops], wn, num_nhops * sizeof(struct weightened_nhop));
+ curr_nhops += num_nhops;
+
+ mp_priv = get_nhgrp(rh->nh_control, pnhops, curr_nhops, perror);
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ if (mp_priv == NULL)
+ return (NULL);
+
+ /*
+ * Nhops are reordered in the new nexthop group.
+ * Walk through old&new groups to calculate an addition mask.
+ * TODO: optimize for large multipath groups.
+ */
+ *paddmask = 0;
+ for (int i = 0; i < num_nhops; i++) {
+ for (int j = 0; j < curr_nhops; j++) {
+ c_wn = &mp_priv->gr_nh_weights[j];
+ if ((wn[i].nh == c_wn->nh) && (wn[i].weight == c_wn->weight)) {
+ /* Found */
+ *paddmask |= 1 << j;
+ break;
+ }
+ }
+ }
+
+ return (mp_priv->gr);
+}
+
+/*
+ * Replaces nexthop with index @replace_idx in @gr_orig with the new one in @wn.
+ *
+ * Returns new referenced nhop group or NULL.
+ */
+struct nhgrp_object *
+nhgrp_get_replace_nhop(struct rib_head *rh, const struct nhgrp_object *gr_orig,
+ struct weightened_nhop *wn, uint8_t replace_idx, uint64_t *pmodmask,
+ int *perror)
+{
+ char storage[64];
+ struct weightened_nhop *pnhops;
+ const struct weightened_nhop *c_wn;
+ const struct nhgrp_priv *src_priv, *mp_priv;
+ size_t sz;
+ int curr_nhops;
+
+ src_priv = NHGRP_PRIV_CONST(gr_orig);
+ curr_nhops = src_priv->gr_nh_count;
+
+ if (replace_idx >= src_priv->gr_nh_count) {
+ *perror = EINVAL;
+ return (NULL);
+ }
+
+ *perror = 0;
+
+ sz = src_priv->gr_nh_count * (sizeof(struct weightened_nhop));
+ /* optimize for <= 4 paths, each path=16 bytes */
+ if (sz <= sizeof(storage))
+ pnhops = (struct weightened_nhop *)&storage[0];
+ else {
+ pnhops = malloc(sz, M_TEMP, M_NOWAIT);
+ if (pnhops == NULL) {
+ *perror = ENOMEM;
+ return (NULL);
+ }
+ }
+
+ /* Copy nhops from original group & update the relevant nhop */
+ memcpy(pnhops, src_priv->gr_nh_weights,
+ src_priv->gr_nh_count * sizeof(struct weightened_nhop));
+ pnhops[replace_idx] = *wn;
+
+ mp_priv = get_nhgrp(rh->nh_control, pnhops, src_priv->gr_nh_count, perror);
+
+ if (pnhops != (struct weightened_nhop *)&storage[0])
+ free(pnhops, M_TEMP);
+
+ if (mp_priv == NULL)
+ return (NULL);
+
+ /*
+ * In the resulting group, nhop can be reordered.
+ * Re-iterave over the group to calculate the addition mask.
+ * TODO: optimize for large multipath groups.
+ */
+ for (int i = 0; i < mp_priv->gr_nh_count; i++) {
+ c_wn = &mp_priv->gr_nh_weights[i];
+ if ((wn[i].nh == c_wn->nh) && (wn[i].weight == c_wn->weight)) {
+ /* Found */
+ *pmodmask |= 1 << i;
+ break;
+ }
+ }
+
+ return (mp_priv->gr);
+}
+
+/*
+ * Returns pointer to array of nexthops with weights for
+ * given @mp. Stores number of items in the array into @pnum_nhops.
+ * XXX: const!
+ */
+struct weightened_nhop *
+nhgrp_get_nhops(struct nhgrp_object *mp, uint32_t *pnum_nhops)
+{
+ struct nhgrp_priv *grp_priv;
+
+ KASSERT(((mp->mp_flags & MPF_MULTIPATH) != 0), ("nhop is not mpath"));
+
+ grp_priv = NHGRP_PRIV(mp);
+ *pnum_nhops = grp_priv->gr_nh_count;
+
+ return (grp_priv->gr_nh_weights);
+}
+
+__noinline static int
+dump_nhgrp_entry(struct rib_head *rh, struct nhgrp_priv *grp_priv,
+ char *buffer, struct sysctl_req *w)
+{
+
+ struct rt_msghdr *rtm;
+ struct mpath_external *mpe;
+ struct nhgrp_object *mp;
+ struct mpath_nhop_external *ext;
+ uint32_t *pidx;
+ int error;
+ size_t sz;
+
+ //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w);
+
+ mp = grp_priv->gr;
+
+ sz = sizeof(struct rt_msghdr) + sizeof(struct mpath_external);
+ sz += sizeof(struct mpath_nhop_external) * grp_priv->gr_nh_count;
+ sz += sizeof(uint32_t) * mp->mp_size;
+
+ bzero(buffer, sz);
+
+ rtm = (struct rt_msghdr *)buffer;
+ rtm->rtm_msglen = sz;
+ rtm->rtm_version = RTM_VERSION;
+ rtm->rtm_type = RTM_GET;
+
+ mpe = (struct mpath_external *)(rtm + 1);
+
+ mpe->mp_idx = grp_priv->gr_idx;
+ mpe->mp_refcount = grp_priv->gr_refcnt;
+ mpe->mp_nh_count = grp_priv->gr_nh_count;
+ mpe->mp_group_size = mp->mp_size;
+
+ ext = (struct mpath_nhop_external *)(mpe + 1);
+ for (int i = 0; i < grp_priv->gr_nh_count; i++) {
+ ext[i].nh_idx = grp_priv->gr_nh_weights[i].nh->nh_priv->nh_idx;
+ ext[i].nh_weight = grp_priv->gr_nh_weights[i].weight;
+ }
+
+ pidx = (uint32_t *)&ext[grp_priv->gr_nh_count];
+ for (int i = 0; i < mp->mp_size; i++)
+ pidx[i] = mp->nhops[i]->nh_priv->nh_idx;
+
+ error = SYSCTL_OUT(w, buffer, sz);
+
+ /*
+ DPRINTF("Exported %d ifindex %d family %d type %d error %d\n", nh->nh_priv->nh_idx, pnhe->ifindex,
+ pnhe->nh_family, pnhe->nh_type, error);
+ */
+
+ return (error);
+}
+
+int
+nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct epoch_tracker et;
+ struct nhgrp_priv *grp_priv;
+ char *buffer;
+ size_t sz;
+ int error;
+
+ if (ctl->gr_head.items_count == 0)
+ return (0);
+
+ sz = sizeof(struct mpath_external);
+ sz += (sizeof(struct mpath_nhop_external) + sizeof(uint32_t)) *
+ RIB_MAX_MPATH_WIDTH;
+ buffer = malloc(sz, M_TEMP, M_WAITOK);
+
+ DPRINTF("NHGRP DUMP: count=%u", ctl->gr_head.items_count);
+ NET_EPOCH_ENTER(et);
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->gr_head, mpath, grp_priv) {
+ error = dump_nhgrp_entry(rh, grp_priv, buffer, w);
+ if (error != 0) {
+ NHOPS_RUNLOCK(ctl);
+ NET_EPOCH_EXIT(et);
+ free(buffer, M_TEMP);
+ return (error);
+ }
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+ NET_EPOCH_EXIT(et);
+
+ free(buffer, M_TEMP);
+
+ return (0);
+}
Index: sys/net/route/nhgrp_var.h
===================================================================
--- /dev/null
+++ sys/net/route/nhgrp_var.h
@@ -0,0 +1,85 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for the nexthop groups.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHGRP_VAR_H_
+#define _NET_ROUTE_NHGRP_VAR_H_
+
+/* define mpath hash table */
+struct nhgrp_priv;
+//CHT_SLIST_DEFINE(mpath, struct nhgrp_priv);
+
+/* Hash definition */
+/* produce hash value for an object */
+#define mpath_hash_obj(_obj) (hash_nhgrp(_obj))
+/* compare two objects */
+#define mpath_cmp(_one, _two) (cmp_nhgrp(_one, _two))
+/* next object accessor */
+#define mpath_next(_obj) (_obj)->gr_priv_next
+
+struct nhgrp_priv {
+ uint32_t gr_idx;
+ uint8_t gr_nh_count; /* number of items in nh_weights */
+ uint8_t gr_spare[3];
+ u_int gr_refcnt; /* use refcount */
+ struct mtx gr_mtx; /* mutex */
+ struct nh_control *nh_control; /* parent control structure */
+ struct nhgrp_priv *gr_priv_next;
+ struct nhgrp_object *gr;
+ struct epoch_context gr_epoch_ctx; /* epoch data for nhop */
+ struct weightened_nhop gr_nh_weights[0];
+};
+
+#define _NHGRP_PRIV(_src) (&(_src)->nhops[(_src)->mp_size])
+#define NHGRP_PRIV(_src) ((struct nhgrp_priv *)_NHGRP_PRIV(_src))
+#define NHGRP_PRIV_CONST(_src) ((const struct nhgrp_priv *)_NHGRP_PRIV(_src))
+
+#define MP_PRIV_LOCK_INIT(_priv) mtx_init(&(_priv)->gr_mtx, "nhgrp", NULL, MTX_DEF)
+#define MP_PRIV_LOCK(_priv) mtx_lock(&(_priv)->gr_mtx)
+#define MP_PRIV_UNLOCK(_priv) mtx_unlock(&(_priv)->gr_mtx)
+#define MP_PRIV_LOCK_DESTROY(_priv) mtx_destroy(&(_priv)->gr_mtx)
+#define MP_PRIV_LOCK_ASSERT(_priv) mtx_assert(&(_priv)->gr_mtx, MA_OWNED)
+
+/* mpath */
+struct weightened_nhop;
+
+/* nhgrp.c */
+int nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags);
+struct nhgrp_priv *find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key);
+int link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv);
+struct nhgrp_priv *unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key);
+
+#endif
+
Index: sys/net/route/nhop.h
===================================================================
--- /dev/null
+++ sys/net/route/nhop.h
@@ -0,0 +1,228 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains public definitions for the nexthop routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHOP_H_
+#define _NET_ROUTE_NHOP_H_
+
+#include <netinet/in.h> /* sockaddr_in && sockaddr_in6 */
+
+#include <sys/counter.h>
+
+enum nhop_type {
+ NH_TYPE_IPV4_ETHER_RSLV = 1, /* IPv4 ethernet without GW */
+ NH_TYPE_IPV4_ETHER_NHOP = 2, /* IPv4 with pre-calculated ethernet encap */
+ NH_TYPE_IPV6_ETHER_RSLV = 3, /* IPv6 ethernet, without GW */
+ NH_TYPE_IPV6_ETHER_NHOP = 4 /* IPv6 with pre-calculated ethernet encap*/
+};
+
+#ifdef _KERNEL
+
+/*
+ * Currently the only use case of AF_LINK gateway is storing
+ * interface index of the interface of the source IPv6 address.
+ * This is used by the IPv6 code for the connections over loopback
+ * interface.
+ *
+ * The structure below copies 'struct sockaddr_dl', reducing the
+ * size of sdl_data buffer, as it is not used. This change
+ * allows to store the AF_LINK gateways in the nhop gateway itself,
+ * simplifying control plane handling.
+ */
+struct sockaddr_dl_short {
+ u_char sdl_len; /* Total length of sockaddr */
+ u_char sdl_family; /* AF_LINK */
+ u_short sdl_index; /* if != 0, system given index for interface */
+ u_char sdl_type; /* interface type */
+ u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */
+ u_char sdl_alen; /* link level address length */
+ u_char sdl_slen; /* link layer selector length */
+ char sdl_data[8]; /* unused */
+};
+
+#define NHOP_RELATED_FLAGS \
+ (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_BLACKHOLE | \
+ RTF_FIXEDMTU | RTF_LOCAL | RTF_BROADCAST | RTF_MULTICAST)
+
+struct nhop_request {
+ struct ifnet *ifp;
+ struct ifaddr *ifa;
+ struct sockaddr *gw;
+ int family;
+ int mtu;
+ int rt_flags; /* gets converted to nh_flags later */
+ uint16_t nh_type;
+ uint16_t nh_flags_additional; /* Additional flags to set to the nh_flags */
+};
+
+struct nh_control;
+struct nhop_priv;
+
+/*
+ * Struct 'nhop_object' field description:
+ *
+ * nh_flags: NHF_ flags used in the dataplane code. NHF_GATEWAY or NHF_BLACKHOLE
+ * can be examples of such flags.
+ * nh_mtu: ready-to-use nexthop mtu. Already accounts for the link-level header,
+ * interface MTU and protocol-specific limitations.
+ * nh_prepend_len: link-level prepend length. Currently unused.
+ * nh_ifp: logical transmit interface. The one from which if_transmit() will be
+ * called. Guaranteed to be non-NULL.
+ * nh_aifp: ifnet of the source address. Same as nh_ifp except IPv6 loopback
+ * routes. See the example below.
+ * nh_ifa: interface address to use. Guaranteed to be non-NULL.
+ * nh_pksent: counter(9) reflecting the number of packets transmitted.
+ *
+ * gw_: storage suitable to hold AF_INET, AF_INET6 or AF_LINK gateway. More
+ * details ara available in the examples below.
+ *
+ *
+ * Direct routes (routes w/o gateway):
+ * NHF_GATEWAY is NOT set.
+ * nh_ifp denotes the logical transmit interface ().
+ * nh_aifp is the same as nh_ifp
+ * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat)
+ * Loopback routes:
+ * NHF_GATEWAY is NOT set.
+ * nh_ifp points to the loopback interface (lo0).
+ * nh_aifp points to the interface where the destination address belongs to.
+ * This is useful in IPv6 link-local-over-loopback communications.
+ * gw_sa contains AF_LINK sa with nh_aifp ifindex (compat)
+ * GW routes:
+ * NHF_GATEWAY is set.
+ * nh_ifp denotes the logical transmit interface.
+ * nh_aifp is the same as nh_ifp
+ * gw_sa contains L3 address (either AF_INET or AF_INET6).
+ *
+ *
+ * Note: struct nhop_object fields are ordered in a way that
+ * supports memcmp-based comparisons.
+ *
+ */
+#define NHOP_END_CMP (__offsetof(struct nhop_object, nh_pksent))
+
+struct nhop_object {
+ uint16_t nh_flags; /* nhop flags */
+ uint16_t nh_mtu; /* nexthop mtu */
+ union {
+ struct sockaddr_in gw4_sa; /* GW accessor as IPv4 */
+ struct sockaddr_in6 gw6_sa; /* GW accessor as IPv6 */
+ struct sockaddr gw_sa;
+ struct sockaddr_dl_short gwl_sa; /* AF_LINK gw (compat) */
+ char gw_buf[28];
+ };
+ struct ifnet *nh_ifp; /* Logical egress interface. Always != NULL */
+ struct ifaddr *nh_ifa; /* interface address to use. Always != NULL */
+ struct ifnet *nh_aifp; /* ifnet of the source address. Always != NULL */
+ counter_u64_t nh_pksent; /* packets sent using this nhop */
+ /* 32 bytes + 4xPTR == 64(amd64) / 48(i386) */
+ uint8_t nh_prepend_len; /* length of prepend data */
+ uint8_t spare[3];
+ uint32_t spare1; /* alignment */
+ char nh_prepend[56]; /* L2 prepend */
+ /* -- 128 bytes -- */
+ struct nhop_priv *nh_priv; /* control plane data */
+ uint8_t spare2[16];
+};
+
+/*
+ * Nhop validness.
+ *
+ * Currently we verify whether link is up or not on every packet, which can be
+ * quite costy.
+ * TODO: subscribe for the interface notifications and update the nexthops
+ * with NHF_INVALID flag.
+ */
+
+//#define NH_IS_VALID(_nh) (((_nh)->nh_flags & NHF_INVALID) == 0)
+#define NH_IS_VALID(_nh) RT_LINK_IS_UP((_nh)->nh_ifp)
+#define NH_IS_MULTIPATH(_nh) ((_nh)->nh_flags & NHF_MULTIPATH)
+
+#define RT_GATEWAY(_rt) ((struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
+#define RT_GATEWAY_CONST(_rt) ((const struct sockaddr *)&(_rt)->rt_nhop->gw4_sa)
+
+#define NH_FREE(_nh) do { \
+ nhop_free_object(_nh); \
+ /* guard against invalid refs */ \
+ _nh = NULL; \
+} while (0)
+
+
+void nhop_free_object(struct nhop_object *nh);
+
+struct sysctl_req;
+struct sockaddr_dl;
+struct rib_head;
+
+uint32_t nhop_get_idx(const struct nhop_object *nh);
+void nhop_free(struct nhop_object *nh);
+
+#endif
+
+/* Kernel <> userland structures */
+
+struct nhop_external {
+ uint32_t nh_idx;
+ uint32_t nh_fib;
+ uint32_t ifindex; /* transmit interface ifindex */
+ uint32_t aifindex; /* address ifindex */
+ uint8_t nh_family; /* address family */
+ uint16_t nh_type; /* nexthop type */
+ uint16_t nh_mtu; /* nexthop mtu */
+
+ uint16_t nh_flags; /* nhop flags */
+ struct in_addr nh_addr; /* GW/DST IPv4 address */
+ struct in_addr nh_src; /* default source IPv4 address */
+ uint64_t nh_pksent;
+ /* control plane */
+ /* lookup key: address, family, type */
+ char nh_prepend[64]; /* L2 prepend */
+ uint8_t prepend_len; /* length of the prepend */
+ uint64_t nh_refcount; /* number of references */
+};
+
+struct mpath_nhop_external {
+ uint32_t nh_idx;
+ uint32_t nh_weight;
+};
+
+struct mpath_external {
+ uint32_t mp_idx;
+ uint32_t mp_refcount;
+ uint32_t mp_nh_count;
+ uint32_t mp_group_size;
+};
+
+
+#endif
+
+
Index: sys/net/route/nhop.c
===================================================================
--- /dev/null
+++ sys/net/route/nhop.c
@@ -0,0 +1,346 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_route_mpath.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+
+/*
+ * This file contains data structures management logic for the nexthop ("nhop")
+ * route subsystem.
+ *
+ * Nexthops in the original sense are the objects containing all the necessary
+ * information to forward the packet to the selected destination.
+ * In particular, nexthop is defined by a combination of
+ * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
+ * NHF_DEFAULT
+ *
+ * All nexthops are stored in the resizable hash table.
+ * Additionally, each nexthop gets assigned its unique index (nexthop index)
+ * so userland programs can interact with the nexthops easier. Index allocation
+ * is backed by the bitmask array.
+ */
+
+MALLOC_DEFINE(M_NHOP, "nhops", "nexthops data");
+
+
+/* Hash management functions */
+
+int
+nhops_init(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ size_t alloc_size;
+ uint32_t num_buckets, num_items;
+ void *ptr;
+
+ ctl = malloc(sizeof(struct nh_control), M_NHOP, M_WAITOK | M_ZERO);
+
+ /*
+ * Allocate nexthop hash. Start with 16 items by default (128 bytes).
+ * This will be enough for most of the cases.
+ */
+ num_buckets = 16;
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+ ptr = malloc(alloc_size, M_NHOP, M_WAITOK | M_ZERO);
+ CHT_SLIST_INIT(&ctl->nh_head, ptr, num_buckets);
+
+ /*
+ * Allocate nexthop index bitmask.
+ */
+ num_items = 128 * 8; /* 128 bytes */
+ ptr = malloc(bitmask_get_size(num_items), M_NHOP, M_WAITOK | M_ZERO);
+ bitmask_init(&ctl->nh_idx_head, ptr, num_items);
+
+ NHOPS_LOCK_INIT(ctl);
+
+ rh->nh_control = ctl;
+ ctl->rh = rh;
+
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_init(ctl);
+#endif
+
+ return (0);
+}
+
+void
+nhops_destroy(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv;
+
+ ctl = rh->nh_control;
+
+ /*
+ * All routes should have been deleted in rt_table_destroy().
+ * However, TCP stack or other consumers may store referenced
+ * nexthop pointers. When these references go to zero,
+ * nhop_free_object() will try to unlink these records from the
+ * datastructures, most likely leading to panic.
+ *
+ * Avoid that by explicitly marking all of the remaining
+ * nexthops as unlinked.
+ */
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ DPRINTF("Unlinking referenced nhop %u", nh_priv->nh_idx);
+ NH_PRIV_LOCK(nh_priv);
+ nh_priv->nh_control = NULL;
+ nh_priv->nh_idx = 0;
+ NH_PRIV_UNLOCK(nh_priv);
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+
+#ifdef ROUTE_MPATH
+ nhgrp_ctl_free(ctl);
+#endif
+ free(ctl->nh_head.ptr, M_NHOP);
+ free(ctl->nh_idx_head.idx, M_NHOP);
+ free(ctl, M_NHOP);
+}
+
+/*
+ * Nexthops distribution:
+ *
+ * 2 "mandatory" nexthops per interface ("interface route", "loopback").
+ * For direct peering: 1 nexthop for the peering router per ifp/af.
+ * For Ix-like peering: tens to hundreds nexthops of neghbors per ifp/af.
+ * IGP control plane & broadcast segment: tens of nexthops per ifp/af.
+ *
+ * With that in mind, hash nexthops by the combination of the interface
+ * and GW IP address.
+ */
+struct _hash_data {
+ uint16_t ifindex;
+ uint8_t family;
+ uint8_t nh_type;
+ uint32_t gw_addr;
+};
+
+static uint32_t
+hash_priv(const struct nhop_priv *priv)
+{
+ struct nhop_object *nh;
+ uint16_t ifindex;
+ struct _hash_data key;
+
+ nh = priv->nh;
+ ifindex = nh->nh_ifp->if_index & 0xFFFF;
+ memset(&key, 0, sizeof(key));
+
+ key.ifindex = ifindex;
+ key.family = nh->gw_sa.sa_family;
+ key.nh_type = priv->nh_type & 0xFF;
+ if (nh->gw_sa.sa_family == AF_INET6)
+ memcpy(&key.gw_addr, &nh->gw6_sa.sin6_addr.s6_addr32[3], 4);
+ else if (nh->gw_sa.sa_family == AF_INET)
+ memcpy(&key.gw_addr, &nh->gw4_sa.sin_addr, 4);
+
+ return (uint32_t)(djb_hash((const unsigned char *)&key, sizeof(key)));
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+__noinline static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+ void *nh_ptr, *nh_idx_ptr;
+ void *old_idx_ptr;
+ size_t alloc_size;
+
+ nh_ptr = NULL ;
+ if (new_nh_buckets != 0) {
+ alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+ nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ nh_idx_ptr = NULL;
+ if (new_idx_items != 0) {
+ alloc_size = bitmask_get_size(new_idx_items);
+ nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+ }
+
+ if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+ /* Both allocations failed */
+ return;
+ }
+
+ DPRINTF("going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]", nh_ptr,
+ new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+ old_idx_ptr = NULL;
+
+ NHOPS_WLOCK(ctl);
+ if (nh_ptr != NULL) {
+ CHT_SLIST_RESIZE(&ctl->nh_head, nhops, nh_ptr, new_nh_buckets);
+ }
+ if (nh_idx_ptr != NULL) {
+ if (bitmask_copy(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items))
+ bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+ }
+ NHOPS_WUNLOCK(ctl);
+
+ if (nh_ptr != NULL)
+ free(nh_ptr, M_NHOP);
+ if (old_idx_ptr != NULL)
+ free(old_idx_ptr, M_NHOP);
+}
+
+__noinline int
+link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv)
+{
+ uint16_t idx;
+ uint32_t new_num_buckets, new_num_items;
+
+ //old_idx = nh->nh_priv->nh_idx;
+
+ NHOPS_WLOCK(ctl);
+ /* Check if we need to resize hash and index */
+ new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
+ new_num_items = bitmask_get_resize_items(&ctl->nh_idx_head);
+
+ if (bitmask_alloc_idx(&ctl->nh_idx_head, &idx) != 0) {
+ NHOPS_WUNLOCK(ctl);
+ DPRINTF("Unable to allocate nhop index");
+ consider_resize(ctl, new_num_buckets, new_num_items);
+ return (0);
+ }
+
+ nh_priv->nh_idx = idx;
+ nh_priv->nh_control = ctl;
+ CHT_SLIST_INSERT_HEAD(&ctl->nh_head, nhops, nh_priv);
+
+ NHOPS_WUNLOCK(ctl);
+
+ DPRINTF("Linked nhop priv %p to %d, hash %u, ctl %p", nh_priv, idx,
+ hash_priv(nh_priv), ctl);
+ consider_resize(ctl, new_num_buckets, new_num_items);
+
+ return (idx);
+}
+
+__noinline struct nhop_priv *
+unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv)
+{
+ struct nhop_priv *priv_ret;
+ int idx, ret;
+ uint32_t new_num_buckets, new_num_items;
+
+ idx = 0;
+ ret = 0;
+
+ NHOPS_WLOCK(ctl);
+ CHT_SLIST_REMOVE_BYOBJ(&ctl->nh_head, nhops, nh_priv, priv_ret);
+
+ if (priv_ret != NULL) {
+ NH_PRIV_LOCK(priv_ret);
+ idx = priv_ret->nh_idx;
+ priv_ret->nh_idx = 0;
+ priv_ret->nh_control = NULL;
+ NH_PRIV_UNLOCK(priv_ret);
+
+ KASSERT((idx != 0), ("bogus nhop index 0"));
+ ret = bitmask_free_idx(&ctl->nh_idx_head, idx);
+ }
+
+ /* Check if we need to resize hash and index */
+ new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->nh_head);
+ new_num_items = bitmask_get_resize_items(&ctl->nh_idx_head);
+
+ NHOPS_WUNLOCK(ctl);
+
+ if (priv_ret == NULL)
+ DPRINTF("Unable to unlink nhop priv %p from hash, hash %u ctl %p",
+ nh_priv,hash_priv(nh_priv), ctl);
+ KASSERT(priv_ret != NULL, ("FCK"));
+ if (ret != 0)
+ DPRINTF("Unable to unlink index %d from nhop %p", idx, priv_ret->nh);
+
+ DPRINTF("Unlinked nhop %p priv idx %d", nh_priv, idx);
+
+ consider_resize(ctl, new_num_buckets, new_num_items);
+
+ return (priv_ret);
+}
+
+__noinline struct nhop_priv *
+find_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv)
+{
+ struct nhop_priv *nh_priv_ret;
+
+ //DPRINTF("--- start search ---");
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FIND_BYOBJ(&ctl->nh_head, nhops, nh_priv, nh_priv_ret);
+ if (nh_priv_ret != NULL) {
+ if (refcount_acquire_if_not_zero(&nh_priv_ret->nh_refcnt) == 0){
+ /* refcount was 0 -> nhop is being deleted */
+ nh_priv_ret = NULL;
+ }
+ }
+ NHOPS_RUNLOCK(ctl);
+
+ //if (nh_priv_ret == NULL)
+ // DPRINTF("--- end search (not found) ---");
+ return (nh_priv_ret);
+}
+
+
Index: sys/net/route/nhop_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/nhop_ctl.c
@@ -0,0 +1,740 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_route_mpath.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+
+/*
+ * This file contains core functionality for the nexthop ("nhop") route subsystem.
+ * The business logic needed to create nexhop objects is implemented here.
+ *
+ * Nexthops in the original sense are the objects containing all the necessary
+ * information to forward the packet to the selected destination.
+ * In particular, nexthop is defined by a combination of
+ * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
+ * NHF_DEFAULT
+ *
+ * Additionally, each nexthop gets assigned its unique index (nexthop index)
+ * so userland programs can interact with the nexthops easier. Index allocation
+ * is backed by the bitmask array.
+ * All nexthops are stored in the resizable hash table.
+ *
+ * Basically, this file revolves around supproring 2 functions:
+ * 1) fill_nhop(), which contains all business logic on filling the nexthop fields
+ * based on the provided request
+ * 2) nhop_get(), which gets a nexthop based on the provided request.
+ *
+ *
+ * Conventions:
+ * 1) non-exported functions start with verb
+ * 2) exported function starts with the subsystem prefix: "nhop"
+ *
+ */
+
+static struct nhop_object *alloc_nhop(const struct nhop_request *req);
+static int fill_nhop(const struct nhop_request *req, struct nhop_object *nh);
+static int dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w);
+
+static struct nhop_object *alloc_nhop_tmp(const struct nhop_request *req);
+static void free_nhop_tmp(struct nhop_object *nh);
+
+static struct ifnet *get_aifp(const struct nhop_request *req);
+static void fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp);
+
+static void destroy_nhop_epoch(epoch_context_t ctx);
+static void destroy_nhop(struct nhop_priv *nh_priv);
+
+_Static_assert(__offsetof(struct nhop_object, nh_ifp) == 32,
+ "nhop_object size mistmatch");
+
+/*
+ * Fetches the interface of the route source address.
+ * In all cases except interface-addresses it would be the
+ * same as the transmit interfaces.
+ * However, for the interface address this function will return
+ * this interface ifp instead of loopback. This is needed to support
+ * link-local IPv6 loopback communications.
+ *
+ * Returns referenced ifp.
+ */
+static struct ifnet *
+get_aifp(const struct nhop_request *req)
+{
+ struct ifnet *aifp = NULL;
+ struct sockaddr_dl *sdl;
+ struct epoch_tracker et;
+
+ /*
+ * Adjust the "outgoing" interface. If we're going to loop
+ * the packet back to ourselves, the ifp would be the loopback
+ * interface. However, we'd rather know the interface associated
+ * to the destination address (which should probably be one of
+ * our own addresses.)
+ */
+ if ((req->ifp->if_flags & IFF_LOOPBACK) &&
+ req->gw->sa_family == AF_LINK) {
+ sdl = (struct sockaddr_dl *)req->gw;
+ NET_EPOCH_ENTER(et);
+ aifp = ifnet_byindex_ref(sdl->sdl_index);
+ NET_EPOCH_EXIT(et);
+ if (aifp == NULL) {
+ DPRINTF("unable to get aifp for %s index %d",
+ if_name(req->ifp), sdl->sdl_index);
+ }
+ }
+
+ if (aifp == NULL) {
+ aifp = req->ifp;
+ if_ref(aifp);
+ }
+
+ return (aifp);
+}
+
+#if 0
+static int compare_sa(const struct sockaddr *first, const struct sockaddr *second);
+static int compare_nhop_addr(const struct nhop_object *nh, const struct nhop_object *nh_src);
+
+static int
+compare_sa(const struct sockaddr *first, const struct sockaddr *second)
+{
+ if (first == second)
+ return (1);
+ if (first->sa_family != second->sa_family) {
+ DPRINTF("family different: %d %d", (int)first->sa_family,
+ (int)second->sa_family);
+ return (0);
+ }
+ if (first->sa_len != second->sa_len) {
+ DPRINTF("size different: %d %d", (int)first->sa_len,
+ (int)second->sa_len);
+ return (0);
+ }
+ if (memcmp(first, second, first->sa_len)) {
+ DPRINTF("data different");
+ return (0);
+ }
+ return (1);
+}
+
+
+__noinline static int
+compare_nhop_addr(const struct nhop_object *nh, const struct nhop_object *nh_src)
+{
+ const struct sockaddr *nh_sa, *nh_src_sa;
+ //struct sockaddr_in6 gw6;
+ //struct sockaddr *sa;
+ //struct in_ifaddr *ia;
+
+ switch (nh->nh_priv->nh_type) {
+#if 0
+ case NH_TYPE_IPV4_ETHER_RSLV:
+ pgw = (struct sockaddr *)&gw6;
+ fill_empty_sa(pgw, AF_INET);
+ if (compare_sa(pgw, sa) != 0)
+ return (1);
+ break;
+#endif
+ case NH_TYPE_IPV4_ETHER_NHOP:
+ nh_sa = (const struct sockaddr *)&nh->gw4_sa;
+ nh_src_sa = (const struct sockaddr *)&nh_src->gw4_sa;
+ if (compare_sa(nh_sa, nh_src_sa) != 0)
+ return (1);
+ break;
+#if 0
+ case NH_TYPE_IPV6_ETHER_RSLV:
+ //nh_sa = (const struct sockaddr *)&ifatoia6(nh->nh_ifa)->ia_addr;
+ pgw = (struct sockaddr *)&gw6;
+ fill_empty_sa(pgw, AF_INET6);
+ if (compare_sa(pgw, sa) != 0)
+ return (1);
+ break;
+#endif
+ case NH_TYPE_IPV6_ETHER_NHOP:
+ nh_sa = (const struct sockaddr *)&nh->gw6_sa;
+ nh_src_sa = (const struct sockaddr *)&nh_src->gw6_sa;
+ if (compare_sa(nh_sa, nh_src_sa) != 0)
+ return (1);
+ break;
+ default:
+ DPRINTF("unknown nh_type: %d", (int)nh->nh_priv->nh_type);
+ }
+
+ return (0);
+}
+
+int
+cmp_priv_debug(const struct nhop_priv *_one, const struct nhop_priv *_two)
+{
+ const struct nhop_object *nh, *nh_src;
+
+ DPRINTF("Q: cmp %p and %p", _one, _two);
+
+ nh = _one->nh;
+ nh_src = _two->nh;
+
+ if (nh->nh_ifp != nh_src->nh_ifp)
+ return (0);
+
+ if ((_one->nh_type != _two->nh_type) || (_one->nh_family != _two->nh_family)) {
+ DPRINTF("MISS: type: %d %d family %d %d", (int)_one->nh_type,
+ (int)_two->nh_type, (int)_one->nh_family,
+ (int)_two->nh_family);
+ return (0);
+ }
+ if (nh->nh_priv->rt_flags != nh_src->nh_priv->rt_flags) {
+ DPRINTF("MISS: rt_flags 0x%X 0x%X",
+ (unsigned int)nh->nh_priv->rt_flags,
+ (unsigned int)nh_src->nh_priv->rt_flags);
+ return (0);
+ }
+ if (nh->nh_mtu != nh_src->nh_mtu) {
+ DPRINTF("MISS: mtu %d %d", (int)nh->nh_mtu, (int)nh_src->nh_mtu);
+ return (0);
+ }
+ if (nh->nh_flags != nh_src->nh_flags) {
+ DPRINTF("MISS NHF_DEFAULT: nh_flags 0x%X 0x%X",
+ (unsigned int)nh->nh_flags, (unsigned int)nh_src->nh_flags);
+ return (0);
+ }
+ if (nh->nh_ifa != nh_src->nh_ifa) {
+ DPRINTF("MISS: ifa %p %p", nh->nh_ifa, nh_src->nh_ifa);
+ return (0);
+ }
+ if (nh->nh_aifp != nh_src->nh_aifp) {
+ DPRINTF("MISS: aifp %s %s", if_name(nh->nh_aifp),
+ if_name(nh_src->nh_aifp));
+ return (0);
+ }
+ if ((nh->nh_flags & NHF_GATEWAY) && (compare_nhop_addr(nh, nh_src) == 0)) {
+ DPRINTF("MISS: SA");
+ return (0);
+ }
+ /* Finally, loopback IPv6 nexthops */
+
+ return (1);
+}
+#endif
+
+int
+cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two)
+{
+
+ if (memcmp(_one->nh, _two->nh, NHOP_END_CMP) != 0)
+ return (0);
+
+ if ((_one->nh_type != _two->nh_type) ||
+ (_one->nh_family != _two->nh_family))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Finds or creates new nhop_object based on @req.
+ * Returns referenced and linked nhop_object or NULL.
+ */
+__noinline struct nhop_object *
+nhop_get(struct rib_head *rh, const struct nhop_request *req)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhop_object *nh, *nh_tmp;
+ struct nhop_priv *nh_priv;
+
+ /*
+ *
+ * TODO: performance optimizations.
+ * In order to find the nexthop, we first need
+ * to construct most of it to make hash lookup
+ * work correctly.
+ * The assumption is that for _most_ routes nexthops
+ * will be shared, so it would make sense to optimize
+ * the lookup process. Current implementation refcounts
+ * all dependent objects even in "temporary" nexthop
+ * usecase, which is an overkill.
+ */
+
+ nh_tmp = alloc_nhop_tmp(req);
+ if (nh_tmp == NULL)
+ return (NULL);
+ nh_priv = find_nhop(ctl, nh_tmp->nh_priv);
+ free_nhop_tmp(nh_tmp);
+
+ if (nh_priv != NULL)
+ return (nh_priv->nh);
+
+ nh = alloc_nhop(req);
+ if (nh == NULL) {
+ DPRINTF("nh_alloc failed!");
+ return (NULL);
+ }
+ if (link_nhop(ctl, nh->nh_priv) == 0) {
+ /*
+ * Adding nexthop to the datastructures
+ * failed. Call destructor w/o waiting for
+ * the epoch end, as nexthop is not used
+ * and return.
+ */
+ DPRINTF("link_nhop failed!");
+ destroy_nhop(nh->nh_priv);
+
+ return (NULL);
+ }
+
+ return (nh);
+}
+
+__noinline static void
+fill_sdl_from_ifp(struct sockaddr_dl_short *sdl, const struct ifnet *ifp)
+{
+
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_len = sizeof(struct sockaddr_dl_short);
+ sdl->sdl_index = ifp->if_index;
+ sdl->sdl_type = ifp->if_type;
+}
+
+__noinline static struct nhop_object *
+alloc_nhop_tmp(const struct nhop_request *req)
+{
+ struct nhop_object *nh;
+ struct nhop_priv *nh_priv;
+ /* IPv6 ND is unhappy */
+ int flags = M_NOWAIT;
+
+ nh_priv = malloc(sizeof(struct nhop_priv) + sizeof(struct nhop_object), M_TEMP, flags | M_ZERO);
+ if (nh_priv == NULL)
+ return (NULL);
+
+ nh = (struct nhop_object *)(nh_priv + 1);
+
+ /*
+ * consists of
+ * ifp, ifa, aifp, mtu, gw addr(if set), nh_type, nh_family, mask of rt_flags and
+ */
+
+ nh->nh_priv = nh_priv;
+ nh_priv->nh = nh;
+
+ if (fill_nhop(req, nh) != 0) {
+ free(nh_priv, M_TEMP);
+ return (NULL);
+ }
+
+ return (nh);
+}
+
+__noinline static void
+free_nhop_tmp(struct nhop_object *nh)
+{
+ /* TODO: rewrite */
+ if_rele(nh->nh_ifp);
+ if_rele(nh->nh_aifp);
+ ifa_free(nh->nh_ifa);
+
+ free(nh->nh_priv, M_TEMP);
+}
+
+__noinline static void
+print_nhop(const char *prefix, const struct nhop_object *nh)
+{
+ char src_buf[INET6_ADDRSTRLEN], addr_buf[INET6_ADDRSTRLEN];
+ int af = nh->nh_priv->nh_family;
+
+ if (af == AF_INET) {
+ const struct sockaddr_in *gw, *src;
+ gw = &nh->gw4_sa;
+ src = IA_SIN(ifatoia(nh->nh_ifa));
+ inet_ntop(af, &src->sin_addr, src_buf, sizeof(src_buf));
+ inet_ntop(af, &gw->sin_addr, addr_buf, sizeof(addr_buf));
+ } else if (af == AF_INET6) {
+ const struct sockaddr_in6 *gw, *src;
+ gw = &nh->gw6_sa;
+ src = &(ifatoia6(nh->nh_ifa)->ia_addr);
+ inet_ntop(af, &src->sin6_addr, src_buf, sizeof(src_buf));
+ inet_ntop(af, &gw->sin6_addr, addr_buf, sizeof(addr_buf));
+ }
+
+ DPRINTF("%s nhop: AF %d ifp %p %s addr %s src %p %s aifp %p %s mtu %d nh_flags %X",
+ prefix, af, nh->nh_ifp, if_name(nh->nh_ifp), addr_buf, nh->nh_ifa,
+ src_buf, nh->nh_aifp, if_name(nh->nh_aifp), nh->nh_mtu, nh->nh_flags);
+}
+
+__noinline static struct nhop_object *
+alloc_nhop(const struct nhop_request *req)
+{
+ struct nhop_object *nh;
+ struct nhop_priv *nh_priv;
+ /* IPv6 ND is unhappy */
+ int flags = M_NOWAIT;
+
+ KASSERT((req->mtu > 0), ("nh requested mtu is zero"));
+
+ nh_priv = malloc(sizeof(struct nhop_priv), M_NHOP, flags | M_ZERO);
+ if (nh_priv == NULL)
+ return (NULL);
+ nh = malloc(sizeof(struct nhop_object), M_NHOP, flags | M_ZERO);
+ if (nh == NULL) {
+ free(nh_priv, M_NHOP);
+ return (NULL);
+ }
+
+ /* Allocate per-cpu packet counter */
+ nh->nh_pksent = counter_u64_alloc(flags);
+ if (nh->nh_pksent == NULL) {
+ free(nh_priv, M_NHOP);
+ free(nh, M_NHOP);
+ return (NULL);
+ }
+
+ nh->nh_priv = nh_priv;
+ nh_priv->nh = nh;
+
+ /* Refcounting for all of the necessary resources done in fill_nhop() */
+ if (fill_nhop(req, nh) != 0) {
+ counter_u64_free(nh->nh_pksent);
+ free(nh_priv, M_NHOP);
+ free(nh, M_NHOP);
+ return (NULL);
+ }
+
+ NH_PRIV_LOCK_INIT(nh_priv);
+ refcount_init(&nh_priv->nh_refcnt, 1);
+
+ print_nhop("ALLOC", nh);
+
+ return (nh);
+}
+
+static void
+destroy_nhop(struct nhop_priv *nh_priv)
+{
+ struct nhop_object *nh = nh_priv->nh;
+
+ NH_PRIV_LOCK(nh_priv);
+ DPRINTF("DEL nhop: AF %d ifp %p %s src %p mtu %d nh_flags %X",
+ nh_priv->nh_family, nh->nh_ifp, nh->nh_ifp->if_xname,
+ nh->nh_ifa, nh->nh_mtu, nh->nh_flags);
+
+ NH_PRIV_UNLOCK(nh_priv);
+
+ free(nh_priv, M_NHOP);
+
+ if_rele(nh->nh_ifp);
+ if_rele(nh->nh_aifp);
+ ifa_free(nh->nh_ifa);
+ counter_u64_free(nh->nh_pksent);
+
+ free(nh, M_NHOP);
+}
+
+/*
+ * Epoch callback indicating nhop is safe to destroy
+ */
+static void
+destroy_nhop_epoch(epoch_context_t ctx)
+{
+ struct nhop_priv *nh_priv;
+
+ nh_priv = __containerof(ctx, struct nhop_priv, nh_epoch_ctx);
+
+ destroy_nhop(nh_priv);
+}
+
+/*
+ * Fills @nh fields with the data supplied in the @req.
+ * Returns 0 on success. References nh_aifp, nh_ifp and nh_ifa.
+ */
+__noinline static int
+fill_nhop(const struct nhop_request *req, struct nhop_object *nh)
+{
+ int rt_flags;
+
+ rt_flags = req->rt_flags & NHOP_RT_FLAG_MASK;
+
+ nh->nh_ifp = req->ifp;
+ nh->nh_mtu = req->mtu;
+ nh->nh_flags = fib_rte_to_nh_flags(rt_flags);
+ nh->nh_flags |= (req->nh_flags_additional & NHF_DEFAULT);
+ nh->nh_priv->rt_flags = rt_flags;
+ nh->nh_ifa = req->ifa;
+
+ if (req->rt_flags & RTF_GATEWAY) {
+ if (req->gw->sa_len > sizeof(struct sockaddr_in6)) {
+ DPRINTF("nhop SA size too big: AF %d len %u",
+ req->gw->sa_family, req->gw->sa_len);
+ return (ENOMEM);
+ }
+ memcpy(&nh->gw_sa, req->gw, req->gw->sa_len);
+ } else {
+ /*
+ * Interface route. Currently the route.c code adds
+ * empty sa of type AF_LINK, which is 56 bytes long.
+ * The only place where this data is used is the IPv6
+ * loopback output, where we need to preserve the original
+ * interface to maintain proper scoping.
+ * Current code stores original interface in the separate field
+ * (nh_aifp, see below). Given that, write fake empty SA
+ * with the request AF.
+ */
+ fill_sdl_from_ifp(&nh->gwl_sa, req->ifp);
+ }
+
+ /* IPv6 (mostly) helper */
+ nh->nh_aifp = get_aifp(req);
+ DPRINTF("AIFP: %p req->ifp %p nh_ifp %p", nh->nh_aifp, req->ifp, nh->nh_ifp);
+
+ /* Reference the needed objects (note nh_aifp is already referenced) */
+ if_ref(nh->nh_ifp);
+ ifa_ref(nh->nh_ifa);
+
+ /* TODO: verify blackhole/reject behavior | NHF_HOST */
+
+ nh->nh_priv->nh_family = req->family;
+ nh->nh_priv->nh_type = req->nh_type;
+
+ return (0);
+}
+
+int
+nhop_ref_object(struct nhop_object *nh)
+{
+
+ return (refcount_acquire_if_not_zero(&nh->nh_priv->nh_refcnt));
+}
+
+void
+nhop_free_object(struct nhop_object *nh)
+{
+ struct nh_control *ctl;
+ struct nhop_priv *nh_priv = nh->nh_priv;
+
+ if (!refcount_release(&nh_priv->nh_refcnt))
+ return;
+
+ NH_PRIV_LOCK(nh_priv);
+ ctl = nh_priv->nh_control;
+ /* Use nh_control as an indicator of linked/unlinked entry */
+ nh_priv->nh_control = NULL;
+ NH_PRIV_UNLOCK(nh_priv);
+
+ if (ctl != NULL) {
+ if (unlink_nhop(ctl, nh_priv) == NULL) {
+ /* Do not try to reclaim */
+ return;
+ }
+ }
+
+ epoch_call(net_epoch_preempt, destroy_nhop_epoch,
+ &nh_priv->nh_epoch_ctx);
+}
+
+int
+nhop_ref_any(struct nhop_object *nh)
+{
+#ifdef ROUTE_MPATH
+ if (!NH_IS_MULTIPATH(nh))
+ return (nhop_ref_object(nh));
+ else
+ return (nhgrp_ref_group((struct nhgrp_object *)nh));
+#else
+ return (nhop_ref_object(nh));
+#endif
+}
+
+void
+nhop_free_any(struct nhop_object *nh)
+{
+ if (!NH_IS_MULTIPATH(nh))
+ nhop_free_object(nh);
+#ifdef ROUTE_MPATH
+ else
+ nhgrp_free_group((struct nhgrp_object *)nh);
+#endif
+}
+
+
+/* Helper functions */
+
+uint32_t
+nhop_get_idx(const struct nhop_object *nh)
+{
+
+ return (nh->nh_priv->nh_idx);
+}
+
+__noinline void
+nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhop_priv *nh_priv;
+ struct nhop_object *nh;
+
+ NHOPS_RLOCK(ctl);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ nh = nh_priv->nh;
+ if (nh->nh_ifp == ifp) {
+ if ((nh_priv->rt_flags & RTF_FIXEDMTU) == 0 ||
+ nh->nh_mtu > mtu) {
+ /* Update */
+ NH_PRIV_LOCK(nh_priv);
+ nh->nh_mtu = mtu;
+ NH_PRIV_UNLOCK(nh_priv);
+ }
+ }
+ } CHT_SLIST_FOREACH_END;
+ NHOPS_RUNLOCK(ctl);
+
+}
+
+__noinline static int
+dump_nhop_entry(struct rib_head *rh, struct nhop_object *nh, struct sysctl_req *w)
+{
+ struct {
+ struct rt_msghdr rtm;
+ struct nhop_external nhe;
+ } arpc;
+ struct nhop_external *pnhe;
+ struct sockaddr *gw_sa, *src_sa;
+ struct sockaddr_storage ss;
+ int error;
+
+ //DPRINTF("Dumping: head %p nh %p flags %X req %p\n", rh, nh, nh->nh_flags, w);
+
+ memset(&arpc, 0, sizeof(arpc));
+
+ arpc.rtm.rtm_msglen = sizeof(arpc);
+ arpc.rtm.rtm_version = RTM_VERSION;
+ arpc.rtm.rtm_type = RTM_GET;
+ //arpc.rtm.rtm_flags = RTF_UP;
+ arpc.rtm.rtm_flags = nh->nh_priv->rt_flags;
+
+ pnhe = &arpc.nhe;
+
+ pnhe->nh_idx = nh->nh_priv->nh_idx;
+ pnhe->nh_fib = rh->rib_fibnum;
+ pnhe->ifindex = nh->nh_ifp->if_index;
+ pnhe->aifindex = nh->nh_aifp->if_index;
+ pnhe->nh_family = nh->nh_priv->nh_family;
+ pnhe->nh_type = nh->nh_priv->nh_type;
+ pnhe->nh_mtu = nh->nh_mtu;
+ pnhe->nh_flags = nh->nh_flags;
+
+ size_t len = 0;
+ gw_sa = (struct sockaddr *)&nh->gw4_sa;
+ // KASSRT sin6_len > 0
+ len += gw_sa->sa_len;
+ //DPRINTF("ADDING gw_sa %lu len, af %d nh_sa: %p\n", len, nh->gw6_sa.sin6_family, &nh->gw6_sa);
+
+ int af = nh->nh_priv->nh_family;
+ if (af == AF_INET) {
+ src_sa = (struct sockaddr *)IA_SIN(ifatoia(nh->nh_ifa));
+ } else if (af == AF_INET6) {
+ src_sa = (struct sockaddr *)&ifatoia6(nh->nh_ifa)->ia_addr;
+ } else {
+ return (1);
+ }
+ if (src_sa->sa_family != af) {
+ /* ifa can be link address. XXX: AF_NULL ? */
+ memset(&ss, 0, sizeof(struct sockaddr_storage));
+ fill_sdl_from_ifp((struct sockaddr_dl_short *)&ss, nh->nh_ifp);
+ src_sa = (struct sockaddr *)&ss;
+ }
+
+ //memcpy(sa, nh_sa, nh_sa->sa_len);
+ len += src_sa->sa_len;
+ arpc.rtm.rtm_msglen += len;
+ //DPRINTF("ADDING %lu len, af %d nh_sa: %p\n", len, src_sa->sa_len, src_sa);
+
+ memcpy(pnhe->nh_prepend, nh->nh_prepend, sizeof(nh->nh_prepend));
+ pnhe->prepend_len = nh->nh_prepend_len;
+ pnhe->nh_refcount = nh->nh_priv->nh_refcnt;
+
+ pnhe->nh_pksent = counter_u64_fetch(nh->nh_pksent);
+
+ error = SYSCTL_OUT(w, &arpc, sizeof(arpc));
+ if (error == 0)
+ error = SYSCTL_OUT(w, gw_sa, gw_sa->sa_len);
+ if (error == 0)
+ error = SYSCTL_OUT(w, src_sa, src_sa->sa_len);
+
+ /*
+ DPRINTF("Exported %d ifindex %d family %d type %d error %d\n", nh->nh_priv->nh_idx, pnhe->ifindex,
+ pnhe->nh_family, pnhe->nh_type, error);
+ */
+
+ return (error);
+}
+
+int
+nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
+{
+ struct nh_control *ctl = rh->nh_control;
+ struct nhop_priv *nh_priv;
+ int error;
+
+ DPRINTF("NHDUMP: count=%u", ctl->nh_head.items_count);
+ CHT_SLIST_FOREACH(&ctl->nh_head, nhops, nh_priv) {
+ error = dump_nhop_entry(rh, nh_priv->nh, w);
+ if (error != 0)
+ return (error);
+ } CHT_SLIST_FOREACH_END;
+
+ return (0);
+}
+
Index: sys/net/route/nhop_utils.h
===================================================================
--- /dev/null
+++ sys/net/route/nhop_utils.h
@@ -0,0 +1,200 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NET_ROUTE_NHOP_UTILS_H_
+#define _NET_ROUTE_NHOP_UTILS_H_
+
+/* Chained hash table */
+struct _cht_head {
+ uint32_t hash_size;
+ uint32_t items_count;
+ void **ptr;
+};
+
+static inline uint32_t
+_cht_get_resize_size(const struct _cht_head *head)
+{
+ uint32_t new_size = 0;
+
+ if ((head->items_count * 2 > head->hash_size) && (head->hash_size < 65536))
+ new_size = head->hash_size * 2;
+ else if ((head->items_count * 4 < head->hash_size) && head->hash_size > 16)
+ new_size = head->hash_size / 2;
+
+ return (new_size);
+}
+
+static inline int
+_cht_need_resize(const struct _cht_head *head)
+{
+
+ return (_cht_get_resize_size(head) > 0);
+}
+
+
+#ifndef typeof
+#define typeof __typeof
+#endif
+
+#define CHT_SLIST_NEED_RESIZE(_head) \
+ _cht_need_resize((const struct _cht_head *)(_head))
+#define CHT_SLIST_GET_RESIZE_BUCKETS(_head) \
+ _cht_get_resize_size((const struct _cht_head *)(_head))
+#define CHT_SLIST_GET_RESIZE_SIZE(_buckets) ((_buckets) * sizeof(void *))
+
+#define CHT_SLIST_DEFINE(_HNAME, _ITEM_TYPE) \
+struct _HNAME##_head { \
+ uint32_t hash_size; \
+ uint32_t items_count; \
+ _ITEM_TYPE **ptr; \
+}
+
+#define CHT_SLIST_INIT(_head, _ptr, _num_buckets) \
+ (_head)->hash_size = _num_buckets; \
+ (_head)->items_count = 0; \
+ (_head)->ptr = _ptr;
+
+/* Default hash method for constant-size keys */
+
+#define CHT_GET_BUCK(_head, _PX, _key) _PX##_hash_key(_key) & ((_head)->hash_size - 1)
+#define CHT_GET_BUCK_OBJ(_head, _PX, _obj) _PX##_hash_obj(_obj) & ((_head)->hash_size - 1)
+
+#define CHT_FIRST(_head, idx) _CHT_FIRST((_head)->ptr, idx)
+#define _CHT_FIRST(_ptr, idx) (_ptr)[idx]
+
+#define CHT_SLIST_FIND(_head, _PX, _key, _ret) do { \
+ uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \
+ _ret = CHT_FIRST(_head, _buck); \
+ for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_key, (_ret))) \
+ break; \
+ } \
+} while(0)
+
+/*
+ * hash_obj, nhop_cmp
+ */
+#define CHT_SLIST_FIND_BYOBJ(_head, _PX, _obj, _ret) do { \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _ret = CHT_FIRST(_head, _buck); \
+ for ( ; _ret != NULL; _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_obj, _ret)) \
+ break; \
+ } \
+} while(0)
+
+#define CHT_SLIST_INSERT_HEAD(_head, _PX, _obj) do { \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _PX##_next(_obj) = CHT_FIRST(_head, _buck); \
+ CHT_FIRST(_head, _buck) = _obj; \
+ (_head)->items_count++; \
+} while(0)
+
+#define CHT_SLIST_REMOVE(_head, _PX, _key, _ret) do { \
+ typeof(*(_head)->ptr) _tmp; \
+ uint32_t _buck = CHT_GET_BUCK(_head, _PX, _key); \
+ _ret = CHT_FIRST(_head, _buck); \
+ _tmp = NULL; \
+ for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_key, _ret)) \
+ break; \
+ } \
+ if (_ret != NULL) { \
+ if (_tmp == NULL) \
+ CHT_FIRST(_head, _buck) = _PX##_next(_ret); \
+ else \
+ _PX##_next(_tmp) = _PX##_next(_ret); \
+ (_head)->items_count--; \
+ } \
+} while(0)
+
+#define CHT_SLIST_REMOVE_BYOBJ(_head, _PX, _obj, _ret) do { \
+ typeof(*(_head)->ptr) _tmp; \
+ uint32_t _buck = CHT_GET_BUCK_OBJ(_head, _PX, _obj); \
+ _ret = CHT_FIRST(_head, _buck); \
+ _tmp = NULL; \
+ for ( ; _ret != NULL; _tmp = _ret, _ret = _PX##_next(_ret)) { \
+ if (_PX##_cmp(_obj, _ret)) \
+ break; \
+ } \
+ if (_ret != NULL) { \
+ if (_tmp == NULL) \
+ CHT_FIRST(_head, _buck) = _PX##_next(_ret); \
+ else \
+ _PX##_next(_tmp) = _PX##_next(_ret); \
+ (_head)->items_count--; \
+ } \
+} while(0)
+
+
+#define CHT_SLIST_FOREACH(_head, _PX, _x) \
+ for (uint32_t _i = 0; _i < (_head)->hash_size; _i++) { \
+ for (_x = CHT_FIRST(_head, _i); _x; _x = _PX##_next(_x))
+
+#define CHT_SLIST_FOREACH_END }
+
+#define CHT_SLIST_RESIZE(_head, _PX, _new_void_ptr, _new_hsize) \
+ uint32_t _new_idx; \
+ typeof((_head)->ptr) _new_ptr = (void *)_new_void_ptr; \
+ typeof(*(_head)->ptr) _x, _y; \
+ for (uint32_t _old_idx = 0; _old_idx < (_head)->hash_size; _old_idx++) {\
+ _x = CHT_FIRST(_head, _old_idx); \
+ _y = _x; \
+ while (_y != NULL) { \
+ _y = _PX##_next(_x); \
+ _new_idx = _PX##_hash_obj(_x) & (_new_hsize - 1);\
+ _PX##_next(_x) = _CHT_FIRST(_new_ptr, _new_idx);\
+ _CHT_FIRST(_new_ptr, _new_idx) = _x; \
+ _x = _y; \
+ } \
+ } \
+ (_head)->hash_size = _new_hsize; \
+ _new_void_ptr = (void *)(_head)->ptr; \
+ (_head)->ptr = _new_ptr;
+
+/* bitmasks */
+
+struct bitmask_head {
+ uint16_t free_off; /* index of the first potentially free block */
+ uint16_t blocks; /* number of 4/8-byte blocks in the index */
+ uint32_t items_count; /* total number of items */
+ u_long *idx;
+};
+
+size_t bitmask_get_size(uint32_t items);
+uint32_t bitmask_get_resize_items(const struct bitmask_head *nh);
+int bitmask_should_resize(const struct bitmask_head *bh);
+void bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx);
+void bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items);
+int bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items);
+int bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx);
+int bitmask_free_idx(struct bitmask_head *bi, uint16_t idx);
+
+#endif
+
Index: sys/net/route/nhop_utils.c
===================================================================
--- /dev/null
+++ sys/net/route/nhop_utils.c
@@ -0,0 +1,220 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet.h"
+#include "opt_route.h"
+#include "opt_mpath.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+
+#include <net/route/nhop_utils.h>
+
+#define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */
+
+#define _BLOCKS_TO_SZ(_blocks) ((size_t)(_blocks) * sizeof(u_long))
+#define _BLOCKS_TO_ITEMS(_blocks) ((uint32_t)(_blocks) * BLOCK_ITEMS)
+#define _ITEMS_TO_BLOCKS(_items) ((_items) / BLOCK_ITEMS)
+
+
+static void _bitmask_init_idx(void *index, uint32_t items);
+
+void
+bitmask_init(struct bitmask_head *bh, void *idx, uint32_t num_items)
+{
+
+ if (idx != NULL)
+ _bitmask_init_idx(idx, num_items);
+
+ memset(bh, 0, sizeof(struct bitmask_head));
+ bh->blocks = _ITEMS_TO_BLOCKS(num_items);
+ bh->idx = (u_long *)idx;
+}
+
+uint32_t
+bitmask_get_resize_items(const struct bitmask_head *bh)
+{
+ if ((bh->items_count * 2 > _BLOCKS_TO_ITEMS(bh->blocks)) && bh->items_count < 65536)
+ return (_BLOCKS_TO_ITEMS(bh->blocks) * 2);
+
+ return (0);
+}
+
+int
+bitmask_should_resize(const struct bitmask_head *bh)
+{
+
+ return (bitmask_get_resize_items(bh) != 0);
+}
+
+#if 0
+uint32_t
+_bitmask_get_blocks(uint32_t items)
+{
+
+ return (items / BLOCK_ITEMS);
+}
+#endif
+
+size_t
+bitmask_get_size(uint32_t items)
+{
+#if _KERNEL
+ KASSERT((items % BLOCK_ITEMS) == 0,
+ ("bitmask size needs to power of 2 and greater or equal to %zu",
+ BLOCK_ITEMS));
+#else
+ assert((items % BLOCK_ITEMS) == 0);
+#endif
+
+ return (items / 8);
+}
+
+static void
+_bitmask_init_idx(void *_idx, uint32_t items)
+{
+ size_t size = bitmask_get_size(items);
+ u_long *idx = (u_long *)_idx;
+
+ /* Mark all as free */
+ memset(idx, 0xFF, size);
+ *idx &= ~(u_long)1; /* Always skip index 0 */
+}
+
+
+/*
+ * _try_merge api to allow shrinking?
+ */
+int
+bitmask_copy(const struct bitmask_head *bi, void *new_idx, uint32_t new_items)
+{
+ uint32_t new_blocks = _BLOCKS_TO_ITEMS(new_items);
+
+ _bitmask_init_idx(new_idx, new_items);
+
+ if (bi->blocks < new_blocks) {
+ /* extend current blocks */
+ if (bi->blocks > 0)
+ memcpy(new_idx, bi->idx, _BLOCKS_TO_SZ(bi->blocks));
+ return (0);
+ } else {
+ /* XXX: ensure all other blocks are non-zero */
+ for (int i = new_blocks; i < bi->blocks; i++) {
+ }
+
+ return (1);
+ }
+}
+
+void
+bitmask_swap(struct bitmask_head *bh, void *new_idx, uint32_t new_items, void **pidx)
+{
+ void *old_ptr;
+
+ old_ptr = bh->idx;
+
+ bh->idx = (u_long *)new_idx;
+ bh->blocks = _ITEMS_TO_BLOCKS(new_items);
+
+ if (pidx != NULL)
+ *pidx = old_ptr;
+}
+
+/*
+ * Allocate new index in given instance and stores in in @pidx.
+ * Returns 0 on success.
+ */
+int
+bitmask_alloc_idx(struct bitmask_head *bi, uint16_t *pidx)
+{
+ u_long *mask;
+ int i, off, v;
+
+ off = bi->free_off;
+ mask = &bi->idx[off];
+
+ for (i = off; i < bi->blocks; i++, mask++) {
+ if ((v = ffsl(*mask)) == 0)
+ continue;
+
+ /* Mark as busy */
+ *mask &= ~ ((u_long)1 << (v - 1));
+
+ bi->free_off = i;
+
+ v = BLOCK_ITEMS * i + v - 1;
+
+ *pidx = v;
+ bi->items_count++;
+ return (0);
+ }
+
+ return (1);
+}
+
+/*
+ * Removes index from given set.
+ * Returns 0 on success.
+ */
+int
+bitmask_free_idx(struct bitmask_head *bi, uint16_t idx)
+{
+ u_long *mask;
+ int i, v;
+
+ if (idx == 0)
+ return (1);
+
+ i = idx / BLOCK_ITEMS;
+ v = idx % BLOCK_ITEMS;
+
+ if (i >= bi->blocks)
+ return (1);
+
+ mask = &bi->idx[i];
+
+ if ((*mask & ((u_long)1 << v)) != 0)
+ return (1);
+
+ /* Mark as free */
+ *mask |= (u_long)1 << v;
+ bi->items_count--;
+
+ /* Update free offset */
+ if (bi->free_off > i)
+ bi->free_off = i;
+
+ return (0);
+}
+
Index: sys/net/route/nhop_var.h
===================================================================
--- /dev/null
+++ sys/net/route/nhop_var.h
@@ -0,0 +1,127 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This header file contains private definitions for nexthop routing.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_NHOP_VAR_H_
+#define _NET_ROUTE_NHOP_VAR_H_
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+ unsigned int result = 0;
+ int i;
+
+ for (i = 0; i < len; i++)
+ result = 33 * result ^ h[i];
+
+ return (result);
+}
+
+/* define nhop hash table */
+struct nhop_priv;
+CHT_SLIST_DEFINE(nhops, struct nhop_priv);
+/* produce hash value for an object */
+#define nhops_hash_obj(_obj) hash_priv(_obj)
+/* compare two objects */
+#define nhops_cmp(_one, _two) cmp_priv(_one, _two)
+/* next object accessor */
+#define nhops_next(_obj) (_obj)->nh_next
+
+/* XXX: declare! */
+/* define mpath hash table */
+struct nhgrp_priv;
+CHT_SLIST_DEFINE(mpath, struct nhgrp_priv);
+
+
+struct nh_control {
+ struct nhops_head nh_head; /* hash table head */
+ struct bitmask_head nh_idx_head; /* nhop index head */
+ struct mpath_head gr_head; /* nhgrp hash table head */
+ struct bitmask_head gr_idx_head; /* nhgrp index head */
+ struct rwlock nhop_lock; /* overall ctl lock */
+ struct rib_head *rh; /* pointer back to rnh */
+};
+
+#define NHOPS_WLOCK(ctl) rw_wlock(&(ctl)->nhop_lock)
+#define NHOPS_RLOCK(ctl) rw_rlock(&(ctl)->nhop_lock)
+#define NHOPS_WUNLOCK(ctl) rw_wunlock(&(ctl)->nhop_lock)
+#define NHOPS_RUNLOCK(ctl) rw_runlock(&(ctl)->nhop_lock)
+#define NHOPS_LOCK_INIT(ctl) rw_init(&(ctl)->nhop_lock, "ctl")
+#define NHOPS_LOCK_DESTROY(ctl) rw_destroy(&(ctl)->nhop_lock)
+#define NHOPS_WLOCK_ASSERT(ctl) rw_assert(&(ctl)->nhop_lock, RA_WLOCKED)
+
+
+/* Control plane-only nhop data */
+struct nhop_object;
+struct nhop_priv {
+ uint32_t nh_idx; /* nexthop index */
+ uint8_t nh_family; /* address family */
+ uint16_t nh_type; /* nexthop type */
+ void *cb_func; /* function handling additional rewrite caps */
+ u_int nh_refcnt; /* number of references */
+ int rt_flags; /* routing flags for the control plane */
+ struct nhop_object *nh; /* backreference to the dataplane nhop */
+ struct nh_control *nh_control; /* backreference to the rnh */
+ struct nhop_priv *nh_next; /* hash table membership */
+ struct mtx nh_mtx; /* mutex */
+ struct epoch_context nh_epoch_ctx; /* epoch data for nhop */
+};
+
+#define NH_PRIV_LOCK_INIT(_priv) mtx_init(&(_priv)->nh_mtx, "nhop", NULL, MTX_DEF)
+#define NH_PRIV_LOCK(_priv) mtx_lock(&(_priv)->nh_mtx)
+#define NH_PRIV_UNLOCK(_priv) mtx_unlock(&(_priv)->nh_mtx)
+#define NH_PRIV_LOCK_DESTROY(_priv) mtx_destroy(&(_priv)->nh_mtx)
+#define NH_PRIV_LOCK_ASSERT(_priv) mtx_assert(&(_priv)->nh_mtx, MA_OWNED)
+
+#define NH_LOCK(_nh) NH_PRIV_LOCK((_nh)->nh_priv)
+#define NH_UNLOCK(_nh) NH_PRIV_UNLOCK((_nh)->nh_priv)
+
+#define NH_IS_PINNED(_nh) ((_nh)->nh_priv->rt_flags & RTF_PINNED)
+
+/* nhop.c */
+struct nhop_priv *find_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv);
+int link_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv);
+struct nhop_priv *unlink_nhop(struct nh_control *ctl, struct nhop_priv *nh_priv);
+
+/* nhop_ctl.c */
+void free_nhop(struct nhop_priv *nh_priv);
+int cmp_priv(const struct nhop_priv *_one, const struct nhop_priv *_two);
+
+/* mpath */
+struct weightened_nhop;
+
+
+#endif
+
Index: sys/net/route/route_ctl.c
===================================================================
--- /dev/null
+++ sys/net/route/route_ctl.c
@@ -0,0 +1,1601 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route_mpath.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/vnet.h>
+#include <net/route.h>
+#define NEED_RTZONE
+#include <net/route_var.h>
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/shared.h>
+#include <netinet/in.h>
+#include <net/route/rtentry_var.h>
+
+#include <vm/uma.h>
+
+/*
+ * This file contains control plane routing tables functions.
+ *
+ * All functions assumes they are called in net epoch.
+ */
+
+#define V_rib_route_multipath VNET(rib_route_multipath)
+#ifdef ROUTE_MPATH
+VNET_DEFINE(u_int, rib_route_multipath) = 1;
+#define MP_FLAGS CTLFLAG_RWTUN
+#else
+VNET_DEFINE(u_int, rib_route_multipath) = 0;
+#define MP_FLAGS CTLFLAG_RD
+#endif
+SYSCTL_UINT(_net_route, OID_AUTO, multipath, MP_FLAGS | CTLFLAG_VNET,
+ &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
+
+
+static void set_req_mtu(const struct rt_addrinfo *info, struct nhop_request *req);
+static int create_rte_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry **ret_rt);
+static int can_rib_multipath(struct rib_head *rh);
+
+static int add_route(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info, struct rib_cmd_info *rc);
+static int del_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+
+static int replace_rte(struct rib_head *rnh, struct sockaddr *dst,
+ struct sockaddr *mask, struct rtentry *rt_new);
+static int update_gateway_metadata(struct rt_addrinfo *info, int fibnum);
+static void fill_nh_request(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_request *nh_req);
+static void fill_nh_request_from_nhop(const struct nhop_object *nh,
+ struct sockaddr_storage *gw_storage, struct nhop_request *nh_req);
+
+
+/*
+ * Returns address family to operate on from @info.
+ */
+static sa_family_t
+get_family_from_info(const struct rt_addrinfo *info)
+{
+
+ return ((info->rti_info[RTAX_DST])->sa_family);
+}
+
+
+/*
+ * Sets @nh_req mtu data based on the @info data.
+ */
+static void
+set_req_mtu(const struct rt_addrinfo *info, struct nhop_request *nh_req)
+{
+
+ if (info->rti_mflags & RTV_MTU) {
+ if (info->rti_rmx->rmx_mtu != 0) {
+
+ /*
+ * MTU was explicitly provided by user.
+ * Keep it.
+ */
+ nh_req->rt_flags |= RTF_FIXEDMTU;
+ } else {
+
+ /*
+ * User explicitly sets MTU to 0.
+ * Assume rollback to default.
+ */
+ nh_req->rt_flags &= ~RTF_FIXEDMTU;
+ }
+ nh_req->mtu = info->rti_rmx->rmx_mtu;
+ }
+}
+
+
+/*
+ * Fills @nh_req based on the data provided in @info.
+ */
+static void
+fill_nh_request(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_request *nh_req)
+{
+
+ bzero(nh_req, sizeof(struct nhop_request));
+ nh_req->ifp = info->rti_ifa->ifa_ifp;
+ nh_req->ifa = info->rti_ifa;
+ nh_req->gw = info->rti_info[RTAX_GATEWAY];
+ nh_req->family = info->rti_info[RTAX_DST]->sa_family;
+ nh_req->rt_flags = info->rti_flags; // fill original rt flags
+ nh_req->nh_type = 0; // hook responsibility to set nhop type
+ set_req_mtu(info, nh_req);
+}
+
+/*
+ * Fill @nh_req based on the real @nh.
+ */
+static void
+fill_nh_request_from_nhop(const struct nhop_object *nh,
+ struct sockaddr_storage *gw_storage, struct nhop_request *nh_req)
+{
+
+ memset(nh_req, 0, sizeof(struct nhop_request));
+ nh_req->ifp = nh->nh_ifp;
+ nh_req->ifa = nh->nh_ifa;
+ nh_req->family = nh->nh_priv->nh_family;
+ nh_req->mtu = nh->nh_mtu;
+ nh_req->rt_flags = nh->nh_priv->rt_flags;
+ nh_req->nh_type = nh->nh_priv->nh_type;
+
+ if (nh_req->rt_flags & RTF_GATEWAY) {
+ /* Assume size is already validated */
+ memcpy(gw_storage, &nh->gw4_sa, nh->gw4_sa.sin_len);
+ } else {
+ /* Nhop value is largerly ignored, set some random bits */
+ gw_storage->ss_len = 0;
+ }
+ nh_req->gw = (struct sockaddr *)gw_storage;
+}
+
+/*
+ * Update @nh_req request data based on the parameters supplied in @info.
+ * This is a helper function to support route changes.
+ *
+ * It limits the changes that can be done to the route to the following:
+ * 1) all combination of gateway changes (gw, interface, blackhole/reject)
+ * 2) route flags (FLAG[123],STATIC,BLACKHOLE,REJECT)
+ * 3) route MTU
+ * 4) route weight (handled by the caller)
+ * 5) route lifetime (setting rte expiration time is handled by the caller)
+ *
+ * Assumes nh_req gw pointer has sockaddr_storage-sized pointer supplied
+ *
+ * Returns:
+ * 0 on success, nh_req->ifa and nh_req->ifp referenced
+ * error code otherwise
+ */
+static int
+alter_nh_request(struct rt_addrinfo *info, u_int fibnum, struct nhop_request *nh_req)
+{
+ int error;
+
+ /* Update MTU if set in the request*/
+ set_req_mtu(info, nh_req);
+
+ /* XXX: allow only one of BLACKHOLE,REJECT,GATEWAY */
+
+ /* Allow some flags (FLAG1,STATIC,BLACKHOLE,REJECT) to be toggled on change. */
+ nh_req->rt_flags &= ~RIB_RTE_CHANGE_MASK;
+ nh_req->rt_flags |= info->rti_flags & RIB_RTE_CHANGE_MASK;
+
+ /* Consider gateway change */
+ struct sockaddr *info_gw = info->rti_info[RTAX_GATEWAY];
+
+ if (info_gw != NULL) {
+ error = update_gateway_metadata(info, fibnum);
+ if (error != 0)
+ return (error);
+ /* ifa/ifp are already referenced by update_gateway_metadata() */
+ nh_req->ifa = info->rti_ifa;
+ nh_req->ifp = info->rti_ifp;
+ /* Update RTF_GATEWAY flag status */
+ nh_req->rt_flags &= ~RTF_GATEWAY;
+ nh_req->rt_flags |= (RTF_GATEWAY & info->rti_flags);
+ } else {
+ /* Original nexthop data copy haven't been referenced, do it now */
+ ifa_ref(nh_req->ifa);
+ if_ref(nh_req->ifp);
+ }
+
+ return (0);
+}
+
+/*
+ * Creates a new nexthop based on the information in @info.
+ *
+ * Returns:
+ * 0 on success, filling @nh_ret with the desired nexthop object ptr
+ * errno otherwise
+ */
+static int
+create_nhop_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object **nh_ret)
+{
+ struct sockaddr *gateway, *dst, *netmask;
+ struct nhop_request nh_req;
+ int error;
+
+ fill_nh_request(rnh, info, &nh_req);
+
+ /* Give the protocols chance to augment the request data */
+ dst = info->rti_info[RTAX_DST];
+ netmask = info->rti_info[RTAX_NETMASK];
+ gateway = info->rti_info[RTAX_GATEWAY];
+
+ if (rnh->rnh_preadd != NULL) {
+ error = rnh->rnh_preadd(rnh->rib_fibnum, dst, netmask, &nh_req);
+ if (error != 0)
+ return (error);
+ }
+
+ *nh_ret = nhop_get(rnh, &nh_req);
+ if (*nh_ret == NULL) {
+ DPRINTF("failed to get the nexthop from req");
+ return (EAGAIN);
+ }
+
+ return (0);
+}
+
+/*
+ * Creates new nexthop based on @nh_old and augmentation data from @info.
+ * Helper function used in the route changes, please see
+ * alter_nh_request() comments for more details.
+ *
+ * Returns:
+ * 0 on success, filling @nh_ret with the desired nexthop object
+ * errno otherwise
+ */
+static int
+create_nhop_from_nhop(struct rib_head *rnh, const struct nhop_object *nh_old,
+ struct rt_addrinfo *info, struct nhop_object **nh_ret)
+{
+ struct nhop_request nh_req;
+ struct sockaddr_storage gw_storage;
+ int error;
+
+ /* Start with copying data from original nexthop */
+ fill_nh_request_from_nhop(nh_old, &gw_storage, &nh_req);
+
+ /* return ifa/ifp referenced */
+ error = alter_nh_request(info, rnh->rib_fibnum, &nh_req);
+ if (error != 0)
+ return (error);
+
+ /* Give protocol chance to alter the nexthop request */
+ if (rnh->rnh_preadd != NULL) {
+ error = rnh->rnh_preadd(rnh->rib_fibnum, info->rti_info[RTAX_DST],
+ info->rti_info[RTAX_NETMASK], &nh_req);
+ if (error != 0) {
+ DPRINTF("failed to create nhop: prehook returned %d",
+ error);
+ /* cleanup */
+ ifa_free(nh_req.ifa);
+ if_rele(nh_req.ifp);
+ return (error);
+ }
+ }
+
+ *nh_ret = nhop_get(rnh, &nh_req);
+ if (*nh_ret == NULL) {
+ DPRINTF("failed to create nhop: nhop_get() failed");
+ ifa_free(nh_req.ifa);
+ if_rele(nh_req.ifp);
+ /* XXX: verify */
+ return (EAGAIN);
+ }
+
+ return (0);
+}
+
+/*
+ * Gets kernel-usable time of the route expiration from @info.
+ * Userland provides absolute expiration timestamp (UTC), this function
+ * converts it to the kernel uptime-based interval.
+ *
+ * Returns: kernel uptime-based timestamp of the route expiration or 0.
+ */
+static u_long
+get_expire_from_info(const struct rt_addrinfo *info)
+{
+ u_long expire = 0;
+
+ /* Kernel -> userland timebase conversion. */
+ if ((info->rti_mflags & RTV_EXPIRE) && (info->rti_rmx->rmx_expire > 0))
+ expire = info->rti_rmx->rmx_expire - time_second + time_uptime;
+
+ return (expire);
+}
+
+/*
+ * Gets route weight from @info.
+ * If weight is not set (true in most cases, 2020-01), returns
+ * ROUTE_DEFAULT_WEIGHT (100). If the weight is too high,
+ * caps it to ROUTE_MAX_WEIGHT (2^24 -1).
+ */
+static uint32_t
+get_weight_from_info(const struct rt_addrinfo *info)
+{
+ uint32_t weight;
+
+
+ if ((info->rti_mflags & RTV_WEIGHT) && (info->rti_rmx->rmx_weight > 0))
+ weight = info->rti_rmx->rmx_weight;
+ else
+ weight = ROUTE_DEFAULT_WEIGHT;
+
+ if (weight > ROUTE_MAX_WEIGHT)
+ weight = ROUTE_MAX_WEIGHT;
+
+ return (weight);
+}
+
+/*
+ * Creates rtentry based on dst, mask and other metadata in @info.
+ *
+ * Returns 0 on success, filling @ret_rt with referenced & unlocked
+ * rtentry.
+ */
+static int
+create_rte_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry **ret_rt)
+{
+ struct sockaddr *gateway, *dst, *ndst, *netmask;
+ struct rtentry *rt;
+
+ dst = info->rti_info[RTAX_DST];
+ netmask = info->rti_info[RTAX_NETMASK];
+ gateway = info->rti_info[RTAX_GATEWAY];
+
+ rt = uma_zalloc(V_rtzone, M_NOWAIT);
+ if (rt == NULL) {
+ return (ENOBUFS);
+ }
+
+ /* Bump refcount to return referenced rte */
+ rt->rt_refcnt = 1;
+ rt->rt_fibnum = rnh->rib_fibnum;
+
+ if (dst->sa_len <= sizeof(struct sockaddr_in6)) {
+ memcpy(&rt->rt_dst, dst, dst->sa_len);
+ rt_key(rt) = &rt->rt_dst;
+ } else {
+ /* dst size is too big. Alloc separately */
+ rt_key(rt) = malloc(dst->sa_len, M_RTABLE, M_NOWAIT);
+ if (rt_key(rt) == NULL) {
+ uma_zfree(V_rtzone, rt);
+ return (ENOBUFS);
+ }
+ }
+
+ /*
+ * point to the (possibly newly malloc'd) dest address.
+ */
+ ndst = (struct sockaddr *)rt_key(rt);
+
+ /*
+ * make sure it contains the value we want (masked if needed).
+ */
+ if (netmask != NULL) {
+ /* TODO: verify instead of masked copy */
+ rt_maskedcopy(dst, ndst, netmask);
+ if (!sa_equal(dst, ndst)) {
+ /* contract violation, return */
+ char abuf[INET6_ADDRSTRLEN];
+ rib_print_sockaddr(abuf, INET6_ADDRSTRLEN, dst);
+ DPRINTF("warn: masked dst != dst (%s)", abuf);
+ /* XXX: fix callers! */
+#if 0
+ uma_zfree(V_rtzone, rt);
+ return (EINVAL);
+#endif
+ }
+ } else
+ bcopy(dst, ndst, dst->sa_len);
+
+ rt->rt_weight = get_weight_from_info(info);
+ rt->rt_expire = get_expire_from_info(info);
+ rt->rte_flags = info->rti_flags & RTE_RT_FLAG_MASK;
+
+ *ret_rt = rt;
+
+ return (0);
+}
+
+int
+create_rte_from_rte(struct rib_head *rnh, struct rtentry *rt_orig,
+ struct rtentry **ret_rt)
+{
+ struct sockaddr *dst;
+ struct rtentry *rt;
+
+ dst = rt_key(rt_orig);
+
+ rt = uma_zalloc(V_rtzone, M_NOWAIT);
+ if (rt == NULL) {
+ return (ENOBUFS);
+ }
+
+ /* Bump refcount to return referenced rte */
+ rt->rt_refcnt = 1;
+ rt->rt_fibnum = rnh->rib_fibnum;
+
+ if (dst->sa_len <= sizeof(struct sockaddr_in6)) {
+ memcpy(&rt->rt_dst, dst, dst->sa_len);
+ rt_key(rt) = &rt->rt_dst;
+ } else {
+ /* dst size is too big. Alloc separately */
+ rt_key(rt) = malloc(dst->sa_len, M_RTABLE, M_NOWAIT);
+ if (rt_key(rt) == NULL) {
+ uma_zfree(V_rtzone, rt);
+ return (ENOBUFS);
+ }
+ memcpy(rt_key(rt), dst, dst->sa_len);
+ }
+
+ rt->rt_weight = rt_orig->rt_weight;
+ rt->rt_expire = rt_orig->rt_expire;
+ rt->rte_flags = rt_orig->rte_flags;
+ rt->rt_nhop = rt_orig->rt_nhop;
+
+ *ret_rt = rt;
+
+ return (0);
+}
+
+int
+replace_rte(struct rib_head *rnh, struct sockaddr *dst, struct sockaddr *mask,
+ struct rtentry *rt_new)
+{
+ struct radix_node *rn;
+
+ RIB_WLOCK_ASSERT(rnh);
+
+ rn = rnh->rnh_deladdr(dst, mask, &rnh->head);
+ if (rn == NULL)
+ return (ESRCH);
+
+ rn = rnh->rnh_addaddr(dst, mask, &rnh->head,
+ rt_new->rt_nodes);
+
+ if (rn == NULL)
+ return (ENOBUFS);
+
+ return (0);
+}
+
+/*
+ * Verify that the combination of dst and gateway address families is supported.
+ *
+ * Currently accepted options:
+ * gw_af == dst_af: default option for the routes with RTF_GATEWAY
+ * gw_af == AF_LINK: IPv4/IPv6 interface routes, storing inteface index in sdl.
+ * gw_af == AF_UNSPEC: was used to provide raw ethernet header. Currently not supported.
+ *
+ * Return 0 on success, errno otherwise.
+ */
+static int
+verify_gateway_family(const struct rt_addrinfo *info)
+{
+ const struct sockaddr *dst, *gateway;
+
+ dst = info->rti_info[RTAX_DST];
+ gateway = info->rti_info[RTAX_GATEWAY];
+
+ if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
+ (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Check is nhop is multipath-eligible.
+ * Avoid nhops without gateways and redirects.
+ *
+ * Returns 1 for multipath-eligible nexthop,
+ * 0 otherwise.
+ */
+int
+can_nh_multipath(const struct nhop_object *nh)
+{
+
+ if ((nh->nh_flags & NHF_MULTIPATH) != 0)
+ return (1);
+ if ((nh->nh_flags & NHF_GATEWAY) == 0)
+ return (0);
+ if ((nh->nh_flags & NHF_REDIRECT) != 0)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Get relativer route priority across other routes.
+ * Interface routes (RTF_PINNED) are the hightest,
+ * Normal routes goes next.
+ * Redirected routes have the least priority.
+ *
+ * Returns preference as a number, higher is better.
+ *
+ */
+static uint16_t
+get_rt_preference(int rt_flags)
+{
+ uint16_t result;
+
+ result = (!!(rt_flags & RTF_PINNED)) << 2;
+ result |= (!(rt_flags & (RTF_DYNAMIC))) << 1;
+ result |= !!(rt_flags & (RTF_DYNAMIC));
+
+ return (result);
+}
+
+/*
+ * Tries to add route to the RIB.
+ * Assumes @rt_new and @rt_new->rt_nhop are referenced and unlocked
+ *
+ * Return values:
+ * 0 for success. @rt and rt->rt_nhop is consumed.
+ * If @rc is supplied, unlocked operation result is saved there.
+ * != 0: Error code is returned. It is caller responsibility to free rt / rt->rt_nhop.
+ */
+static int
+add_route(struct rib_head *rnh, struct rtentry *rt_new, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+ struct rtentry *rt_orig;
+ struct nhop_object *nh_orig, *nh_new;
+ struct sockaddr *ndst, *netmask;
+ int error;
+
+ ndst = (struct sockaddr *)rt_key(rt_new);
+ netmask = info->rti_info[RTAX_NETMASK];
+ nh_new = rt_new->rt_nhop;
+
+ rc->cmd = RTM_ADD;
+
+ RIB_WLOCK(rnh);
+ RT_LOCK(rt_new);
+
+ rt_orig = (struct rtentry *)rnh->rnh_addaddr(ndst, netmask, &rnh->head,
+ rt_new->rt_nodes);
+
+ if (rt_orig != NULL) {
+ /* Success. Update generation id. */
+ rnh->rnh_gen++;
+ /* Notify temporal routes of a new route */
+ if (rt_new->rt_expire != 0)
+ tmproutes_update(rnh, rt_new);
+ RIB_WUNLOCK(rnh);
+
+ /*
+ * Prepare notification:
+ * RTM_ADD, nh_old: NULL, nh_new: rt_new->rt_nhop
+ */
+ rc->nh_new = nh_new;
+ rc->rt = rt_new;
+ RT_UNLOCK(rt_new);
+
+ return (0);
+ }
+
+ /* Route addition failed. Inspect the prefix in the rib to determine the cause */
+ rt_orig = (struct rtentry *)rnh->rnh_lookup(ndst, netmask, &rnh->head);
+ if (rt_orig == NULL) {
+ /*
+ * The only reason this can happen is when
+ * rnh_addaddr fails to allocate memory, so the first error
+ * was not really "prefix exists".
+ * Unlock everything and return.
+ */
+ RIB_WUNLOCK(rnh);
+ RT_UNLOCK(rt_new);
+ RTSTAT_INC(rts_add_algo_fail);
+ return (ENOMEM);
+ }
+
+ /* We have existing route in the RIB. */
+ nh_orig = rt_orig->rt_nhop;
+ /* TODO: generalise to the protocol preferences */
+ if ((info->rti_flags & RTF_PINNED) && !NH_IS_PINNED(nh_orig)) {
+ /*
+ * Our new proposed route is an interface route so it
+ * takes precedence. Replace old nexthop & rte with a new pair.
+ */
+ error = replace_rte(rnh, ndst, netmask, rt_new);
+ RT_UNLOCK(rt_new);
+ if (error == 0)
+ rnh->rnh_gen++;
+ RIB_WUNLOCK(rnh);
+
+ if (error != 0)
+ return (error);
+
+ RTSTAT_INC(rts_add_pinned);
+
+ /* Update notification data */
+ rc->cmd = RTM_CHANGE;
+ rc->rt = rt_new;
+ rc->nh_new = nh_new;
+ rc->nh_old = nh_orig;
+
+ RTFREE(rt_orig);
+ nhop_free_any(nh_orig);
+
+ return (0);
+ }
+
+#ifdef ROUTE_MPATH
+ /* Eligible for multipath? */
+ if (!can_rib_multipath(rnh) || (can_nh_multipath(rt_new->rt_nhop) == 0) ||
+ (can_nh_multipath(rt_orig->rt_nhop) == 0)) {
+ /*
+ * Multipath not enabled OR
+ * new NH is not a route with gw OR
+ * existing NH is NOT multipah group / gateway
+ */
+ RIB_WUNLOCK(rnh);
+ RT_UNLOCK(rt_new);
+ RTSTAT_INC(rts_mpath_ineligible);
+ return (EEXIST);
+ }
+
+ /*
+ * One or more routes is already in the RIB and we need to add
+ * another one, which requires getting a new nexthop group.
+ */
+ unsigned int weight_orig = rt_orig->rt_weight;
+ nh_orig = rt_orig->rt_nhop;
+ RIB_WUNLOCK(rnh);
+ RT_UNLOCK(rt_new);
+
+ error = add_route_mpath(rnh, rt_new, nh_orig, weight_orig, info, rc);
+#else
+ RIB_WUNLOCK(rnh);
+ RT_UNLOCK(rt_new);
+ error = EEXIST;
+#endif
+ return (error);
+}
+
+/*
+ * Check if specified @gw matches gw data in the nexthop @nh.
+ *
+ * Returns 1 if matches, 0 otherwise.
+ */
+int
+rib_match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
+{
+
+ if (nh->gw_sa.sa_family != gw->sa_family)
+ return (0);
+
+ switch (gw->sa_family) {
+ case AF_INET:
+ return (nh->gw4_sa.sin_addr.s_addr ==
+ ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
+ case AF_INET6:
+ {
+ const struct sockaddr_in6 *gw6;
+ gw6 = (const struct sockaddr_in6 *)gw;
+ /*
+ * Currently (2020-01) IPv6 gws in kernel have their
+ * scope embedded. Once this becomes false, this code
+ * has to be revisited.
+ */
+ if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
+ &gw6->sin6_addr))
+ return (1);
+ return (0);
+ }
+ default:
+ if (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) != 0)
+ return (0);
+ return (1);
+ }
+
+ /* NOTREACHED */
+ return (0);
+}
+
+int
+del_route_one(struct rib_head *rnh, struct rtentry *rt, struct rt_addrinfo *info)
+{
+ struct sockaddr *dst, *netmask, *gw;
+ struct radix_node *rn;
+
+ RIB_WLOCK_ASSERT(rnh);
+ KASSERT((!NH_IS_MULTIPATH(rt->rt_nhop)), ("called with mpath route"));
+
+ /* TODO: generalise priorities */
+ if (NH_IS_PINNED(rt->rt_nhop) && ((info->rti_flags & RTF_PINNED) == 0)){
+ /*
+ * Target route is PINNED, while our request does not
+ * contain RTF_PINNED flag -> refuse to delete.
+ */
+ RTSTAT_INC(rts_del_fail_priority);
+ return (EADDRINUSE);
+ }
+
+ gw = info->rti_info[RTAX_GATEWAY];
+ if ((info->rti_flags & RTF_GATEWAY) && (gw != NULL)) {
+ /*
+ * Delete request contains specific gateway.
+ * Have to verify it prior to the deletion.
+ */
+ if (rib_match_nhop_gw(rt->rt_nhop, gw) == 0) {
+ return (ESRCH);
+ }
+ }
+
+ if (info->rti_filter != NULL) {
+ /*
+ * Delete request contains specific matching function.
+ * Run the found rte through it.
+ */
+ if (info->rti_filter(rt, rt->rt_nhop, info->rti_filterdata) == 0) {
+ /* Not matched */
+ return (ESRCH);
+ }
+ }
+
+ /* Finally, remove record */
+ dst = info->rti_info[RTAX_DST];
+ netmask = info->rti_info[RTAX_NETMASK];
+ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head);
+
+ if (rn == NULL) {
+ /* Should not happen */
+ RTSTAT_INC(rts_del_algo_fail);
+ return (ESRCH);
+ }
+ KASSERT((struct rtentry *)rn == rt,
+ ("rnh_deladdr returned wrong rte: expected %p got %p", rt, rn));
+
+ /* Mark rte as deleted */
+ rt->rte_flags &= ~RTF_UP;
+
+ return (0);
+}
+
+/*
+ * Tries to delete route specified by @info.
+ * Returns 0 on success.
+ * If successful, references rt, nhop and
+ * returns them unlocked.
+ */
+static int
+del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rib_cmd_info *rc)
+{
+ struct sockaddr *dst, *netmask;
+ struct rtentry *rt;
+#ifdef ROUTE_MPATH
+ struct nhgrp_object *mp;
+#endif
+ int error;
+
+ dst = info->rti_info[RTAX_DST];
+ netmask = info->rti_info[RTAX_NETMASK];
+
+ rc->cmd = RTM_DELETE;
+
+ RIB_WLOCK(rnh);
+ rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
+ if (rt == NULL) {
+ RIB_WUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ if (NH_IS_MULTIPATH(rt->rt_nhop)) {
+ /* We hit multipath group */
+#ifdef ROUTE_MPATH
+ if (info->rti_info[RTAX_GATEWAY] == NULL) {
+ /*
+ * No gateway specification in the delete
+ * request, aborting.
+ */
+ RIB_WUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ mp = (struct nhgrp_object *)rt->rt_nhop;
+
+ RIB_WUNLOCK(rnh);
+
+ return (del_route_mpath(rnh, rt, mp, info, rc));
+#else
+ RIB_WUNLOCK(rnh);
+ return (ENOTSUP);
+#endif
+ }
+
+ error = del_route_one(rnh, rt, info);
+ RIB_WUNLOCK(rnh);
+ if (error != 0)
+ return (error);
+
+ RTSTAT_INC(rts_del_success);
+
+ /* Finalise notification data */
+ rc->rt = rt;
+ rc->nh_old = rt->rt_nhop;
+ rc->rt_weight = rt->rt_weight;
+
+ /*
+ * rt was removed from the tree as well as rt_nhop.
+ * Decrease their reference counts.
+ */
+ NH_FREE(rt->rt_nhop);
+ RTFREE(rt);
+
+ return (0);
+}
+
+static int
+clone_rte_conditional(struct rib_head *rnh, struct rtentry *rt_orig,
+ struct nhop_object *nh_orig, struct rt_addrinfo *info, struct rtentry **ret)
+{
+ struct rtentry *rt_new;
+ int error;
+
+ rt_new = NULL;
+ if (info->rti_mflags & RTV_EXPIRE) {
+ if (NH_IS_MULTIPATH(nh_orig)) {
+ return (ENOTSUP);
+ }
+
+ if (get_expire_from_info(info) != rt_orig->rt_expire) {
+ error = create_rte_from_rte(rnh, rt_orig, &rt_new);
+ if (error != 0)
+ return (error);
+ }
+ /* XXX: set expire */
+ }
+ if ((info->rti_mflags & RTV_WEIGHT) && !NH_IS_MULTIPATH(nh_orig) &&
+ get_weight_from_info(info) != rt_orig->rt_weight) {
+ if (rt_new == NULL) {
+ error = create_rte_from_rte(rnh, rt_orig, &rt_new);
+ if (error != 0)
+ return (error);
+ }
+ /* XXX: set weight */
+ }
+
+ *ret = rt_new;
+ return (0);
+}
+
+static int
+change_route(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+ struct nhop_object *nh_orig, *nh_new, *nh_src, *nh_insert;
+ struct sockaddr *gw;
+ struct rtentry *rt_orig, *rt_curr, *rt_new;
+ int error;
+ uint32_t weight_orig;
+ unsigned long expire_orig;
+#ifdef ROUTE_MPATH
+ struct weightened_nhop *wn_orig, wn_new;
+ struct nhgrp_object *mp_new;
+ uint32_t changed_idx, num_nhops;
+ uint64_t modmask;
+#endif
+ RIB_RLOCK_TRACKER;
+
+ gw = info->rti_info[RTAX_GATEWAY];
+
+ RIB_RLOCK(rnh);
+ rt_orig = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
+ info->rti_info[RTAX_NETMASK], &rnh->head);
+
+ if (rt_orig == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ weight_orig = rt_orig->rt_weight;
+ expire_orig = rt_orig->rt_expire;
+ nh_orig = rt_orig->rt_nhop;
+ if (NH_IS_MULTIPATH(nh_orig)) {
+#ifdef ROUTE_MPATH
+ if (gw == NULL) {
+ /* Unable to choose the proper nexthop */
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+
+ nh_src = NULL;
+ wn_orig = nhgrp_get_nhops((struct nhgrp_object *)nh_orig,
+ &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ if (rib_match_nhop_gw(wn_orig[i].nh, gw)) {
+ nh_src = wn_orig[i].nh;
+ changed_idx = i;
+ break;
+ }
+ }
+
+ if (nh_src == NULL) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+#else
+ RIB_RUNLOCK(rnh);
+ return (ENOTSUP);
+#endif
+ } else {
+ if (gw != NULL && !rib_match_nhop_gw(nh_orig, gw)) {
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+ }
+ nh_src = nh_orig;
+ }
+
+ /*
+ * Chosen nexthop is nh_src, original rt is rt_orig, original
+ * nhop/nhop group is nh_orig.
+ * Drop the lock and try to create a new nexthop and a new
+ * nhop group if needed.
+ */
+ RIB_RUNLOCK(rnh);
+
+ /*
+ * Route change may request weight / expire time change.
+ * As these changes has to be stored in rtentry and we need
+ * to maintain immutability of most fields, we clone&insert
+ * cloned rtentry in the rib in such cases.
+ *
+ * Note: rt_new CAN be NULL and is NULL for all common cases.
+ */
+ error = clone_rte_conditional(rnh, rt_orig, nh_orig, info, &rt_new);
+ if (error != 0)
+ return (error);
+
+ error = create_nhop_from_nhop(rnh, nh_src, info, &nh_new);
+ if (error != 0) {
+ if (rt_new != NULL)
+ RTFREE(rt_new);
+ return (error);
+ }
+ DPRINTF("Update nhop: %d -> %d", nh_orig->nh_priv->nh_idx,
+ nh_new->nh_priv->nh_idx);
+
+#ifdef ROUTE_MPATH
+ mp_new = NULL;
+ if (NH_IS_MULTIPATH(nh_orig)) {
+ /* Create mpath group with an updated nhop/weight */
+ wn_new.nh = nh_src;
+ if (info->rti_mflags & RTV_WEIGHT)
+ wn_new.weight = get_weight_from_info(info);
+ else
+ wn_new.weight = wn_orig[changed_idx].weight;
+
+ mp_new = nhgrp_get_replace_nhop(rnh,
+ (struct nhgrp_object *)nh_orig, &wn_new,
+ changed_idx, &modmask, &error);
+
+ if (mp_new == NULL) {
+ NH_FREE(nh_src);
+ if (rt_new != NULL)
+ RTFREE(rt_new);
+ return (error);
+ }
+ nh_insert = (struct nhop_object *)mp_new;
+ } else
+#endif
+ nh_insert = nh_new;
+
+ if (rt_new != NULL)
+ rt_new->rt_nhop = nh_insert;
+
+ /* Update notification metadata */
+ rc->nh_old = nh_src;
+ rc->nh_new = nh_insert;
+
+ RIB_WLOCK(rnh);
+
+ /*
+ * Lookup route once again as it may have been changed or deleted.
+ */
+ rt_curr = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
+ info->rti_info[RTAX_NETMASK], &rnh->head);
+
+ /* Check if anything has changed */
+ if ((rt_curr != rt_orig) || (rt_curr->rt_nhop != nh_orig)) {
+ /*
+ * The original nexthop has changed. Free the resources
+ * and return EAGAIN, so the caller can retry.
+ */
+ RIB_WUNLOCK(rnh);
+ NH_FREE(nh_new);
+#ifdef ROUTE_MPATH
+ if (mp_new != NULL)
+ nhgrp_free_group(mp_new);
+#endif
+ if (rt_new != NULL)
+ RTFREE(rt_new);
+ return (EAGAIN);
+ }
+
+ if (rt_new != NULL) {
+ error = replace_rte(rnh, info->rti_info[RTAX_DST],
+ info->rti_info[RTAX_NETMASK], rt_new);
+ } else {
+ RT_LOCK(rt_orig);
+ rt_orig->rt_nhop = nh_insert;
+ RT_UNLOCK(rt_orig);
+ error = 0;
+ }
+
+ if (error == 0)
+ rnh->rnh_gen++;
+ RIB_WUNLOCK(rnh);
+
+ if (error != 0) {
+ /*
+ * Failed to install new rte with new nexthop.
+ * Free resources.
+ */
+ NH_FREE(nh_new);
+#ifdef ROUTE_MPATH
+ if (mp_new != NULL)
+ nhgrp_free_group(mp_new);
+#endif
+ if (rt_new != NULL)
+ RTFREE(rt_new);
+
+ return (error);
+ }
+
+ /* Remove refcount from the old nhop */
+ nhop_free_any(nh_orig);
+
+ if (rt_new != NULL) {
+ RTFREE(rt_orig);
+ rc->rt = rt_new;
+ } else
+ rc->rt = rt_orig;
+
+ return (0);
+}
+
+/*
+ * Assumes RTAX_GATEWAY is set
+ * Returns 0 on success, references ifa/ifp
+ * XXX: verify freeing refcount
+ */
+static int
+update_gateway_metadata(struct rt_addrinfo *info, int fibnum)
+{
+ int error;
+
+ KASSERT((info->rti_info[RTAX_GATEWAY] != NULL), ("gateway is NULL"));
+
+ /*
+ * Allow the same set of rules as with route creation
+ */
+ error = verify_gateway_family(info);
+ if (error != 0)
+ return (error);
+
+ if (info->rti_ifa == NULL) {
+ error = rt_getifa_fib(info, fibnum);
+ if (error != 0)
+ return (error);
+ } else {
+ ifa_ref(info->rti_ifa);
+ }
+
+ if (info->rti_ifp == NULL)
+ info->rti_ifp = info->rti_ifa->ifa_ifp;
+ if_ref(info->rti_ifp);
+
+ return (0);
+}
+
+
+
+static void
+refine_info(struct rt_addrinfo *info)
+{
+
+ /*
+ * If we are adding a host route then we don't want to put
+ * a netmask in the tree, nor do we want to clone it.
+ */
+ if (info->rti_flags & RTF_HOST)
+ info->rti_info[RTAX_NETMASK] = NULL;
+}
+
+/*
+ * Allocates rtentry and gets referenced&linked nhop.
+ *
+ * Returns 0 on success, storing rtentry with the valid nhop into @ret_rt.
+ *
+ */
+static int
+create_rt_nh_pair_from_info(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry **ret_rt)
+{
+ struct rtentry *rt;
+ struct nhop_object *nh;
+ int error;
+
+ error = create_rte_from_info(rnh, info, &rt);
+ if (error != 0) {
+ DPRINTF("failed to create rte: %d", error);
+ return (error);
+ }
+
+ DPRINTF("new rte %p af %d", rt, (int)(info->rti_info[RTAX_DST])->sa_family);
+
+ error = create_nhop_from_info(rnh, info, &nh);
+ if (error != 0) {
+ DPRINTF("failed to create nhop: %d", error);
+ uma_zfree(V_rtzone, rt);
+ return (error);
+ }
+
+ rt->rt_nhop = nh;
+
+ *ret_rt = rt;
+
+ return (0);
+}
+
+
+/*
+ * Adds route defined by @info into the kernel table specified by @fibnum and
+ * sa_family in @info->rti_info[RTAX_DST].
+ *
+ * Returns 0 on success and fills in operation metadata into @rc.
+ */
+int
+rib_add_route(u_int fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
+{
+ int error = 0;
+ struct rtentry *rt;
+ struct sockaddr *gateway, *dst, *netmask;
+ struct rib_head *rnh;
+ int ifa_referenced = 0;
+
+ NET_EPOCH_ASSERT_INVARIANTS();
+
+ refine_info(info);
+
+ dst = info->rti_info[RTAX_DST];
+ netmask = info->rti_info[RTAX_NETMASK];
+ gateway = info->rti_info[RTAX_GATEWAY];
+
+ if ((info->rti_flags & RTF_GATEWAY) && gateway == NULL)
+ return (EINVAL);
+ error = verify_gateway_family(info);
+ if (error != 0)
+ return (error);
+
+ /* ensure route is UP */
+ info->rti_flags |= RTF_UP;
+
+ if (info->rti_ifa == NULL) {
+ /* rt_getifa_fib() references ifa upon successful completion */
+ error = rt_getifa_fib(info, fibnum);
+ if (error != 0)
+ return (error);
+ ifa_referenced = 1;
+ }
+
+ rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+ error = create_rt_nh_pair_from_info(rnh, info, &rt);
+ if (error != 0) {
+ if (ifa_referenced != 0)
+ ifa_free(info->rti_ifa);
+ return (error);
+ }
+
+ bzero(rc, sizeof(struct rib_cmd_info));
+
+ error = add_route(rnh, rt, info, rc);
+
+ /*
+ * If it still failed to go into the tree,
+ * then un-make it (this should be a function)
+ */
+ if (error != 0) {
+ NH_FREE(rt->rt_nhop);
+ if (rt_key(rt) != &rt->rt_dst)
+ R_Free(rt_key(rt));
+ uma_zfree(V_rtzone, rt);
+ if (ifa_referenced != 0)
+ ifa_free(info->rti_ifa);
+ return (error);
+ }
+ RTSTAT_INC(rts_add_success);
+
+ rib_notify_subscribers(rnh, info, rc);
+
+ return (0);
+}
+
+
+/*
+ * Removes route defined by @info from the kernel table specified by @fibnum and
+ * sa_family in @info->rti_info[RTAX_DST].
+ *
+ * Returns 0 on success and fills in operation metadata into @rc.
+ */
+int
+rib_del_route(u_int fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
+{
+ struct rib_head *rnh;
+ int error = 0;
+
+ NET_EPOCH_ASSERT_INVARIANTS();
+
+ refine_info(info);
+
+ rnh = rt_tables_get_rnh(fibnum, get_family_from_info(info));
+
+ bzero(rc, sizeof(struct rib_cmd_info));
+
+ error = del_route(rnh, info, rc);
+
+ if (error == 0)
+ rib_notify_subscribers(rnh, info, rc);
+
+ return (error);
+}
+
+
+/*
+ * Changes route properties defined by @info in the kernel table specified by
+ * @fibnum and sa_family in @info->rti_info[RTAX_DST].
+ *
+ * Returns 0 on success and fills in operation metadata into @rc.
+ */
+int
+rib_change_route(u_int fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
+{
+ struct rib_head *rnh;
+ int error = 0;
+
+ NET_EPOCH_ASSERT_INVARIANTS();
+
+ refine_info(info);
+
+ rnh = rt_tables_get_rnh(fibnum, get_family_from_info(info));
+
+ bzero(rc, sizeof(struct rib_cmd_info));
+
+ for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+ error = change_route(rnh, info, rc);
+ if (error != EAGAIN)
+ break;
+ }
+
+ if (error == 0)
+ rib_notify_subscribers(rnh, info, rc);
+
+ return (error);
+}
+
+static int
+can_rib_multipath(struct rib_head *rh)
+{
+ int result;
+
+ CURVNET_SET(rh->rib_vnet);
+ result = !!V_rib_route_multipath;
+ CURVNET_RESTORE();
+
+ return (result);
+}
+
+/*
+ * Looks up route based on @dst and @mask.
+ *
+ * @dst: destination to lookup.
+ * @mask: route netmask for exact prefix match, can be NULL.
+ *
+ * Returns 0 on success, filling @ret with found rtentry.
+ * rtentry is returned locked.
+ */
+int
+rib_lookup_route_netmask(u_int fibnum, const struct sockaddr *dst,
+ const struct sockaddr *mask, struct rtentry **ret)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+
+ rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+ if (rnh == NULL)
+ return (EAFNOSUPPORT);
+
+ RIB_RLOCK(rnh);
+
+ if (mask == NULL) {
+ /* Longest prefix match lookup */
+ rn = rnh->rnh_matchaddr(__DECONST(void *, dst), &rnh->head);
+ } else {
+ /* Exact match lookup */
+ rn = rnh->rnh_lookup(__DECONST(void *, dst),
+ __DECONST(void *, mask), &rnh->head);
+ }
+
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+ RT_LOCK(rt);
+ RIB_RUNLOCK(rnh);
+
+ *ret = rt;
+ return (0);
+ }
+
+ RIB_RUNLOCK(rnh);
+ return (ESRCH);
+}
+
+static int
+contigmask(const uint8_t *p, int len)
+{
+ int i, n;
+
+ for (i = 0; i < len ; i++)
+ if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
+ break;
+ for (n= i + 1; n < len; n++)
+ if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0)
+ return (-1); /* mask not contiguous */
+ return (i);
+}
+
+
+/*
+ * Retrieves address and prefix from @rt.
+ * @dst: prefix dst storage. Can be NULL, if not NULL, buffer size in sa_len.
+ * @netmask: prefix mask storage. Can be NULL, if not NULL, buffer size in sa_len.
+ * @plen: CIDR len, can be NULL. -1 on failure (non-contig mask).
+ *
+ * Returns 0 on success.
+ *
+ */
+int
+rib_get_entry_prefix(const struct rtentry *rt, struct sockaddr *dst,
+ struct sockaddr *netmask, int *plen)
+{
+ const struct sockaddr *src;
+
+ if (dst != NULL) {
+ src = rt_key_const(rt);
+ if (src->sa_len > dst->sa_len)
+ return (ENOBUFS);
+ memcpy(dst, src, src->sa_len);
+ }
+
+ if (netmask != NULL) {
+ src = rt_mask_const(rt);
+ if (src->sa_len > netmask->sa_len)
+ return (ENOBUFS);
+ /*
+ * Currently in-tree netmasks
+ * a) does not have address family attached
+ * b) have different notion of sa_len,
+ * limiting it to the amount of
+ * non-zero bytes in netmask to
+ * speedup lookup.
+ * Fix this by copyin the remaining data
+ * from the key.
+ */
+ const struct sockaddr *dst_sa = rt_key_const(rt);
+ memcpy(netmask, src, dst_sa->sa_len);
+ netmask->sa_family = dst_sa->sa_family;
+ netmask->sa_len = dst_sa->sa_len;
+ }
+
+ if (plen != NULL) {
+ int family = (rt_key_const(rt))->sa_family;
+ const char *ptr = NULL;
+ int max_prefix = -1;
+ src = rt_mask_const(rt);
+ if (family == AF_INET) {
+ max_prefix = 32;
+ ptr = (const char *)&((const struct sockaddr_in *)src)->sin_addr;
+ } else if (family == AF_INET6) {
+ max_prefix = 128;
+ ptr = (const char *)&((const struct sockaddr_in6 *)src)->sin6_addr;
+ }
+
+ if (src != NULL && ptr != NULL)
+ *plen = contigmask(ptr, max_prefix);
+ else
+ *plen = max_prefix;
+ }
+
+ return (0);
+}
+
+struct sockaddr *
+rib_get_entry_dst_sa(const struct rtentry *rt, struct sockaddr *dst,
+ size_t sa_len, int *error)
+{
+ const struct sockaddr *src = rt_key_const(rt);
+
+ if (src->sa_len > sa_len) {
+ *error = ENOBUFS;
+ return (NULL);
+ }
+ memcpy(dst, src, src->sa_len);
+ *error = 0;
+
+ return (dst);
+}
+
+struct sockaddr *
+rib_get_entry_netmask_sa(const struct rtentry *rt, struct sockaddr *netmask,
+ size_t sa_len, int *error)
+{
+ const struct sockaddr *src = rt_mask_const(rt);
+ if (src == NULL) {
+ *error = 0;
+ return (NULL);
+ }
+
+ if (src->sa_len > sa_len) {
+ *error = ENOBUFS;
+ return (NULL);
+ }
+
+ *error = 0;
+
+ /*
+ * Currently in-tree netmasks
+ * a) do not have address family attached
+ * b) have different notion of sa_len,
+ * limiting it to the amount of
+ * non-zero bytes in netmask to
+ * speedup lookup.
+ * Fix this by copyin the remaining data
+ * from the key.
+ */
+ const struct sockaddr *dst = rt_key_const(rt);
+
+ bzero(netmask, dst->sa_len);
+ netmask->sa_len = dst->sa_len;
+ netmask->sa_family = dst->sa_family;
+
+ switch (dst->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)netmask)->sin_addr =
+ ((const struct sockaddr_in *)src)->sin_addr;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)netmask)->sin6_addr =
+ ((const struct sockaddr_in6 *)src)->sin6_addr;
+ break;
+ default:
+ memcpy(netmask, src, dst->sa_len);
+ netmask->sa_family = dst->sa_family;
+ netmask->sa_len = dst->sa_len;
+ }
+
+ return (netmask);
+}
+
+int
+rib_get_entry_plen(const struct rtentry *rt)
+{
+ int family = (rt_key_const(rt))->sa_family;
+ const char *ptr = NULL;
+ int max_prefix = -1;
+ const struct sockaddr *src = rt_mask_const(rt);
+ int plen;
+
+ if (family == AF_INET) {
+ max_prefix = 32;
+ ptr = (const char *)&((const struct sockaddr_in *)src)->sin_addr;
+ } else if (family == AF_INET6) {
+ max_prefix = 128;
+ ptr = (const char *)&((const struct sockaddr_in6 *)src)->sin6_addr;
+ }
+
+ if (src != NULL && ptr != NULL)
+ plen = contigmask(ptr, max_prefix);
+ else
+ plen = max_prefix;
+
+ return (plen);
+}
+
+int
+rib_get_entry_weight(const struct rtentry *rt)
+{
+
+ return (rt->rt_weight);
+}
+
+int
+rib_get_entry_rtflags(const struct rtentry *rt, const struct nhop_object *nh)
+{
+
+ return (rt->rte_flags | nh->nh_priv->rt_flags);
+}
+
+const struct nhop_object *
+rib_get_entry_nhop(const struct rtentry *rt)
+{
+
+ return (rt->rt_nhop);
+}
+
+sa_family_t
+rib_get_entry_family(const struct rtentry *rt)
+{
+
+ return ((rt_key_const(rt))->sa_family);
+}
+
+unsigned int
+rib_get_entry_fibnum(const struct rtentry *rt)
+{
+
+ return (rt->rt_fibnum);
+}
+
+unsigned long
+rib_get_entry_expire_time(const struct rtentry *rt)
+{
+
+ return (rt->rt_expire);
+}
+
+/*
+int
+rib_is_host_entry(const struct rtentry *rt)
+{
+
+ return (rt->rt_flags & RTF_HOST);
+}
+*/
+
+
+
+
+
+
+#include "tests/routing/test_route_ctl.h"
+
Index: sys/net/route/route_helpers.c
===================================================================
--- /dev/null
+++ sys/net/route/route_helpers.c
@@ -0,0 +1,387 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+#include "opt_route_mpath.h"
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/shared.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_mroute.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/rtentry_var.h>
+
+#include <vm/uma.h>
+
+/*
+ * RIB helper functions.
+ */
+
+/*
+ * Operation results generated by the rib_<add|delete|change>_route()
+ * can represent faily complex operations on multiple paths, making
+ * the notification handlers much more complex than they need to be.
+ * This function servers as helper, decomposing such notificaions into
+ * list of simple <path, add/del/change action> operations and calling
+ * provided callback on each primitive operation.
+ */
+int
+rib_decompose_notification(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata)
+{
+#ifdef ROUTE_MPATH
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+#endif
+
+ DPRINTF("rnh=%p cb=%p info=%p, cmd=%d nh_old=%p nh_new=%p change_mask=%X",
+ rnh, cb, info, rc->cmd, rc->nh_old, rc->nh_new,
+ (uint32_t)rc->mask_changed);
+ switch (rc->cmd) {
+ case RTM_ADD:
+ if (!NH_IS_MULTIPATH(rc->nh_new)) {
+ cb(RTM_ADD, rnh, info, rc->rt, NULL, rc->nh_new,
+ rc->rt->rt_weight, cbdata);
+ break;
+ }
+#ifdef ROUTE_MPATH
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->nh_new, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ if (rc->mask_changed & (1 << i))
+ cb(RTM_ADD, rnh, info, rc->rt, NULL, wn[i].nh,
+ wn[i].weight, cbdata);
+ }
+#endif
+ break;
+ case RTM_DELETE:
+ if (!NH_IS_MULTIPATH(rc->nh_old)) {
+ cb(RTM_DELETE, rnh, info, rc->rt, rc->nh_old, NULL,
+ rc->rt->rt_weight, cbdata);
+ break;
+ }
+#ifdef ROUTE_MPATH
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc->nh_old, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ if (rc->mask_changed & (1 << i)) {
+ DPRINTF("RTM_DELETE: rnh=%p info=%p i=%d wn=%p",
+ rnh, info, i, wn);
+ cb(RTM_DELETE, rnh, info, rc->rt, wn[i].nh,
+ NULL, wn[i].weight, cbdata);
+ }
+ }
+#endif
+ break;
+ case RTM_CHANGE:
+ /*
+ * Current rtsock API does not allow changing more than one path at
+ * once. This will change in the future, as the most
+ * efficient way of dealing with large number of multipath routes
+ * is to allow routing daemon to have direct control over nexthops
+ * and multipath objects.
+ *
+ * Additionally, there is a case with a force switch from multipath
+ * route to the inteface route. This is a corner case, which should
+ * be infrequent.
+ *
+ * Given that, implement mpath <> mpath support in the easiest way,
+ * postponing more performant implementation till other
+ * rtsock / netlink changes.
+ */
+#ifdef ROUTE_MPATH
+ if (NH_IS_MULTIPATH(rc->nh_old) || NH_IS_MULTIPATH(rc->nh_new)) {
+ uint32_t num_old, num_new;
+ struct weightened_nhop *wn_old, *wno, *wn_new, *wnn;
+ struct weightened_nhop tmp = { NULL, 0 };
+ struct nhgrp_object *mp;
+ uint32_t idx_old, idx_new;
+ uint64_t bmask;
+
+ if (NH_IS_MULTIPATH(rc->nh_old)) {
+ mp = (struct nhgrp_object *)rc->nh_old;
+ wn_old = nhgrp_get_nhops(mp, &num_old);
+ } else {
+ tmp.nh = rc->nh_old;
+ tmp.weight = rc->rt_weight;
+ wn_old = &tmp;
+ num_old = 1;
+ }
+ if (NH_IS_MULTIPATH(rc->nh_new)) {
+ mp = (struct nhgrp_object *)rc->nh_new;
+ wn_new = nhgrp_get_nhops(mp, &num_new);
+ } else {
+ tmp.nh = rc->nh_new;
+ tmp.weight = rc->rt_weight;
+ wn_new = &tmp;
+ num_new = 1;
+ }
+
+ int found;
+ bmask = 0;
+ for (idx_old = 0; idx_old < num_old; idx_old++) {
+ wno = &wn_old[idx_old];
+ found = 0;
+ for (idx_new = 0; idx_new < num_new; idx_new++) {
+ wnn = &wn_new[idx_new];
+ if (wno->nh != wnn->nh)
+ continue;
+ bmask |= (1 << idx_new);
+ found = 1;
+ if (wno->weight != wnn->weight) {
+ cb(RTM_CHANGE, rnh, info, rc->rt,
+ wno->nh, wnn->nh, wnn->weight,
+ cbdata);
+ }
+ break;
+ }
+ if (found == 0) {
+ DPRINTF("RTM_DELETE: rnh=%p info=%p rt=%p wn=%p nh[%d]=%p nh_old=%p nh_new=%p",
+ rnh, info, rc->rt, wn_old,
+ idx_old, wno->nh, rc->nh_old,
+ rc->nh_new);
+ cb(RTM_DELETE, rnh, info, rc->rt, wno->nh,
+ NULL, wno->weight, cbdata);
+ }
+ }
+ for (idx_new = 0; idx_new < num_new; idx_new++) {
+ if ((bmask & (1 << idx_new)) == 0)
+ continue;
+ wnn = &wn_new[idx_new];
+ cb(RTM_ADD, rnh, info, rc->rt, NULL, wnn->nh,
+ wnn->weight, cbdata);
+ }
+
+ break;
+ }
+#endif
+
+ /* Weight changes ? */
+ cb(RTM_CHANGE, rnh, info, rc->rt, rc->nh_old, rc->nh_new,
+ rc->rt->rt_weight, cbdata);
+ break;
+ }
+
+ return (0);
+}
+
+static void
+rt_notify_ifa_handler(int cmd, struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry *rt, struct nhop_object *nh_old, struct nhop_object *nh_new, uint32_t weight, void *cbdata)
+{
+ struct ifaddr *ifa;
+
+ switch (cmd) {
+ case RTM_ADD:
+ ifa = nh_new->nh_ifa;
+ if (ifa->ifa_rtrequest != NULL)
+ ifa->ifa_rtrequest(RTM_ADD, rt, nh_new, info);
+ break;
+ case RTM_DELETE:
+ ifa = nh_old->nh_ifa;
+ if (ifa->ifa_rtrequest != NULL)
+ ifa->ifa_rtrequest(RTM_DELETE, rt, nh_old, info);
+ break;
+ case RTM_CHANGE:
+ if (nh_old->nh_ifa == nh_new->nh_ifa)
+ break;
+ ifa = nh_old->nh_ifa;
+ if (ifa->ifa_rtrequest != NULL)
+ ifa->ifa_rtrequest(RTM_DELETE, rt, nh_old, info);
+ ifa = nh_new->nh_ifa;
+ if (ifa->ifa_rtrequest != NULL)
+ ifa->ifa_rtrequest(RTM_ADD, rt, nh_new, info);
+ break;
+ }
+}
+
+/*
+ * old_nh, new_nh, bmask
+ *
+ * ADD [] -> [1] + (NULL, new, 1)
+ * CHANGE [1] -> [2] + (old, new, 1)
+ * CHANGE [1:w1] -> [1:w2] -> ?
+ * DEL [1] (old, NULL, ?)
+ * -
+ * ADD [1] -> [1, 2] + (old, new, 2)
+ * ADD [1] -> [1, 2, 3] + (old, new, 2,3)
+ * CHANGE [1, 2] -> [3, 4] ? old_bmask?
+ * CHANGE [1, 2:w1] -> [1, 2:w2]
+ * CHANGE [1, 2:w1, 3] -> [2:w2]
+ * DEL [1, 2, 3] -> [1]
+ *
+ */
+void
+rib_notify_subscribers(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+
+ rib_decompose_notification(rnh, info, rc, rt_notify_ifa_handler, NULL);
+}
+
+
+/*
+ * Prints sockaddr @s into supplied buffer @buf of size @buflen.
+ *
+ * Returns length of the resulting string excluding last '\0'
+ */
+int
+rib_print_sockaddr(char *buf, int buflen, const struct sockaddr *s)
+{
+ const void *paddr = NULL;
+
+ switch (s->sa_family) {
+ case AF_INET:
+ paddr = &((const struct sockaddr_in *)s)->sin_addr;
+ break;
+ case AF_INET6:
+ paddr = &((const struct sockaddr_in6 *)s)->sin6_addr;
+ break;
+ }
+
+ if (paddr == NULL) {
+ return (snprintf(buf, buflen, "unknown_af:%d:len:%d",
+ s->sa_family, s->sa_len));
+ }
+
+ if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL)
+ return (0);
+
+ return (strlen(buf));
+}
+
+/*
+ * Calls @wa_f with @arg for each entry in the table specified by
+ * @af and @fibnum.
+ *
+ * Table is traversed under read lock.
+ */
+void
+rib_walk(int af, u_int fibnum, rt_walktree_f_t *wa_f, void *arg)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rnh;
+
+ if ((rnh = rt_tables_get_rnh(fibnum, af)) == NULL)
+ return;
+
+ RIB_RLOCK(rnh);
+ rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
+ RIB_RUNLOCK(rnh);
+}
+
+int
+rib_request(enum rib_cmd_type cmd, u_int fibnum, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+ int error;
+
+ switch (cmd) {
+ case RIB_ADD:
+ error = rib_add_route(fibnum, info, rc);
+ break;
+ case RIB_DEL:
+ error = rib_del_route(fibnum, info, rc);
+ break;
+ case RIB_CHANGE:
+ error = rib_change_route(fibnum, info, rc);
+ break;
+ default:
+ error = ENOTSUP;
+ }
+
+ return (error);
+}
+
+/*
+ * Adds/removes route denoted by @dst, @mask and @gw to/from the RIB.
+ * Return 0 on success.
+ */
+int
+rib_request_simple(enum rib_cmd_type cmd, u_int fibnum, struct sockaddr *dst,
+ struct sockaddr *mask, struct sockaddr *gw, int rt_flags)
+{
+ struct rt_addrinfo info;
+ struct rib_cmd_info rc;
+ struct epoch_tracker et;
+ int error;
+
+ bzero(&info, sizeof(info));
+ info.rti_flags = rt_flags;
+ info.rti_info[RTAX_DST] = dst;
+ info.rti_info[RTAX_NETMASK] = mask;
+ info.rti_info[RTAX_GATEWAY] = gw;
+
+ NET_EPOCH_ENTER(et);
+ error = rib_request(cmd, fibnum, &info, &rc);
+ NET_EPOCH_EXIT(et);
+
+ return (error);
+}
+
+/*
+ * Checks if rte can be exported v.r.t jails/vnets.
+ *
+ * Returns 1 if it can, 0 otherwise.
+ */
+int
+rib_can_export_rte(struct ucred *td_ucred, const struct rtentry *rt)
+{
+
+ if (!RT_IS_HOST_ROUTE(rt)
+ ? jailed_without_vnet(td_ucred)
+ : prison_if(td_ucred, rt_key_const(rt)) != 0)
+ return (0);
+ return (1);
+}
+
Index: sys/net/route/rtentry_var.h
===================================================================
--- /dev/null
+++ sys/net/route/rtentry_var.h
@@ -0,0 +1,146 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1980, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)route.h 8.4 (Berkeley) 1/9/95
+ * $FreeBSD$
+ */
+
+/*
+ * This header contains struct rtentry definition and supporting macro.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_RTENTRY_VAR_H_
+#define _NET_RTENTRY_VAR_H_
+
+#if defined(_KERNEL)
+
+#include <sys/counter.h>
+
+#define rt_key_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_key)))
+#define rt_mask_const(r) (*((const struct sockaddr * const *)(&(r)->rt_nodes->rn_mask)))
+struct rtentry {
+ struct radix_node rt_nodes[2]; /* tree glue, and other values */
+ /*
+ * XXX struct rtentry must begin with a struct radix_node (or two!)
+ * because the code does some casts of a 'struct radix_node *'
+ * to a 'struct rtentry *'
+ */
+#define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key)))
+#define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask)))
+ /*
+ * 2 structures above consists of 2x6 pointers, leaving
+ * 4 pointers / 32 bytes in the cacheline on amd64
+ */
+ struct nhop_object *rt_nhop; /* nexthop data */
+ union {
+ struct sockaddr_in rt_dst4;
+ struct sockaddr_in6 rt_dst6;
+ struct sockaddr rt_dst;
+ };
+
+ /*
+ * sizeof(struct sockaddr_in6) == 28 on amd64,
+ * however, the dataplane-relevant part (e.g. address)
+ * lies at offset 8..24, making it into the end of the cache line.
+ */
+
+ int rte_flags; /* up/down?, host/net */
+ int rt_refcnt; /* # held references */
+ u_int rt_fibnum; /* which FIB */
+ u_long rt_weight; /* absolute weight */
+ u_long rt_expire; /* lifetime for route, e.g. redirect */
+#define rt_endzero rt_mtx
+ struct mtx rt_mtx; /* mutex for routing entry */
+ struct rtentry *rt_chain; /* pointer to next rtentry to delete */
+ struct epoch_context rt_epoch_ctx; /* net epoch tracker */
+};
+
+#define RT_LOCK_INIT(_rt) \
+ mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW)
+#define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx)
+#define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx)
+#define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx)
+#define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED)
+#define RT_UNLOCK_COND(_rt) do { \
+ if (mtx_owned(&(_rt)->rt_mtx)) \
+ mtx_unlock(&(_rt)->rt_mtx); \
+} while (0)
+
+#define RT_ADDREF(_rt) do { \
+ RT_LOCK_ASSERT(_rt); \
+ KASSERT((_rt)->rt_refcnt >= 0, \
+ ("negative refcnt %d", (_rt)->rt_refcnt)); \
+ (_rt)->rt_refcnt++; \
+} while (0)
+
+#define RT_REMREF(_rt) do { \
+ RT_LOCK_ASSERT(_rt); \
+ KASSERT((_rt)->rt_refcnt > 0, \
+ ("bogus refcnt %d", (_rt)->rt_refcnt)); \
+ (_rt)->rt_refcnt--; \
+} while (0)
+
+#define RTFREE_LOCKED(_rt) do { \
+ if ((_rt)->rt_refcnt <= 1) \
+ rtfree(_rt); \
+ else { \
+ RT_REMREF(_rt); \
+ RT_UNLOCK(_rt); \
+ } \
+ /* guard against invalid refs */ \
+ _rt = 0; \
+} while (0)
+
+#define RTFREE(_rt) do { \
+ RT_LOCK(_rt); \
+ RTFREE_LOCKED(_rt); \
+} while (0)
+
+#define RT_IS_UP(_rt) ((_rt)->rte_flags & RTF_UP)
+#define RT_IS_HOST_ROUTE(_rt) ((_rt)->rte_flags & RTF_HOST)
+
+/*
+ * Convert a 'struct radix_node *' to a 'struct rtentry *'.
+ * The operation can be done safely (in this code) because a
+ * 'struct rtentry' starts with two 'struct radix_node''s, the first
+ * one representing leaf nodes in the routing tree, which is
+ * what the code in radix.c passes us as a 'struct radix_node'.
+ *
+ * But because there are a lot of assumptions in this conversion,
+ * do not cast explicitly, but always use the macro below.
+ */
+#define RNTORT(p) ((struct rtentry *)(p))
+
+#endif /* _KERNEL */
+
+#endif
Index: sys/net/route/shared.h
===================================================================
--- /dev/null
+++ sys/net/route/shared.h
@@ -0,0 +1,126 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Contains various definitions shared between the parts of a routing subsystem.
+ *
+ * Header is not intended to be included by the code external to the
+ * routing subsystem.
+ */
+
+#ifndef _NET_ROUTE_SHARED_H_
+#define _NET_ROUTE_SHARED_H_
+
+#ifdef INVARIANTS
+#define NET_EPOCH_ASSERT_INVARIANTS() NET_EPOCH_ASSERT()
+#else
+#define NET_EPOCH_ASSERT_INVARIANTS()
+#endif
+
+#ifdef RTDEBUG
+#define DPRINTF(_fmt, ...) printf("%s: " _fmt "\n", __func__ , ## __VA_ARGS__)
+#else
+#define DPRINTF(_fmt, ...)
+#endif
+
+struct rib_head;
+
+/* Shared across nexthops and nexthop groups */
+MALLOC_DECLARE(M_NHOP);
+
+/* Nexhops */
+int nhops_init(struct rib_head *rh);
+void nhops_destroy(struct rib_head *rh);
+struct nhop_object *nhop_get(struct rib_head *rh, const struct nhop_request *req);
+int nhop_ref_object(struct nhop_object *nh);
+int nhop_ref_any(struct nhop_object *nh);
+void nhop_free_any(struct nhop_object *nh);
+
+void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
+int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+
+/* multipath */
+#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */
+#define MPF_LINKED 0x10 /* mpath group is linked */
+
+struct nhgrp_object {
+ uint16_t mp_flags; /* mpath flags */
+ uint8_t mp_size; /* size of mpath group used in selection */
+ uint8_t spare;
+ struct nhop_object *nhops[0]; /* nhops */
+};
+
+struct weightened_nhop {
+ struct nhop_object *nh;
+ uint32_t weight;
+};
+
+
+/* nhgrp.c */
+int nhgrp_ctl_init(struct nh_control *ctl);
+void nhgrp_ctl_free(struct nh_control *ctl);
+
+struct nhgrp_object;
+
+/* nhgrp_ctl.c */
+struct weightened_nhop *nhgrp_get_nhops(struct nhgrp_object *mp,
+ uint32_t *pnum_nhops);
+int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+
+struct nhgrp_object *nhgrp_get_group(struct rib_head *rh,
+ struct weightened_nhop *wn, int num_nhops, int *perror);
+struct nhgrp_object *nhgrp_append_nhops(struct rib_head *rh,
+ const struct nhgrp_object *gr_orig, struct weightened_nhop *wn,
+ int num_nhops, uint64_t *paddmask, int *perror);
+struct nhgrp_object *nhgrp_get_del_nhops(struct rib_head *rh,
+ const struct nhgrp_object *src, uint64_t *nhop_mask, int *perror);
+struct nhgrp_object *nhgrp_get_replace_nhop(struct rib_head *rh,
+ const struct nhgrp_object *gr_orig, struct weightened_nhop *wn,
+ uint8_t replace_idx, uint64_t *pmodmask, int *perror);
+
+void nhgrp_free_group(struct nhgrp_object *gr);
+int nhgrp_ref_group(struct nhgrp_object *gr);
+
+/* nhgrp*/
+
+/* route_ctl.c */
+int can_nh_multipath(const struct nhop_object *nh);
+int create_rte_from_rte(struct rib_head *rnh, struct rtentry *rt_orig,
+ struct rtentry **ret_rt);
+int del_route_one(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info);
+
+int rib_match_nhop_gw(const struct nhop_object *nh,
+ const struct sockaddr *gw);
+
+#endif
+
+
+
Index: sys/net/route_temporal.c
===================================================================
--- sys/net/route_temporal.c
+++ sys/net/route_temporal.c
@@ -40,7 +40,12 @@
#include <sys/lock.h>
#include <sys/rmlock.h>
#include <sys/callout.h>
+#include <sys/counter.h>
+#include <sys/sysctl.h>
+#include <sys/epoch.h>
+#include <netinet/in.h>
+
#include <net/if.h>
#include <net/route.h>
#include <net/route_var.h>
@@ -51,14 +56,17 @@
* Updates time of the next nearest route expiration as a side effect.
*/
static int
-expire_route(const struct rtentry *rt, void *arg)
+expire_route(const struct rtentry *rt, const struct nhop_object *nh, void *arg)
{
time_t *next_callout;
+ unsigned long rt_expire;
- if (rt->rt_expire == 0)
+ rt_expire = rib_get_entry_expire_time(rt);
+
+ if (rt_expire == 0)
return (0);
- if (rt->rt_expire <= time_uptime)
+ if (rt_expire <= time_uptime)
return (1);
next_callout = (time_t *)arg;
@@ -67,8 +75,8 @@
* Update next_callout to determine the next ts to
* run the callback at.
*/
- if (*next_callout == 0 || *next_callout > rt->rt_expire)
- *next_callout = rt->rt_expire;
+ if (*next_callout == 0 || *next_callout > rt_expire)
+ *next_callout = rt_expire;
return (0);
}
@@ -124,23 +132,26 @@
tmproutes_update(struct rib_head *rnh, struct rtentry *rt)
{
int seconds;
+ unsigned long rt_expire;
RIB_WLOCK_ASSERT(rnh);
- if (rnh->next_expire == 0 || rnh->next_expire > rt->rt_expire) {
+ rt_expire = rib_get_entry_expire_time(rt);
+
+ if (rnh->next_expire == 0 || rnh->next_expire > rt_expire) {
/*
* Callback is not scheduled, is executing,
* or is scheduled for a later time than we need.
*
* Schedule the one for the current @rt expiration time.
*/
- seconds = (rt->rt_expire - time_uptime);
+ seconds = (rt_expire - time_uptime);
if (seconds < 0)
seconds = 0;
callout_reset_sbt(&rnh->expire_callout, SBT_1S * seconds,
SBT_1MS * 500, expire_callout, rnh, 0);
- rnh->next_expire = rt->rt_expire;
+ rnh->next_expire = rt_expire;
}
}
Index: sys/net/route_var.h
===================================================================
--- sys/net/route_var.h
+++ sys/net/route_var.h
@@ -32,6 +32,15 @@
#ifndef _NET_ROUTE_VAR_H_
#define _NET_ROUTE_VAR_H_
+#ifndef RNF_NORMAL
+#include <net/radix.h>
+#endif
+
+struct nh_control;
+struct nhop_request;
+typedef int rnh_preadd_entry_f_t(u_int fibnum, const struct sockaddr *addr,
+ const struct sockaddr *mask, struct nhop_request *req);
+
struct rib_head {
struct radix_head head;
rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */
@@ -41,6 +50,7 @@
rn_walktree_t *rnh_walktree; /* traverse tree */
rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */
rn_close_t *rnh_close; /*do something when the last ref drops*/
+ rnh_preadd_entry_f_t *rnh_preadd; /* hook to alter record prior to insertion */
rt_gen_t rnh_gen; /* generation counter */
int rnh_multipath; /* multipath capable ? */
struct radix_node rnh_nodes[3]; /* empty tree for common case */
@@ -51,6 +61,7 @@
u_int rib_fibnum; /* fib number */
struct callout expire_callout; /* Callout for expiring dynamic routes */
time_t next_expire; /* Next expire run ts */
+ struct nh_control *nh_control; /* nexthop subsystem data */
};
#define RIB_RLOCK_TRACKER struct rm_priotracker _rib_tracker
@@ -74,7 +85,7 @@
CHK_STRUCT_FIELD_GENERIC(struct route, _field, _route_new, _field)
#define CHK_STRUCT_ROUTE_FIELDS(_route_new) \
- _CHK_ROUTE_FIELD(_route_new, ro_rt) \
+ _CHK_ROUTE_FIELD(_route_new, ro_nh) \
_CHK_ROUTE_FIELD(_route_new, ro_lle) \
_CHK_ROUTE_FIELD(_route_new, ro_prepend)\
_CHK_ROUTE_FIELD(_route_new, ro_plen) \
@@ -89,6 +100,74 @@
struct rib_head *rt_tables_get_rnh(int fib, int family);
+#ifdef NEED_RTZONE
+#if 0
+VNET_DECLARE(uma_zone_t, rtzone); /* Routing table UMA zone. */
+#define V_rtzone VNET(rtzone)
+#endif
+extern uma_zone_t rtzone; /* Routing table UMA zone. */
+#define V_rtzone rtzone
+#endif
+
+VNET_PCPUSTAT_DECLARE(struct rtstat, rtstat);
+#define RTSTAT_ADD(name, val) \
+ VNET_PCPUSTAT_ADD(struct rtstat, rtstat, name, (val))
+#define RTSTAT_INC(name) RTSTAT_ADD(name, 1)
+
+SYSCTL_DECL(_net_route);
+
+/* Constants */
+
+/*
+ * Number of times to retry the operation such as RTM_CHANGE
+ * on error caused by concurrent rtable changes before returning
+ * to userland with an error.
+ */
+#define RIB_MAX_RETRIES 3
+
+/*
+ * Maximum width of the multipath group.
+ */
+#define RIB_MAX_MPATH_WIDTH 64
+
+
+/*
+ * With the split between the routing entry and the nexthop,
+ * rt_flags has to be split between these 2 entries. As rtentry
+ * mostly contains prefix data and is thought to be generic enough
+ * so one can transparently change the nexthop pointer w/o requiring
+ * any other rtentry changes, most of rt_flags shifts to the particular nexthop.
+ * /
+ *
+ * RTF_UP: rtentry, as an indication that it is linked.
+ * RTF_HOST: rtentry, nhop. The latter indication is needed for the datapath
+ * RTF_DYNAMIC: nhop, to make rtentry generic.
+ * RTF_MODIFIED: nhop, to make rtentry generic. (legacy)
+ * -- "native" path (nhop) properties:
+ * RTF_GATEWAY, RTF_STATIC, RTF_PROTO1, RTF_PROTO2, RTF_PROTO3, RTF_FIXEDMTU,
+ * RTF_PINNED, RTF_REJECT, RTF_BLACKHOLE, RTF_BROADCAST
+ */
+
+/* Nexthop rt flags mask */
+#define NHOP_RT_FLAG_MASK (RTF_GATEWAY | RTF_HOST | RTF_REJECT | RTF_DYNAMIC | \
+ RTF_MODIFIED | RTF_STATIC | RTF_BLACKHOLE | RTF_PROTO1 | RTF_PROTO2 | \
+ RTF_PROTO3 | RTF_FIXEDMTU | RTF_PINNED | RTF_BROADCAST)
+
+/* rtentry rt flag mask */
+#define RTE_RT_FLAG_MASK (RTF_UP | RTF_HOST)
+
+/* Nexthop selection */
+#define _NH2MP(_nh) ((struct nhgrp_object *)(_nh))
+#define _SELECT_NHOP(_nh, _flowid) \
+ (_NH2MP(_nh))->nhops[(_flowid) % (_NH2MP(_nh))->mp_size]
+#define _RT_SELECT_NHOP(_nh, _flowid) \
+ ((!NH_IS_MULTIPATH(_nh)) ? (_nh) : _SELECT_NHOP(_nh, _flowid))
+#define RT_SELECT_NHOP(_rt, _flowid) _RT_SELECT_NHOP((_rt)->rt_nhop, _flowid)
+
+/* Entropy data used for outbound hashing */
+#define MPATH_ENTROPY_KEY_LEN 40
+extern uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN];
+
/* rte<>nhop translation */
static inline uint16_t
fib_rte_to_nh_flags(int rt_flags)
@@ -105,8 +184,24 @@
return (res);
}
+/* route.c */
+struct rtentry *rtalloc1_fib(struct sockaddr *dst, int report,
+ u_long ignflags, u_int fibnum);
void tmproutes_update(struct rib_head *rnh, struct rtentry *rt);
void tmproutes_init(struct rib_head *rh);
void tmproutes_destroy(struct rib_head *rh);
+
+/* route_ctl.c */
+int match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw);
+
+/* mpath_ctl.c */
+struct nhgrp_object;
+
+int add_route_mpath(struct rib_head *rnh, struct rtentry *rt,
+ struct nhop_object *nh_orig, u_long weight_orig, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+int del_route_mpath(struct rib_head *rnh, struct rtentry *rt,
+ struct nhgrp_object *mp_orig, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
#endif
Index: sys/net/rtsock.c
===================================================================
--- sys/net/rtsock.c
+++ sys/net/rtsock.c
@@ -32,7 +32,7 @@
* $FreeBSD$
*/
#include "opt_ddb.h"
-#include "opt_mpath.h"
+#include "opt_route_mpath.h"
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -68,6 +68,8 @@
#include <net/raw_cb.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/rtentry_var.h>
+#include <net/route/shared.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -77,6 +79,7 @@
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif
+#include <net/route/nhop.h>
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
@@ -158,8 +161,7 @@
#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)
-static SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
- "");
+SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
struct walkarg {
int w_tmemsize;
@@ -168,25 +170,30 @@
struct sysctl_req *w_req;
};
+struct nh_walkarg;
+
static void rts_input(struct mbuf *m);
static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
struct walkarg *w, int *plen);
static int rt_xaddrs(caddr_t cp, caddr_t cplim,
struct rt_addrinfo *rtinfo);
+static int sysctl_dump_rt_nhop(struct nhop_object *nh, uint32_t rt_weight,
+ struct nh_walkarg *nw);
static int sysctl_dumpentry(struct radix_node *rn, void *vw);
static int sysctl_iflist(int af, struct walkarg *w);
static int sysctl_ifmalist(int af, struct walkarg *w);
static int route_output(struct mbuf *m, struct socket *so, ...);
-static void rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out);
+static void rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh,
+ uint32_t weight, struct rt_metrics *out);
static void rt_dispatch(struct mbuf *, sa_family_t);
static struct sockaddr *rtsock_fix_netmask(struct sockaddr *dst,
- struct sockaddr *smask, struct sockaddr_storage *dmask);
-static int handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
- struct rt_msghdr *rtm, struct rtentry **ret_nrt);
-static int update_rtm_from_rte(struct rt_addrinfo *info,
- struct rt_msghdr **prtm, int alloc_len,
- struct rtentry *rt);
+ struct sockaddr *smask, struct sockaddr_in6 *dmask);
+static int handle_rtm_get(u_int fibnum, struct rt_addrinfo *info,
+ struct rt_msghdr **prtm, int alloc_len);
+static int
+update_rtm_from_rte(struct rt_addrinfo *info, struct rt_msghdr **prtm,
+ int alloc_len, const struct rtentry *rt, struct nhop_object *nh, uint32_t rt_weight);
static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm,
struct mbuf *m, sa_family_t saf, u_int fibnum,
int rtm_errno);
@@ -455,15 +462,15 @@
static int
rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
- struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred)
+ const struct nhop_object *nh, union sockaddr_union *saun, struct ucred *cred)
{
#if defined(INET) || defined(INET6)
struct epoch_tracker et;
#endif
/* First, see if the returned address is part of the jail. */
- if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) {
- info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
+ if (prison_if(cred, nh->nh_ifa->ifa_addr) == 0) {
+ info->rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
return (0);
}
@@ -497,7 +504,7 @@
/*
* As a last resort return the 'default' jail address.
*/
- ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->
+ ia = ((struct sockaddr_in *)nh->nh_ifa->ifa_addr)->
sin_addr;
if (prison_get_ip4(cred, &ia) != 0)
return (ESRCH);
@@ -540,7 +547,7 @@
/*
* As a last resort return the 'default' jail address.
*/
- ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)->
+ ia6 = ((struct sockaddr_in6 *)nh->nh_ifa->ifa_addr)->
sin6_addr;
if (prison_get_ip6(cred, &ia6) != 0)
return (ESRCH);
@@ -570,8 +577,13 @@
fill_addrinfo(struct rt_msghdr *rtm, int len, u_int fibnum, struct rt_addrinfo *info)
{
int error;
- sa_family_t saf;
+ /*
+ * Starting from here, it is possible
+ * to alter original message and insert
+ * caller PID and error value.
+ */
+
rtm->rtm_pid = curproc->p_pid;
info->rti_addrs = rtm->rtm_addrs;
@@ -594,7 +606,6 @@
(info->rti_info[RTAX_GATEWAY] != NULL &&
info->rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
return (EINVAL);
- saf = info->rti_info[RTAX_DST]->sa_family;
/*
* Verify that the caller has the appropriate privilege; RTM_GET
* is the only operation the non-superuser is allowed.
@@ -653,43 +664,26 @@
* Handles RTM_GET message from routing socket, returning matching rt.
*
* Returns:
- * 0 on success, with locked and referenced matching rt in @rt_nrt
+ * 0 on success, with locked matching rt, nh in @ret_nrt and @ret_nh
* errno of failure
*/
static int
-handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
- struct rt_msghdr *rtm, struct rtentry **ret_nrt)
+handle_rtm_get(u_int fibnum, struct rt_addrinfo *info,
+ struct rt_msghdr **prtm, int alloc_len)
{
- RIB_RLOCK_TRACKER;
struct rtentry *rt;
- struct rib_head *rnh;
- sa_family_t saf;
+ struct nhop_object *nh;
+ uint32_t rt_weight;
+ int error;
- saf = info->rti_info[RTAX_DST]->sa_family;
+ error = rib_lookup_route_netmask(fibnum, info->rti_info[RTAX_DST],
+ info->rti_info[RTAX_NETMASK], &rt);
- rnh = rt_tables_get_rnh(fibnum, saf);
- if (rnh == NULL)
- return (EAFNOSUPPORT);
+ if (error != 0)
+ return (error);
- RIB_RLOCK(rnh);
+ /* rt is locked and unreferenced. */
- if (info->rti_info[RTAX_NETMASK] == NULL) {
- /*
- * Provide longest prefix match for
- * address lookup (no mask).
- * 'route -n get addr'
- */
- rt = (struct rtentry *) rnh->rnh_matchaddr(
- info->rti_info[RTAX_DST], &rnh->head);
- } else
- rt = (struct rtentry *) rnh->rnh_lookup(
- info->rti_info[RTAX_DST],
- info->rti_info[RTAX_NETMASK], &rnh->head);
-
- if (rt == NULL) {
- RIB_RUNLOCK(rnh);
- return (ESRCH);
- }
#ifdef RADIX_MPATH
/*
* for RTM_GET, gate is optional even with multipath.
@@ -704,12 +698,46 @@
}
}
#endif
+ rt_weight = rt->rt_weight;
+ nh = rt->rt_nhop;
+#ifdef ROUTE_MPATH
+ if (NH_IS_MULTIPATH(nh)) {
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ struct sockaddr *gw;
+
+ nh = NULL;
+ gw = info->rti_info[RTAX_GATEWAY];
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rt->rt_nhop,
+ &num_nhops);
+ if (gw != NULL) {
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ if (rib_match_nhop_gw(wn[i].nh, gw)) {
+ nh = wn[i].nh;
+ rt_weight = wn[i].weight;
+ break;
+ }
+ }
+ if (nh == NULL) {
+ RT_UNLOCK(rt);
+ return (ESRCH);
+ }
+ } else {
+ /* By default, use the first control plane nexthop */
+ nh = wn[0].nh;
+ rt_weight = wn[0].weight;
+ }
+ }
+#endif
+
/*
* If performing proxied L2 entry insertion, and
* the actual PPP host entry is found, perform
* another search to retrieve the prefix route of
* the local end point of the PPP link.
*/
+ /* XXX: fix RTF_ANNOUNCE */
+#if 0
if (rtm->rtm_flags & RTF_ANNOUNCE) {
struct sockaddr laddr;
@@ -740,19 +768,24 @@
return (ESRCH);
}
}
- RT_LOCK(rt);
- RT_ADDREF(rt);
- RIB_RUNLOCK(rnh);
+#endif
- *ret_nrt = rt;
+ if (!can_export_rte(curthread->td_ucred, rt)) {
+ RT_UNLOCK(rt);
+ return (ESRCH);
+ }
- return (0);
+ error = update_rtm_from_rte(info, prtm, alloc_len, rt, nh, rt_weight);
+ RT_UNLOCK(rt);
+ if (error != 0)
+ printf("%s: ret %d\n", __func__, error);
+
+ return (error);
}
/*
* Update sockaddrs, flags, etc in @prtm based on @rt data.
* Assumes @rt is locked.
- * rtm can be reallocated.
*
* Returns 0 on success, along with pointer to (potentially reallocated)
* rtm.
@@ -760,36 +793,41 @@
*/
static int
update_rtm_from_rte(struct rt_addrinfo *info, struct rt_msghdr **prtm,
- int alloc_len, struct rtentry *rt)
+ int alloc_len, const struct rtentry *rt, struct nhop_object *nh, uint32_t rt_weight)
{
- struct sockaddr_storage netmask_ss;
- struct walkarg w;
- union sockaddr_union saun;
- struct rt_msghdr *rtm, *orig_rtm = NULL;
+ struct sockaddr_in6 dst, mask;
struct ifnet *ifp;
+ struct rt_msghdr *rtm, *orig_rtm = NULL;
+ struct walkarg w;
int error, len;
+ union sockaddr_union saun;
RT_LOCK_ASSERT(rt);
- rtm = *prtm;
-
- info->rti_info[RTAX_DST] = rt_key(rt);
- info->rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- info->rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
- rt_mask(rt), &netmask_ss);
+ info->rti_info[RTAX_DST] = rib_get_entry_dst_sa(rt,
+ (struct sockaddr *)&dst, sizeof(dst), &error);
+ if (error != 0)
+ return (error);
+ info->rti_info[RTAX_NETMASK] = rib_get_entry_netmask_sa(rt,
+ (struct sockaddr *)&mask, sizeof(mask), &error);
+ if (error != 0)
+ return (error);
info->rti_info[RTAX_GENMASK] = 0;
- ifp = rt->rt_ifp;
+ info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
+
+ ifp = nh->nh_ifp;
+ rtm = *prtm;
if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
if (ifp) {
info->rti_info[RTAX_IFP] =
ifp->if_addr->ifa_addr;
- error = rtm_get_jailed(info, ifp, rt,
+ error = rtm_get_jailed(info, ifp, nh,
&saun, curthread->td_ucred);
if (error != 0)
return (error);
if (ifp->if_flags & IFF_POINTOPOINT)
info->rti_info[RTAX_BRD] =
- rt->rt_ifa->ifa_dstaddr;
+ nh->nh_ifa->ifa_dstaddr;
rtm->rtm_index = ifp->if_index;
} else {
info->rti_info[RTAX_IFP] = NULL;
@@ -821,12 +859,14 @@
w.w_tmemsize = alloc_len;
rtsock_msg_buffer(rtm->rtm_type, info, &w, &len);
- if (rt->rt_flags & RTF_GWFLAG_COMPAT)
+ int rt_flags = rib_get_entry_rtflags(rt, nh);
+ /* XXX: Eliminate RTF_GWFLAG_COMPAT */
+ if (rt->rte_flags & RTF_GWFLAG_COMPAT)
rtm->rtm_flags = RTF_GATEWAY |
- (rt->rt_flags & ~RTF_GWFLAG_COMPAT);
+ (rt_flags & ~RTF_GWFLAG_COMPAT);
else
- rtm->rtm_flags = rt->rt_flags;
- rt_getmetrics(rt, &rtm->rtm_rmx);
+ rtm->rtm_flags = rt_flags;
+ rt_getmetrics(rt, nh, rt_weight, &rtm->rtm_rmx);
rtm->rtm_addrs = info->rti_addrs;
if (orig_rtm != NULL)
@@ -841,11 +881,10 @@
route_output(struct mbuf *m, struct socket *so, ...)
{
struct rt_msghdr *rtm = NULL;
- struct rtentry *rt = NULL;
struct rt_addrinfo info;
+ struct sockaddr_storage ss;
struct epoch_tracker et;
#ifdef INET6
- struct sockaddr_storage ss;
struct sockaddr_in6 *sin6;
int i, rti_need_deembed = 0;
#endif
@@ -909,35 +948,50 @@
goto flush;
}
- switch (rtm->rtm_type) {
- struct rtentry *saved_nrt;
+ struct rib_cmd_info rc;
+ bzero(&rc, sizeof(rc));
+ switch (rtm->rtm_type) {
case RTM_ADD:
case RTM_CHANGE:
if (rtm->rtm_type == RTM_ADD) {
if (info.rti_info[RTAX_GATEWAY] == NULL)
senderr(EINVAL);
- }
- saved_nrt = NULL;
- error = rtrequest1_fib(rtm->rtm_type, &info, &saved_nrt,
- fibnum);
- if (error == 0 && saved_nrt != NULL) {
+ error = rib_add_route(fibnum, &info, &rc);
+ } else
+ error = rib_change_route(fibnum, &info, &rc);
+ if (error == 0) {
#ifdef INET6
rti_need_deembed = (V_deembed_scopeid) ? 1 : 0;
#endif
- RT_LOCK(saved_nrt);
- rtm->rtm_index = saved_nrt->rt_ifp->if_index;
- RT_REMREF(saved_nrt);
- RT_UNLOCK(saved_nrt);
+#ifdef ROUTE_MPATH
+ if (NH_IS_MULTIPATH(rc.nh_new) && rc.mask_changed) {
+ uint32_t num_nhops, idx;
+ struct weightened_nhop *wn;
+
+ /* Find the index of the added nhop. */
+ idx = ffsll(rc.mask_changed) - 1;
+ wn = nhgrp_get_nhops((struct nhgrp_object *)rc.nh_new,
+ &num_nhops);
+ rtm->rtm_index = wn[idx].nh->nh_ifp->if_index;
+ } else
+#endif
+ rtm->rtm_index = rc.nh_new->nh_ifp->if_index;
}
break;
case RTM_DELETE:
- saved_nrt = NULL;
- error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt, fibnum);
+ error = rib_del_route(fibnum, &info, &rc);
if (error == 0) {
- RT_LOCK(saved_nrt);
- rt = saved_nrt;
+ /* XXX: mpath */
+ if (can_export_rte(curthread->td_ucred, rc.rt)) {
+ RT_LOCK(rc.rt);
+ error = update_rtm_from_rte(&info, &rtm,
+ alloc_len, rc.rt, rc.nh_old,
+ rc.rt->rt_weight);
+ RT_UNLOCK(rc.rt);
+ } else
+ error = ESRCH;
goto report;
}
#ifdef INET6
@@ -947,17 +1001,12 @@
break;
case RTM_GET:
- error = handle_rtm_get(&info, fibnum, rtm, &rt);
+ /* XXX: verify deembed on errors */
+ error = handle_rtm_get(fibnum, &info, &rtm, alloc_len);
if (error != 0)
senderr(error);
report:
- RT_LOCK_ASSERT(rt);
- if (!can_export_rte(curthread->td_ucred, rt)) {
- RT_UNLOCK(rt);
- senderr(ESRCH);
- }
- error = update_rtm_from_rte(&info, &rtm, alloc_len, rt);
/*
* Note that some sockaddr pointers may have changed to
* point to memory outsize @rtm. Some may be pointing
@@ -973,7 +1022,6 @@
#ifdef INET6
rti_need_deembed = 0;
#endif
- RT_UNLOCK(rt);
if (error != 0)
senderr(error);
break;
@@ -984,8 +1032,6 @@
flush:
NET_EPOCH_EXIT(et);
- if (rt != NULL)
- RTFREE(rt);
#ifdef INET6
if (rtm != NULL) {
@@ -1069,13 +1115,14 @@
static void
-rt_getmetrics(const struct rtentry *rt, struct rt_metrics *out)
+rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh,
+ uint32_t rt_weight, struct rt_metrics *out)
{
bzero(out, sizeof(*out));
- out->rmx_mtu = rt->rt_mtu;
- out->rmx_weight = rt->rt_weight;
- out->rmx_pksent = counter_u64_fetch(rt->rt_pksent);
+ out->rmx_mtu = nh->nh_mtu;
+ out->rmx_weight = rt_weight;
+ out->rmx_pksent = nhop_get_idx(nh);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = rt->rt_expire ?
rt->rt_expire - time_uptime + time_second : 0;
@@ -1126,23 +1173,84 @@
/*
* Fill in @dmask with valid netmask leaving original @smask
- * intact. Mostly used with radix netmasks.
+ * intact. Used with radix-originated netmasks.
*/
static struct sockaddr *
rtsock_fix_netmask(struct sockaddr *dst, struct sockaddr *smask,
- struct sockaddr_storage *dmask)
+ struct sockaddr_in6 *dmask)
{
if (dst == NULL || smask == NULL)
return (NULL);
- memset(dmask, 0, dst->sa_len);
- memcpy(dmask, smask, smask->sa_len);
- dmask->ss_len = dst->sa_len;
- dmask->ss_family = dst->sa_family;
+ if (dst->sa_len > sizeof(struct sockaddr_in6)) {
+ printf("NETMASK SA_LEN: %d\n", dst->sa_len);
+ return (NULL);
+ }
+ bzero(dmask, dst->sa_len);
+ dmask->sin6_len = dst->sa_len;
+ dmask->sin6_family = dst->sa_family;
- return ((struct sockaddr *)dmask);
+ switch (dst->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)dmask)->sin_addr =
+ ((struct sockaddr_in *)smask)->sin_addr;
+ return ((struct sockaddr *)dmask);
+ case AF_INET6:
+ ((struct sockaddr_in6 *)dmask)->sin6_addr =
+ ((struct sockaddr_in6 *)smask)->sin6_addr;
+ return ((struct sockaddr *)dmask);
+ }
+
+ return (smask);
}
+#ifdef COMPAT_FREEBSD32
+#define SA_SIZE_COMPAT(_sa, _compat32) { \
+ if (_compat32) \
+ SA_SIZE32(sa); \
+ else \
+ SA_SIZE(_sa); \
+}
+#else
+#define SA_SIZE_COMPAT(_sa, _compat32) SA_SIZE(_sa)
+#endif
+
+
+#if 0
+inline static struct sockaddr *
+prepare_sockaddr(struct rt_addrinfo *info, int addr, struct sockaddr *buf,
+ int buflen, int *sa_len, int compat32, int deembed)
+{
+ struct sockaddr *sa = info->rti_info[addr];
+
+ if (addr = RTAX_NETMASK) {
+ struct sockaddr *dst = info->rti_info[RTAX_DST];
+ *sa_len = SA_SIZE_COMPAT(dst, compat32);
+
+ if (buf == NULL)
+ return (NULL);
+ return (rtsock_fix_netmask(dst, sa, buf));
+ }
+
+ *sa_len = SA_SIZE_COMPAT(sa, compat32);
+ if (buf == NULL)
+ return (NULL);
+#ifdef INET6
+ if ((sa->sa_family == AF_INET6) && deembed) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
+ if ((IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
+ IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))){
+ memcpy(buf, sin6, sin6->sin6_len);
+ }
+
+ }
+
+#else
+ return (sa);
+#endif
+}
+#endif
+
/*
* Writes information related to @rtinfo object to newly-allocated mbuf.
* Assumes MCLBYTES is enough to construct any message.
@@ -1220,9 +1328,15 @@
m_freem(m);
return (NULL);
}
+
+ /*
+ * The following 3 fields are the only fields shared
+ * by the rtsock messages.
+ */
rtm->rtm_msglen = len;
rtm->rtm_version = RTM_VERSION;
rtm->rtm_type = type;
+
return (m);
}
@@ -1441,7 +1555,7 @@
struct mbuf *m;
struct ifa_msghdr *ifam;
struct ifnet *ifp = ifa->ifa_ifp;
- struct sockaddr_storage ss;
+ struct sockaddr_in6 ss;
if (V_route_cb.any_count == 0)
return (0);
@@ -1481,10 +1595,9 @@
* Returns 0 on success.
*/
int
-rtsock_routemsg(int cmd, struct rtentry *rt, struct ifnet *ifp, int rti_addrs,
- int fibnum)
+rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum)
{
- struct sockaddr_storage ss;
+ struct sockaddr_in6 ss;
struct rt_addrinfo info;
if (V_route_cb.any_count == 0)
@@ -1493,9 +1606,9 @@
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt), rt_mask(rt), &ss);
- info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- info.rti_flags = rt->rt_flags;
- info.rti_ifp = ifp;
+ info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
+ info.rti_flags = rib_get_entry_rtflags(rt, nh);
+ info.rti_ifp = nh->nh_aifp;
return (rtsock_routemsg_info(cmd, &info, fibnum));
}
@@ -1695,7 +1808,7 @@
can_export_rte(struct ucred *td_ucred, const struct rtentry *rt)
{
- if ((rt->rt_flags & RTF_HOST) == 0
+ if (!RT_IS_HOST_ROUTE(rt)
? jailed_without_vnet(td_ucred)
: prison_if(td_ucred, rt_key_const(rt)) != 0)
return (0);
@@ -1705,32 +1818,83 @@
/*
* This is used in dumping the kernel table via sysctl().
*/
+struct nh_walkarg {
+ struct walkarg *w;
+ struct rtentry *rt;
+};
+
static int
sysctl_dumpentry(struct radix_node *rn, void *vw)
{
struct walkarg *w = vw;
struct rtentry *rt = (struct rtentry *)rn;
- int error = 0, size;
- struct rt_addrinfo info;
- struct sockaddr_storage ss;
+ struct nhop_object *nh;
+ struct nh_walkarg nw;
+ int rt_flags;
NET_EPOCH_ASSERT();
- if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
+ nh = rt->rt_nhop;
+ rt_flags = rib_get_entry_rtflags(rt, nh);
+
+ if (w->w_op == NET_RT_FLAGS && !(rt_flags & w->w_arg))
return 0;
if (!can_export_rte(w->w_req->td->td_ucred, rt))
return (0);
+
+ nw.w = w;
+ nw.rt = rt;
+
+ if (!NH_IS_MULTIPATH(nh))
+ return (sysctl_dump_rt_nhop(nh, rib_get_entry_weight(rt), &nw));
+
+#ifdef ROUTE_MPATH
+ struct weightened_nhop *wn;
+ uint32_t num_nhops;
+ int error;
+
+ wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
+ for (uint32_t i = 0; i < num_nhops; i++) {
+ error = sysctl_dump_rt_nhop(wn[i].nh, wn[i].weight, &nw);
+ if (error != 0)
+ return (error);
+ }
+#endif
+
+ return (0);
+}
+
+
+__noinline static int
+sysctl_dump_rt_nhop(struct nhop_object *nh, uint32_t rt_weight,
+ struct nh_walkarg *nw)
+{
+ int error = 0, size;
+ struct walkarg *w = nw->w;
+ struct rtentry *rt = nw->rt;
+ struct rt_addrinfo info;
+ struct sockaddr_in6 dst, netmask;
+ int rt_flags;
+
+ rt_flags = rib_get_entry_rtflags(rt, nh);
+
bzero((caddr_t)&info, sizeof(info));
- info.rti_info[RTAX_DST] = rt_key(rt);
- info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(rt_key(rt),
- rt_mask(rt), &ss);
+ rib_get_entry_dst_sa(rt, (struct sockaddr *)&dst, sizeof(dst), &error);
+ info.rti_info[RTAX_DST] = rib_get_entry_dst_sa(rt,
+ (struct sockaddr *)&dst, sizeof(dst), &error);
+ if (error != 0)
+ return (error);
+ info.rti_info[RTAX_NETMASK] = rib_get_entry_netmask_sa(rt,
+ (struct sockaddr *)&netmask, sizeof(netmask), &error);
+ if (error != 0)
+ return (error);
info.rti_info[RTAX_GENMASK] = 0;
- if (rt->rt_ifp && !(rt->rt_ifp->if_flags & IFF_DYING)) {
- info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
- info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
- if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
- info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
+ info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
+ if (nh->nh_ifp && !(nh->nh_ifp->if_flags & IFF_DYING)) {
+ info.rti_info[RTAX_IFP] = nh->nh_ifp->if_addr->ifa_addr;
+ info.rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
+ if (nh->nh_ifp->if_flags & IFF_POINTOPOINT)
+ info.rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr;
}
if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
return (error);
@@ -1739,13 +1903,13 @@
bzero(&rtm->rtm_index,
sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index));
- if (rt->rt_flags & RTF_GWFLAG_COMPAT)
+ if (rt_flags & RTF_GWFLAG_COMPAT)
rtm->rtm_flags = RTF_GATEWAY |
- (rt->rt_flags & ~RTF_GWFLAG_COMPAT);
+ (rt_flags & ~RTF_GWFLAG_COMPAT);
else
- rtm->rtm_flags = rt->rt_flags;
- rt_getmetrics(rt, &rtm->rtm_rmx);
- rtm->rtm_index = rt->rt_ifp->if_index;
+ rtm->rtm_flags = rt_flags;
+ rt_getmetrics(rt, nh, rt_weight, &rtm->rtm_rmx);
+ rtm->rtm_index = nh->nh_ifp->if_index;
rtm->rtm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
return (error);
@@ -1901,7 +2065,7 @@
struct if_data ifd;
struct rt_addrinfo info;
int len, error = 0;
- struct sockaddr_storage ss;
+ struct sockaddr_in6 ss;
bzero((caddr_t)&info, sizeof(info));
bzero(&ifd, sizeof(ifd));
@@ -2025,7 +2189,7 @@
namelen--;
if (req->newptr)
return (EPERM);
- if (name[1] == NET_RT_DUMP) {
+ if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGROUPS) {
if (namelen == 3)
fib = req->td->td_proc->p_fibnum;
else if (namelen == 4)
@@ -2092,7 +2256,30 @@
error = EAFNOSUPPORT;
}
break;
-
+ case NET_RT_NHOP:
+ case NET_RT_NHGROUPS:
+ /* Allow dumping one specific af/fib at a time */
+ if (namelen < 4) {
+ error = EINVAL;
+ break;
+ }
+ fib = name[3];
+ if (fib < 0 || fib > rt_numfibs) {
+ error = EINVAL;
+ break;
+ }
+ rnh = rt_tables_get_rnh(fib, af);
+ if (rnh == NULL) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+ if (w.w_op == NET_RT_NHOP)
+ error = nhops_dump_sysctl(rnh, w.w_req);
+#ifdef ROUTE_MPATH
+ else
+ error = nhgrp_dump_sysctl(rnh, w.w_req);
+#endif
+ break;
case NET_RT_IFLIST:
case NET_RT_IFLISTL:
error = sysctl_iflist(af, &w);
@@ -2215,31 +2402,34 @@
static int
rt_dumpentry_ddb(struct radix_node *rn, void *arg __unused)
{
- struct sockaddr_storage ss;
+ struct sockaddr_in6 ss;
struct rtentry *rt;
+ struct nhop_object *nh;
int flags, idx;
/* If RNTORT is important, put it in a header. */
rt = (void *)rn;
+ /* XXX: mpath */
+ nh = rt->rt_nhop;
rt_dumpaddr_ddb("dst", rt_key(rt));
- rt_dumpaddr_ddb("gateway", rt->rt_gateway);
+ rt_dumpaddr_ddb("gateway", &nh->gw_sa);
rt_dumpaddr_ddb("netmask", rtsock_fix_netmask(rt_key(rt), rt_mask(rt),
&ss));
- if (rt->rt_ifp != NULL && (rt->rt_ifp->if_flags & IFF_DYING) == 0) {
- rt_dumpaddr_ddb("ifp", rt->rt_ifp->if_addr->ifa_addr);
- rt_dumpaddr_ddb("ifa", rt->rt_ifa->ifa_addr);
+ if (nh->nh_ifp != NULL && (nh->nh_ifp->if_flags & IFF_DYING) == 0) {
+ rt_dumpaddr_ddb("ifp", nh->nh_ifp->if_addr->ifa_addr);
+ rt_dumpaddr_ddb("ifa", nh->nh_ifa->ifa_addr);
}
db_printf("flags ");
- flags = rt->rt_flags;
+ flags = rt->rte_flags;
if (flags == 0)
db_printf("none");
while ((idx = ffs(flags)) > 0) {
idx--;
- if (flags != rt->rt_flags)
+ if (flags != rt->rte_flags)
db_printf(",");
db_printf("%s", rt_flag_name(idx));
@@ -2522,7 +2712,7 @@
db_printf("Looking up route to destination '%s'\n", bp);
CURVNET_SET(vnet0);
- rt = rtalloc1(dstp, 0, RTF_RNH_LOCKED);
+ rt = rtalloc1_fib(dstp, 0, RTF_RNH_LOCKED, 0);
CURVNET_RESTORE();
if (rt == NULL) {
Index: sys/netinet/icmp6.h
===================================================================
--- sys/netinet/icmp6.h
+++ sys/netinet/icmp6.h
@@ -693,7 +693,7 @@
#ifdef _KERNEL
# ifdef __STDC__
-struct rtentry;
+struct nhop_object;
struct rttimer;
struct in6_multi;
# endif
@@ -705,7 +705,7 @@
void icmp6_slowtimo(void);
void icmp6_prepare(struct mbuf *);
void icmp6_redirect_input(struct mbuf *, int);
-void icmp6_redirect_output(struct mbuf *, struct rtentry *);
+void icmp6_redirect_output(struct mbuf *, struct nhop_object *);
struct ip6ctlparam;
void icmp6_mtudisc_update(struct ip6ctlparam *, int);
Index: sys/netinet/in_fib.h
===================================================================
--- sys/netinet/in_fib.h
+++ sys/netinet/in_fib.h
@@ -32,6 +32,19 @@
#ifndef _NETINET_IN_FIB_H_
#define _NETINET_IN_FIB_H_
+struct route_in {
+ /* common fields shared among all 'struct route' */
+ struct nhop_object *ro_nh;
+ struct llentry *ro_lle;
+ char *ro_prepend;
+ uint16_t ro_plen;
+ uint16_t ro_flags;
+ uint16_t ro_mtu; /* saved ro_rt mtu */
+ uint16_t spare;
+ /* custom sockaddr */
+ struct sockaddr_in ro_dst4;
+};
+
/* Basic nexthop info used for uRPF/mtu checks */
struct nhop4_basic {
struct ifnet *nh_ifp; /* Logical egress interface */
@@ -57,6 +70,17 @@
int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags,
uint32_t flowid, struct nhop4_extended *pnh4);
void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4);
+
+struct nhop_object *fib4_lookup_nh_ptr(uint32_t fibnum, struct in_addr dst,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid);
+int fib4_lookup_nh_route(uint32_t fibnum, struct route_in *ro4, uint32_t flags,
+ uint32_t flowid);
+int fib4_lookup_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, const struct ifnet *src_if);
+
+uint32_t fib4_calc_software_hash(struct in_addr src, struct in_addr dst,
+ unsigned short src_port, unsigned short dst_port, char proto,
+ uint32_t *phashtype);
#endif
Index: sys/netinet/in_fib.c
===================================================================
--- sys/netinet/in_fib.c
+++ sys/netinet/in_fib.c
@@ -33,6 +33,7 @@
#include "opt_inet.h"
#include "opt_route.h"
#include "opt_mpath.h"
+#include "opt_route_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -49,70 +50,98 @@
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
+#include <net/toeplitz.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_fib.h>
+#include <net/route/rtentry_var.h>
#ifdef INET
-static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+
+/* Verify struct route compatiblity */
+/* Assert 'struct route_in' is compatible with 'struct route' */
+CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4);
+
+static void fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_basic *pnh4);
-static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+static void fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_extended *pnh4);
-#define RNTORT(p) ((struct rtentry *)(p))
+#ifdef ROUTE_MPATH
+struct _hash_5tuple_ipv4 {
+ struct in_addr src;
+ struct in_addr dst;
+ unsigned short src_port;
+ unsigned short dst_port;
+ char proto;
+ char spare[3];
+};
+_Static_assert(sizeof(struct _hash_5tuple_ipv4) == 16,
+ "_hash_5tuple_ipv4 size is wrong");
+
+
+uint32_t
+fib4_calc_software_hash(struct in_addr src, struct in_addr dst,
+ unsigned short src_port, unsigned short dst_port, char proto,
+ uint32_t *phashtype)
+{
+ struct _hash_5tuple_ipv4 data;
+
+ data.src = src;
+ data.dst = dst;
+ data.src_port = src_port;
+ data.dst_port = dst_port;
+ data.proto = proto;
+ data.spare[0] = data.spare[1] = data.spare[2] = 0;
+
+ *phashtype = M_HASHTYPE_OPAQUE;
+
+ return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key,
+ sizeof(data), (uint8_t *)&data));
+}
+#endif
+
static void
-fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+fib4_rte_to_nh_basic(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_basic *pnh4)
{
- struct sockaddr_in *gw;
if ((flags & NHR_IFAIF) != 0)
- pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_ifp = nh->nh_ifa->ifa_ifp;
else
- pnh4->nh_ifp = rte->rt_ifp;
- pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in *)rte->rt_gateway;
- pnh4->nh_addr = gw->sin_addr;
- } else
+ pnh4->nh_ifp = nh->nh_ifp;
+ pnh4->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY)
+ pnh4->nh_addr = nh->gw4_sa.sin_addr;
+ else
pnh4->nh_addr = dst;
/* Set flags */
- pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in *)rt_key(rte);
- if (gw->sin_addr.s_addr == 0)
- pnh4->nh_flags |= NHF_DEFAULT;
+ pnh4->nh_flags = nh->nh_flags;
/* TODO: Handle RTF_BROADCAST here */
}
static void
-fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+fib4_rte_to_nh_extended(struct nhop_object *nh, struct in_addr dst,
uint32_t flags, struct nhop4_extended *pnh4)
{
- struct sockaddr_in *gw;
if ((flags & NHR_IFAIF) != 0)
- pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_ifp = nh->nh_ifa->ifa_ifp;
else
- pnh4->nh_ifp = rte->rt_ifp;
- pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in *)rte->rt_gateway;
- pnh4->nh_addr = gw->sin_addr;
- } else
+ pnh4->nh_ifp = nh->nh_ifp;
+ pnh4->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY)
+ pnh4->nh_addr = nh->gw4_sa.sin_addr;
+ else
pnh4->nh_addr = dst;
/* Set flags */
- pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in *)rt_key(rte);
- if (gw->sin_addr.s_addr == 0)
- pnh4->nh_flags |= NHF_DEFAULT;
- pnh4->nh_ia = ifatoia(rte->rt_ifa);
+ pnh4->nh_flags = nh->nh_flags;
+ pnh4->nh_ia = ifatoia(nh->nh_ifa);
pnh4->nh_src = IA_SIN(pnh4->nh_ia)->sin_addr;
}
@@ -135,7 +164,7 @@
struct rib_head *rh;
struct radix_node *rn;
struct sockaddr_in sin;
- struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
@@ -150,10 +179,10 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib4_rte_to_nh_basic(rte, dst, flags, pnh4);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib4_rte_to_nh_basic(nh, dst, flags, pnh4);
RIB_RUNLOCK(rh);
return (0);
@@ -183,8 +212,8 @@
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
+ struct nhop_object *nh;
struct sockaddr_in sin;
- struct rtentry *rte;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
@@ -193,23 +222,18 @@
/* Prepare lookup key */
memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
sin.sin_len = sizeof(struct sockaddr_in);
sin.sin_addr = dst;
+ nh = NULL;
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
-#ifdef RADIX_MPATH
- rte = rt_mpath_select(rte, flowid);
- if (rte == NULL) {
- RIB_RUNLOCK(rh);
- return (ENOENT);
- }
-#endif
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib4_rte_to_nh_extended(rte, dst, flags, pnh4);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib4_rte_to_nh_extended(nh, dst, flags, pnh4);
if ((flags & NHR_REF) != 0) {
/* TODO: lwref on egress ifp's ? */
}
@@ -229,4 +253,197 @@
}
+struct nhop_object *
+fib4_lookup_nh_ptr(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, uint32_t flowid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (NULL);
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4;
+ memset(&sin4, 0, sizeof(sin4));
+ sin4.sin_family = AF_INET;
+ sin4.sin_len = sizeof(struct sockaddr_in);
+ sin4.sin_addr = dst;
+
+ nh = NULL;
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ RIB_RUNLOCK(rh);
+ return (nh);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+
+inline static int
+check_urpf(const struct nhop_object *nh, uint32_t flags,
+ const struct ifnet *src_if)
+{
+#ifdef ROUTE_MPATH
+ const struct nhgrp_object *nhgrp;
+
+ if (NH_IS_MULTIPATH(nh)) {
+ nhgrp = (const struct nhgrp_object *)nh;
+
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nhgrp->nhops[0]->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ return (0);
+ }
+
+ /* src_if != NULL, need to iterate over nhops */
+ /* TODO: consider iterating control plane nhop list */
+ for (int i = 0; i < nhgrp->mp_size; i++) {
+ if (nhgrp->nhops[i]->nh_aifp == src_if)
+ return (1);
+ }
+ return (0);
+ }
+#endif
+
+ if (src_if != NULL && nh->nh_aifp == src_if) {
+ return (1);
+ }
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nh->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib4_lookup_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, const struct ifnet *src_if)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+ int ret;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (0);
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4;
+ memset(&sin4, 0, sizeof(sin4));
+ sin4.sin_len = sizeof(struct sockaddr_in);
+ sin4.sin_addr = dst;
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ nh = (RNTORT(rn))->rt_nhop;
+ ret = check_urpf(nh, flags, src_if);
+ RIB_RUNLOCK(rh);
+ return (ret);
+ }
+ RIB_RUNLOCK(rh);
+
+ return (0);
+}
+
+/*
+ * Lookups route for the destination specified in the @ro4.
+ *
+ * If the lookup resulting nhop is the same as saved in @ro4->ro_nh, returns 1
+ * If the lookup result is different from the @ro4->ro_nh, under old & ref new
+ * IFF NHR_REF is set in flags.
+ *
+ *
+ * Returns:
+ * 0 if the the lookup was successful, with the referenced&unlocked nexthop stored.
+ * errno otherwise, with ro_nh freed and set to 0.
+ *
+ * If ro4->ro_nh is NOT null, returns
+ * If search WAS done AND entry WAS found, returns 1
+ * flags supported: NHR_LOCK -> if the entry WAS found, lock it prior returning
+ */
+int
+fib4_lookup_nh_route(uint32_t fibnum, struct route_in *ro4,
+ uint32_t flags, uint32_t flowid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct nhop_object *nh, *nh_old;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_route: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rh == NULL)
+ return (EAFNOSUPPORT);
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4;
+ memset(&sin4, 0, sizeof(sin4));
+ sin4.sin_family = AF_INET;
+ sin4.sin_len = sizeof(struct sockaddr_in);
+ sin4.sin_addr = ro4->ro_dst4.sin_addr;
+
+ nh_old = NULL;
+ flags |= NHR_REF;
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ /* Valid nhop is found */
+ if (ro4->ro_nh != nh) {
+ nh_old = ro4->ro_nh;
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ ro4->ro_nh = nh;
+ }
+ RIB_RUNLOCK(rh);
+ if ((nh_old != NULL) && (flags & NHR_REF))
+ nhop_free_object(nh_old);
+ return (0);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ /* Not found */
+ if ((ro4->ro_nh != NULL) && (flags & NHR_REF)) {
+ nhop_free_object(ro4->ro_nh);
+ ro4->ro_nh = NULL;
+ }
+
+ return (ESRCH);
+}
#endif
Index: sys/netinet/in_pcb.c
===================================================================
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -46,7 +46,7 @@
#include "opt_inet6.h"
#include "opt_ratelimit.h"
#include "opt_pcbgroup.h"
-#include "opt_rss.h"
+#include "opt_route_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -86,8 +86,10 @@
#if defined(INET) || defined(INET6)
#include <netinet/in.h>
#include <netinet/in_pcb.h>
+#include <netinet/in_rss.h>
#ifdef INET
#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
#endif
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
@@ -101,7 +103,9 @@
#include <netinet6/in6_pcb.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
+#include <netinet6/in6_fib.h>
#endif /* INET6 */
+#include <net/route/nhop.h>
#endif
#include <netipsec/ipsec_support.h>
@@ -111,6 +115,7 @@
#define INPCBLBGROUP_SIZMIN 8
#define INPCBLBGROUP_SIZMAX 256
+
static struct callout ipport_tick_callout;
/*
@@ -1033,8 +1038,8 @@
{
struct ifaddr *ifa;
struct sockaddr *sa;
- struct sockaddr_in *sin;
- struct route sro;
+ struct sockaddr_in *sin, dst;
+ struct nhop_object *nh = NULL;
int error;
NET_EPOCH_ASSERT();
@@ -1047,9 +1052,9 @@
return (0);
error = 0;
- bzero(&sro, sizeof(sro));
- sin = (struct sockaddr_in *)&sro.ro_dst;
+ bzero(&dst, sizeof(dst));
+ sin = &dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
sin->sin_addr.s_addr = faddr->s_addr;
@@ -1061,7 +1066,8 @@
* Find out route to destination.
*/
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
- in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
+ nh = fib4_lookup_nh_ptr(inp->inp_inc.inc_fibnum, *faddr,
+ 0, NHR_NONE, 0);
/*
* If we found a route, use the address corresponding to
@@ -1071,7 +1077,7 @@
* network and try to find a corresponding interface to take
* the source address from.
*/
- if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
+ if (nh == NULL || nh->nh_ifp == NULL) {
struct in_ifaddr *ia;
struct ifnet *ifp;
@@ -1124,22 +1130,22 @@
* belonging to this jail. If so use it.
* 3. as a last resort return the 'default' jail address.
*/
- if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
+ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
struct in_ifaddr *ia;
struct ifnet *ifp;
/* If not jailed, use the default returned. */
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
- ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
+ ia = (struct in_ifaddr *)nh->nh_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/* Jailed. */
/* 1. Check if the iface address belongs to the jail. */
- sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
+ sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
- ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
+ ia = (struct in_ifaddr *)nh->nh_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
@@ -1149,7 +1155,7 @@
* belonging to this jail.
*/
ia = NULL;
- ifp = sro.ro_rt->rt_ifp;
+ ifp = nh->nh_ifp;
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
@@ -1179,7 +1185,7 @@
* In case of jails, check that it is an address of the jail
* and if we cannot find, fall back to the 'default' jail address.
*/
- if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
+ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
struct sockaddr_in sain;
struct in_ifaddr *ia;
@@ -1234,8 +1240,6 @@
}
done:
- if (sro.ro_rt != NULL)
- RTFREE(sro.ro_rt);
return (error);
}
@@ -1266,6 +1270,9 @@
struct in_addr laddr, faddr;
u_short lport, fport;
int error;
+#ifdef ROUTE_MPATH
+ uint32_t hash_val, hash_type;
+#endif
/*
* Because a global state change doesn't actually occur here, a read
@@ -1288,6 +1295,15 @@
faddr = sin->sin_addr;
fport = sin->sin_port;
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
+ inp->inp_socket->so_proto->pr_protocol, &hash_type);
+
+ inp->inp_flowid = hash_val;
+ inp->inp_flowtype = hash_type;
+ }
+#endif
if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
@@ -3367,22 +3383,6 @@
mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
}
} else {
- error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
- }
- if (error == 0 || error == EOPNOTSUPP)
- inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
-
- return (error);
-}
-
-/*
- * This function should be called when the INP_RATE_LIMIT_CHANGED flag
- * is set in the fast path and will attach/detach/modify the TX rate
- * limit send tag based on the socket's so_max_pacing_rate value.
- */
-void
-in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
-{
struct socket *socket;
uint32_t max_pacing_rate;
bool did_upgrade;
Index: sys/netinet/in_rmx.c
===================================================================
--- sys/netinet/in_rmx.c
+++ sys/netinet/in_rmx.c
@@ -41,6 +41,7 @@
#include <net/if_var.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -48,67 +49,63 @@
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
+#include <net/route/rtentry_var.h>
extern int in_inithead(void **head, int off, u_int fibnum);
#ifdef VIMAGE
extern int in_detachhead(void **head, int off);
#endif
-/*
- * Do what we need to do when inserting a route.
- */
-static struct radix_node *
-in_addroute(void *v_arg, void *n_arg, struct radix_head *head,
- struct radix_node *treenodes)
+static int
+rib4_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
+ struct nhop_request *req)
{
- struct rtentry *rt = (struct rtentry *)treenodes;
- struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
+ const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr;
- /*
- * A little bit of help for both IP output and input:
- * For host routes, we make sure that RTF_BROADCAST
- * is set for anything that looks like a broadcast address.
- * This way, we can avoid an expensive call to in_broadcast()
- * in ip_output() most of the time (because the route passed
- * to ip_output() is almost always a host route).
- *
- * We also do the same for local addresses, with the thought
- * that this might one day be used to speed up ip_input().
- *
- * We also mark routes to multicast addresses as such, because
- * it's easy to do and might be useful (but this is much more
- * dubious since it's so easy to inspect the address).
- */
- if (rt->rt_flags & RTF_HOST) {
- struct epoch_tracker et;
- bool bcast;
+ /* XXX: RTF_LOCAL && RTF_MULTICAST */
- NET_EPOCH_ENTER(et);
- bcast = in_broadcast(sin->sin_addr, rt->rt_ifp);
- NET_EPOCH_EXIT(et);
- if (bcast)
- rt->rt_flags |= RTF_BROADCAST;
- else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
- sin->sin_addr.s_addr)
- rt->rt_flags |= RTF_LOCAL;
+ if (req->rt_flags & RTF_HOST) {
+ /*
+ * Backward compatibility:
+ * if the destination is broadcast,
+ * mark route as broadcast.
+ * This behavior was useful when route cloning
+ * was in place, so there was an explicit cloned
+ * route for every broadcasted address.
+ * Currently (2019-12) there are no kernel machinery
+ * to do route cloning, though someone might explicitly
+ * add these routes to support some cases with active-active
+ * load balancing. Given that, retain this support.
+ */
+ if (in_broadcast(addr4->sin_addr, req->ifp))
+ req->rt_flags |= RTF_BROADCAST;
}
- if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
- rt->rt_flags |= RTF_MULTICAST;
- if (rt->rt_ifp != NULL) {
- /*
- * Check route MTU:
- * inherit interface MTU if not set or
- * check if MTU is too large.
- */
- if (rt->rt_mtu == 0) {
- rt->rt_mtu = rt->rt_ifp->if_mtu;
- } else if (rt->rt_mtu > rt->rt_ifp->if_mtu)
- rt->rt_mtu = rt->rt_ifp->if_mtu;
+ /*
+ * Check route MTU:
+ * inherit interface MTU if not set or
+ * check if MTU is too large.
+ */
+ if (req->mtu == 0) {
+ req->mtu = req->ifp->if_mtu;
+ } else if (req->mtu > req->ifp->if_mtu)
+ req->mtu = req->ifp->if_mtu;
+
+ /* Ensure that default route nhop has special flag */
+ const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask;
+ if ((req->rt_flags & RTF_HOST) == 0 && mask4->sin_addr.s_addr == 0)
+ req->nh_flags_additional |= NHF_DEFAULT;
+
+ /* Set nhop type to basic per-AF nhop */
+ if (req->nh_type == 0) {
+ if (req->rt_flags & RTF_GATEWAY)
+ req->nh_type = NH_TYPE_IPV4_ETHER_NHOP;
+ else
+ req->nh_type = NH_TYPE_IPV4_ETHER_RSLV;
}
- return (rn_addroute(v_arg, n_arg, head, treenodes));
+ return (0);
}
static int _in_rt_was_here;
@@ -124,7 +121,7 @@
if (rh == NULL)
return (0);
- rh->rnh_addaddr = in_addroute;
+ rh->rnh_preadd = rib4_preadd;
*head = (void *)rh;
if (_in_rt_was_here == 0 ) {
@@ -158,14 +155,15 @@
};
static int
-in_ifadownkill(const struct rtentry *rt, void *xap)
+in_ifadownkill(const struct rtentry *rt, const struct nhop_object *nh, void *xap)
{
struct in_ifadown_arg *ap = xap;
- if (rt->rt_ifa != ap->ifa)
+ if (nh->nh_ifa != ap->ifa)
return (0);
- if ((rt->rt_flags & RTF_STATIC) != 0 && ap->del == 0)
+ int rt_flags = rib_get_entry_rtflags(rt, nh);
+ if ((rt_flags & RTF_STATIC) != 0 && ap->del == 0)
return (0);
return (1);
@@ -184,16 +182,5 @@
rt_foreach_fib_walk_del(AF_INET, in_ifadownkill, &arg);
ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */
-}
-
-/*
- * inet versions of rt functions. These have fib extensions and
- * for now will just reference the _fib variants.
- * eventually this order will be reversed,
- */
-void
-in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
-{
- rtalloc_ign_fib(ro, ignflags, fibnum);
}
Index: sys/netinet/in_var.h
===================================================================
--- sys/netinet/in_var.h
+++ sys/netinet/in_var.h
@@ -473,7 +473,6 @@
/* XXX */
-void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum);
#endif /* _KERNEL */
/* INET6 stuff */
Index: sys/netinet/ip_fastfwd.c
===================================================================
--- sys/netinet/ip_fastfwd.c
+++ sys/netinet/ip_fastfwd.c
@@ -96,6 +96,7 @@
#include <net/if_dl.h>
#include <net/pfil.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -111,11 +112,13 @@
#include <machine/in_cksum.h>
static int
-ip_findroute(struct nhop4_basic *pnh, struct in_addr dest, struct mbuf *m)
+ip_findroute(struct nhop_object **pnh, struct in_addr dest, struct mbuf *m)
{
+ struct nhop_object *nh;
- bzero(pnh, sizeof(*pnh));
- if (fib4_lookup_nh_basic(M_GETFIB(m), dest, 0, 0, pnh) != 0) {
+ nh = fib4_lookup_nh_ptr(M_GETFIB(m), dest, 0, NHR_NONE,
+ m->m_pkthdr.flowid);
+ if (nh == NULL) {
IPSTAT_INC(ips_noroute);
IPSTAT_INC(ips_cantforward);
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
@@ -124,18 +127,20 @@
/*
* Drop blackholed traffic and directed broadcasts.
*/
- if ((pnh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) != 0) {
+ if ((nh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) != 0) {
IPSTAT_INC(ips_cantforward);
m_freem(m);
return (EHOSTUNREACH);
}
- if (pnh->nh_flags & NHF_REJECT) {
+ if (nh->nh_flags & NHF_REJECT) {
IPSTAT_INC(ips_cantforward);
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
return (EHOSTUNREACH);
}
+ *pnh = nh;
+
return (0);
}
@@ -151,7 +156,7 @@
{
struct ip *ip;
struct mbuf *m0 = NULL;
- struct nhop4_basic nh;
+ struct nhop_object *nh;
struct sockaddr_in dst;
struct in_addr dest, odest, rtdest;
uint16_t ip_len, ip_off;
@@ -323,7 +328,7 @@
if (!PFIL_HOOKED_OUT(V_inet_pfil_head))
goto passout;
- if (pfil_run_hooks(V_inet_pfil_head, &m, nh.nh_ifp,
+ if (pfil_run_hooks(V_inet_pfil_head, &m, nh->nh_ifp,
PFIL_OUT | PFIL_FWD, NULL) != PFIL_PASS)
goto drop;
@@ -376,12 +381,15 @@
bzero(&dst, sizeof(dst));
dst.sin_family = AF_INET;
dst.sin_len = sizeof(dst);
- dst.sin_addr = nh.nh_addr;
+ if (nh->nh_flags & NHF_GATEWAY)
+ dst.sin_addr = nh->gw4_sa.sin_addr;
+ else
+ dst.sin_addr = dest;
/*
* Check if packet fits MTU or if hardware will fragment for us
*/
- if (ip_len <= nh.nh_mtu) {
+ if (ip_len <= nh->nh_mtu) {
/*
* Avoid confusing lower layers.
*/
@@ -389,8 +397,8 @@
/*
* Send off the packet via outgoing interface
*/
- IP_PROBE(send, NULL, NULL, ip, nh.nh_ifp, ip, NULL);
- error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m,
+ IP_PROBE(send, NULL, NULL, ip, nh->nh_ifp, ip, NULL);
+ error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m,
(struct sockaddr *)&dst, NULL);
} else {
/*
@@ -399,15 +407,15 @@
if (ip_off & IP_DF) {
IPSTAT_INC(ips_cantfrag);
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
- 0, nh.nh_mtu);
+ 0, nh->nh_mtu);
goto consumed;
} else {
/*
* We have to fragment the packet
*/
m->m_pkthdr.csum_flags |= CSUM_IP;
- if (ip_fragment(ip, &m, nh.nh_mtu,
- nh.nh_ifp->if_hwassist) != 0)
+ if (ip_fragment(ip, &m, nh->nh_mtu,
+ nh->nh_ifp->if_hwassist) != 0)
goto drop;
KASSERT(m != NULL, ("null mbuf and no error"));
/*
@@ -423,10 +431,10 @@
m_clrprotoflags(m);
IP_PROBE(send, NULL, NULL,
- mtod(m, struct ip *), nh.nh_ifp,
+ mtod(m, struct ip *), nh->nh_ifp,
mtod(m, struct ip *), NULL);
/* XXX: we can use cached route here */
- error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m,
+ error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m,
(struct sockaddr *)&dst, NULL);
if (error)
break;
Index: sys/netinet/ip_icmp.c
===================================================================
--- sys/netinet/ip_icmp.c
+++ sys/netinet/ip_icmp.c
@@ -52,6 +52,7 @@
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -945,7 +946,7 @@
icmp_verify_redirect_gateway(struct sockaddr_in *src, struct sockaddr_in *dst,
struct sockaddr_in *gateway, u_int fibnum)
{
- struct rtentry *rt;
+ struct nhop_object *nh;
struct ifaddr *ifa;
NET_EPOCH_ASSERT();
@@ -958,8 +959,8 @@
if (ifa_ifwithaddr_check((struct sockaddr *)gateway))
return (EHOSTUNREACH);
- rt = rtalloc1_fib((struct sockaddr *)dst, 0, 0UL, fibnum); /* NB: rt is locked */
- if (rt == NULL)
+ nh = fib4_lookup_nh_ptr(fibnum, dst->sin_addr, 0, NHR_NONE, 0);
+ if (nh == NULL)
return (EINVAL);
/*
@@ -968,28 +969,19 @@
* we have a routing loop, perhaps as a result of an interface
* going down recently.
*/
- if (!sa_equal((struct sockaddr *)src, rt->rt_gateway)) {
- RTFREE_LOCKED(rt);
+ if (!sa_equal((struct sockaddr *)src, &nh->gw_sa))
return (EINVAL);
- }
- if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) {
- RTFREE_LOCKED(rt);
+ if (nh->nh_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK)
return (EINVAL);
- }
/* If host route already exists, ignore redirect. */
- if (rt->rt_flags & RTF_HOST) {
- RTFREE_LOCKED(rt);
+ if (nh->nh_flags & NHF_HOST)
return (EEXIST);
- }
/* If the prefix is directly reachable, ignore redirect. */
- if (!(rt->rt_flags & RTF_GATEWAY)) {
- RTFREE_LOCKED(rt);
+ if (!(nh->nh_flags & NHF_GATEWAY))
return (EEXIST);
- }
- RTFREE_LOCKED(rt);
return (0);
}
Index: sys/netinet/ip_input.c
===================================================================
--- sys/netinet/ip_input.c
+++ sys/netinet/ip_input.c
@@ -63,6 +63,7 @@
#include <net/if_dl.h>
#include <net/pfil.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/netisr.h>
#include <net/rss_config.h>
#include <net/vnet.h>
@@ -72,6 +73,7 @@
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
+#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_fw.h>
@@ -980,10 +982,11 @@
ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
M_GETFIB(m));
#else
- in_rtalloc_ign(&ro, 0, M_GETFIB(m));
+ ro.ro_nh = fib4_lookup_nh_ptr(M_GETFIB(m), ip->ip_dst, 0, NHR_REF,
+ m->m_pkthdr.flowid);
#endif
- if (ro.ro_rt != NULL) {
- ia = ifatoia(ro.ro_rt->rt_ifa);
+ if (ro.ro_nh != NULL) {
+ ia = ifatoia(ro.ro_nh->nh_ifa);
} else
ia = NULL;
/*
@@ -1045,19 +1048,18 @@
dest.s_addr = 0;
if (!srcrt && V_ipsendredirects &&
ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
- struct rtentry *rt;
+ struct nhop_object *nh;
- rt = ro.ro_rt;
+ nh = ro.ro_nh;
- if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 &&
- satosin(rt_key(rt))->sin_addr.s_addr != 0) {
-#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa))
+ if (nh != NULL && ((nh->nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) {
+ struct in_ifaddr *nh_ia = (struct in_ifaddr *)(nh->nh_ifa);
u_long src = ntohl(ip->ip_src.s_addr);
- if (RTA(rt) &&
- (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
- if (rt->rt_flags & RTF_GATEWAY)
- dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
+ if (nh_ia &&
+ (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) {
+ if (nh->nh_flags & NHF_GATEWAY)
+ dest.s_addr = nh->gw4_sa.sin_addr.s_addr;
else
dest.s_addr = ip->ip_dst.s_addr;
/* Router requirements says to only send host redirects */
@@ -1069,9 +1071,9 @@
error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
- if (error == EMSGSIZE && ro.ro_rt)
- mtu = ro.ro_rt->rt_mtu;
- RO_RTFREE(&ro);
+ if (error == EMSGSIZE && ro.ro_nh)
+ mtu = ro.ro_nh->nh_mtu;
+ RO_NHFREE(&ro);
if (error)
IPSTAT_INC(ips_cantforward);
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -67,6 +67,7 @@
#include <net/netisr.h>
#include <net/pfil.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
@@ -78,6 +79,7 @@
#include <netinet/in_kdtrace.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
+#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/in_rss.h>
#include <netinet/in_var.h>
@@ -287,6 +289,18 @@
return (error);
}
+static inline void
+rt_update_ro_flags(struct route *ro)
+{
+ int nh_flags = ro->ro_nh->nh_flags;
+
+ ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
+
+ ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
+ ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
+ ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
+}
+
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
@@ -368,7 +382,7 @@
dst = (struct sockaddr_in *)&ro->ro_dst;
else
dst = &sin;
- if (ro == NULL || ro->ro_rt == NULL) {
+ if (ro == NULL || ro->ro_nh == NULL) {
bzero(dst, sizeof(*dst));
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
@@ -380,8 +394,8 @@
* Validate route against routing table additions;
* a better/more specific route might have been added.
*/
- if (inp != NULL && ro != NULL && ro->ro_rt != NULL)
- RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
+ if (inp != NULL && ro != NULL && ro->ro_nh != NULL)
+ NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
/*
* If there is a cached route,
* check that it is to the same destination
@@ -390,9 +404,9 @@
* cache with IPv6.
* Also check whether routing cache needs invalidation.
*/
- if (ro != NULL && ro->ro_rt != NULL &&
- ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
- ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) ||
+ if (ro != NULL && ro->ro_nh != NULL &&
+ ((!NH_IS_VALID(ro->ro_nh)) ||
+ ro->ro_nh->nh_ifp == NULL || !RT_LINK_IS_UP(ro->ro_nh->nh_ifp) ||
dst->sin_family != AF_INET ||
dst->sin_addr.s_addr != ip->ip_dst.s_addr))
RO_INVALIDATE_CACHE(ro);
@@ -450,25 +464,23 @@
else
src.s_addr = INADDR_ANY;
} else if (ro != NULL) {
- if (ro->ro_rt == NULL) {
+ if (ro->ro_nh == NULL) {
/*
* We want to do any cloning requested by the link
* layer, as this is probably required in all cases
* for correct operation (as it is for ARP).
*/
-#ifdef RADIX_MPATH
- rtalloc_mpath_fib(ro,
- ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
- fibnum);
-#else
- in_rtalloc_ign(ro, 0, fibnum);
+ fib4_lookup_nh_route(fibnum, (struct route_in *)ro,
+ NHR_NONE, m->m_pkthdr.flowid);
+#if 0
+ char xbuf[20];
+ inet_ntop(AF_INET, &((struct sockaddr_in *)&ro->ro_dst)->sin_addr, xbuf, sizeof(xbuf));
+ printf("lookup for %s in fib %u returned ro_nh=%p\n", xbuf, fibnum, ro->ro_nh);
#endif
- if (ro->ro_rt == NULL ||
- (ro->ro_rt->rt_flags & RTF_UP) == 0 ||
- ro->ro_rt->rt_ifp == NULL ||
- !RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) {
+ if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh)) ||
+ !RT_LINK_IS_UP(ro->ro_nh->nh_ifp)) {
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- /*
+ /*
* There is no route for this packet, but it is
* possible that a matching SPD entry exists.
*/
@@ -481,29 +493,29 @@
goto bad;
}
}
- ia = ifatoia(ro->ro_rt->rt_ifa);
- ifp = ro->ro_rt->rt_ifp;
- counter_u64_add(ro->ro_rt->rt_pksent, 1);
+ ia = ifatoia(ro->ro_nh->nh_ifa);
+ ifp = ro->ro_nh->nh_ifp;
+ counter_u64_add(ro->ro_nh->nh_pksent, 1);
rt_update_ro_flags(ro);
- if (ro->ro_rt->rt_flags & RTF_GATEWAY)
- gw = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
- if (ro->ro_rt->rt_flags & RTF_HOST)
- isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
+ if (ro->ro_nh->nh_flags & NHF_GATEWAY)
+ gw = &ro->ro_nh->gw4_sa;
+ if (ro->ro_nh->nh_flags & NHF_HOST)
+ isbroadcast = (ro->ro_nh->nh_flags & NHF_BROADCAST);
else if (ifp->if_flags & IFF_BROADCAST)
isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
else
isbroadcast = 0;
- if (ro->ro_rt->rt_flags & RTF_HOST)
- mtu = ro->ro_rt->rt_mtu;
+ if (ro->ro_nh->nh_flags & NHF_HOST)
+ mtu = ro->ro_nh->nh_mtu;
else
mtu = ifp->if_mtu;
src = IA_SIN(ia)->sin_addr;
} else {
- struct nhop4_extended nh;
+ struct nhop_object *nh;
- bzero(&nh, sizeof(nh));
- if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) !=
- 0) {
+ nh = fib4_lookup_nh_ptr(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE,
+ m->m_pkthdr.flowid);
+ if (nh == NULL) {
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/*
* There is no route for this packet, but it is
@@ -514,11 +526,12 @@
goto sendit;
#endif
IPSTAT_INC(ips_noroute);
+ printf("NOTOUTE: extended\n");
error = EHOSTUNREACH;
goto bad;
}
- ifp = nh.nh_ifp;
- mtu = nh.nh_mtu;
+ ifp = nh->nh_ifp;
+ mtu = nh->nh_mtu;
/*
* We are rewriting here dst to be gw actually, contradicting
* comment at the beginning of the function. However, in this
@@ -527,19 +540,20 @@
* function, the dst would be rewritten by ip_output_pfil().
*/
MPASS(dst == &sin);
- dst->sin_addr = nh.nh_addr;
- ia = nh.nh_ia;
- src = nh.nh_src;
- isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
+ if (nh->nh_flags & NHF_GATEWAY)
+ dst->sin_addr = nh->gw4_sa.sin_addr;
+ ia = (struct in_ifaddr *)nh->nh_ifa;
+ src = ia->ia_addr.sin_addr;
+ isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
(NHF_HOST | NHF_BROADCAST)) ||
((ifp->if_flags & IFF_BROADCAST) &&
in_ifaddr_broadcast(dst->sin_addr, ia)));
}
/* Catch a possible divide by zero later. */
- KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (rt_flags=0x%08x) ifp=%p",
+ KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
__func__, mtu, ro,
- (ro != NULL && ro->ro_rt != NULL) ? ro->ro_rt->rt_flags : 0, ifp));
+ (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
m->m_flags |= M_MCAST;
@@ -702,7 +716,7 @@
case -1: /* Need to try again */
/* Reset everything for a new round */
if (ro != NULL) {
- RO_RTFREE(ro);
+ RO_NHFREE(ro);
ro->ro_prepend = NULL;
}
gw = dst;
Index: sys/netinet/raw_ip.c
===================================================================
--- sys/netinet/raw_ip.c
+++ sys/netinet/raw_ip.c
@@ -38,6 +38,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
+#include "opt_route_mpath.h"
#include <sys/param.h>
#include <sys/jail.h>
@@ -67,6 +68,7 @@
#include <netinet/in.h>
#include <netinet/in_systm.h>
+#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/if_ether.h>
@@ -455,6 +457,9 @@
IP_ALLOWBROADCAST;
int cnt, hlen;
u_char opttype, optlen, *cp;
+#ifdef ROUTE_MPATH
+ uint32_t hash_val, hash_type;
+#endif
va_start(ap, so);
dst = va_arg(ap, u_long);
@@ -484,6 +489,15 @@
ip->ip_len = htons(m->m_pkthdr.len);
ip->ip_src = inp->inp_laddr;
ip->ip_dst.s_addr = dst;
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ hash_val = fib4_calc_software_hash(ip->ip_src,
+ ip->ip_dst, 0, 0, ip->ip_p, &hash_type);
+ m->m_pkthdr.flowid = hash_val;
+ M_HASHTYPE_SET(m, hash_type);
+ flags |= IP_NODEFAULTFLOWID;
+ }
+#endif
if (jailed(inp->inp_cred)) {
/*
* prison_local_ip4() would be good enough but would
@@ -519,6 +533,15 @@
return (EINVAL);
ip = mtod(m, struct ip *);
}
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ hash_val = fib4_calc_software_hash(ip->ip_dst,
+ ip->ip_src, 0, 0, ip->ip_p, &hash_type);
+ m->m_pkthdr.flowid = hash_val;
+ M_HASHTYPE_SET(m, hash_type);
+ flags |= IP_NODEFAULTFLOWID;
+ }
+#endif
INP_RLOCK(inp);
/*
Index: sys/netinet/sctp_asconf.c
===================================================================
--- sys/netinet/sctp_asconf.c
+++ sys/netinet/sctp_asconf.c
@@ -981,8 +981,7 @@
((ifn == NULL) ||
(SCTP_GET_IF_INDEX_FROM_ROUTE(&net->ro) != ifn->ifn_index))) {
/* clear any cached route */
- RTFREE(net->ro.ro_rt);
- net->ro.ro_rt = NULL;
+ RO_NHFREE(&net->ro);
}
/* clear any cached source address */
if (net->src_addr_selected) {
@@ -1091,10 +1090,7 @@
if (addrnum == 1) {
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
/* clear any cached route and source address */
- if (net->ro.ro_rt) {
- RTFREE(net->ro.ro_rt);
- net->ro.ro_rt = NULL;
- }
+ RO_NHFREE(&net->ro);
if (net->src_addr_selected) {
sctp_free_ifa(net->ro._s_addr);
net->ro._s_addr = NULL;
@@ -1113,9 +1109,9 @@
/* Multiple local addresses exsist in the association. */
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
/* clear any cached route and source address */
- if (net->ro.ro_rt) {
- RTFREE(net->ro.ro_rt);
- net->ro.ro_rt = NULL;
+ if (net->ro.ro_nh) {
+ NH_FREE(net->ro.ro_nh);
+ net->ro.ro_nh = NULL;
}
if (net->src_addr_selected) {
sctp_free_ifa(net->ro._s_addr);
@@ -1132,7 +1128,7 @@
SCTP_RTALLOC((sctp_route_t *)&net->ro,
stcb->sctp_ep->def_vrf_id,
stcb->sctp_ep->fibnum);
- if (net->ro.ro_rt == NULL)
+ if (net->ro.ro_nh == NULL)
continue;
changed = 0;
@@ -2215,18 +2211,13 @@
struct sctp_nets *net;
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
- sctp_rtentry_t *rt;
/* delete this address if cached */
if (net->ro._s_addr == ifa) {
sctp_free_ifa(net->ro._s_addr);
net->ro._s_addr = NULL;
net->src_addr_selected = 0;
- rt = net->ro.ro_rt;
- if (rt) {
- RTFREE(rt);
- net->ro.ro_rt = NULL;
- }
+ RO_NHFREE(&net->ro);
/*
* Now we deleted our src address,
* should we not also now reset the
Index: sys/netinet/sctp_os_bsd.h
===================================================================
--- sys/netinet/sctp_os_bsd.h
+++ sys/netinet/sctp_os_bsd.h
@@ -71,11 +71,13 @@
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
+#include <netinet/in_fib.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
@@ -85,6 +87,7 @@
#ifdef INET6
#include <sys/domain.h>
#include <netinet/ip6.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6protosw.h>
@@ -199,15 +202,15 @@
#define SCTP_INIT_VRF_TABLEID(vrf)
#define SCTP_IFN_IS_IFT_LOOP(ifn) ((ifn)->ifn_type == IFT_LOOP)
-#define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifa && (ro)->ro_rt->rt_ifa->ifa_ifp && (ro)->ro_rt->rt_ifa->ifa_ifp->if_type == IFT_LOOP)
+#define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifa && (ro)->ro_nh->nh_ifa->ifa_ifp && (ro)->ro_nh->nh_ifa->ifa_ifp->if_type == IFT_LOOP)
/*
* Access to IFN's to help with src-addr-selection
*/
/* This could return VOID if the index works but for BSD we provide both. */
-#define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_rt->rt_ifp
-#define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_rt->rt_ifp->if_index
-#define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifp)
+#define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_nh->nh_ifp
+#define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_nh->nh_ifp->if_index
+#define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifp)
/*
* general memory allocation
@@ -304,12 +307,10 @@
/* MTU */
/*************************/
#define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, af) ((struct ifnet *)ifn)->if_mtu
-#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((uint32_t)((rt != NULL) ? rt->rt_mtu : 0))
+#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, nh) ((uint32_t)((nh != NULL) ? nh->nh_mtu : 0))
#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) ((sctp_ifn->ifn_p != NULL) ? ((struct ifnet *)(sctp_ifn->ifn_p))->if_mtu : 0)
-#define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \
- if (rt != NULL) \
- rt->rt_mtu = mtu; \
- } while(0)
+/* XXX: Setting MTU from the protocol in this way is simply incorrect */
+#define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu)
/* (de-)register interface event notifications */
#define SCTP_REGISTER_INTERFACE(ifhandle, af)
@@ -365,7 +366,7 @@
*/
/* get the v6 hop limit */
-#define SCTP_GET_HLIM(inp, ro) in6_selecthlim(&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL)));
+#define SCTP_GET_HLIM(inp, ro) in6_selecthlim(&inp->ip_inp.inp, (ro ? (ro->ro_nh ? (ro->ro_nh->nh_ifp) : (NULL)) : (NULL)));
/* is the endpoint v6only? */
#define SCTP_IPV6_V6ONLY(sctp_inpcb) ((sctp_inpcb)->ip_inp.inp.inp_flags & IN6P_IPV6_V6ONLY)
@@ -400,7 +401,14 @@
typedef struct rtentry sctp_rtentry_t;
#define SCTP_RTALLOC(ro, vrf_id, fibnum) \
- rtalloc_ign_fib((struct route *)ro, 0UL, fibnum)
+{ \
+ if ((ro)->ro_nh == NULL) { \
+ if ((ro)->ro_dst.sa_family == AF_INET) \
+ (ro)->ro_nh = fib4_lookup_nh_ptr(fibnum, ((struct sockaddr_in *)&(ro)->ro_dst)->sin_addr, NHR_REF, 0, 0); \
+ if ((ro)->ro_dst.sa_family == AF_INET6) \
+ (ro)->ro_nh = fib6_lookup_nh_ptr(fibnum, &((struct sockaddr_in6 *)&(ro)->ro_dst)->sin6_addr, NHR_REF, 0, 0); \
+ } \
+}
/*
* SCTP protocol specific mbuf flags.
Index: sys/netinet/sctp_output.c
===================================================================
--- sys/netinet/sctp_output.c
+++ sys/netinet/sctp_output.c
@@ -3387,13 +3387,13 @@
* addresses. If the bound set is NOT assigned to the interface then
* we must use rotation amongst the bound addresses..
*/
- if (ro->ro_rt == NULL) {
+ if (ro->ro_nh == NULL) {
/*
* Need a route to cache.
*/
SCTP_RTALLOC(ro, vrf_id, inp->fibnum);
}
- if (ro->ro_rt == NULL) {
+ if (ro->ro_nh == NULL) {
return (NULL);
}
fam = ro->ro_dst.sa_family;
@@ -4131,10 +4131,7 @@
sctp_free_ifa(net->ro._s_addr);
net->ro._s_addr = NULL;
net->src_addr_selected = 0;
- if (ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
+ RO_NHFREE(ro);
}
if (net->src_addr_selected == 0) {
/* Cache the source address */
@@ -4206,7 +4203,7 @@
* catch that somewhere and abort the association
* right away (assuming this is an INIT being sent).
*/
- if (ro->ro_rt == NULL) {
+ if (ro->ro_nh == NULL) {
/*
* src addr selection failed to find a route
* (or valid source addr), so we can't get
@@ -4225,7 +4222,7 @@
SCTPDBG(SCTP_DEBUG_OUTPUT3, "Destination is %x\n",
(uint32_t)(ntohl(ip->ip_dst.s_addr)));
SCTPDBG(SCTP_DEBUG_OUTPUT3, "RTP route is %p through\n",
- (void *)ro->ro_rt);
+ (void *)ro->ro_nh);
if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
/* failed to prepend data, give up */
@@ -4278,13 +4275,13 @@
SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret);
if (net == NULL) {
/* free tempy routes */
- RO_RTFREE(ro);
+ RO_NHFREE(ro);
} else {
- if ((ro->ro_rt != NULL) && (net->ro._s_addr) &&
+ if ((ro->ro_nh != NULL) && (net->ro._s_addr) &&
((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) {
uint32_t mtu;
- mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt);
+ mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh);
if (mtu > 0) {
if (net->port) {
mtu -= sizeof(struct udphdr);
@@ -4296,7 +4293,7 @@
net->mtu = mtu;
}
}
- } else if (ro->ro_rt == NULL) {
+ } else if (ro->ro_nh == NULL) {
/* route was freed */
if (net->ro._s_addr &&
net->src_addr_selected) {
@@ -4426,10 +4423,7 @@
sctp_free_ifa(net->ro._s_addr);
net->ro._s_addr = NULL;
net->src_addr_selected = 0;
- if (ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
+ RO_NHFREE(ro);
}
if (net->src_addr_selected == 0) {
sin6 = (struct sockaddr_in6 *)&net->ro._l_addr;
@@ -4489,7 +4483,7 @@
}
lsa6->sin6_port = inp->sctp_lport;
- if (ro->ro_rt == NULL) {
+ if (ro->ro_nh == NULL) {
/*
* src addr selection failed to find a route
* (or valid source addr), so we can't get
@@ -4625,13 +4619,13 @@
}
if (net == NULL) {
/* Now if we had a temp route free it */
- RO_RTFREE(ro);
+ RO_NHFREE(ro);
} else {
/*
* PMTU check versus smallest asoc MTU goes
* here
*/
- if (ro->ro_rt == NULL) {
+ if (ro->ro_nh == NULL) {
/* Route was freed */
if (net->ro._s_addr &&
net->src_addr_selected) {
@@ -4640,11 +4634,11 @@
}
net->src_addr_selected = 0;
}
- if ((ro->ro_rt != NULL) && (net->ro._s_addr) &&
+ if ((ro->ro_nh != NULL) && (net->ro._s_addr) &&
((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) {
uint32_t mtu;
- mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt);
+ mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh);
if (mtu > 0) {
if (net->port) {
mtu -= sizeof(struct udphdr);
@@ -13836,7 +13830,7 @@
struct nd_pfxrouter *pfxrtr = NULL;
struct sockaddr_in6 gw6;
- if (ro == NULL || ro->ro_rt == NULL || src6->sin6_family != AF_INET6)
+ if (ro == NULL || ro->ro_nh == NULL || src6->sin6_family != AF_INET6)
return (0);
/* get prefix entry of address */
@@ -13869,8 +13863,8 @@
SCTPDBG(SCTP_DEBUG_OUTPUT2, "prefix router is ");
SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&gw6);
SCTPDBG(SCTP_DEBUG_OUTPUT2, "installed router is ");
- SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway);
- if (sctp_cmpaddr((struct sockaddr *)&gw6, ro->ro_rt->rt_gateway)) {
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw_sa);
+ if (sctp_cmpaddr((struct sockaddr *)&gw6, &ro->ro_nh->gw_sa)) {
ND6_RUNLOCK();
SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is installed\n");
return (1);
@@ -13890,7 +13884,7 @@
struct ifaddr *ifa;
struct in_addr srcnetaddr, gwnetaddr;
- if (ro == NULL || ro->ro_rt == NULL ||
+ if (ro == NULL || ro->ro_nh == NULL ||
sifa->address.sa.sa_family != AF_INET) {
return (0);
}
@@ -13902,10 +13896,10 @@
SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa);
SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr);
- sin = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
+ sin = &ro->ro_nh->gw4_sa;
gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr);
SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is ");
- SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway);
+ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw4_sa);
SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr);
if (srcnetaddr.s_addr == gwnetaddr.s_addr) {
return (1);
Index: sys/netinet/sctp_pcb.c
===================================================================
--- sys/netinet/sctp_pcb.c
+++ sys/netinet/sctp_pcb.c
@@ -3978,9 +3978,11 @@
} else {
imtu = 0;
}
- rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_rt);
+ rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_nh);
hcmtu = sctp_hc_get_mtu(&net->ro._l_addr, stcb->sctp_ep->fibnum);
net->mtu = sctp_min_mtu(hcmtu, rmtu, imtu);
+ /* XXXME: not possible */
+#if 0
if (rmtu == 0) {
/*
* Start things off to match mtu of
@@ -3989,6 +3991,7 @@
SCTP_SET_MTU_OF_ROUTE(&net->ro._l_addr.sa,
net->ro.ro_rt, net->mtu);
}
+#endif
}
}
if (net->mtu == 0) {
@@ -4069,19 +4072,19 @@
*netp = net;
}
netfirst = TAILQ_FIRST(&stcb->asoc.nets);
- if (net->ro.ro_rt == NULL) {
+ if (net->ro.ro_nh == NULL) {
/* Since we have no route put it at the back */
TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
} else if (netfirst == NULL) {
/* We are the first one in the pool. */
TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
- } else if (netfirst->ro.ro_rt == NULL) {
+ } else if (netfirst->ro.ro_nh == NULL) {
/*
* First one has NO route. Place this one ahead of the first
* one.
*/
TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next);
- } else if (net->ro.ro_rt->rt_ifp != netfirst->ro.ro_rt->rt_ifp) {
+ } else if (net->ro.ro_nh->nh_ifp != netfirst->ro.ro_nh->nh_ifp) {
/*
* This one has a different interface than the one at the
* top of the list. Place it ahead.
@@ -4102,11 +4105,11 @@
/* End of the list */
TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next);
break;
- } else if (netlook->ro.ro_rt == NULL) {
+ } else if (netlook->ro.ro_nh == NULL) {
/* next one has NO route */
TAILQ_INSERT_BEFORE(netfirst, net, sctp_next);
break;
- } else if (netlook->ro.ro_rt->rt_ifp != net->ro.ro_rt->rt_ifp) {
+ } else if (netlook->ro.ro_nh->nh_ifp != net->ro.ro_nh->nh_ifp) {
TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook,
net, sctp_next);
break;
@@ -4119,8 +4122,8 @@
/* got to have a primary set */
if (stcb->asoc.primary_destination == 0) {
stcb->asoc.primary_destination = net;
- } else if ((stcb->asoc.primary_destination->ro.ro_rt == NULL) &&
- (net->ro.ro_rt) &&
+ } else if ((stcb->asoc.primary_destination->ro.ro_nh == NULL) &&
+ (net->ro.ro_nh) &&
((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) {
/* No route to current primary adopt new primary */
stcb->asoc.primary_destination = net;
@@ -5461,14 +5464,9 @@
TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
if (net->ro._s_addr == laddr->ifa) {
/* Yep, purge src address selected */
- sctp_rtentry_t *rt;
/* delete this address if cached */
- rt = net->ro.ro_rt;
- if (rt != NULL) {
- RTFREE(rt);
- net->ro.ro_rt = NULL;
- }
+ RO_NHFREE(&net->ro);
sctp_free_ifa(net->ro._s_addr);
net->ro._s_addr = NULL;
net->src_addr_selected = 0;
Index: sys/netinet/sctp_structs.h
===================================================================
--- sys/netinet/sctp_structs.h
+++ sys/netinet/sctp_structs.h
@@ -189,7 +189,7 @@
#define SCTP_ITERATOR_STOP_CUR_INP 0x00000008
struct sctp_net_route {
- sctp_rtentry_t *ro_rt;
+ struct nhop_object *ro_nh;
struct llentry *ro_lle;
char *ro_prepend;
uint16_t ro_plen;
Index: sys/netinet/sctp_timer.c
===================================================================
--- sys/netinet/sctp_timer.c
+++ sys/netinet/sctp_timer.c
@@ -350,7 +350,7 @@
return (NULL);
}
}
- if (alt->ro.ro_rt == NULL) {
+ if (alt->ro.ro_nh == NULL) {
if (alt->ro._s_addr) {
sctp_free_ifa(alt->ro._s_addr);
alt->ro._s_addr = NULL;
@@ -358,7 +358,7 @@
alt->src_addr_selected = 0;
}
if (((alt->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE) &&
- (alt->ro.ro_rt != NULL) &&
+ (alt->ro.ro_nh != NULL) &&
(!(alt->dest_state & SCTP_ADDR_UNCONFIRMED))) {
/* Found a reachable address */
break;
@@ -937,10 +937,7 @@
net->src_addr_selected = 0;
/* Force a route allocation too */
- if (net->ro.ro_rt) {
- RTFREE(net->ro.ro_rt);
- net->ro.ro_rt = NULL;
- }
+ RO_NHFREE(&net->ro);
/* Was it our primary? */
if ((stcb->asoc.primary_destination == net) && (alt != net)) {
@@ -1501,7 +1498,7 @@
net->src_addr_selected = 1;
}
if (net->ro._s_addr) {
- mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_rt);
+ mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_nh);
#if defined(INET) || defined(INET6)
if (net->port) {
mtu -= sizeof(struct udphdr);
Index: sys/netinet/sctp_var.h
===================================================================
--- sys/netinet/sctp_var.h
+++ sys/netinet/sctp_var.h
@@ -187,9 +187,9 @@
if ((__net)) { \
if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \
(void)SCTP_OS_TIMER_STOP(&(__net)->rxt_timer.timer); \
- if ((__net)->ro.ro_rt) { \
- RTFREE((__net)->ro.ro_rt); \
- (__net)->ro.ro_rt = NULL; \
+ if ((__net)->ro.ro_nh) { \
+ NH_FREE((__net)->ro.ro_nh); \
+ (__net)->ro.ro_nh = NULL; \
} \
if ((__net)->src_addr_selected) { \
sctp_free_ifa((__net)->ro._s_addr); \
Index: sys/netinet/tcp_offload.c
===================================================================
--- sys/netinet/tcp_offload.c
+++ sys/netinet/tcp_offload.c
@@ -41,8 +41,11 @@
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
+#include <netinet/in_fib.h>
+#include <netinet6/in6_fib.h>
#include <netinet/tcp.h>
#include <netinet/tcp_offload.h>
#define TCPOUTFLAGS
@@ -60,7 +63,8 @@
{
struct ifnet *ifp;
struct toedev *tod;
- struct rtentry *rt;
+ struct nhop_object *nh;
+ struct epoch_tracker et;
int error = EOPNOTSUPP;
INP_WLOCK_ASSERT(sotoinpcb(so));
@@ -70,13 +74,20 @@
if (registered_toedevs == 0)
return (error);
- rt = rtalloc1(nam, 0, 0);
- if (rt)
- RT_UNLOCK(rt);
- else
+ NET_EPOCH_ENTER(et);
+ nh = NULL;
+ if (nam->sa_family == AF_INET)
+ nh = fib4_lookup_nh_ptr(0, ((struct sockaddr_in *)nam)->sin_addr,
+ NHR_NONE, 0, 0);
+ else if (nam->sa_family == AF_INET6)
+ nh = fib6_lookup_nh_ptr(0, &((struct sockaddr_in6 *)nam)->sin6_addr,
+ NHR_NONE, 0, 0);
+ if (nh == NULL) {
+ NET_EPOCH_EXIT(et);
return (EHOSTUNREACH);
+ }
- ifp = rt->rt_ifp;
+ ifp = nh->nh_ifp;
if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4))
goto done;
@@ -85,9 +96,9 @@
tod = TOEDEV(ifp);
if (tod != NULL)
- error = tod->tod_connect(tod, so, rt, nam);
+ error = tod->tod_connect(tod, so, nh, nam);
done:
- RTFREE(rt);
+ NET_EPOCH_EXIT(et);
return (error);
}
Index: sys/netinet/tcp_output.c
===================================================================
--- sys/netinet/tcp_output.c
+++ sys/netinet/tcp_output.c
@@ -64,6 +64,7 @@
#include <net/if.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -1411,8 +1412,8 @@
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, NULL, tp->t_inpcb);
- if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_rt != NULL)
- mtu = tp->t_inpcb->inp_route6.ro_rt->rt_mtu;
+ if (error == EMSGSIZE && tp->t_inpcb->inp_route6.ro_nh != NULL)
+ mtu = tp->t_inpcb->inp_route6.ro_nh->nh_mtu;
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
@@ -1454,8 +1455,8 @@
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
tp->t_inpcb);
- if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL)
- mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu;
+ if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_nh != NULL)
+ mtu = tp->t_inpcb->inp_route.ro_nh->nh_mtu;
}
#endif /* INET */
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -76,6 +76,7 @@
#include <vm/uma.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/vnet.h>
@@ -2199,9 +2200,9 @@
if (tp->t_state == TCPS_ESTABLISHED &&
(error == EHOSTUNREACH || error == ENETUNREACH ||
error == EHOSTDOWN)) {
- if (inp->inp_route.ro_rt) {
- RTFREE(inp->inp_route.ro_rt);
- inp->inp_route.ro_rt = (struct rtentry *)NULL;
+ if (inp->inp_route.ro_nh) {
+ NH_FREE(inp->inp_route.ro_nh);
+ inp->inp_route.ro_nh = (struct nhop_object *)NULL;
}
return (inp);
} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
Index: sys/netinet/toecore.h
===================================================================
--- sys/netinet/toecore.h
+++ sys/netinet/toecore.h
@@ -41,6 +41,7 @@
struct tcphdr;
struct in_conninfo;
struct tcp_info;
+struct nhop_object;
struct ktls_session;
struct toedev {
@@ -51,7 +52,7 @@
* Active open. If a failure occurs, it is reported back by the driver
* via toe_connect_failed.
*/
- int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *,
+ int (*tod_connect)(struct toedev *, struct socket *, struct nhop_object *,
struct sockaddr *);
/* Passive open. */
Index: sys/netinet/toecore.c
===================================================================
--- sys/netinet/toecore.c
+++ sys/netinet/toecore.c
@@ -77,7 +77,7 @@
static int
toedev_connect(struct toedev *tod __unused, struct socket *so __unused,
- struct rtentry *rt __unused, struct sockaddr *nam __unused)
+ struct nhop_object *nh __unused, struct sockaddr *nam __unused)
{
return (ENOTSUP);
Index: sys/netinet/udp_usrreq.c
===================================================================
--- sys/netinet/udp_usrreq.c
+++ sys/netinet/udp_usrreq.c
@@ -71,6 +71,7 @@
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/rss_config.h>
#include <netinet/in.h>
@@ -761,9 +762,9 @@
INP_WLOCK_ASSERT(inp);
if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
- errno == EHOSTDOWN) && inp->inp_route.ro_rt) {
- RTFREE(inp->inp_route.ro_rt);
- inp->inp_route.ro_rt = (struct rtentry *)NULL;
+ errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
+ NH_FREE(inp->inp_route.ro_nh);
+ inp->inp_route.ro_nh = (struct nhop_object *)NULL;
}
inp->inp_socket->so_error = errno;
Index: sys/netinet6/icmp6.c
===================================================================
--- sys/netinet6/icmp6.c
+++ sys/netinet6/icmp6.c
@@ -93,6 +93,7 @@
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -2412,7 +2413,7 @@
}
void
-icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
+icmp6_redirect_output(struct mbuf *m0, struct nhop_object *nh)
{
struct ifnet *ifp; /* my outgoing interface */
struct in6_addr *ifp_ll6;
@@ -2435,7 +2436,7 @@
goto fail;
/* sanity check */
- if (!m0 || !rt || !(rt->rt_flags & RTF_UP) || !(ifp = rt->rt_ifp))
+ if (!m0 || !nh || !(NH_IS_VALID(nh)) || !(ifp = nh->nh_ifp))
goto fail;
/*
@@ -2469,7 +2470,7 @@
m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
if (m == NULL)
goto fail;
- M_SETFIB(m, rt->rt_fibnum);
+ M_SETFIB(m, M_GETFIB(m0));
maxlen = M_TRAILINGSPACE(m);
maxlen = min(IPV6_MMTU, maxlen);
/* just for safety */
@@ -2491,9 +2492,9 @@
}
/* get ip6 linklocal address for the router. */
- if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
+ if (nh->nh_flags & NHF_GATEWAY) {
struct sockaddr_in6 *sin6;
- sin6 = (struct sockaddr_in6 *)rt->rt_gateway;
+ sin6 = &nh->gw6_sa;
router_ll6 = &sin6->sin6_addr;
if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
router_ll6 = (struct in6_addr *)NULL;
@@ -2517,7 +2518,7 @@
nd_rd->nd_rd_type = ND_REDIRECT;
nd_rd->nd_rd_code = 0;
nd_rd->nd_rd_reserved = 0;
- if (rt->rt_flags & RTF_GATEWAY) {
+ if (nh->nh_flags & NHF_GATEWAY) {
/*
* nd_rd->nd_rd_target must be a link-local address in
* better router cases.
Index: sys/netinet6/in6.h
===================================================================
--- sys/netinet6/in6.h
+++ sys/netinet6/in6.h
@@ -375,8 +375,9 @@
* IP6 route structure
*/
#if __BSD_VISIBLE
+struct nhop_object;
struct route_in6 {
- struct rtentry *ro_rt;
+ struct nhop_object *ro_nh;
struct llentry *ro_lle;
/*
* ro_prepend and ro_plen are only used for bpf to pass in a
Index: sys/netinet6/in6_fib.h
===================================================================
--- sys/netinet6/in6_fib.h
+++ sys/netinet6/in6_fib.h
@@ -58,5 +58,14 @@
uint32_t scopeid, uint32_t flags, uint32_t flowid,
struct nhop6_extended *pnh6);
void fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6);
+struct nhop_object *fib6_lookup_nh_ptr(uint32_t fibnum,
+ const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags,
+ uint32_t flowid);
+int fib6_lookup_urpf(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if);
+
+uint32_t fib6_calc_software_hash(const struct in6_addr *src,
+ const struct in6_addr *dst, unsigned short src_port, unsigned short dst_port,
+ char proto, uint32_t *phashtype);
#endif
Index: sys/netinet6/in6_fib.c
===================================================================
--- sys/netinet6/in6_fib.c
+++ sys/netinet6/in6_fib.c
@@ -34,6 +34,7 @@
#include "opt_inet6.h"
#include "opt_route.h"
#include "opt_mpath.h"
+#include "opt_route_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -50,12 +51,11 @@
#include <net/if_dl.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/shared.h>
+#include <net/toeplitz.h>
#include <net/vnet.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_mroute.h>
@@ -64,98 +64,99 @@
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/scope6_var.h>
+#include <net/route/rtentry_var.h>
#include <net/if_types.h>
#ifdef INET6
-static void fib6_rte_to_nh_extended(struct rtentry *rte,
+static void fib6_rte_to_nh_extended(const struct nhop_object *nh,
const struct in6_addr *dst, uint32_t flags, struct nhop6_extended *pnh6);
-static void fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
+static void fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_basic *pnh6);
-static struct ifnet *fib6_get_ifaifp(struct rtentry *rte);
#define RNTORT(p) ((struct rtentry *)(p))
#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))
CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst);
-/*
- * Gets real interface for the @rte.
- * Returns rt_ifp for !IFF_LOOPBACK routers.
- * Extracts "real" address interface from interface address
- * loopback routes.
- */
-static struct ifnet *
-fib6_get_ifaifp(struct rtentry *rte)
+#ifdef ROUTE_MPATH
+struct _hash_5tuple_ipv6 {
+ struct in6_addr src;
+ struct in6_addr dst;
+ unsigned short src_port;
+ unsigned short dst_port;
+ char proto;
+ char spare[3];
+};
+_Static_assert(sizeof(struct _hash_5tuple_ipv6) == 40,
+ "_hash_5tuple_ipv6 size is wrong");
+
+uint32_t
+fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst,
+ unsigned short src_port, unsigned short dst_port, char proto,
+ uint32_t *phashtype)
{
- struct ifnet *ifp;
- struct sockaddr_dl *sdl;
+ struct _hash_5tuple_ipv6 data;
- ifp = rte->rt_ifp;
- if ((ifp->if_flags & IFF_LOOPBACK) &&
- rte->rt_gateway->sa_family == AF_LINK) {
- sdl = (struct sockaddr_dl *)rte->rt_gateway;
- return (ifnet_byindex(sdl->sdl_index));
- }
+ data.src = *src;
+ data.dst = *dst;
+ data.src_port = src_port;
+ data.dst_port = dst_port;
+ data.proto = proto;
+ data.spare[0] = data.spare[1] = data.spare[2] = 0;
- return (ifp);
+ *phashtype = M_HASHTYPE_OPAQUE_HASH;
+
+ return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key,
+ sizeof(data), (uint8_t *)&data));
}
+#endif
static void
-fib6_rte_to_nh_basic(struct rtentry *rte, const struct in6_addr *dst,
+fib6_rte_to_nh_basic(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_basic *pnh6)
{
- struct sockaddr_in6 *gw;
/* Do explicit nexthop zero unless we're copying it */
memset(pnh6, 0, sizeof(*pnh6));
if ((flags & NHR_IFAIF) != 0)
- pnh6->nh_ifp = fib6_get_ifaifp(rte);
+ pnh6->nh_ifp = nh->nh_aifp;
else
- pnh6->nh_ifp = rte->rt_ifp;
+ pnh6->nh_ifp = nh->nh_ifp;
- pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
- if (rte->rt_flags & RTF_GATEWAY) {
+ pnh6->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY) {
/* Return address with embedded scope. */
- gw = (struct sockaddr_in6 *)rte->rt_gateway;
- pnh6->nh_addr = gw->sin6_addr;
+ pnh6->nh_addr = nh->gw6_sa.sin6_addr;
} else
pnh6->nh_addr = *dst;
/* Set flags */
- pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in6 *)rt_key(rte);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
- pnh6->nh_flags |= NHF_DEFAULT;
+ pnh6->nh_flags = nh->nh_flags;
}
static void
-fib6_rte_to_nh_extended(struct rtentry *rte, const struct in6_addr *dst,
+fib6_rte_to_nh_extended(const struct nhop_object *nh, const struct in6_addr *dst,
uint32_t flags, struct nhop6_extended *pnh6)
{
- struct sockaddr_in6 *gw;
/* Do explicit nexthop zero unless we're copying it */
memset(pnh6, 0, sizeof(*pnh6));
if ((flags & NHR_IFAIF) != 0)
- pnh6->nh_ifp = fib6_get_ifaifp(rte);
+ pnh6->nh_ifp = nh->nh_aifp;
else
- pnh6->nh_ifp = rte->rt_ifp;
+ pnh6->nh_ifp = nh->nh_ifp;
- pnh6->nh_mtu = min(rte->rt_mtu, IN6_LINKMTU(rte->rt_ifp));
- if (rte->rt_flags & RTF_GATEWAY) {
+ pnh6->nh_mtu = nh->nh_mtu;
+ if (nh->nh_flags & NHF_GATEWAY) {
/* Return address with embedded scope. */
- gw = (struct sockaddr_in6 *)rte->rt_gateway;
- pnh6->nh_addr = gw->sin6_addr;
+ pnh6->nh_addr = nh->gw6_sa.sin6_addr;
} else
pnh6->nh_addr = *dst;
/* Set flags */
- pnh6->nh_flags = fib_rte_to_nh_flags(rte->rt_flags);
- gw = (struct sockaddr_in6 *)rt_key(rte);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
- pnh6->nh_flags |= NHF_DEFAULT;
- pnh6->nh_ia = ifatoia6(rte->rt_ifa);
+ pnh6->nh_flags = nh->nh_flags;
+ pnh6->nh_ia = ifatoia6(nh->nh_ifa);
}
/*
@@ -180,7 +181,7 @@
struct rib_head *rh;
struct radix_node *rn;
struct sockaddr_in6 sin6;
- struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -198,10 +199,10 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib6_rte_to_nh_basic(rte, &sin6.sin6_addr, flags, pnh6);
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib6_rte_to_nh_basic(nh, &sin6.sin6_addr, flags, pnh6);
RIB_RUNLOCK(rh);
return (0);
}
@@ -230,7 +231,7 @@
struct rib_head *rh;
struct radix_node *rn;
struct sockaddr_in6 sin6;
- struct rtentry *rte;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -248,17 +249,10 @@
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- rte = RNTORT(rn);
-#ifdef RADIX_MPATH
- rte = rt_mpath_select(rte, flowid);
- if (rte == NULL) {
- RIB_RUNLOCK(rh);
- return (ENOENT);
- }
-#endif
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(rte->rt_ifp)) {
- fib6_rte_to_nh_extended(rte, &sin6.sin6_addr, flags,
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ fib6_rte_to_nh_extended(nh, &sin6.sin6_addr, flags,
pnh6);
if ((flags & NHR_REF) != 0) {
/* TODO: Do lwref on egress ifp's */
@@ -277,6 +271,138 @@
fib6_free_nh_ext(uint32_t fibnum, struct nhop6_extended *pnh6)
{
+}
+
+/*
+ *
+ * Assumes scope is deembedded and provided in @scopeid
+ */
+struct nhop_object *
+fib6_lookup_nh_ptr(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+ struct sockaddr_in6 sin6;
+
+ KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET6);
+ if (rh == NULL)
+ return (NULL);
+
+ /* TODO: radix changes */
+ //addr = *dst6;
+ /* Prepare lookup key */
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+ sin6.sin6_addr = *dst6;
+
+ /* Assume scopeid is valid and embed it directly */
+ if (IN6_IS_SCOPE_LINKLOCAL(dst6))
+ sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ nh = RT_SELECT_NHOP((RNTORT(rn)), flowid);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ RIB_RUNLOCK(rh);
+ return (nh);
+ }
+ }
+ RIB_RUNLOCK(rh);
+
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+
+inline static int
+check_urpf(const struct nhop_object *nh, uint32_t flags,
+ const struct ifnet *src_if)
+{
+#ifdef ROUTE_MPATH
+ const struct nhgrp_object *nhgrp;
+
+ if (NH_IS_MULTIPATH(nh)) {
+ nhgrp = (const struct nhgrp_object *)nh;
+
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nhgrp->nhops[0]->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ return (0);
+ }
+
+ /* src_if != NULL, need to iterate over nhops */
+ /* TODO: consider iterating control plane nhop list */
+ for (int i = 0; i < nhgrp->mp_size; i++) {
+ if (nhgrp->nhops[i]->nh_aifp == src_if)
+ return (1);
+ }
+ return (0);
+ }
+#endif
+
+ if (src_if != NULL && nh->nh_aifp == src_if) {
+ return (1);
+ }
+ if (src_if == NULL) {
+ if ((flags & NHR_NODEFAULT) == 0)
+ return (1);
+ else if ((nh->nh_flags & NHF_DEFAULT) == 0)
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib6_lookup_urpf(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+ struct in6_addr addr;
+ int ret;
+
+ KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_ext: bad fibnum"));
+ rh = rt_tables_get_rnh(fibnum, AF_INET6);
+ if (rh == NULL)
+ return (0);
+
+ addr = *dst6;
+ /* Assume scopeid is valid and embed it directly */
+ if (IN6_IS_SCOPE_LINKLOCAL(dst6))
+ addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&addr, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ nh = (RNTORT(rn))->rt_nhop;
+ ret = check_urpf(nh, flags, src_if);
+ RIB_RUNLOCK(rh);
+ return (ret);
+ }
+ RIB_RUNLOCK(rh);
+
+ return (0);
}
#endif
Index: sys/netinet6/in6_pcb.c
===================================================================
--- sys/netinet6/in6_pcb.c
+++ sys/netinet6/in6_pcb.c
@@ -74,6 +74,7 @@
#include "opt_ipsec.h"
#include "opt_pcbgroup.h"
#include "opt_rss.h"
+#include "opt_route_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -97,6 +98,7 @@
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
@@ -109,6 +111,7 @@
#include <netinet6/nd6.h>
#include <netinet/in_pcb.h>
#include <netinet6/in6_pcb.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/scope6_var.h>
static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *,
@@ -417,10 +420,22 @@
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
struct in6_addr addr6;
int error;
+#ifdef ROUTE_MPATH
+ uint32_t hash_val, hash_type;
+#endif
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
+ &sin6->sin6_addr, 0, sin6->sin6_port,
+ inp->inp_socket->so_proto->pr_protocol, &hash_type);
+ inp->inp_flowid = hash_val;
+ inp->inp_flowtype = hash_type;
+ }
+#endif
/*
* Call inner routine, to assign local interface address.
* in6_pcbladdr() may automatically fill in sin6_scope_id.
Index: sys/netinet6/in6_rmx.c
===================================================================
--- sys/netinet6/in6_rmx.c
+++ sys/netinet6/in6_rmx.c
@@ -73,6 +73,7 @@
#include <sys/socketvar.h>
#include <sys/mbuf.h>
#include <sys/rwlock.h>
+#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/callout.h>
@@ -80,6 +81,8 @@
#include <net/if_var.h>
#include <net/route.h>
#include <net/route_var.h>
+#include <net/route/nhop.h>
+#include <net/route/rtentry_var.h>
#include <netinet/in.h>
#include <netinet/ip_var.h>
@@ -101,6 +104,40 @@
extern int in6_detachhead(void **head, int off);
#endif
+static int
+rib6_preadd(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
+ struct nhop_request *req)
+{
+
+ /* XXX: RTF_LOCAL */
+
+ /*
+ * Check route MTU:
+ * inherit interface MTU if not set or
+ * check if MTU is too large.
+ */
+ if (req->mtu == 0) {
+ req->mtu = IN6_LINKMTU(req->ifp);
+ } else if (req->mtu > IN6_LINKMTU(req->ifp))
+ req->mtu = IN6_LINKMTU(req->ifp);
+
+ /* Ensure that default route nhop has special flag */
+ const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask;
+ if ((req->rt_flags & RTF_HOST) == 0 && IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr))
+ req->nh_flags_additional |= NHF_DEFAULT;
+
+ /* Set nexthop type */
+ if (req->nh_type == 0) {
+ if (req->rt_flags & RTF_GATEWAY)
+ req->nh_type = NH_TYPE_IPV6_ETHER_NHOP;
+ else
+ req->nh_type = NH_TYPE_IPV6_ETHER_RSLV;
+ }
+
+ return (0);
+}
+
+#if 0
/*
* Do what we need to do when inserting a route.
*/
@@ -151,6 +188,7 @@
return (rn_addroute(v_arg, n_arg, head, treenodes));
}
+#endif
/*
* Initialize our routing tree.
@@ -166,7 +204,7 @@
if (rh == NULL)
return (0);
- rh->rnh_addaddr = in6_addroute;
+ rh->rnh_preadd = rib6_preadd;
*head = (void *)rh;
return (1);
@@ -186,31 +224,4 @@
/*
* Extended API for IPv6 FIB support.
*/
-int
-in6_rtrequest(int req, struct sockaddr *dst, struct sockaddr *gw,
- struct sockaddr *mask, int flags, struct rtentry **ret_nrt, u_int fibnum)
-{
- return (rtrequest_fib(req, dst, gw, mask, flags, ret_nrt, fibnum));
-}
-
-void
-in6_rtalloc(struct route_in6 *ro, u_int fibnum)
-{
-
- rtalloc_ign_fib((struct route *)ro, 0ul, fibnum);
-}
-
-void
-in6_rtalloc_ign(struct route_in6 *ro, u_long ignflags, u_int fibnum)
-{
-
- rtalloc_ign_fib((struct route *)ro, ignflags, fibnum);
-}
-
-struct rtentry *
-in6_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum)
-{
-
- return (rtalloc1_fib(dst, report, ignflags, fibnum));
-}
Index: sys/netinet6/in6_src.c
===================================================================
--- sys/netinet6/in6_src.c
+++ sys/netinet6/in6_src.c
@@ -91,6 +91,7 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/if_llatbl.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
@@ -134,7 +135,7 @@
static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route_in6 *, struct ifnet **,
- struct rtentry **, int, u_int);
+ struct nhop_object **, int, u_int, uint32_t);
static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct ifnet **,
struct ifnet *, u_int);
@@ -625,11 +626,12 @@
static int
selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro,
- struct ifnet **retifp, struct rtentry **retrt, int norouteok, u_int fibnum)
+ struct ifnet **retifp, struct nhop_object **retnh, int norouteok,
+ u_int fibnum, uint32_t flowid)
{
int error = 0;
struct ifnet *ifp = NULL;
- struct rtentry *rt = NULL;
+ struct nhop_object *nh = NULL;
struct sockaddr_in6 *sin6_next;
struct in6_pktinfo *pi = NULL;
struct in6_addr *dst = &dstsock->sin6_addr;
@@ -654,7 +656,7 @@
/* XXX boundary check is assumed to be already done. */
ifp = ifnet_byindex(pi->ipi6_ifindex);
if (ifp != NULL &&
- (norouteok || retrt == NULL ||
+ (norouteok || retnh == NULL ||
IN6_IS_ADDR_MULTICAST(dst))) {
/*
* we do not have to check or get the route for
@@ -707,26 +709,31 @@
}
ron = &opts->ip6po_nextroute;
/* Use a cached route if it exists and is valid. */
- if (ron->ro_rt != NULL && (
- (ron->ro_rt->rt_flags & RTF_UP) == 0 ||
+ if (ron->ro_nh != NULL && (
+ !NH_IS_VALID(ron->ro_nh) ||
ron->ro_dst.sin6_family != AF_INET6 ||
!IN6_ARE_ADDR_EQUAL(&ron->ro_dst.sin6_addr,
&sin6_next->sin6_addr)))
- RO_RTFREE(ron);
- if (ron->ro_rt == NULL) {
+ RO_NHFREE(ron);
+ if (ron->ro_nh == NULL) {
ron->ro_dst = *sin6_next;
- in6_rtalloc(ron, fibnum); /* multi path case? */
+ /*
+ * sin6_next is not link-local OR scopeid is 0,
+ * no need to clear scope
+ */
+ ron->ro_nh = fib6_lookup_nh_ptr(fibnum,
+ &sin6_next->sin6_addr, 0, NHR_REF, flowid);
}
/*
* The node identified by that address must be a
* neighbor of the sending host.
*/
- if (ron->ro_rt == NULL ||
- (ron->ro_rt->rt_flags & RTF_GATEWAY) != 0)
+ if (ron->ro_nh == NULL ||
+ (ron->ro_nh->nh_flags & NHF_GATEWAY) != 0)
error = EHOSTUNREACH;
else {
- rt = ron->ro_rt;
- ifp = rt->rt_ifp;
+ nh = ron->ro_nh;
+ ifp = nh->nh_ifp;
}
goto done;
}
@@ -737,15 +744,14 @@
* cached destination, in case of sharing the cache with IPv4.
*/
if (ro) {
- if (ro->ro_rt &&
- (!(ro->ro_rt->rt_flags & RTF_UP) ||
+ if (ro->ro_nh &&
+ (!NH_IS_VALID(ro->ro_nh) ||
((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
!IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
dst))) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = (struct rtentry *)NULL;
+ RO_NHFREE(ro);
}
- if (ro->ro_rt == (struct rtentry *)NULL) {
+ if (ro->ro_nh == (struct nhop_object *)NULL) {
struct sockaddr_in6 *sa6;
/* No route yet, so try to acquire one */
@@ -754,15 +760,28 @@
*sa6 = *dstsock;
sa6->sin6_scope_id = 0;
+ /*
+ * Currently dst has scopeid embedded.
+ * New routing API accepts scopeid as a separate argument.
+ * Convert dst before/after doing lookup
+ */
+ uint32_t scopeid = 0;
+ if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr)) {
+ /* Unwrap in6_getscope() and in6_clearscope() */
+ scopeid = ntohs(sa6->sin6_addr.s6_addr16[1]);
+ sa6->sin6_addr.s6_addr16[1] = 0;
+
+ }
+
#ifdef RADIX_MPATH
rtalloc_mpath_fib((struct route *)ro,
ntohl(sa6->sin6_addr.s6_addr32[3]), fibnum);
#else
- ro->ro_rt = in6_rtalloc1((struct sockaddr *)
- &ro->ro_dst, 0, 0UL, fibnum);
- if (ro->ro_rt)
- RT_UNLOCK(ro->ro_rt);
+ ro->ro_nh = fib6_lookup_nh_ptr(fibnum,
+ &sa6->sin6_addr, scopeid, NHR_REF, flowid);
#endif
+ if (IN6_IS_SCOPE_LINKLOCAL(&sa6->sin6_addr))
+ sa6->sin6_addr.s6_addr16[1] = htons(scopeid);
}
/*
@@ -772,17 +791,11 @@
if (opts && opts->ip6po_nexthop)
goto done;
- if (ro->ro_rt) {
- ifp = ro->ro_rt->rt_ifp;
-
- if (ifp == NULL) { /* can this really happen? */
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
- }
- if (ro->ro_rt == NULL)
+ if (ro->ro_nh)
+ ifp = ro->ro_nh->nh_ifp;
+ else
error = EHOSTUNREACH;
- rt = ro->ro_rt;
+ nh = ro->ro_nh;
/*
* Check if the outgoing interface conflicts with
@@ -803,7 +816,7 @@
}
done:
- if (ifp == NULL && rt == NULL) {
+ if (ifp == NULL && nh == NULL) {
/*
* This can happen if the caller did not pass a cached route
* nor any other hints. We treat this case an error.
@@ -814,26 +827,14 @@
IP6STAT_INC(ip6s_noroute);
if (retifp != NULL) {
- *retifp = ifp;
-
- /*
- * Adjust the "outgoing" interface. If we're going to loop
- * the packet back to ourselves, the ifp would be the loopback
- * interface. However, we'd rather know the interface associated
- * to the destination address (which should probably be one of
- * our own addresses.)
- */
- if (rt) {
- if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
- (rt->rt_gateway->sa_family == AF_LINK))
- *retifp =
- ifnet_byindex(((struct sockaddr_dl *)
- rt->rt_gateway)->sdl_index);
- }
+ if (nh != NULL)
+ *retifp = nh->nh_aifp;
+ else
+ *retifp = ifp;
}
- if (retrt != NULL)
- *retrt = rt; /* rt may be NULL */
+ if (retnh != NULL)
+ *retnh = nh; /* nh may be NULL */
return (error);
}
@@ -845,20 +846,20 @@
{
int error;
struct route_in6 sro;
- struct rtentry *rt = NULL;
- int rt_flags;
+ struct nhop_object *nh = NULL;
+ uint16_t nh_flags;
KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__));
bzero(&sro, sizeof(sro));
- rt_flags = 0;
+ nh_flags = 0;
- error = selectroute(dstsock, opts, mopts, &sro, retifp, &rt, 1, fibnum);
+ error = selectroute(dstsock, opts, mopts, &sro, retifp, &nh, 1, fibnum, 0);
- if (rt)
- rt_flags = rt->rt_flags;
- if (rt && rt == sro.ro_rt)
- RTFREE(rt);
+ if (nh != NULL)
+ nh_flags = nh->nh_flags;
+ if (nh != NULL && nh == sro.ro_nh)
+ NH_FREE(nh);
if (error != 0) {
/* Help ND. See oifp comment in in6_selectsrc(). */
@@ -887,8 +888,8 @@
* We thus reject the case here.
*/
- if (rt_flags & (RTF_REJECT | RTF_BLACKHOLE)) {
- error = (rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
+ if (nh_flags & (NHF_REJECT | NHF_BLACKHOLE)) {
+ error = (nh_flags & NHF_HOST ? EHOSTUNREACH : ENETUNREACH);
return (error);
}
@@ -899,11 +900,11 @@
int
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route_in6 *ro,
- struct ifnet **retifp, struct rtentry **retrt, u_int fibnum)
+ struct ifnet **retifp, struct nhop_object **retnh, u_int fibnum, uint32_t flowid)
{
return (selectroute(dstsock, opts, mopts, ro, retifp,
- retrt, 0, fibnum));
+ retnh, 0, fibnum, flowid));
}
/*
Index: sys/netinet6/in6_var.h
===================================================================
--- sys/netinet6/in6_var.h
+++ sys/netinet6/in6_var.h
@@ -915,11 +915,6 @@
* Extended API for IPv6 FIB support.
*/
struct mbuf *ip6_tryforward(struct mbuf *);
-int in6_rtrequest(int, struct sockaddr *, struct sockaddr *,
- struct sockaddr *, int, struct rtentry **, u_int);
-void in6_rtalloc(struct route_in6 *, u_int);
-void in6_rtalloc_ign(struct route_in6 *, u_long, u_int);
-struct rtentry *in6_rtalloc1(struct sockaddr *, int, u_long, u_int);
#endif /* _KERNEL */
#endif /* _NETINET6_IN6_VAR_H_ */
Index: sys/netinet6/ip6_fastfwd.c
===================================================================
--- sys/netinet6/ip6_fastfwd.c
+++ sys/netinet6/ip6_fastfwd.c
@@ -40,6 +40,7 @@
#include <net/if.h>
#include <net/if_var.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/pfil.h>
#include <net/vnet.h>
@@ -55,30 +56,35 @@
#include <netinet6/nd6.h>
static int
-ip6_findroute(struct nhop6_basic *pnh, const struct sockaddr_in6 *dst,
+ip6_findroute(struct nhop_object **pnh, const struct sockaddr_in6 *dst,
struct mbuf *m)
{
+ struct nhop_object *nh;
- if (fib6_lookup_nh_basic(M_GETFIB(m), &dst->sin6_addr,
- dst->sin6_scope_id, 0, dst->sin6_flowinfo, pnh) != 0) {
+ nh = fib6_lookup_nh_ptr(M_GETFIB(m), &dst->sin6_addr,
+ dst->sin6_scope_id, NHR_NONE, m->m_pkthdr.flowid);
+ if (nh == NULL) {
IP6STAT_INC(ip6s_noroute);
IP6STAT_INC(ip6s_cantforward);
icmp6_error(m, ICMP6_DST_UNREACH,
ICMP6_DST_UNREACH_NOROUTE, 0);
return (EHOSTUNREACH);
}
- if (pnh->nh_flags & NHF_BLACKHOLE) {
+ if (nh->nh_flags & NHF_BLACKHOLE) {
IP6STAT_INC(ip6s_cantforward);
m_freem(m);
return (EHOSTUNREACH);
}
- if (pnh->nh_flags & NHF_REJECT) {
+ if (nh->nh_flags & NHF_REJECT) {
IP6STAT_INC(ip6s_cantforward);
icmp6_error(m, ICMP6_DST_UNREACH,
ICMP6_DST_UNREACH_REJECT, 0);
return (EHOSTUNREACH);
}
+
+ *pnh = nh;
+
return (0);
}
@@ -86,7 +92,7 @@
ip6_tryforward(struct mbuf *m)
{
struct sockaddr_in6 dst;
- struct nhop6_basic nh;
+ struct nhop_object *nh;
struct m_tag *fwd_tag;
struct ip6_hdr *ip6;
struct ifnet *rcvif;
@@ -196,9 +202,9 @@
goto dropin;
}
if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) {
- if (m->m_pkthdr.len > nh.nh_mtu) {
- in6_ifstat_inc(nh.nh_ifp, ifs6_in_toobig);
- icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh.nh_mtu);
+ if (m->m_pkthdr.len > nh->nh_mtu) {
+ in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig);
+ icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu);
m = NULL;
goto dropout;
}
@@ -208,7 +214,7 @@
/*
* Outgoing packet firewall processing.
*/
- if (pfil_run_hooks(V_inet6_pfil_head, &m, nh.nh_ifp, PFIL_OUT |
+ if (pfil_run_hooks(V_inet6_pfil_head, &m, nh->nh_ifp, PFIL_OUT |
PFIL_FWD, NULL) != PFIL_PASS)
goto dropout;
@@ -216,9 +222,9 @@
* We used slow path processing for packets with scoped addresses.
* So, scope checks aren't needed here.
*/
- if (m->m_pkthdr.len > nh.nh_mtu) {
- in6_ifstat_inc(nh.nh_ifp, ifs6_in_toobig);
- icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh.nh_mtu);
+ if (m->m_pkthdr.len > nh->nh_mtu) {
+ in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig);
+ icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu);
m = NULL;
goto dropout;
}
@@ -272,16 +278,17 @@
}
m_clrprotoflags(m); /* Avoid confusing lower layers. */
- IP_PROBE(send, NULL, NULL, ip6, nh.nh_ifp, NULL, ip6);
+ IP_PROBE(send, NULL, NULL, ip6, nh->nh_ifp, NULL, ip6);
- dst.sin6_addr = nh.nh_addr;
- error = (*nh.nh_ifp->if_output)(nh.nh_ifp, m,
+ if (nh->nh_flags & NHF_GATEWAY)
+ dst.sin6_addr = nh->gw6_sa.sin6_addr;
+ error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m,
(struct sockaddr *)&dst, NULL);
if (error != 0) {
- in6_ifstat_inc(nh.nh_ifp, ifs6_out_discard);
+ in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard);
IP6STAT_INC(ip6s_cantforward);
} else {
- in6_ifstat_inc(nh.nh_ifp, ifs6_out_forward);
+ in6_ifstat_inc(nh->nh_ifp, ifs6_out_forward);
IP6STAT_INC(ip6s_forward);
}
return (NULL);
@@ -289,7 +296,7 @@
in6_ifstat_inc(rcvif, ifs6_in_discard);
goto drop;
dropout:
- in6_ifstat_inc(nh.nh_ifp, ifs6_out_discard);
+ in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard);
drop:
if (m != NULL)
m_freem(m);
Index: sys/netinet6/ip6_forward.c
===================================================================
--- sys/netinet6/ip6_forward.c
+++ sys/netinet6/ip6_forward.c
@@ -56,6 +56,7 @@
#include <net/if_var.h>
#include <net/netisr.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/pfil.h>
#include <netinet/in.h>
@@ -65,6 +66,7 @@
#include <netinet/ip_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet/icmp6.h>
@@ -90,14 +92,13 @@
ip6_forward(struct mbuf *m, int srcrt)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
- struct sockaddr_in6 *dst = NULL;
- struct rtentry *rt = NULL;
- struct route_in6 rin6;
+ struct sockaddr_in6 dst;
+ struct nhop_object *nh = NULL;
int error, type = 0, code = 0;
struct mbuf *mcopy = NULL;
struct ifnet *origifp; /* maybe unnecessary */
u_int32_t inzone, outzone;
- struct in6_addr src_in6, dst_in6, odst;
+ struct in6_addr odst;
struct m_tag *fwd_tag;
char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
@@ -165,18 +166,27 @@
/* No IPsec processing required */
}
#endif
+ /*
+ * ip6_forward() operates with IPv6 addresses with deembedded scope.
+ *
+ * There are 3 sources of IPv6 destination address:
+ *
+ * 1) ip6_input(), where ip6_dst contains deembedded address.
+ * In order to deal with forwarding of link-local packets,
+ * calculate the scope based on input interface (RFC 4007, clause 9).
+ * 2) packet filters changing ip6_dst directly. It would embed scope
+ * for LL addresses, so in6_localip() performs properly.
+ * 3) packet filters attaching PACKET_TAG_IPFORWARD would embed
+ * scope for the nexthop.
+ */
+ bzero(&dst, sizeof(struct sockaddr_in6));
+ dst.sin6_family = AF_INET6;
+ dst.sin6_addr = ip6->ip6_dst;
+ dst.sin6_scope_id = in6_get_unicast_scopeid(&ip6->ip6_dst, m->m_pkthdr.rcvif);
again:
- bzero(&rin6, sizeof(struct route_in6));
- dst = (struct sockaddr_in6 *)&rin6.ro_dst;
- dst->sin6_len = sizeof(struct sockaddr_in6);
- dst->sin6_family = AF_INET6;
- dst->sin6_addr = ip6->ip6_dst;
-again2:
- rin6.ro_rt = in6_rtalloc1((struct sockaddr *)dst, 0, 0, M_GETFIB(m));
- rt = rin6.ro_rt;
- if (rin6.ro_rt != NULL)
- RT_UNLOCK(rin6.ro_rt);
- else {
+ nh = fib6_lookup_nh_ptr(M_GETFIB(m), &dst.sin6_addr, dst.sin6_scope_id,
+ NHR_REF, m->m_pkthdr.flowid);
+ if (nh == NULL) {
IP6STAT_INC(ip6s_noroute);
in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute);
if (mcopy) {
@@ -195,8 +205,10 @@
* will possibly modify its first argument.
* [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1]
*/
- src_in6 = ip6->ip6_src;
- if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) {
+ outzone = in6_get_unicast_scopeid(&ip6->ip6_src, nh->nh_ifp);
+ inzone = in6_get_unicast_scopeid(&ip6->ip6_src, m->m_pkthdr.rcvif);
+#if 0
+ if (in6_setscope(&src_in6, nh->nh_ifp, &outzone)) {
/* XXX: this should not happen */
IP6STAT_INC(ip6s_cantforward);
IP6STAT_INC(ip6s_badscope);
@@ -207,10 +219,11 @@
IP6STAT_INC(ip6s_badscope);
goto bad;
}
+#endif
if (inzone != outzone) {
IP6STAT_INC(ip6s_cantforward);
IP6STAT_INC(ip6s_badscope);
- in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard);
+ in6_ifstat_inc(nh->nh_ifp, ifs6_in_discard);
if (V_ip6_log_time + V_ip6_log_interval < time_uptime) {
V_ip6_log_time = time_uptime;
@@ -220,7 +233,7 @@
ip6_sprintf(ip6bufs, &ip6->ip6_src),
ip6_sprintf(ip6bufd, &ip6->ip6_dst),
ip6->ip6_nxt,
- if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp));
+ if_name(m->m_pkthdr.rcvif), if_name(nh->nh_ifp));
}
if (mcopy)
icmp6_error(mcopy, ICMP6_DST_UNREACH,
@@ -235,17 +248,21 @@
* we need an explicit check because we may mistakenly forward the
* packet to a different zone by (e.g.) a default route.
*/
- dst_in6 = ip6->ip6_dst;
- if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 ||
- in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 ||
- inzone != outzone) {
+ inzone = in6_get_unicast_scopeid(&ip6->ip6_dst, m->m_pkthdr.rcvif);
+ outzone = in6_get_unicast_scopeid(&ip6->ip6_dst, nh->nh_ifp);
+
+ if (inzone != outzone) {
IP6STAT_INC(ip6s_cantforward);
IP6STAT_INC(ip6s_badscope);
goto bad;
}
- if (rt->rt_flags & RTF_GATEWAY)
- dst = (struct sockaddr_in6 *)rt->rt_gateway;
+ if (nh->nh_flags & NHF_GATEWAY) {
+ /* Store gateway address in deembedded form */
+ dst.sin6_addr = nh->gw6_sa.sin6_addr;
+ dst.sin6_scope_id = ntohs(in6_getscope(&dst.sin6_addr));
+ in6_clearscope(&dst.sin6_addr);
+ }
/*
* If we are to forward the packet using the same interface
@@ -256,9 +273,9 @@
* Also, don't send redirect if forwarding using a route
* modified by a redirect.
*/
- if (V_ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt &&
- (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) {
- if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) {
+ if (V_ip6_sendredirects && nh->nh_ifp == m->m_pkthdr.rcvif && !srcrt &&
+ (nh->nh_flags & NHF_REDIRECT) == 0) {
+ if ((nh->nh_ifp->if_flags & IFF_POINTOPOINT) != 0) {
/*
* If the incoming interface is equal to the outgoing
* one, and the link attached to the interface is
@@ -284,7 +301,7 @@
* link identifiers, we can do this stuff after making a copy for
* returning an error.
*/
- if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
+ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
/*
* See corresponding comments in ip6_output.
* XXX: but is it possible that ip6_forward() sends a packet
@@ -305,14 +322,14 @@
ip6_sprintf(ip6bufs, &ip6->ip6_src),
ip6_sprintf(ip6bufd, &ip6->ip6_dst),
ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif),
- if_name(rt->rt_ifp));
+ if_name(nh->nh_ifp));
}
/* we can just use rcvif in forwarding. */
origifp = m->m_pkthdr.rcvif;
}
else
- origifp = rt->rt_ifp;
+ origifp = nh->nh_ifp;
/*
* clear embedded scope identifiers if necessary.
* in6_clearscope will touch the addresses only when necessary.
@@ -326,7 +343,7 @@
odst = ip6->ip6_dst;
/* Run through list of hooks for forwarded packets. */
- if (pfil_run_hooks(V_inet6_pfil_head, &m, rt->rt_ifp, PFIL_OUT |
+ if (pfil_run_hooks(V_inet6_pfil_head, &m, nh->nh_ifp, PFIL_OUT |
PFIL_FWD, NULL) != PFIL_PASS)
goto freecopy;
ip6 = mtod(m, struct ip6_hdr *);
@@ -338,7 +355,12 @@
if (in6_localip(&ip6->ip6_dst))
m->m_flags |= M_FASTFWD_OURS;
else {
- RTFREE(rt);
+ NH_FREE(nh);
+
+ /* Update address and scopeid. Assume scope is embedded */
+ dst.sin6_scope_id = ntohs(in6_getscope(&ip6->ip6_dst));
+ dst.sin6_addr = ip6->ip6_dst;
+ in6_clearscope(&dst.sin6_addr);
goto again; /* Redo the routing table lookup. */
}
}
@@ -362,32 +384,43 @@
/* Or forward to some other address? */
if ((m->m_flags & M_IP6_NEXTHOP) &&
(fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
- dst = (struct sockaddr_in6 *)&rin6.ro_dst;
- bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in6));
+ struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)(fwd_tag + 1);
+
+ /* Update address and scopeid. Assume scope is embedded */
+ dst.sin6_scope_id = ntohs(in6_getscope(&gw6->sin6_addr));
+ dst.sin6_addr = gw6->sin6_addr;
+ in6_clearscope(&dst.sin6_addr);
+
m->m_flags |= M_SKIP_FIREWALL;
m->m_flags &= ~M_IP6_NEXTHOP;
m_tag_delete(m, fwd_tag);
- RTFREE(rt);
- goto again2;
+ NH_FREE(nh);
+ goto again;
}
pass:
/* See if the size was changed by the packet filter. */
- if (m->m_pkthdr.len > IN6_LINKMTU(rt->rt_ifp)) {
- in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig);
+ /* TODO: change to nh->nh_mtu */
+ if (m->m_pkthdr.len > IN6_LINKMTU(nh->nh_ifp)) {
+ in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig);
if (mcopy)
icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0,
- IN6_LINKMTU(rt->rt_ifp));
+ IN6_LINKMTU(nh->nh_ifp));
goto bad;
}
- error = nd6_output_ifp(rt->rt_ifp, origifp, m, dst, NULL);
+ /* Currently LLE layer stores embedded IPv6 addresses */
+ if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6_addr)) {
+ in6_set_unicast_scopeid(&dst.sin6_addr, dst.sin6_scope_id);
+ dst.sin6_scope_id = 0;
+ }
+ error = nd6_output_ifp(nh->nh_ifp, origifp, m, &dst, NULL);
if (error) {
- in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard);
+ in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard);
IP6STAT_INC(ip6s_cantforward);
} else {
IP6STAT_INC(ip6s_forward);
- in6_ifstat_inc(rt->rt_ifp, ifs6_out_forward);
+ in6_ifstat_inc(nh->nh_ifp, ifs6_out_forward);
if (type)
IP6STAT_INC(ip6s_redirectsent);
else {
@@ -401,7 +434,7 @@
switch (error) {
case 0:
if (type == ND_REDIRECT) {
- icmp6_redirect_output(mcopy, rt);
+ icmp6_redirect_output(mcopy, nh);
goto out;
}
goto freecopy;
@@ -432,6 +465,6 @@
bad:
m_freem(m);
out:
- if (rt != NULL)
- RTFREE(rt);
+ if (nh != NULL)
+ NH_FREE(nh);
}
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -95,6 +95,7 @@
#include <net/if_llatbl.h>
#include <net/netisr.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/pfil.h>
#include <net/rss_config.h>
#include <net/vnet.h>
@@ -403,18 +404,15 @@
* This function may modify ver and hlim only.
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
- * If route_in6 ro is present and has ro_rt initialized, route lookup would be
- * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
- * then result of route lookup is stored in ro->ro_rt.
+ * If route_in6 ro is present and has ro_nh initialized, route lookup would be
+ * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL,
+ * then result of route lookup is stored in ro->ro_nh.
*
* Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu
* is uint32_t. So we use u_long to hold largest one, which is rt_mtu.
*
* ifpp - XXX: just for statistics
*/
-/*
- * XXX TODO: no flowid is assigned for outbound flows?
- */
int
ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
@@ -425,7 +423,7 @@
struct mbuf *m = m0;
struct mbuf *mprev;
struct route_in6 *ro_pmtu;
- struct rtentry *rt;
+ struct nhop_object *nh;
struct sockaddr_in6 *dst, sin6, src_sa, dst_sa;
struct in6_addr odst;
u_char *nexthdrp;
@@ -666,7 +664,7 @@
ip6->ip6_hlim = V_ip6_defmcasthlim;
}
- if (ro == NULL || ro->ro_rt == NULL) {
+ if (ro == NULL || ro->ro_nh == NULL) {
bzero(dst, sizeof(*dst));
dst->sin6_family = AF_INET6;
dst->sin6_len = sizeof(*dst);
@@ -676,29 +674,26 @@
* Validate route against routing table changes.
* Make sure that the address family is set in route.
*/
- rt = NULL;
+ nh = NULL;
ifp = NULL;
mtu = 0;
if (ro != NULL) {
- if (ro->ro_rt != NULL && inp != NULL) {
+ if (ro->ro_nh != NULL && inp != NULL) {
ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */
- RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie,
+ NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie,
fibnum);
}
- if (ro->ro_rt != NULL && fwd_tag == NULL &&
- ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
- ro->ro_rt->rt_ifp == NULL ||
- !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) ||
+ if (ro->ro_nh != NULL && fwd_tag == NULL &&
+ (!NH_IS_VALID(ro->ro_nh) ||
ro->ro_dst.sin6_family != AF_INET6 ||
!IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)))
RO_INVALIDATE_CACHE(ro);
- if (ro->ro_rt != NULL && fwd_tag == NULL &&
- (ro->ro_rt->rt_flags & RTF_UP) &&
+ if (ro->ro_nh != NULL && fwd_tag == NULL &&
ro->ro_dst.sin6_family == AF_INET6 &&
IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
- rt = ro->ro_rt;
- ifp = ro->ro_rt->rt_ifp;
+ nh = ro->ro_nh;
+ ifp = nh->nh_ifp;
} else {
if (ro->ro_lle)
LLE_FREE(ro->ro_lle); /* zeros ro_lle */
@@ -710,7 +705,7 @@
dst_sa.sin6_addr = ip6->ip6_dst;
}
error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp,
- &rt, fibnum);
+ &nh, fibnum, m->m_pkthdr.flowid);
if (error != 0) {
IP6STAT_INC(ip6s_noroute);
if (ifp != NULL)
@@ -720,17 +715,17 @@
if (ifp != NULL)
mtu = ifp->if_mtu;
}
- if (rt == NULL) {
+ if (nh == NULL) {
/*
- * If in6_selectroute() does not return a route entry
+ * If in6_selectroute() does not return nexthop
* dst may not have been updated.
*/
*dst = dst_sa; /* XXX */
} else {
- if (rt->rt_flags & RTF_HOST)
- mtu = rt->rt_mtu;
- ia = (struct in6_ifaddr *)(rt->rt_ifa);
- counter_u64_add(rt->rt_pksent, 1);
+ if (nh->nh_flags & NHF_HOST)
+ mtu = nh->nh_mtu;
+ ia = (struct in6_ifaddr *)(nh->nh_ifa);
+ counter_u64_add(nh->nh_pksent, 1);
}
} else {
struct nhop6_extended nh6;
@@ -763,8 +758,8 @@
}
}
- error = fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0,
- &nh6);
+ error = fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF,
+ m->m_pkthdr.flowid, &nh6);
if (error != 0) {
IP6STAT_INC(ip6s_noroute);
/* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */
@@ -781,7 +776,7 @@
;
}
- /* Then rt (for unicast) and ifp must be non-NULL valid values. */
+ /* Then nh (for unicast) and ifp must be non-NULL valid values. */
if ((flags & IPV6_FORWARDING) == 0) {
/* XXX: the FORWARDING flag can be set for mrouting. */
in6_ifstat_inc(ifp, ifs6_out_request);
@@ -852,8 +847,8 @@
}
/* All scope ID checks are successful. */
- if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
- if (opt && opt->ip6po_nextroute.ro_rt) {
+ if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
+ if (opt && opt->ip6po_nextroute.ro_nh) {
/*
* The nexthop is explicitly specified by the
* application. We assume the next hop is an IPv6
@@ -861,8 +856,8 @@
*/
dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
}
- else if ((rt->rt_flags & RTF_GATEWAY))
- dst = (struct sockaddr_in6 *)rt->rt_gateway;
+ else if ((nh->nh_flags & NHF_GATEWAY))
+ dst = &nh->gw6_sa;
}
if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
@@ -1517,8 +1512,8 @@
mtu = ro_pmtu->ro_mtu;
}
- if (ro_pmtu != NULL && ro_pmtu->ro_rt != NULL)
- mtu = ro_pmtu->ro_rt->rt_mtu;
+ if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL)
+ mtu = ro_pmtu->ro_nh->nh_mtu;
return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
}
@@ -2646,9 +2641,9 @@
if (optname == -1 || optname == IPV6_TCLASS)
pktopt->ip6po_tclass = -1;
if (optname == -1 || optname == IPV6_NEXTHOP) {
- if (pktopt->ip6po_nextroute.ro_rt) {
- RTFREE(pktopt->ip6po_nextroute.ro_rt);
- pktopt->ip6po_nextroute.ro_rt = NULL;
+ if (pktopt->ip6po_nextroute.ro_nh) {
+ NH_FREE(pktopt->ip6po_nextroute.ro_nh);
+ pktopt->ip6po_nextroute.ro_nh = NULL;
}
if (pktopt->ip6po_nexthop)
free(pktopt->ip6po_nexthop, M_IP6OPT);
@@ -2668,9 +2663,9 @@
if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
- if (pktopt->ip6po_route.ro_rt) {
- RTFREE(pktopt->ip6po_route.ro_rt);
- pktopt->ip6po_route.ro_rt = NULL;
+ if (pktopt->ip6po_route.ro_nh) {
+ NH_FREE(pktopt->ip6po_route.ro_nh);
+ pktopt->ip6po_route.ro_nh = NULL;
}
}
if (optname == -1 || optname == IPV6_DSTOPTS) {
Index: sys/netinet6/ip6_var.h
===================================================================
--- sys/netinet6/ip6_var.h
+++ sys/netinet6/ip6_var.h
@@ -416,7 +416,7 @@
uint32_t, struct ifnet *, struct in6_addr *, int *);
int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route_in6 *, struct ifnet **,
- struct rtentry **, u_int);
+ struct nhop_object **, u_int, uint32_t);
u_int32_t ip6_randomid(void);
u_int32_t ip6_randomflowlabel(void);
void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset);
Index: sys/netinet6/nd6.c
===================================================================
--- sys/netinet6/nd6.c
+++ sys/netinet6/nd6.c
@@ -62,6 +62,7 @@
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -136,7 +137,8 @@
static void nd6_llinfo_timer(void *);
static void nd6_llinfo_settimer_locked(struct llentry *, long);
static void clear_llinfo_pqueue(struct llentry *);
-static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
+static void nd6_rtrequest(int, struct rtentry *, struct nhop_object *,
+ struct rt_addrinfo *);
static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *,
const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **);
static int nd6_need_cache(struct ifnet *);
@@ -1526,14 +1528,17 @@
}
static int
-nd6_isdynrte(const struct rtentry *rt, void *xap)
+nd6_isdynrte(const struct rtentry *rt, const struct nhop_object *nh, void *xap)
{
- if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC))
+ int rt_flags = rib_get_entry_rtflags(rt, nh);
+
+ if (rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC))
return (1);
return (0);
}
+
/*
* Remove the rtentry for the given llentry,
* both of which were installed by a redirect.
@@ -1544,6 +1549,7 @@
int fibnum;
struct sockaddr_in6 sin6;
struct rt_addrinfo info;
+ struct rib_cmd_info rc;
lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6);
memset(&info, 0, sizeof(info));
@@ -1551,7 +1557,7 @@
info.rti_filter = nd6_isdynrte;
for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
- rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum);
+ rib_del_route(fibnum, &info, &rc);
}
/*
@@ -1559,14 +1565,15 @@
* processing.
*/
void
-nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info)
+nd6_rtrequest(int req, struct rtentry *rt, struct nhop_object *nh,
+ struct rt_addrinfo *info)
{
struct sockaddr_in6 *gateway;
struct nd_defrouter *dr;
struct ifnet *ifp;
- gateway = (struct sockaddr_in6 *)rt->rt_gateway;
- ifp = rt->rt_ifp;
+ gateway = &nh->gw6_sa;
+ ifp = nh->nh_ifp;
switch (req) {
case RTM_ADD:
@@ -1578,14 +1585,13 @@
/*
* Only indirect routes are interesting.
*/
- if ((rt->rt_flags & RTF_GATEWAY) == 0)
+ if ((nh->nh_flags & NHF_GATEWAY) == 0)
return;
/*
* check for default route
*/
- if (IN6_ARE_ADDR_EQUAL(&in6addr_any,
- &SIN6(rt_key(rt))->sin6_addr)) {
- dr = defrouter_lookup(&gateway->sin6_addr, ifp);
+ if (nh->nh_flags & NHF_DEFAULT) {
+ dr = defrouter_lookup(&nh->gw6_sa.sin6_addr, ifp);
if (dr != NULL) {
dr->installed = 0;
defrouter_rele(dr);
Index: sys/netinet6/nd6_rtr.c
===================================================================
--- sys/netinet6/nd6_rtr.c
+++ sys/netinet6/nd6_rtr.c
@@ -59,8 +59,7 @@
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
-#include <net/route_var.h>
-#include <net/radix.h>
+#include <net/route/nhop.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -603,14 +602,6 @@
m_freem(m);
}
-/* tell the change to user processes watching the routing socket. */
-static void
-nd6_rtmsg(int cmd, struct rtentry *rt)
-{
-
- rt_routemsg(cmd, rt, rt->rt_ifp, 0, rt->rt_fibnum);
-}
-
/* PFXRTR */
static struct nd_pfxrouter *
pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr)
@@ -680,7 +671,8 @@
defrouter_addreq(struct nd_defrouter *new)
{
struct sockaddr_in6 def, mask, gate;
- struct rtentry *newrt = NULL;
+ struct epoch_tracker et;
+ u_int fibnum;
int error;
bzero(&def, sizeof(def));
@@ -692,15 +684,25 @@
def.sin6_family = gate.sin6_family = AF_INET6;
gate.sin6_addr = new->rtaddr;
- error = in6_rtrequest(RTM_ADD, (struct sockaddr *)&def,
- (struct sockaddr *)&gate, (struct sockaddr *)&mask,
- RTF_GATEWAY, &newrt, new->ifp->if_fib);
- if (newrt) {
- nd6_rtmsg(RTM_ADD, newrt); /* tell user process */
- RTFREE(newrt);
- }
- if (error == 0)
+ struct rt_addrinfo info;
+
+ bzero(&info, sizeof(info));
+ info.rti_info[RTAX_DST] = (struct sockaddr *)&def;
+ info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask;
+ info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate;
+ info.rti_flags = RTF_GATEWAY;
+
+ struct rib_cmd_info rc;
+
+ NET_EPOCH_ENTER(et);
+ fibnum = new->ifp->if_fib;
+ error = rib_add_route(fibnum, &info, &rc);
+
+ if (error == 0) {
new->installed = 1;
+ rt_routemsg(RTM_ADD, rc.rt, rc.nh_new, fibnum);
+ }
+ NET_EPOCH_EXIT(et);
}
/*
@@ -712,7 +714,9 @@
defrouter_delreq(struct nd_defrouter *dr)
{
struct sockaddr_in6 def, mask, gate;
- struct rtentry *oldrt = NULL;
+ struct epoch_tracker et;
+ u_int fibnum;
+ int error;
bzero(&def, sizeof(def));
bzero(&mask, sizeof(mask));
@@ -723,15 +727,25 @@
def.sin6_family = gate.sin6_family = AF_INET6;
gate.sin6_addr = dr->rtaddr;
- in6_rtrequest(RTM_DELETE, (struct sockaddr *)&def,
- (struct sockaddr *)&gate,
- (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, dr->ifp->if_fib);
- if (oldrt) {
- nd6_rtmsg(RTM_DELETE, oldrt);
- RTFREE(oldrt);
- }
+ struct rt_addrinfo info;
- dr->installed = 0;
+ bzero(&info, sizeof(info));
+ info.rti_info[RTAX_DST] = (struct sockaddr *)&def;
+ info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask;
+ info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate;
+ info.rti_flags = RTF_GATEWAY;
+
+ struct rib_cmd_info rc;
+
+ NET_EPOCH_ENTER(et);
+ fibnum = dr->ifp->if_fib;
+ error = rib_del_route(fibnum, &info, &rc);
+
+ if (error == 0) {
+ dr->installed = 0;
+ rt_routemsg(RTM_DELETE, rc.rt, rc.nh_old, fibnum);
+ }
+ NET_EPOCH_EXIT(et);
}
static void
@@ -2010,11 +2024,13 @@
nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa)
{
struct sockaddr_dl sdl;
- struct rtentry *rt;
struct sockaddr_in6 mask6;
u_long rtflags;
int error, a_failure, fibnum, maxfib;
+ struct rt_addrinfo info;
+ struct epoch_tracker et;
+
/*
* in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs.
* ifa->ifa_rtrequest = nd6_rtrequest;
@@ -2030,6 +2046,14 @@
sdl.sdl_type = ifa->ifa_ifp->if_type;
sdl.sdl_index = ifa->ifa_ifp->if_index;
+ bzero(&info, sizeof(struct rt_addrinfo));
+ info.rti_info[RTAX_DST] = (struct sockaddr *)&pr->ndpr_prefix;
+ info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask6;
+ info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&sdl;
+ info.rti_flags = rtflags;
+
+ struct rib_cmd_info rc;
+
if(V_rt_add_addr_allfibs) {
fibnum = 0;
maxfib = rt_numfibs;
@@ -2040,17 +2064,10 @@
a_failure = 0;
for (; fibnum < maxfib; fibnum++) {
- rt = NULL;
- error = in6_rtrequest(RTM_ADD,
- (struct sockaddr *)&pr->ndpr_prefix, (struct sockaddr *)&sdl,
- (struct sockaddr *)&mask6, rtflags, &rt, fibnum);
+ NET_EPOCH_ENTER(et);
+ error = rib_add_route(fibnum, &info, &rc);
if (error == 0) {
- KASSERT(rt != NULL, ("%s: in6_rtrequest return no "
- "error(%d) but rt is NULL, pr=%p, ifa=%p", __func__,
- error, pr, ifa));
- RT_LOCK(rt);
- nd6_rtmsg(RTM_ADD, rt);
- RT_UNLOCK(rt);
+ rt_routemsg(RTM_ADD, rc.rt, rc.nh_new, fibnum);
pr->ndpr_stateflags |= NDPRF_ONLINK;
} else {
char ip6buf[INET6_ADDRSTRLEN];
@@ -2071,12 +2088,7 @@
/* Save last error to return, see rtinit(). */
a_failure = error;
}
-
- if (rt != NULL) {
- RT_LOCK(rt);
- RT_REMREF(rt);
- RT_UNLOCK(rt);
- }
+ NET_EPOCH_EXIT(et);
}
/* Return the last error we got. */
@@ -2175,7 +2187,6 @@
struct ifnet *ifp = pr->ndpr_ifp;
struct nd_prefix *opr;
struct sockaddr_in6 sa6, mask6;
- struct rtentry *rt;
char ip6buf[INET6_ADDRSTRLEN];
uint64_t genid;
int fibnum, maxfib, a_failure;
@@ -2204,22 +2215,27 @@
maxfib = fibnum + 1;
}
+ struct rt_addrinfo info;
+
+ bzero(&info, sizeof(info));
+ info.rti_info[RTAX_DST] = (struct sockaddr *)&sa6;
+ info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask6;
+
+ struct rib_cmd_info rc;
+ struct epoch_tracker et;
+
a_failure = 0;
for (; fibnum < maxfib; fibnum++) {
- rt = NULL;
- error = in6_rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL,
- (struct sockaddr *)&mask6, 0, &rt, fibnum);
+ NET_EPOCH_ENTER(et);
+ error = rib_del_route(fibnum, &info, &rc);
if (error == 0) {
/* report the route deletion to the routing socket. */
- if (rt != NULL)
- nd6_rtmsg(RTM_DELETE, rt);
+ rt_routemsg(RTM_DELETE, rc.rt, rc.nh_old, fibnum);
} else {
/* Save last error to return, see rtinit(). */
a_failure = error;
}
- if (rt != NULL) {
- RTFREE(rt);
- }
+ NET_EPOCH_EXIT(et);
}
error = a_failure;
a_failure = 1;
@@ -2406,16 +2422,21 @@
return (0);
}
+struct rt6_args {
+ struct ifnet *ifp;
+ struct in6_addr *gateway;
+};
+
static int
-rt6_deleteroute(const struct rtentry *rt, void *arg)
+rt6_deleteroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg)
{
-#define SIN6(s) ((struct sockaddr_in6 *)s)
- struct in6_addr *gate = (struct in6_addr *)arg;
+ struct rt6_args *args = (struct rt6_args *)arg;
+ int rt_flags = rib_get_entry_rtflags(rt, nh);
- if (rt->rt_gateway == NULL || rt->rt_gateway->sa_family != AF_INET6)
+ if (nh->gw6_sa.sin6_family != AF_INET6)
return (0);
- if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) {
+ if (!IN6_ARE_ADDR_EQUAL(args->gateway, &nh->gw6_sa.sin6_addr)) {
return (0);
}
@@ -2424,18 +2445,17 @@
* XXX: this seems to be a bit ad-hoc. Should we consider the
* 'cloned' bit instead?
*/
- if ((rt->rt_flags & RTF_STATIC) != 0)
+ if ((rt_flags & RTF_STATIC) != 0)
return (0);
/*
* We delete only host route. This means, in particular, we don't
* delete default route.
*/
- if ((rt->rt_flags & RTF_HOST) == 0)
+ if ((rt_flags & RTF_HOST) == 0)
return (0);
return (1);
-#undef SIN6
}
/*
@@ -2446,13 +2466,17 @@
void
rt6_flush(struct in6_addr *gateway, struct ifnet *ifp)
{
+ struct rt6_args args;
/* We'll care only link-local addresses */
if (!IN6_IS_ADDR_LINKLOCAL(gateway))
return;
+ args.ifp = ifp;
+ args.gateway = gateway;
+
/* XXX Do we really need to walk any but the default FIB? */
- rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway);
+ rt_foreach_fib_walk_del(AF_INET6, rt6_deleteroute, (void *)&args);
}
int
Index: sys/netinet6/raw_ip6.c
===================================================================
--- sys/netinet6/raw_ip6.c
+++ sys/netinet6/raw_ip6.c
@@ -66,6 +66,7 @@
#include "opt_ipsec.h"
#include "opt_inet6.h"
+#include "opt_route_mpath.h"
#include <sys/param.h>
#include <sys/errno.h>
@@ -99,6 +100,7 @@
#include <netinet/ip_var.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/ip6_mroute.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
@@ -404,6 +406,9 @@
int use_defzone = 0;
int hlim = 0;
struct in6_addr in6a;
+#ifdef ROUTE_MPATH
+ uint32_t hash_val, hash_type;
+#endif
va_list ap;
va_start(ap, so);
@@ -462,6 +467,15 @@
}
ip6 = mtod(m, struct ip6_hdr *);
+#ifdef ROUTE_MPATH
+ if (V_fib_hash_outbound) {
+ hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
+ &dstsock->sin6_addr, 0, 0, so->so_proto->pr_protocol,
+ &hash_type);
+ inp->inp_flowid = hash_val;
+ inp->inp_flowtype = hash_type;
+ }
+#endif
/*
* Source address selection.
*/
Index: sys/netinet6/scope6.c
===================================================================
--- sys/netinet6/scope6.c
+++ sys/netinet6/scope6.c
@@ -466,6 +466,28 @@
}
/*
+ * Returns scope zone id for the unicast address @in6.
+ *
+ * Returns 0 for global unicast and loopback addresses.
+ * Returns interface index for the link-local addresses.
+ */
+uint32_t
+in6_get_unicast_scopeid(const struct in6_addr *in6, const struct ifnet *ifp)
+{
+
+ if (IN6_IS_SCOPE_LINKLOCAL(in6))
+ return (ifp->if_index);
+ return (0);
+}
+
+void
+in6_set_unicast_scopeid(struct in6_addr *in6, uint32_t scopeid)
+{
+
+ in6->s6_addr16[1] = htons(scopeid & 0xffff);
+}
+
+/*
* Return pointer to ifnet structure, corresponding to the zone id of
* link-local scope.
*/
Index: sys/netinet6/scope6_var.h
===================================================================
--- sys/netinet6/scope6_var.h
+++ sys/netinet6/scope6_var.h
@@ -67,6 +67,9 @@
uint32_t in6_getscopezone(const struct ifnet *, int);
void in6_splitscope(const struct in6_addr *, struct in6_addr *, uint32_t *);
struct ifnet* in6_getlinkifnet(uint32_t);
+uint32_t in6_get_unicast_scopeid(const struct in6_addr *, const struct ifnet *);
+void in6_set_unicast_scopeid(struct in6_addr *, uint32_t);
+
#endif /* _KERNEL */
#endif /* _NETINET6_SCOPE6_VAR_H_ */
Index: sys/netinet6/udp6_usrreq.c
===================================================================
--- sys/netinet6/udp6_usrreq.c
+++ sys/netinet6/udp6_usrreq.c
@@ -1048,6 +1048,7 @@
static int
udp6_attach(struct socket *so, int proto, struct thread *td)
{
+ static uint32_t udp_flowid;
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
int error;
@@ -1071,6 +1072,8 @@
inp->inp_vflag |= INP_IPV6;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
inp->inp_vflag |= INP_IPV4;
+ inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
+ inp->inp_flowtype = M_HASHTYPE_OPAQUE;
inp->in6p_hops = -1; /* use kernel default */
inp->in6p_cksum = -1; /* just to be sure */
/*
Index: sys/netpfil/ipfw/ip_fw_table_algo.c
===================================================================
--- sys/netpfil/ipfw/ip_fw_table_algo.c
+++ sys/netpfil/ipfw/ip_fw_table_algo.c
@@ -51,7 +51,6 @@
#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
#include <net/radix.h>
#include <net/route.h>
-#include <net/route_var.h>
#include <netinet/in.h>
#include <netinet/in_fib.h>
@@ -3918,10 +3917,10 @@
tinfo->flags = IPFW_TATFLAGS_AFDATA;
tinfo->taclass4 = IPFW_TACLASS_RADIX;
tinfo->count4 = 0;
- tinfo->itemsize4 = sizeof(struct rtentry);
+ tinfo->itemsize4 = 0;
tinfo->taclass6 = IPFW_TACLASS_RADIX;
tinfo->count6 = 0;
- tinfo->itemsize6 = sizeof(struct rtentry);
+ tinfo->itemsize6 = 0;
}
static int
@@ -3943,11 +3942,17 @@
ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e,
ipfw_obj_tentry *tent)
{
- struct rtentry *rte;
+ struct rtentry *rt;
+ struct sockaddr_in6 dst, mask;
- rte = (struct rtentry *)e;
+ rt = (struct rtentry *)e;
- return ta_dump_kfib_tentry_int(rt_key(rte), rt_mask(rte), tent);
+ dst.sin6_len = sizeof(struct sockaddr_in6);
+ mask.sin6_len = sizeof(struct sockaddr_in6);
+
+ rib_get_entry_prefix(rt, (struct sockaddr *)&dst, (struct sockaddr *)&mask, NULL);
+
+ return ta_dump_kfib_tentry_int((struct sockaddr *)&dst, (struct sockaddr *)&mask, tent);
}
static int
@@ -4047,23 +4052,9 @@
ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f,
void *arg)
{
- RIB_RLOCK_TRACKER;
- struct rib_head *rh;
- int error;
- rh = rt_tables_get_rnh(ti->data, AF_INET);
- if (rh != NULL) {
- RIB_RLOCK(rh);
- error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
- RIB_RUNLOCK(rh);
- }
-
- rh = rt_tables_get_rnh(ti->data, AF_INET6);
- if (rh != NULL) {
- RIB_RLOCK(rh);
- error = rh->rnh_walktree(&rh->head, (walktree_f_t *)f, arg);
- RIB_RUNLOCK(rh);
- }
+ rib_walk(AF_INET, ti->data, (rt_walktree_f_t *)f, arg);
+ rib_walk(AF_INET6, ti->data, (rt_walktree_f_t *)f, arg);
}
struct table_algo addr_kfib = {
Index: sys/netpfil/pf/pf.c
===================================================================
--- sys/netpfil/pf/pf.c
+++ sys/netpfil/pf/pf.c
@@ -69,7 +69,6 @@
#include <net/if_types.h>
#include <net/if_vlan_var.h>
#include <net/route.h>
-#include <net/radix_mpath.h>
#include <net/vnet.h>
#include <net/pfil.h>
@@ -5338,122 +5337,12 @@
return (p);
}
-#ifdef RADIX_MPATH
-static int
-pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
- int rtableid)
-{
- struct radix_node_head *rnh;
- struct sockaddr_in *dst;
- int ret = 1;
- int check_mpath;
-#ifdef INET6
- struct sockaddr_in6 *dst6;
- struct route_in6 ro;
-#else
- struct route ro;
-#endif
- struct radix_node *rn;
- struct rtentry *rt;
- struct ifnet *ifp;
-
- check_mpath = 0;
- /* XXX: stick to table 0 for now */
- rnh = rt_tables_get_rnh(0, af);
- if (rnh != NULL && rn_mpath_capable(rnh))
- check_mpath = 1;
- bzero(&ro, sizeof(ro));
- switch (af) {
- case AF_INET:
- dst = satosin(&ro.ro_dst);
- dst->sin_family = AF_INET;
- dst->sin_len = sizeof(*dst);
- dst->sin_addr = addr->v4;
- break;
-#ifdef INET6
- case AF_INET6:
- /*
- * Skip check for addresses with embedded interface scope,
- * as they would always match anyway.
- */
- if (IN6_IS_SCOPE_EMBED(&addr->v6))
- goto out;
- dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
- dst6->sin6_family = AF_INET6;
- dst6->sin6_len = sizeof(*dst6);
- dst6->sin6_addr = addr->v6;
- break;
-#endif /* INET6 */
- default:
- return (0);
- }
-
- /* Skip checks for ipsec interfaces */
- if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
- goto out;
-
- switch (af) {
-#ifdef INET6
- case AF_INET6:
- in6_rtalloc_ign(&ro, 0, rtableid);
- break;
-#endif
-#ifdef INET
- case AF_INET:
- in_rtalloc_ign((struct route *)&ro, 0, rtableid);
- break;
-#endif
- }
-
- if (ro.ro_rt != NULL) {
- /* No interface given, this is a no-route check */
- if (kif == NULL)
- goto out;
-
- if (kif->pfik_ifp == NULL) {
- ret = 0;
- goto out;
- }
-
- /* Perform uRPF check if passed input interface */
- ret = 0;
- rn = (struct radix_node *)ro.ro_rt;
- do {
- rt = (struct rtentry *)rn;
- ifp = rt->rt_ifp;
-
- if (kif->pfik_ifp == ifp)
- ret = 1;
- rn = rn_mpath_next(rn);
- } while (check_mpath == 1 && rn != NULL && ret == 0);
- } else
- ret = 0;
-out:
- if (ro.ro_rt != NULL)
- RTFREE(ro.ro_rt);
- return (ret);
-}
-#endif
-
int
pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
int rtableid)
{
-#ifdef INET
- struct nhop4_basic nh4;
-#endif
-#ifdef INET6
- struct nhop6_basic nh6;
-#endif
struct ifnet *ifp;
-#ifdef RADIX_MPATH
- struct radix_node_head *rnh;
- /* XXX: stick to table 0 for now */
- rnh = rt_tables_get_rnh(0, af);
- if (rnh != NULL && rn_mpath_capable(rnh))
- return (pf_routable_oldmpath(addr, af, kif, rtableid));
-#endif
/*
* Skip check for addresses with embedded interface scope,
* as they would always match anyway.
@@ -5468,35 +5357,21 @@
if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
return (1);
- ifp = NULL;
+ ifp = (kif != NULL) ? kif->pfik_ifp : NULL;
switch (af) {
#ifdef INET6
case AF_INET6:
- if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0)
- return (0);
- ifp = nh6.nh_ifp;
- break;
+ return (fib6_lookup_urpf(rtableid, &addr->v6, 0, NHR_NONE,
+ ifp));
#endif
#ifdef INET
case AF_INET:
- if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0)
- return (0);
- ifp = nh4.nh_ifp;
- break;
+ return (fib4_lookup_urpf(rtableid, addr->v4, 0, NHR_NONE,
+ ifp));
#endif
}
- /* No interface given, this is a no-route check */
- if (kif == NULL)
- return (1);
-
- if (kif->pfik_ifp == NULL)
- return (0);
-
- /* Perform uRPF check if passed input interface */
- if (kif->pfik_ifp == ifp)
- return (1);
return (0);
}
Index: sys/nfs/bootp_subr.c
===================================================================
--- sys/nfs/bootp_subr.c
+++ sys/nfs/bootp_subr.c
@@ -347,6 +347,7 @@
bootpboot_p_sa(rt_key(rt), rt_mask(rt));
printf(" ");
+ /* XXX: fix this */
bootpboot_p_sa(rt->rt_gateway, NULL);
printf(" ");
printf("flags %x", (unsigned short) rt->rt_flags);
@@ -1082,11 +1083,12 @@
clear_sinaddr(&defdst);
clear_sinaddr(&defmask);
- error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&defdst,
- (struct sockaddr *) &ifctx->gw, (struct sockaddr *)&defmask,
- (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB);
+ error = rib_request_simple(RIB_ADD, RT_DEFAULT_FIB,
+ (struct sockaddr *)&defdst, (struct sockaddr *)&defmask,
+ (struct sockaddr *) &ifctx->gw, RTF_UP | RTF_GATEWAY | RTF_STATIC);
+
if (error != 0) {
- printf("%s: RTM_ADD, error=%d\n", __func__, error);
+ printf("%s: RIB_ADD, error=%d\n", __func__, error);
}
}
@@ -1103,11 +1105,11 @@
clear_sinaddr(&defdst);
clear_sinaddr(&defmask);
- error = rtrequest_fib(RTM_DELETE, (struct sockaddr *)&defdst,
- (struct sockaddr *) &ifctx->gw, (struct sockaddr *)&defmask,
- (RTF_UP | RTF_GATEWAY | RTF_STATIC), NULL, RT_DEFAULT_FIB);
+ error = rib_request_simple(RIB_DEL, RT_DEFAULT_FIB,
+ (struct sockaddr *)&defdst, (struct sockaddr *)&defmask,
+ (struct sockaddr *) &ifctx->gw, 0);
if (error != 0) {
- printf("%s: RTM_DELETE, error=%d\n", __func__, error);
+ printf("%s: RIB_DEL, error=%d\n", __func__, error);
}
}
Index: sys/ofed/drivers/infiniband/core/ib_addr.c
===================================================================
--- sys/ofed/drivers/infiniband/core/ib_addr.c
+++ sys/ofed/drivers/infiniband/core/ib_addr.c
@@ -44,14 +44,17 @@
#include <linux/workqueue.h>
#include <linux/module.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/netevent.h>
#include <rdma/ib_addr.h>
#include <rdma/ib.h>
+#include <netinet/in_fib.h>
#include <netinet/if_ether.h>
#include <netinet/ip_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/in6_pcb.h>
+#include <netinet6/in6_fib.h>
#include "core_priv.h"
@@ -275,7 +278,7 @@
struct sockaddr_in dst_tmp = *dst_in;
in_port_t src_port;
struct sockaddr *saddr = NULL;
- struct rtentry *rte;
+ struct nhop_object *nh;
struct ifnet *ifp;
int error;
int type;
@@ -293,8 +296,7 @@
type |= ADDR_DST_ANY;
/*
- * Make sure the socket address length field
- * is set, else rtalloc1() will fail.
+ * Make sure the socket address length field is set.
*/
dst_tmp.sin_len = sizeof(dst_tmp);
@@ -303,16 +305,12 @@
case ADDR_VALID:
case ADDR_SRC_ANY:
/* regular destination route lookup */
- rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0);
- if (rte == NULL) {
+ nh = fib4_lookup_nh_ptr(RT_DEFAULT_FIB, dst_tmp.sin_addr,
+ 0, NHR_NONE, 0);
+ if (nh == NULL) {
error = EHOSTUNREACH;
goto done;
- } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) {
- RTFREE_LOCKED(rte);
- error = EHOSTUNREACH;
- goto done;
}
- RT_UNLOCK(rte);
break;
default:
error = ENETUNREACH;
@@ -332,14 +330,14 @@
/* check source interface */
if (ifp == NULL) {
error = ENETUNREACH;
- goto error_rt_free;
+ goto done;
} else if (ifp->if_flags & IFF_LOOPBACK) {
/*
* Source address cannot be a loopback device.
*/
error = EHOSTUNREACH;
goto error_put_ifp;
- } else if (rte->rt_ifp->if_flags & IFF_LOOPBACK) {
+ } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) {
if (memcmp(&src_in->sin_addr, &dst_in->sin_addr,
sizeof(src_in->sin_addr))) {
/*
@@ -352,9 +350,9 @@
}
/* get destination network interface from route */
dev_put(ifp);
- ifp = rte->rt_ifp;
+ ifp = nh->nh_ifp;
dev_hold(ifp);
- } else if (ifp != rte->rt_ifp) {
+ } else if (ifp != nh->nh_ifp) {
/*
* Source and destination interfaces are
* different.
@@ -365,13 +363,13 @@
break;
case ADDR_SRC_ANY:
/* check for loopback device */
- if (rte->rt_ifp->if_flags & IFF_LOOPBACK)
+ if (nh->nh_ifp->if_flags & IFF_LOOPBACK)
saddr = (struct sockaddr *)&dst_tmp;
else
- saddr = rte->rt_ifa->ifa_addr;
+ saddr = nh->nh_ifa->ifa_addr;
/* get destination network interface from route */
- ifp = rte->rt_ifp;
+ ifp = nh->nh_ifp;
dev_hold(ifp);
break;
default:
@@ -386,7 +384,7 @@
ifp->if_addrlen, MAX_ADDR_LEN);
error = 0;
} else if (IN_MULTICAST(ntohl(dst_tmp.sin_addr.s_addr))) {
- bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0;
+ bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0;
error = addr_resolve_multi(edst, ifp, (struct sockaddr *)&dst_tmp);
if (error != 0)
goto error_put_ifp;
@@ -396,10 +394,10 @@
memset(edst, 0, MAX_ADDR_LEN);
error = 0;
} else {
- bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0;
+ bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0;
memset(edst, 0, MAX_ADDR_LEN);
error = arpresolve(ifp, is_gw, NULL, is_gw ?
- rte->rt_gateway : (const struct sockaddr *)&dst_tmp,
+ &nh->gw_sa : (const struct sockaddr *)&dst_tmp,
edst, NULL, NULL);
if (error != 0)
goto error_put_ifp;
@@ -416,17 +414,12 @@
src_in->sin_port = src_port; /* preserve port number */
}
- if (rte != NULL)
- RTFREE(rte);
-
*ifpp = ifp;
goto done;
error_put_ifp:
dev_put(ifp);
-error_rt_free:
- RTFREE(rte);
done:
CURVNET_RESTORE();
@@ -460,7 +453,7 @@
struct sockaddr_in6 dst_tmp = *dst_in;
in_port_t src_port;
struct sockaddr *saddr = NULL;
- struct rtentry *rte;
+ struct nhop_object *nh;
struct ifnet *ifp;
int error;
int type;
@@ -478,8 +471,7 @@
type |= ADDR_DST_ANY;
/*
- * Make sure the socket address length field
- * is set, else rtalloc1() will fail.
+ * Make sure the socket address length field is set.
*/
dst_tmp.sin6_len = sizeof(dst_tmp);
@@ -502,16 +494,12 @@
/* FALLTHROUGH */
case ADDR_SRC_ANY:
/* regular destination route lookup */
- rte = rtalloc1((struct sockaddr *)&dst_tmp, 1, 0);
- if (rte == NULL) {
+ nh = fib6_lookup_nh_ptr(RT_DEFAULT_FIB, &dst_in->sin6_addr,
+ addr->bound_dev_if, NHR_NONE, 0);
+ if (nh == NULL) {
error = EHOSTUNREACH;
goto done;
- } else if (rte->rt_ifp == NULL || RT_LINK_IS_UP(rte->rt_ifp) == 0) {
- RTFREE_LOCKED(rte);
- error = EHOSTUNREACH;
- goto done;
}
- RT_UNLOCK(rte);
break;
default:
error = ENETUNREACH;
@@ -531,14 +519,14 @@
/* check source interface */
if (ifp == NULL) {
error = ENETUNREACH;
- goto error_rt_free;
+ goto done;
} else if (ifp->if_flags & IFF_LOOPBACK) {
/*
* Source address cannot be a loopback device.
*/
error = EHOSTUNREACH;
goto error_put_ifp;
- } else if (rte->rt_ifp->if_flags & IFF_LOOPBACK) {
+ } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) {
if (memcmp(&src_in->sin6_addr, &dst_in->sin6_addr,
sizeof(src_in->sin6_addr))) {
/*
@@ -551,9 +539,9 @@
}
/* get destination network interface from route */
dev_put(ifp);
- ifp = rte->rt_ifp;
+ ifp = nh->nh_ifp;
dev_hold(ifp);
- } else if (ifp != rte->rt_ifp) {
+ } else if (ifp != nh->nh_ifp) {
/*
* Source and destination interfaces are
* different.
@@ -564,13 +552,13 @@
break;
case ADDR_SRC_ANY:
/* check for loopback device */
- if (rte->rt_ifp->if_flags & IFF_LOOPBACK)
+ if (nh->nh_ifp->if_flags & IFF_LOOPBACK)
saddr = (struct sockaddr *)&dst_tmp;
else
- saddr = rte->rt_ifa->ifa_addr;
+ saddr = nh->nh_ifa->ifa_addr;
/* get destination network interface from route */
- ifp = rte->rt_ifp;
+ ifp = nh->nh_ifp;
dev_hold(ifp);
break;
default:
@@ -581,21 +569,21 @@
* Step 3 - resolve destination MAC address
*/
if (IN6_IS_ADDR_MULTICAST(&dst_tmp.sin6_addr)) {
- bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0;
+ bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0;
error = addr_resolve_multi(edst, ifp,
(struct sockaddr *)&dst_tmp);
if (error != 0)
goto error_put_ifp;
else if (is_gw)
addr->network = RDMA_NETWORK_IPV6;
- } else if (rte->rt_ifp->if_flags & IFF_LOOPBACK) {
+ } else if (nh->nh_ifp->if_flags & IFF_LOOPBACK) {
memset(edst, 0, MAX_ADDR_LEN);
error = 0;
} else {
- bool is_gw = (rte->rt_flags & RTF_GATEWAY) != 0;
+ bool is_gw = (nh->nh_flags & NHF_GATEWAY) != 0;
memset(edst, 0, MAX_ADDR_LEN);
error = nd6_resolve(ifp, is_gw, NULL, is_gw ?
- rte->rt_gateway : (const struct sockaddr *)&dst_tmp,
+ &nh->gw_sa : (const struct sockaddr *)&dst_tmp,
edst, NULL, NULL);
if (error != 0)
goto error_put_ifp;
@@ -612,17 +600,12 @@
src_in->sin6_port = src_port; /* preserve port number */
}
- if (rte != NULL)
- RTFREE(rte);
-
*ifpp = ifp;
goto done;
error_put_ifp:
dev_put(ifp);
-error_rt_free:
- RTFREE(rte);
done:
CURVNET_RESTORE();
Index: sys/ofed/drivers/infiniband/core/ib_cma.c
===================================================================
--- sys/ofed/drivers/infiniband/core/ib_cma.c
+++ sys/ofed/drivers/infiniband/core/ib_cma.c
@@ -50,10 +50,14 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <net/route.h>
+#include <net/route/nhop.h>
#include <net/tcp.h>
#include <net/ipv6.h>
+#include <netinet/in_fib.h>
+
+#include <netinet6/in6_fib.h>
#include <netinet6/scope6_var.h>
#include <netinet6/ip6_var.h>
@@ -1355,7 +1359,7 @@
__be32 daddr = dst_addr->sin_addr.s_addr,
saddr = src_addr->sin_addr.s_addr;
struct net_device *dst_dev;
- struct rtentry *rte;
+ struct nhop_object *nh;
bool ret;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
@@ -1385,13 +1389,12 @@
src_tmp.sin_len = sizeof(src_tmp);
CURVNET_SET(net_dev->if_vnet);
- rte = rtalloc1((struct sockaddr *)&src_tmp, 1, 0);
- if (rte != NULL) {
- ret = (rte->rt_ifp == net_dev);
- RTFREE_LOCKED(rte);
- } else {
+ nh = fib4_lookup_nh_ptr(RT_DEFAULT_FIB, src_addr->sin_addr,
+ 0, NHR_NONE, 0);
+ if (nh != NULL)
+ ret = (nh->nh_ifp == net_dev);
+ else
ret = false;
- }
CURVNET_RESTORE();
return ret;
#else
@@ -1407,7 +1410,7 @@
struct sockaddr_in6 src_tmp = *src_addr;
struct sockaddr_in6 dst_tmp = *dst_addr;
struct net_device *dst_dev;
- struct rtentry *rte;
+ struct nhop_object *nh;
bool ret;
dst_dev = ip6_dev_find(net_dev->if_vnet, dst_tmp.sin6_addr,
@@ -1446,13 +1449,12 @@
ret = true;
} else {
/* non-loopback case */
- rte = rtalloc1((struct sockaddr *)&src_tmp, 1, 0);
- if (rte != NULL) {
- ret = (rte->rt_ifp == net_dev);
- RTFREE_LOCKED(rte);
- } else {
+ nh = fib6_lookup_nh_ptr(RT_DEFAULT_FIB, &src_addr->sin6_addr,
+ net_dev->if_index, NHR_NONE, 0);
+ if (nh != NULL)
+ ret = (nh->nh_ifp == net_dev);
+ else
ret = false;
- }
}
CURVNET_RESTORE();
return ret;
@@ -1512,6 +1514,7 @@
*src_addr = (struct sockaddr *)&src_addr_storage;
struct net_device *net_dev;
const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+ struct epoch_tracker et;
int err;
err = cma_save_ip_info(listen_addr, src_addr, ib_event,
@@ -1530,10 +1533,13 @@
if (!net_dev)
return ERR_PTR(-ENODEV);
+ NET_EPOCH_ENTER(et);
if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+ NET_EPOCH_EXIT(et);
dev_put(net_dev);
return ERR_PTR(-EHOSTUNREACH);
}
+ NET_EPOCH_EXIT(et);
return net_dev;
}
Index: sys/sys/socket.h
===================================================================
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -416,6 +416,8 @@
#define NET_RT_IFMALIST 4 /* return multicast address list */
#define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en
* versions of msghdr structs. */
+#define NET_RT_NHOP 6 /* dump routing nexthops */
+#define NET_RT_NHGROUPS 7 /* dump routing mulipath groups */
#endif /* __BSD_VISIBLE */
/*
Index: sys/tests/routing/module.h
===================================================================
--- /dev/null
+++ sys/tests/routing/module.h
@@ -0,0 +1,39 @@
+#ifndef SYS_TESTS_ROUTING_MODULE_H_
+#define SYS_TESTS_ROUTING_MODULE_H_
+
+typedef int (ktest_f_t)(void);
+
+struct ktest_item {
+ char *name;
+ ktest_f_t *fn;
+ char *descr;
+};
+#define DECLARE_KTEST(_f) {#_f, &(_f), ""}
+#define DECLARE_KTEST_DESC(_f, _d) {#_f, &(_f), _d}
+
+struct ktests {
+ char *name;
+ char *descr;
+ struct ktest_item *tests;
+ int num_tests;
+};
+
+#define ARRAYLEN(_a) (sizeof(_a) / sizeof(_a[0]))
+#define DEFINE_KTESTS(_name, _descr, _tests) \
+ struct ktests kt_##_name = {#_name, #_descr, _tests, ARRAYLEN(_tests)}
+
+#define DECLARE_KTESTS(_name) extern struct ktests kt_##_name
+
+#define TPRINTF(_arg, ...) printf("KTEST:%s:%d " _arg "\n", __func__, __LINE__, ##__VA_ARGS__)
+
+#define TASSERT(_cond, _fmt, ...) do { \
+ if (!(_cond)) { \
+ TPRINTF(_fmt, ##__VA_ARGS__); \
+ error = EINVAL; \
+ } \
+} while (0);
+
+
+DECLARE_KTESTS(route_ctl);
+
+#endif
Index: sys/tests/routing/module.c
===================================================================
--- /dev/null
+++ sys/tests/routing/module.c
@@ -0,0 +1,163 @@
+/*-
+ * Copyright (c) 2019, Alexander V. Chernikov <melifaro@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of Alexander V. Chernikov nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/counter.h>
+#include <sys/epoch.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include "tests/routing/module.h"
+
+static int inited;
+#define ET_EXITING 0x1
+static volatile int state_flags;
+
+struct sysctl_ctx_list ctx;
+static struct ktest_item *ki = NULL;
+static int ki_size = 0, ki_count = 0;
+
+static int
+invoke_test_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error, v;
+ struct ktest_item *item;
+
+ if (inited == 0)
+ return (ENOENT);
+
+ v = 0;
+ error = sysctl_handle_int(oidp, &v, 0, req);
+ if (error)
+ return (error);
+ if (req->newptr == NULL)
+ return (error);
+ if (v == 0)
+ return (0);
+
+ item = &ki[arg2];
+ printf("running item %d: %s\n", (int)arg2, item->name);
+ error = item->fn();
+ printf("done running item %d: %s - ret %d\n", (int)arg2, item->name, error);
+
+ return (error);
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, test, CTLFLAG_RW, 0, "Test framework");
+SYSCTL_NODE(_kern_test, OID_AUTO, routing, CTLFLAG_RW, 0, "Routing test framework");
+
+static int
+test_init_one(struct ktests *kt)
+{
+ struct sysctl_oid *tree;
+
+ if (kt->num_tests + ki_count > ki_size) {
+ size_t new_size = roundup2(kt->num_tests + ki_count, 32);
+ void *new_ptr;
+ new_ptr = malloc(new_size * sizeof(struct ktest_item), M_TEMP, M_WAITOK | M_ZERO);
+ if (ki_count > 0)
+ memcpy(new_ptr, ki, ki_count * sizeof(struct ktest_item));
+ free(ki, M_TEMP);
+ ki = new_ptr;
+ ki_size = new_size;
+ }
+
+ tree = SYSCTL_ADD_NODE(&ctx, SYSCTL_STATIC_CHILDREN(_kern_test_routing),
+ OID_AUTO, kt->name, CTLFLAG_RW, 0, "routing tests");
+
+ memcpy(&ki[ki_count], kt->tests, kt->num_tests * sizeof(struct ktest_item));
+
+ for (int i = 0; i < kt->num_tests; i++) {
+ SYSCTL_ADD_PROC(&ctx, SYSCTL_CHILDREN(tree), OID_AUTO,
+ kt->tests[i].name, (CTLTYPE_INT | CTLFLAG_RW), NULL, ki_count + i,
+ invoke_test_handler, "I", kt->tests[i].descr);
+ }
+ ki_count += kt->num_tests;
+
+ return (0);
+}
+
+static int
+test_modinit(void)
+{
+ sysctl_ctx_init(&ctx);
+
+ test_init_one(&kt_route_ctl);
+
+ inited = 1;
+ return (0);
+}
+
+
+static int
+routing_test_module_event_handler(module_t mod, int what, void *arg __unused)
+{
+ int err;
+
+ switch (what) {
+ case MOD_LOAD:
+ if ((err = test_modinit()) != 0)
+ return (err);
+ break;
+ case MOD_UNLOAD:
+ //mtx_lock(&state_mtx);
+ state_flags = ET_EXITING;
+ sysctl_ctx_free(&ctx);
+ free(ki, M_TEMP);
+ //wakeup(&state_mtx);
+ //mtx_unlock(&state_mtx);
+ /* yes --- gross */
+ pause("epoch unload", 2 * hz);
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ return (0);
+}
+
+static moduledata_t routing_test_moduledata = {
+ "routing_test",
+ routing_test_module_event_handler,
+ NULL
+};
+
+MODULE_VERSION(routing_test, 1);
+DECLARE_MODULE(routing_test, routing_test_moduledata, SI_SUB_PSEUDO, SI_ORDER_ANY);
Index: sys/tests/routing/test_route_ctl.h
===================================================================
--- /dev/null
+++ sys/tests/routing/test_route_ctl.h
@@ -0,0 +1,73 @@
+#ifndef _SYS_TESTS_ROUTING_TEST_ROUTE_CTL_H_
+#define _SYS_TESTS_ROUTING_TEST_ROUTE_CTL_H_
+
+int create_rte_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry **ret_rt);
+
+int create_nhop_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object **nh_ret);
+
+int create_rt_nh_pair_from_info_wrapper(struct rib_head *rnh,
+ struct rt_addrinfo *info, struct rtentry **ret_rt);
+
+int add_route_wrapper(struct rib_head *rnh, struct rtentry *rt_new,
+ struct rt_addrinfo *info, struct rib_cmd_info *rc);
+
+int del_route_one_wrapper(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info);
+
+int change_route_wrapper(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc);
+
+#ifndef _TEST_CALLER
+int
+create_nhop_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct nhop_object **nh_ret)
+{
+
+ return (create_nhop_from_info(rnh, info, nh_ret));
+}
+
+int
+create_rte_from_info_wrapper(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rtentry **ret_rt)
+{
+
+ return (create_rte_from_info(rnh, info, ret_rt));
+}
+
+int
+create_rt_nh_pair_from_info_wrapper(struct rib_head *rnh,
+ struct rt_addrinfo *info, struct rtentry **ret_rt)
+{
+
+ return (create_rt_nh_pair_from_info(rnh, info, ret_rt));
+}
+
+int
+add_route_wrapper(struct rib_head *rnh, struct rtentry *rt_new,
+ struct rt_addrinfo *info, struct rib_cmd_info *rc)
+{
+
+ return (add_route(rnh, rt_new, info, rc));
+}
+
+int
+del_route_one_wrapper(struct rib_head *rnh, struct rtentry *rt,
+ struct rt_addrinfo *info)
+{
+
+ return (del_route_one(rnh, rt, info));
+}
+
+int
+change_route_wrapper(struct rib_head *rnh, struct rt_addrinfo *info,
+ struct rib_cmd_info *rc)
+{
+
+ return (change_route(rnh, info, rc));
+}
+#endif
+
+#endif
+
Index: sys/tests/routing/test_route_ctl.c
===================================================================
--- /dev/null
+++ sys/tests/routing/test_route_ctl.c
@@ -0,0 +1,451 @@
+/*-
+ * Copyright (c) 2020, Alexander V. Chernikov <melifaro@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of Alexander V. Chernikov nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/domain.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/epoch.h>
+#include <sys/sysctl.h>
+
+#include <netinet/in.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route_var.h>
+#include <net/route/rtentry_var.h>
+
+#include <net/route/nhop.h>
+
+#include "tests/routing/module.h"
+#define _TEST_CALLER
+#include "tests/routing/test_route_ctl.h"
+
+static struct rib_head *
+create_rnh(int family, u_long fibnum)
+{
+ struct domain *dom;
+ struct rib_head *rnh;
+
+ for (dom = domains; dom; dom = dom->dom_next) {
+ if (dom->dom_family != family)
+ continue;
+ dom->dom_rtattach((void **)&rnh, 0, fibnum);
+ return (rnh);
+ }
+
+ return (NULL);
+}
+
+static void
+free_rnh(struct rib_head *rnh)
+{
+ struct domain *dom;
+
+ if (rnh == NULL)
+ return;
+
+ for (dom = domains; dom; dom = dom->dom_next) {
+ if (dom->dom_family != rnh->rib_family)
+ continue;
+ dom->dom_rtdetach((void **)&rnh, 0);
+ break;
+ }
+
+}
+
+static size_t
+fill_sa(struct sockaddr *sa, const char *addr)
+{
+ size_t sz;
+
+ if (strchr(addr, ':')) {
+ struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
+
+ sz = sizeof(struct sockaddr_in6);
+ bzero(sa6, sz);
+ sa6->sin6_family = AF_INET6;
+ sa6->sin6_len = sz;
+ inet_pton(AF_INET6, addr, &sa6->sin6_addr);
+ } else {
+ struct sockaddr_in *sa4 = (struct sockaddr_in *)sa;
+
+ sz = sizeof(struct sockaddr_in);
+ bzero(sa4, sz);
+ sa4->sin_family = AF_INET;
+ sa4->sin_len = sz;
+ inet_pton(AF_INET, addr, &sa4->sin_addr);
+ }
+
+ return (sz);
+}
+
+static void
+sa_fill_mask4(struct sockaddr_in *sin, int plen)
+{
+
+ memset(sin, 0, sizeof(struct sockaddr_in));
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(struct sockaddr_in);
+ sin->sin_addr.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
+}
+
+static void
+sa_fill_mask6(struct sockaddr_in6 *sin6, uint8_t mask)
+{
+ uint32_t *cp;
+
+ memset(sin6, 0, sizeof(struct sockaddr_in6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(struct sockaddr_in6);
+
+ for (cp = (uint32_t *)&sin6->sin6_addr; mask >= 32; mask -= 32)
+ *cp++ = 0xFFFFFFFF;
+ if (mask > 0)
+ *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
+}
+
+
+static struct rt_addrinfo *
+get_info(char *buf, char *_prefix, char *gw_s)
+{
+ struct rt_addrinfo *info;
+ struct sockaddr *dst, *gw;
+ struct sockaddr_in6 *sa6;
+ struct sockaddr_in *sa4;
+ char prefix[128], *d;
+ struct epoch_tracker et;
+ size_t sz;
+
+ info = (struct rt_addrinfo *)buf;
+ buf += sizeof(struct rt_addrinfo);
+
+ bzero(info, sizeof(struct rt_addrinfo));
+
+ strlcpy(prefix, _prefix, sizeof(prefix));
+ d = strchr(prefix, '/');
+ if (d != NULL) {
+ *d++ = '\0';
+ if (strchr(prefix, ':')) {
+ sa6 = (struct sockaddr_in6 *)buf;
+ sa_fill_mask6(sa6, strtol(d, NULL, 10));
+ sz = sa6->sin6_len;
+ } else {
+ sa4 = (struct sockaddr_in *)buf;
+ sa_fill_mask4(sa4, strtol(d, NULL, 10));
+ sz = sa4->sin_len;
+ }
+ info->rti_info[RTAX_NETMASK] = (struct sockaddr *)buf;
+ buf += sz;
+ }
+
+ dst = (struct sockaddr *)buf;
+ buf += fill_sa(dst, prefix);
+ info->rti_info[RTAX_DST] = dst;
+
+ if (gw_s != NULL) {
+ gw = (struct sockaddr *)buf;
+ buf += fill_sa(gw, gw_s);
+ info->rti_info[RTAX_GATEWAY] = gw;
+
+ NET_EPOCH_ENTER(et);
+ info->rti_ifa = ifa_ifwithnet(gw, 0, 0);
+ NET_EPOCH_EXIT(et);
+ if (info->rti_ifa != NULL)
+ info->rti_ifp = info->rti_ifa->ifa_ifp;
+ }
+
+ return (info);
+}
+
+static int
+test_add_route_plain_add_success()
+{
+ int error;
+ struct rib_cmd_info rc;
+ struct rt_addrinfo *info;
+ struct rib_head *rnh;
+ struct rtentry *rt;
+ struct radix_node *rn;
+ struct epoch_tracker et;
+
+ char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO);
+
+ rnh = create_rnh(AF_INET6, 0);
+ info = get_info(buf, "2001:db8:1::/64", "::1");
+
+ if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) {
+ TPRINTF("failed to create info");
+ free_rnh(rnh);
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+
+ /* done by rib_add_route() */
+ bzero(&rc, sizeof(struct rib_cmd_info));
+
+ NET_EPOCH_ENTER(et);
+
+ error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt);
+ if (error != 0) {
+ TPRINTF("create_rt_nh_pair_from_info() failed: %d", error);
+ } else {
+ error = add_route_wrapper(rnh, rt, info, &rc);
+ if (error == 0) {
+ rn = rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head);
+ TASSERT((struct rtentry *)rn == rt, "inserted rt not found");
+ /* verify rc */
+ TASSERT(rc.cmd == RTM_ADD, "cmd!=RTM_ADD:%d", rc.cmd);
+ TASSERT(rc.rt == rt, "rc.rt!=rt");
+ TASSERT(rc.nh_old == NULL, "rc.nh_old!=NULL");
+ TASSERT(rc.nh_new == rt->rt_nhop, "rc.nh_new!=rt.rt_nhop");
+ } else {
+ TPRINTF("add_route() returned %d", error);
+ }
+ }
+ NET_EPOCH_EXIT(et);
+ free(buf, M_TEMP);
+ free_rnh(rnh);
+
+ return (error);
+}
+
+static int
+test_add_route_exist_fail()
+{
+ int error;
+ struct rib_cmd_info rc;
+ struct rt_addrinfo *info;
+ struct rib_head *rnh;
+ struct rtentry *rt, *rt2;
+ struct epoch_tracker et;
+
+ char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO);
+
+ rnh = create_rnh(AF_INET6, 0);
+ info = get_info(buf, "2001:db8:1::/64", "::1");
+
+ if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) {
+ TPRINTF("failed to create info");
+ free_rnh(rnh);
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+ /* Do not set RTF_GATEWAY so the first route is multipath ineligible */
+
+ /* done by rib_add_route() */
+ bzero(&rc, sizeof(struct rib_cmd_info));
+
+ NET_EPOCH_ENTER(et);
+
+ error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt);
+ if (error != 0) {
+ TPRINTF("create_rt_nh_pair_from_info() failed: %d", error);
+ goto cleanup;
+ }
+ /* Set RTF_GATEWAY so the new nexthop is different */
+ info->rti_flags |= RTF_GATEWAY;
+ error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt2);
+ if (error != 0) {
+ TPRINTF("second create_rt_nh_pair_from_info() failed: %d", error);
+ goto cleanup;
+ }
+
+ error = add_route_wrapper(rnh, rt, info, &rc);
+ if (error != 0) {
+ TPRINTF("add_route() returned %d", error);
+ goto cleanup;
+ }
+
+ if (rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head) == NULL) {
+ TPRINTF("added route not found");
+ error = EINVAL;
+ goto cleanup;
+ }
+
+ error = add_route_wrapper(rnh, rt2, info, &rc);
+ if (error != EEXIST) {
+ TPRINTF("add_route() returned %d instead of EEXIST", error);
+ goto cleanup;
+ }
+
+ error = 0;
+cleanup:
+ NET_EPOCH_EXIT(et);
+ free(buf, M_TEMP);
+ free_rnh(rnh);
+
+ return (error);
+}
+
+static int
+test_add_route_pinned_success()
+{
+ int error;
+ struct rib_cmd_info rc;
+ struct rt_addrinfo *info;
+ struct rib_head *rnh;
+ struct rtentry *rt, *rt2;
+ struct epoch_tracker et;
+
+ char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO);
+
+ rnh = create_rnh(AF_INET6, 0);
+ info = get_info(buf, "2001:db8:1::/64", "::1");
+
+ if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) {
+ TPRINTF("failed to create info");
+ free_rnh(rnh);
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+
+ /* done by rib_add_route() */
+ bzero(&rc, sizeof(struct rib_cmd_info));
+
+ NET_EPOCH_ENTER(et);
+
+ error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt);
+ if (error != 0) {
+ TPRINTF("create_rt_nh_pair_from_info() failed: %d", error);
+ goto cleanup;
+ }
+
+ info->rti_flags |= RTF_PINNED;
+ error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt2);
+ if (error != 0) {
+ TPRINTF("second create_rt_nh_pair_from_info() failed: %d", error);
+ goto cleanup;
+ }
+
+ error = add_route_wrapper(rnh, rt, info, &rc);
+ if (error != 0) {
+ TPRINTF("add_route() returned %d", error);
+ goto cleanup;
+ }
+
+ if (rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head) == NULL) {
+ TPRINTF("added route not found");
+ error = EINVAL;
+ goto cleanup;
+ }
+
+ error = add_route_wrapper(rnh, rt2, info, &rc);
+ if (error != 0) {
+ TPRINTF("second add_route() returned %d", error);
+ goto cleanup;
+ }
+cleanup:
+ NET_EPOCH_EXIT(et);
+ free(buf, M_TEMP);
+ free_rnh(rnh);
+
+ return (error);
+}
+
+static int
+test_del_route_plain_del_success()
+{
+ int error;
+ struct rib_cmd_info rc;
+ struct rt_addrinfo *info;
+ struct rib_head *rnh;
+ struct rtentry *rt, *rt2;
+ struct epoch_tracker et;
+
+ char *buf = malloc(1024, M_TEMP, M_WAITOK | M_ZERO);
+
+ rnh = create_rnh(AF_INET6, 0);
+ info = get_info(buf, "2001:db8:1::/64", "::1");
+
+ if (info == NULL || info->rti_ifp == NULL || info->rti_ifa == NULL) {
+ TPRINTF("failed to create info");
+ free_rnh(rnh);
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+
+ /* done by rib_del_route() */
+ bzero(&rc, sizeof(struct rib_cmd_info));
+
+ NET_EPOCH_ENTER(et);
+
+ error = create_rt_nh_pair_from_info_wrapper(rnh, info, &rt);
+ if (error != 0) {
+ TPRINTF("create_rt_nh_pair_from_info() failed: %d", error);
+ goto cleanup;
+ }
+
+ error = add_route_wrapper(rnh, rt, info, &rc);
+ if (error != 0) {
+ TPRINTF("add_route() returned %d", error);
+ goto cleanup;
+ }
+
+ RIB_WLOCK(rnh);
+ error = del_route_one_wrapper(rnh, &rt, info);
+ RIB_WUNLOCK(rnh);
+
+ if (error != 0) {
+ TPRINTF("del_route_one() returned %d", error);
+ goto cleanup;
+ }
+
+ if (rnh->rnh_lookup(rt_key(rt), rt_mask(rt), &rnh->head) != NULL) {
+ TPRINTF("deleted route still in tree");
+ error = EINVAL;
+ goto cleanup;
+ }
+cleanup:
+ NET_EPOCH_EXIT(et);
+ free(buf, M_TEMP);
+ free_rnh(rnh);
+
+ return (error);
+}
+
+struct ktest_item tests[] = {
+ DECLARE_KTEST(test_add_route_plain_add_success),
+ DECLARE_KTEST(test_add_route_exist_fail),
+ DECLARE_KTEST(test_add_route_pinned_success),
+ DECLARE_KTEST(test_del_route_plain_del_success),
+};
+DEFINE_KTESTS(route_ctl, "routing control plane tests", tests);
+
Index: usr.bin/netstat/Makefile
===================================================================
--- usr.bin/netstat/Makefile
+++ usr.bin/netstat/Makefile
@@ -5,7 +5,7 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c nl_symbols.c route.c \
- unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \
+ unix.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c common.c nhops.c \
nl_defs.h
nl_symbols.c: nlist_symbols
Index: usr.bin/netstat/common.h
===================================================================
--- /dev/null
+++ usr.bin/netstat/common.h
@@ -0,0 +1,24 @@
+#ifndef _NETSTAT_COMMON_H_
+#define _NETSTAT_COMMON_H_
+
+struct bits {
+ u_long b_mask;
+ char b_val;
+ const char *b_name;
+};
+extern struct bits rt_bits[];
+
+const char *fmt_flags(const struct bits *p, int f);
+void print_flags_generic(int flags, const struct bits *pbits,
+ const char *format, const char *tag_name);
+int print_sockaddr(const char *name, struct sockaddr *sa,
+ struct sockaddr *mask, int flags, int width);
+
+struct ifmap_entry {
+ char ifname[IFNAMSIZ];
+};
+
+struct ifmap_entry *prepare_ifmap(size_t *ifmap_size);
+
+#endif
+
Index: usr.bin/netstat/common.c
===================================================================
--- /dev/null
+++ usr.bin/netstat/common.c
@@ -0,0 +1,140 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1983, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <libutil.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+const char *
+fmt_flags(const struct bits *p, int f)
+{
+ static char name[33];
+ char *flags;
+
+ for (flags = name; p->b_mask; p++)
+ if (p->b_mask & f)
+ *flags++ = p->b_val;
+ *flags = '\0';
+ return (name);
+}
+
+void
+print_flags_generic(int flags, const struct bits *pbits, const char *format,
+ const char *tag_name)
+{
+ const struct bits *p;
+ char tag_fmt[64];
+
+ xo_emit(format, fmt_flags(pbits, flags));
+
+ snprintf(tag_fmt, sizeof(tag_fmt), "{le:%s/%%s}", tag_name);
+ xo_open_list(tag_name);
+ for (p = pbits; p->b_mask; p++)
+ if (p->b_mask & flags)
+ xo_emit(tag_fmt, p->b_name);
+ xo_close_list(tag_name);
+}
+
+struct ifmap_entry *
+prepare_ifmap(size_t *pifmap_size)
+{
+ int ifindex = 0, size;
+ struct ifaddrs *ifap, *ifa;
+ struct sockaddr_dl *sdl;
+
+ struct ifmap_entry *ifmap = NULL;
+ int ifmap_size = 0;
+
+ /*
+ * Retrieve interface list at first
+ * since we need #ifindex -> if_xname match
+ */
+ if (getifaddrs(&ifap) != 0)
+ err(EX_OSERR, "getifaddrs");
+
+ for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
+
+ if (ifa->ifa_addr->sa_family != AF_LINK)
+ continue;
+
+ sdl = (struct sockaddr_dl *)ifa->ifa_addr;
+ ifindex = sdl->sdl_index;
+
+ if (ifindex >= ifmap_size) {
+ size = roundup(ifindex + 1, 32) *
+ sizeof(struct ifmap_entry);
+ if ((ifmap = realloc(ifmap, size)) == NULL)
+ errx(2, "realloc(%d) failed", size);
+ memset(&ifmap[ifmap_size], 0,
+ size - ifmap_size *
+ sizeof(struct ifmap_entry));
+
+ ifmap_size = roundup(ifindex + 1, 32);
+ }
+
+ if (*ifmap[ifindex].ifname != '\0')
+ continue;
+
+ strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ);
+ }
+
+ freeifaddrs(ifap);
+
+ *pifmap_size = ifmap_size;
+
+ return (ifmap);
+}
+
Index: usr.bin/netstat/main.c
===================================================================
--- usr.bin/netstat/main.c
+++ usr.bin/netstat/main.c
@@ -214,6 +214,7 @@
int noutputs = 0; /* how much outputs before we exit */
int numeric_addr; /* show addresses numerically */
int numeric_port; /* show ports numerically */
+int oflag; /* show nexthop objects*/
int Pflag; /* show TCP log ID */
static int pflag; /* show given protocol */
static int Qflag; /* show netisr information */
@@ -248,7 +249,7 @@
if (argc < 0)
exit(EXIT_FAILURE);
- while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz"))
+ while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:noPp:Qq:RrSTsuWw:xz"))
!= -1)
switch(ch) {
case '4':
@@ -345,6 +346,9 @@
case 'n':
numeric_addr = numeric_port = 1;
break;
+ case 'o':
+ oflag = 1;
+ break;
case 'P':
Pflag = 1;
break;
@@ -494,6 +498,15 @@
xo_finish();
exit(0);
}
+ if (oflag) {
+ xo_open_container("statistics");
+ nhops_print(fib, af);
+ nhgrp_print(fib, af);
+ xo_close_container("statistics");
+ xo_finish();
+ exit(0);
+ }
+
if (gflag) {
xo_open_container("statistics");
Index: usr.bin/netstat/netstat.h
===================================================================
--- usr.bin/netstat/netstat.h
+++ usr.bin/netstat/netstat.h
@@ -147,6 +147,10 @@
char *routename(struct sockaddr *, int);
const char *netname(struct sockaddr *, struct sockaddr *);
void routepr(int, int);
+int p_sockaddr(const char *name, struct sockaddr *sa,
+ struct sockaddr *mask, int flags, int width);
+const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask,
+ int flags);
#ifdef NETGRAPH
void netgraphprotopr(u_long, const char *, int, int);
@@ -157,3 +161,6 @@
void mroutepr(void);
void mrt_stats(void);
void bpf_stats(char *);
+void nhops_print(int fibnum, int af);
+void nhgrp_print(int fibnum, int af);
+
Index: usr.bin/netstat/nhops.c
===================================================================
--- /dev/null
+++ usr.bin/netstat/nhops.c
@@ -0,0 +1,687 @@
+/*-
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1983, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
+
+#include <netinet/in.h>
+#include <netgraph/ng_socket.h>
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <libutil.h>
+#include <netdb.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <err.h>
+#include <libxo/xo.h>
+#include "netstat.h"
+#include "common.h"
+
+/* column widths; each followed by one space */
+#ifndef INET6
+#define WID_DST_DEFAULT(af) 18 /* width of destination column */
+#define WID_GW_DEFAULT(af) 18 /* width of gateway column */
+#define WID_IF_DEFAULT(af) (Wflag ? 10 : 8) /* width of netif column */
+#else
+#define WID_DST_DEFAULT(af) \
+ ((af) == AF_INET6 ? (numeric_addr ? 33: 18) : 18)
+#define WID_GW_DEFAULT(af) \
+ ((af) == AF_INET6 ? (numeric_addr ? 29 : 18) : 18)
+#define WID_IF_DEFAULT(af) ((af) == AF_INET6 ? 8 : (Wflag ? 10 : 8))
+#endif /*INET6*/
+static int wid_dst;
+static int wid_gw;
+static int wid_flags;
+static int wid_pksent;
+static int wid_mtu;
+static int wid_if;
+static int wid_nhidx;
+static int wid_nhtype;
+static int wid_refcnt;
+static int wid_prepend;
+
+static struct bits nh_bits[] = {
+ { NHF_REJECT, 'R', "reject" },
+ { NHF_BLACKHOLE,'B', "blackhole" },
+ { NHF_REDIRECT, 'r', "redirect" },
+ { NHF_GATEWAY, 'G', "gateway" },
+ { NHF_DEFAULT, 'd', "default" },
+ { NHF_BROADCAST,'b', "broadcast" },
+ { 0 , 0, NULL }
+};
+
+static char *nh_types[] = {
+ "empty", /* 0 */
+ "v4/resolve", /* 1 */
+ "v4/gw",
+ "v6/resolve",
+ "v6/gw"
+};
+
+struct nhop_entry {
+ char gw[64];
+ char ifname[IFNAMSIZ];
+};
+
+struct nhop_map {
+ struct nhop_entry *ptr;
+ size_t size;
+};
+static struct nhop_map global_nhop_map;
+
+static void nhop_map_update(struct nhop_map *map, uint32_t idx,
+ char *gw, char *ifname);
+static struct nhop_entry *nhop_get(struct nhop_map *map, uint32_t idx);
+
+
+static struct ifmap_entry *ifmap;
+static size_t ifmap_size;
+
+static void
+print_sockaddr_buf(char *buf, size_t bufsize, const struct sockaddr *sa)
+{
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ inet_ntop(AF_INET, &((struct sockaddr_in *)sa)->sin_addr,
+ buf, bufsize);
+ break;
+ case AF_INET6:
+ inet_ntop(AF_INET6, &((struct sockaddr_in6 *)sa)->sin6_addr,
+ buf, bufsize);
+ break;
+ default:
+ snprintf(buf, bufsize, "unknown:%d", sa->sa_family);
+ break;
+ }
+}
+
+static int
+print_addr(const char *name, const char *addr, int width)
+{
+ char buf[128];
+ int protrusion;
+
+ if (width < 0) {
+ snprintf(buf, sizeof(buf), "{:%s/%%s} ", name);
+ xo_emit(buf, addr);
+ protrusion = 0;
+ } else {
+ if (Wflag != 0 || numeric_addr) {
+ snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%s}{]:} ",
+ -width, name);
+ xo_emit(buf, addr);
+ protrusion = strlen(addr) - width;
+ if (protrusion < 0)
+ protrusion = 0;
+ } else {
+ snprintf(buf, sizeof(buf), "{[:%d}{:%s/%%-.*s}{]:} ",
+ -width, name);
+ xo_emit(buf, width, addr);
+ protrusion = 0;
+ }
+ }
+ return (protrusion);
+}
+
+
+static void
+print_nhop_header(int af1 __unused)
+{
+
+ if (Wflag) {
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
+ "{T:/%*.*s} {T:/%-*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*.*s} {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "Idx",
+ wid_nhtype, wid_nhtype, "Type",
+ wid_dst, wid_dst, "IFA",
+ wid_gw, wid_gw, "Gateway",
+ wid_flags, wid_flags, "Flags",
+ wid_pksent, wid_pksent, "Use",
+ wid_mtu, wid_mtu, "Mtu",
+ wid_if, wid_if, "Netif",
+ wid_if, wid_if, "Addrif",
+ wid_refcnt, wid_refcnt, "Refcnt",
+ wid_prepend, "Prepend");
+ } else {
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s} "
+ " {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "Idx",
+ wid_dst, wid_dst, "IFA",
+ wid_gw, wid_gw, "Gateway",
+ wid_flags, wid_flags, "Flags",
+ wid_if, wid_if, "Netif",
+ wid_prepend, "Refcnt");
+ }
+}
+
+static void
+print_nhgroup_header(int af1 __unused)
+{
+
+ xo_emit("{T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%-*.*s} {T:/%*.*s}"
+ " {T:/%-*.*s} {T:/%*s}\n",
+ wid_nhidx, wid_nhidx, "MpIdx",
+ wid_nhidx, wid_nhidx, "NHIdx",
+ wid_nhidx, wid_nhidx, "Weight",
+ wid_nhidx, wid_nhidx, "Slots",
+ wid_gw, wid_gw, "Gateway",
+ wid_if, wid_if, "Netif",
+ wid_nhidx, "Refcnt");
+}
+
+static void
+print_nhgroup_entry_sysctl(const char *name, struct rt_msghdr *rtm,
+ struct mpath_external *mpe)
+{
+ char buffer[128];
+ struct nhop_entry *ne;
+
+ xo_open_instance(name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:mp_index/%%lu}{]:} ", wid_nhidx);
+ xo_emit(buffer, mpe->mp_idx);
+
+ xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("{t:dummy-3/%*.*s}", wid_gw, wid_gw, "----");
+ xo_emit("{t:dummy-4/%*.*s}", wid_if, wid_if, "----");
+ xo_emit("{t:mp-refcnt/%*lu}", wid_nhidx, mpe->mp_refcount);
+ xo_emit("\n");
+
+ struct mpath_nhop_external *ext;
+ ext = (struct mpath_nhop_external *)(mpe + 1);
+
+ uint32_t *fwd_c = calloc(sizeof(uint32_t), global_nhop_map.size);
+ uint32_t *pidx;
+ pidx = (uint32_t *)&ext[mpe->mp_nh_count];
+ for (uint32_t i = 0; i < mpe->mp_group_size; i++) {
+ fwd_c[pidx[i]]++;
+ }
+
+ xo_open_list("nhop_weights");
+ for (uint32_t i = 0; i < mpe->mp_nh_count; i++) {
+ xo_open_instance("nhop-weight");
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx);
+ xo_emit(buffer, "");
+ // nh index
+ xo_emit("{t:nh-index/%*lu} ", wid_nhidx, ext[i].nh_idx);
+ xo_emit("{t:nh-weight/%*lu} ", wid_nhidx, ext[i].nh_weight);
+ xo_emit("{t:nh-slots/%*lu} ", wid_nhidx, fwd_c[ext[i].nh_idx]);
+ ne = nhop_get(&global_nhop_map, ext[i].nh_idx);
+ if (ne != NULL) {
+ xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw);
+ xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname);
+ }
+ xo_emit("\n");
+ xo_close_instance("nhop-weight");
+ }
+ xo_close_list("nhop_weights");
+
+#if 0
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "");
+ xo_emit("{t:dummy-1/%*.*s}", wid_nhidx, wid_nhidx, "----");
+ xo_emit("\n");
+
+ uint32_t *pidx;
+ pidx = (uint32_t *)&ext[mpe->mp_nh_count];
+ xo_open_list("fwd-nhops");
+ for (uint32_t i = 0; i < mpe->mp_group_size; i++) {
+ xo_open_instance("fwd-nhop");
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:d-0/%%s}{]:} ", wid_nhidx);
+ xo_emit(buffer, "");
+ // nh index
+ xo_emit("{t:nh-index/%*lu} ", wid_nhidx, pidx[i]);
+ ne = nhop_get(&global_nhop_map, pidx[i]);
+ if (ne != NULL) {
+ xo_emit("{t:dummy-2/%*.*s}", wid_nhidx, wid_nhidx, "");
+ xo_emit("{t:nh-gw/%*.*s}", wid_gw, wid_gw, ne->gw);
+ xo_emit("{t:nh-interface/%*.*s}", wid_if, wid_if, ne->ifname);
+ }
+ xo_emit("\n");
+ xo_close_instance("fwd-nhop");
+ }
+ xo_close_list("fwd-nhops");
+#endif
+#if 0
+ if (Wflag) {
+ char *cp = nh_types[nh->nh_type];
+ xo_emit("{t:type_str/%*s} ", wid_nhtype, cp);
+ }
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->ifindex < (uint32_t)ifmap_size) {
+ strlcpy(iface_name, ifmap[nh->ifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+
+ //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr));
+ //protrusion = p_addr("ifa", src_addr, wid_dst);
+ sa_gw = (struct sockaddr *)(nh + 1);
+ sa_ifa = (struct sockaddr *)((char *)sa_gw + sa_gw->sa_len);
+ protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst);
+
+ if (nh->nh_flags & NHF_GATEWAY) {
+ const char *cp;
+ cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST);
+ strlcpy(gw_addr, cp, sizeof(gw_addr));
+ } else
+ snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
+ protrusion = p_addr("gateway", gw_addr, wid_dst - protrusion);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ",
+ wid_flags - protrusion);
+
+ //p_nhflags(nh->nh_flags, buffer);
+ p_flags(rtm->rtm_flags, buffer);
+
+ if (Wflag) {
+ xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent);
+ xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu);
+ }
+ //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n");
+
+ if (Wflag)
+ xo_emit("{t:interface-name/%*s}", wid_if, iface_name);
+ else
+ xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name);
+
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) {
+ strlcpy(iface_name, ifmap[nh->aifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+ if (Wflag)
+ xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name);
+
+ xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount);
+ if (Wflag && nh->prepend_len) {
+ char *prepend_hex = "AABBCCDDEE";
+ xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex);
+ }
+#endif
+ //xo_emit("\n");
+ xo_close_instance(name);
+}
+
+
+static void
+print_nhgrp_sysctl(int fibnum, int af)
+{
+ size_t needed;
+ int mib[7];
+ char *buf, *next, *lim;
+ struct rt_msghdr *rtm;
+ struct mpath_external *mp;
+ int fam = AF_UNSPEC;
+ int need_table_close = false;
+
+ mib[0] = CTL_NET;
+ mib[1] = PF_ROUTE;
+ mib[2] = 0;
+ mib[3] = af;
+ mib[4] = NET_RT_NHGROUPS;
+ mib[5] = 0;
+ mib[6] = fibnum;
+ if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0)
+ err(EX_OSERR, "sysctl: net.route.0.%d.nhgrpdump.%d estimate",
+ af, fibnum);
+ if ((buf = malloc(needed)) == NULL)
+ errx(2, "malloc(%lu)", (unsigned long)needed);
+ if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
+ err(1, "sysctl: net.route.0.%d.nhgrpdump.%d", af, fibnum);
+ printf("BUF: %zu\n", needed);
+ lim = buf + needed;
+ xo_open_container("nhgrp-table");
+ xo_open_list("rt-family");
+ for (next = buf; next < lim; next += rtm->rtm_msglen) {
+ rtm = (struct rt_msghdr *)next;
+ if (rtm->rtm_version != RTM_VERSION)
+ continue;
+
+ mp = (struct mpath_external *)(rtm + 1);
+ /*
+ * Peek inside header to determine AF
+ */
+ /* Only print family first time. */
+ if (fam != af) {
+ if (need_table_close) {
+ xo_close_list("nhgrp-entry");
+ xo_close_instance("rt-family");
+ }
+ need_table_close = true;
+
+ fam = af;
+ wid_dst = WID_GW_DEFAULT(fam);
+ wid_gw = WID_GW_DEFAULT(fam);
+ wid_nhidx = 5;
+ wid_nhtype = 12;
+ wid_refcnt = 6;
+ wid_flags = 6;
+ wid_pksent = 8;
+ wid_mtu = 6;
+ wid_if = WID_IF_DEFAULT(fam);
+ xo_open_instance("rt-family");
+ pr_family(fam);
+ xo_open_list("nhgrp-entry");
+
+ print_nhgroup_header(fam);
+ }
+ print_nhgroup_entry_sysctl("nhgrp-entry", rtm, mp);
+ }
+ if (need_table_close) {
+ xo_close_list("nhgrp-entry");
+ xo_close_instance("rt-family");
+ }
+ xo_close_list("rt-family");
+ xo_close_container("nhgrp-table");
+ free(buf);
+}
+
+static void
+nhop_map_update(struct nhop_map *map, uint32_t idx, char *gw, char *ifname)
+{
+ if (idx >= map->size) {
+ uint32_t new_size;
+ size_t sz;
+ if (map->size == 0)
+ new_size = 32;
+ else
+ new_size = map->size * 2;
+ if (new_size <= idx)
+ new_size = roundup(idx + 1, 32);
+
+ sz = new_size * (sizeof(struct nhop_entry));
+ if ((map->ptr = realloc(map->ptr, sz)) == NULL)
+ errx(2, "realloc(%lu) failed", sz);
+
+ memset(&map->ptr[map->size], 0, (new_size - map->size) * sizeof(struct nhop_entry));
+ map->size = new_size;
+ }
+
+ strlcpy(map->ptr[idx].ifname, ifname, sizeof(map->ptr[idx].ifname));
+ strlcpy(map->ptr[idx].gw, gw, sizeof(map->ptr[idx].gw));
+}
+
+static struct nhop_entry *
+nhop_get(struct nhop_map *map, uint32_t idx)
+{
+
+ if (idx >= map->size)
+ return (NULL);
+ if (*map->ptr[idx].ifname == '\0')
+ return (NULL);
+ return &map->ptr[idx];
+}
+
+static void
+print_nhop_entry_sysctl(const char *name, struct rt_msghdr *rtm, struct nhop_external *nh)
+{
+ char buffer[128];
+ char iface_name[128];
+ int protrusion;
+ char gw_addr[64];
+ struct sockaddr *sa_gw, *sa_ifa;
+
+ xo_open_instance(name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:index/%%lu}{]:} ", wid_nhidx);
+ //xo_emit("{t:index/%-lu} ", wid_nhidx, nh->nh_idx);
+ xo_emit(buffer, nh->nh_idx);
+
+ if (Wflag) {
+ char *cp = nh_types[nh->nh_type];
+ xo_emit("{t:type_str/%*s} ", wid_nhtype, cp);
+ }
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->ifindex < (uint32_t)ifmap_size) {
+ strlcpy(iface_name, ifmap[nh->ifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+
+ //inet_ntop(nh->nh_family, &nh->nh_src, src_addr, sizeof(src_addr));
+ //protrusion = p_addr("ifa", src_addr, wid_dst);
+ sa_gw = (struct sockaddr *)(nh + 1);
+ sa_ifa = (struct sockaddr *)((char *)sa_gw + sa_gw->sa_len);
+ protrusion = p_sockaddr("ifa", sa_ifa, NULL, RTF_HOST, wid_dst);
+
+ if (nh->nh_flags & NHF_GATEWAY) {
+ const char *cp;
+ cp = fmt_sockaddr(sa_gw, NULL, RTF_HOST);
+ strlcpy(gw_addr, cp, sizeof(gw_addr));
+ } else
+ snprintf(gw_addr, sizeof(gw_addr), "%s/resolve", iface_name);
+ protrusion = print_addr("gateway", gw_addr, wid_dst - protrusion);
+
+ nhop_map_update(&global_nhop_map, nh->nh_idx, gw_addr, iface_name);
+
+ snprintf(buffer, sizeof(buffer), "{[:-%d}{:flags/%%s}{]:} ",
+ wid_flags - protrusion);
+
+ //p_nhflags(nh->nh_flags, buffer);
+ print_flags_generic(rtm->rtm_flags, rt_bits, buffer, "rt_flags_pretty");
+
+ if (Wflag) {
+ xo_emit("{t:use/%*lu} ", wid_pksent, nh->nh_pksent);
+ xo_emit("{t:mtu/%*lu} ", wid_mtu, nh->nh_mtu);
+ }
+ //printf("IDX: %d IFACE: %s FAMILY: %d TYPE: %d FLAGS: %X GW \n");
+
+ if (Wflag)
+ xo_emit("{t:interface-name/%*s}", wid_if, iface_name);
+ else
+ xo_emit("{t:interface-name/%*.*s}", wid_if, wid_if, iface_name);
+
+ memset(iface_name, 0, sizeof(iface_name));
+ if (nh->aifindex < (uint32_t)ifmap_size && nh->ifindex != nh->aifindex) {
+ strlcpy(iface_name, ifmap[nh->aifindex].ifname,
+ sizeof(iface_name));
+ if (*iface_name == '\0')
+ strlcpy(iface_name, "---", sizeof(iface_name));
+ }
+ if (Wflag)
+ xo_emit("{t:address-interface-name/%*s}", wid_if, iface_name);
+
+ xo_emit("{t:refcount/%*lu} ", wid_refcnt, nh->nh_refcount);
+ if (Wflag && nh->prepend_len) {
+ char *prepend_hex = "AABBCCDDEE";
+ xo_emit(" {:nhop-prepend/%*s}", wid_prepend, prepend_hex);
+ }
+
+ xo_emit("\n");
+ xo_close_instance(name);
+}
+
+static void
+print_nhops_sysctl(int fibnum, int af)
+{
+ size_t needed;
+ int mib[7];
+ char *buf, *next, *lim;
+ struct rt_msghdr *rtm;
+ struct nhop_external *nh;
+ int fam = AF_UNSPEC;
+ int need_table_close = false;
+
+ mib[0] = CTL_NET;
+ mib[1] = PF_ROUTE;
+ mib[2] = 0;
+ mib[3] = af;
+ mib[4] = NET_RT_NHOP;
+ mib[5] = 0;
+ mib[6] = fibnum;
+ if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0)
+ err(EX_OSERR, "sysctl: net.route.0.%d.nhdump.%d estimate", af,
+ fibnum);
+ if ((buf = malloc(needed)) == NULL)
+ errx(2, "malloc(%lu)", (unsigned long)needed);
+ if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0)
+ err(1, "sysctl: net.route.0.%d.nhdump.%d", af, fibnum);
+ lim = buf + needed;
+ xo_open_container("nhop-table");
+ xo_open_list("rt-family");
+ for (next = buf; next < lim; next += rtm->rtm_msglen) {
+ rtm = (struct rt_msghdr *)next;
+ if (rtm->rtm_version != RTM_VERSION)
+ continue;
+
+ nh = (struct nhop_external *)(rtm + 1);
+ /*
+ * Peek inside header to determine AF
+ */
+ /* Only print family first time. */
+ if (fam != nh->nh_family) {
+ if (need_table_close) {
+ xo_close_list("nh-entry");
+ xo_close_instance("rt-family");
+ }
+ need_table_close = true;
+
+ fam = nh->nh_family;
+ wid_dst = WID_GW_DEFAULT(fam);
+ wid_gw = WID_GW_DEFAULT(fam);
+ wid_nhidx = 5;
+ wid_nhtype = 12;
+ wid_refcnt = 6;
+ wid_flags = 6;
+ wid_pksent = 8;
+ wid_mtu = 6;
+ wid_if = WID_IF_DEFAULT(fam);
+ xo_open_instance("rt-family");
+ pr_family(fam);
+ xo_open_list("nh-entry");
+
+ print_nhop_header(fam);
+ }
+ print_nhop_entry_sysctl("nh-entry", rtm, nh);
+ }
+ if (need_table_close) {
+ xo_close_list("nh-entry");
+ xo_close_instance("rt-family");
+ }
+ xo_close_list("rt-family");
+ xo_close_container("nhop-table");
+ free(buf);
+}
+
+static void
+p_nhflags(int f, const char *format)
+{
+ struct bits *p;
+ char *pretty_name = "nh_flags_pretty";
+
+ xo_emit(format, fmt_flags(nh_bits, f));
+
+ xo_open_list(pretty_name);
+ for (p = nh_bits; p->b_mask; p++)
+ if (p->b_mask & f)
+ xo_emit("{le:nh_flags_pretty/%s}", p->b_name);
+ xo_close_list(pretty_name);
+}
+
+void
+nhops_print(int fibnum, int af)
+{
+ size_t intsize;
+ int numfibs;
+
+ intsize = sizeof(int);
+ if (fibnum == -1 &&
+ sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1)
+ fibnum = 0;
+ if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
+ numfibs = 1;
+ if (fibnum < 0 || fibnum > numfibs - 1)
+ errx(EX_USAGE, "%d: invalid fib", fibnum);
+
+ ifmap = prepare_ifmap(&ifmap_size);
+
+ xo_open_container("route-nhop-information");
+ xo_emit("{T:Nexthop data}");
+ if (fibnum)
+ xo_emit(" ({L:fib}: {:fib/%d})", fibnum);
+ xo_emit("\n");
+ print_nhops_sysctl(fibnum, af);
+ xo_close_container("route-nhop-information");
+}
+
+void
+nhgrp_print(int fibnum, int af)
+{
+ size_t intsize;
+ int numfibs;
+
+ intsize = sizeof(int);
+ if (fibnum == -1 &&
+ sysctlbyname("net.my_fibnum", &fibnum, &intsize, NULL, 0) == -1)
+ fibnum = 0;
+ if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1)
+ numfibs = 1;
+ if (fibnum < 0 || fibnum > numfibs - 1)
+ errx(EX_USAGE, "%d: invalid fib", fibnum);
+
+ ifmap = prepare_ifmap(&ifmap_size);
+
+ xo_open_container("route-nhgrp-information");
+ xo_emit("{T:Nexthop groups data}");
+ if (fibnum)
+ xo_emit(" ({L:fib}: {:fib/%d})", fibnum);
+ xo_emit("\n");
+ print_nhgrp_sysctl(fibnum, af);
+ xo_close_container("route-nhgrp-information");
+}
Index: usr.bin/netstat/route.c
===================================================================
--- usr.bin/netstat/route.c
+++ usr.bin/netstat/route.c
@@ -69,16 +69,13 @@
#include <err.h>
#include <libxo/xo.h>
#include "netstat.h"
+#include "common.h"
#include "nl_defs.h"
/*
* Definitions for showing gateway flags.
*/
-static struct bits {
- u_long b_mask;
- char b_val;
- const char *b_name;
-} bits[] = {
+struct bits rt_bits[] = {
{ RTF_UP, 'U', "up" },
{ RTF_GATEWAY, 'G', "gateway" },
{ RTF_HOST, 'H', "host" },
@@ -99,11 +96,8 @@
{ 0 , 0, NULL }
};
-struct ifmap_entry {
- char ifname[IFNAMSIZ];
-};
static struct ifmap_entry *ifmap;
-static int ifmap_size;
+static size_t ifmap_size;
static struct timespec uptime;
static const char *netname4(in_addr_t, in_addr_t);
@@ -112,12 +106,7 @@
#endif
static void p_rtable_sysctl(int, int);
static void p_rtentry_sysctl(const char *name, struct rt_msghdr *);
-static int p_sockaddr(const char *name, struct sockaddr *, struct sockaddr *,
- int, int);
-static const char *fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask,
- int flags);
static void p_flags(int, const char *);
-static const char *fmt_flags(int f);
static void domask(char *, size_t, u_long);
@@ -229,7 +218,7 @@
wid_dst, wid_dst, "Destination",
wid_gw, wid_gw, "Gateway",
wid_flags, wid_flags, "Flags",
- wid_pksent, wid_pksent, "Use",
+ wid_mtu, wid_mtu, "Nhop#",
wid_mtu, wid_mtu, "Mtu",
wid_if, wid_if, "Netif",
wid_expire, "Expire");
@@ -252,47 +241,11 @@
char *buf, *next, *lim;
struct rt_msghdr *rtm;
struct sockaddr *sa;
- int fam = AF_UNSPEC, ifindex = 0, size;
+ int fam = AF_UNSPEC;
int need_table_close = false;
- struct ifaddrs *ifap, *ifa;
- struct sockaddr_dl *sdl;
+ ifmap = prepare_ifmap(&ifmap_size);
- /*
- * Retrieve interface list at first
- * since we need #ifindex -> if_xname match
- */
- if (getifaddrs(&ifap) != 0)
- err(EX_OSERR, "getifaddrs");
-
- for (ifa = ifap; ifa; ifa = ifa->ifa_next) {
-
- if (ifa->ifa_addr->sa_family != AF_LINK)
- continue;
-
- sdl = (struct sockaddr_dl *)ifa->ifa_addr;
- ifindex = sdl->sdl_index;
-
- if (ifindex >= ifmap_size) {
- size = roundup(ifindex + 1, 32) *
- sizeof(struct ifmap_entry);
- if ((ifmap = realloc(ifmap, size)) == NULL)
- errx(2, "realloc(%d) failed", size);
- memset(&ifmap[ifmap_size], 0,
- size - ifmap_size *
- sizeof(struct ifmap_entry));
-
- ifmap_size = roundup(ifindex + 1, 32);
- }
-
- if (*ifmap[ifindex].ifname != '\0')
- continue;
-
- strlcpy(ifmap[ifindex].ifname, ifa->ifa_name, IFNAMSIZ);
- }
-
- freeifaddrs(ifap);
-
mib[0] = CTL_NET;
mib[1] = PF_ROUTE;
mib[2] = 0;
@@ -377,7 +330,8 @@
wid_flags - protrusion);
p_flags(rtm->rtm_flags, buffer);
if (Wflag) {
- xo_emit("{t:use/%*lu} ", wid_pksent, rtm->rtm_rmx.rmx_pksent);
+ /* XXX: use=0? */
+ xo_emit("{t:nhop/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_pksent);
if (rtm->rtm_rmx.rmx_mtu != 0)
xo_emit("{t:mtu/%*lu} ", wid_mtu, rtm->rtm_rmx.rmx_mtu);
@@ -410,7 +364,7 @@
xo_close_instance(name);
}
-static int
+int
p_sockaddr(const char *name, struct sockaddr *sa, struct sockaddr *mask,
int flags, int width)
{
@@ -442,7 +396,7 @@
return (protrusion);
}
-static const char *
+const char *
fmt_sockaddr(struct sockaddr *sa, struct sockaddr *mask, int flags)
{
static char buf[128];
@@ -519,30 +473,10 @@
static void
p_flags(int f, const char *format)
{
- struct bits *p;
- xo_emit(format, fmt_flags(f));
-
- xo_open_list("flags_pretty");
- for (p = bits; p->b_mask; p++)
- if (p->b_mask & f)
- xo_emit("{le:flags_pretty/%s}", p->b_name);
- xo_close_list("flags_pretty");
+ print_flags_generic(f, rt_bits, format, "flags_pretty");
}
-static const char *
-fmt_flags(int f)
-{
- static char name[33];
- char *flags;
- struct bits *p = bits;
-
- for (flags = name; p->b_mask; p++)
- if (p->b_mask & f)
- *flags++ = p->b_val;
- *flags = '\0';
- return (name);
-}
char *
routename(struct sockaddr *sa, int flags)

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 28, 9:31 PM (7 h, 1 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17833924
Default Alt Text
D24141.diff (441 KB)

Event Timeline