Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F108582242
D27401.id80103.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
241 KB
Referenced Files
None
Subscribers
None
D27401.id80103.diff
View Options
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4174,6 +4174,7 @@
net/route/nhop.c standard
net/route/nhop_ctl.c standard
net/route/nhop_utils.c standard
+net/route/route_algo.c optional route_algo
net/route/route_ctl.c standard
net/route/route_ddb.c optional ddb
net/route/route_helpers.c standard
@@ -4324,6 +4325,7 @@
netinet/in_kdtrace.c optional inet | inet6
netinet/ip_carp.c optional inet carp | inet6 carp
netinet/in_fib.c optional inet
+netinet/in_fib_algo.c optional inet route_algo
netinet/in_gif.c optional gif inet | netgraph_gif inet
netinet/ip_gre.c optional gre inet
netinet/ip_id.c optional inet
@@ -4400,6 +4402,7 @@
netinet6/in6.c optional inet6
netinet6/in6_cksum.c optional inet6
netinet6/in6_fib.c optional inet6
+netinet6/in6_fib_algo.c optional inet6 route_algo
netinet6/in6_gif.c optional gif inet6 | netgraph_gif inet6
netinet6/in6_ifattach.c optional inet6
netinet6/in6_jail.c optional inet6
Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -454,6 +454,7 @@
PF_DEFAULT_TO_DROP opt_pf.h
RADIX_MPATH opt_mpath.h
ROUTE_MPATH opt_route.h
+ROUTE_ALGO opt_route.h
ROUTETABLES opt_route.h
RSS opt_rss.h
SLIP_IFF_OPTS opt_slip.h
Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm.c
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/dpdk_lpm.c
@@ -0,0 +1,480 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_fib.h>
+#include <netinet/ip.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+#define RTDEBUG
+
+#include "rte_shim.h"
+#include "rte_lpm.h"
+
+#define LPM_MIN_TBL8 8 /* 2 pages of memory */
+#define LPM_MAX_TBL8 65536 * 16 /* 256M */
+
+struct fib_algo_calldata {
+ void *lookup;
+ void *arg;
+};
+
+struct dpdk_lpm_data {
+ struct rte_lpm *lpm;
+ uint32_t number_tbl8s;
+ uint64_t routes_added;
+ uint64_t routes_failed;
+ uint32_t fibnum;
+ uint8_t hit_tables;
+ uint8_t hit_records;
+ struct fib_algo_calldata fa;
+ struct fib_data *fd;
+};
+
+/*
+ * Main datapath routing
+ */
+static struct nhop_object *
+lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ struct rte_lpm *lpm;
+ const struct rte_lpm_external *rte_ext;
+ uint32_t nhidx = 0;
+ int ret;
+
+ lpm = (struct rte_lpm *)algo_data;
+ rte_ext = (const struct rte_lpm_external *)lpm;
+
+ ret = rte_lpm_lookup(lpm, key.addr4.s_addr, &nhidx);
+ if (ret == 0) {
+ /* Success! */
+ return (rte_ext->nh_idx[nhidx]);
+ } else {
+ /* Not found. Check default route */
+ return (rte_ext->nh_idx[rte_ext->default_idx]);
+ }
+
+ return (NULL);
+}
+
+static uint8_t
+rte_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ if (rinfo->num_prefixes < 10)
+ return (1);
+ else if (rinfo->num_prefixes < 1000)
+ return (rinfo->num_prefixes / 10);
+ else if (rinfo->num_prefixes < 500000)
+ return (100 + rinfo->num_prefixes / 3334);
+ else
+ return (250);
+}
+
+static int
+contigmask(const uint8_t *p, int len)
+{
+ int i, n;
+
+ for (i = 0; i < len ; i++)
+ if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
+ break;
+ for (n= i + 1; n < len; n++)
+ if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0)
+ return (-1); /* mask not contiguous */
+ return (i);
+}
+
+static uint8_t
+rt_get_plen(const struct rtentry *rt)
+{
+ const struct sockaddr *sa;
+ int family;
+ int plen;
+
+ sa = rt_key_const(rt);
+ family = sa->sa_family;
+ sa = rt_mask_const(rt);
+ switch (family) {
+ case AF_INET:
+ if (sa != NULL) {
+ const struct in_addr *addr4;
+ addr4 = &((const struct sockaddr_in *)sa)->sin_addr;
+ plen = contigmask((const uint8_t *)addr4, 32);
+ if (plen == -1)
+ plen = 0;
+ } else
+ plen = 32;
+ break;
+ case AF_INET6:
+ if (sa != NULL) {
+ const struct in6_addr *addr6;
+ addr6 = &((const struct sockaddr_in6 *)sa)->sin6_addr;
+ plen = contigmask((const uint8_t *)addr6, 128);
+ if (plen == -1)
+ plen = 0;
+ } else
+ plen = 128;
+ break;
+ default:
+ plen = 0;
+ }
+
+ return (plen);
+}
+
+static enum flm_op_result
+handle_default_change(struct dpdk_lpm_data *dd, struct rib_cmd_info *rc)
+{
+ struct rte_lpm_external *rte_ext;
+ rte_ext = (struct rte_lpm_external *)dd->lpm;
+ uint32_t old_nhidx = rte_ext->default_idx;
+
+ if (rc->rc_cmd != RTM_DELETE) {
+ /* Reference new */
+ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+
+ if (nhidx == 0)
+ return (FLM_REBUILD);
+ rte_ext->default_idx = nhidx;
+ } else {
+ /* No default route */
+ rte_ext->default_idx = 0;
+ }
+
+ if (old_nhidx != 0)
+ fib_free_nhop_idx(dd->fd, old_nhidx);
+
+ return (FLM_SUCCESS);
+}
+
+static void
+get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, uint8_t *plen, uint32_t *nhop_idx)
+{
+ struct route_nhop_data rnd;
+ struct rtentry *rt;
+
+ rt = fib4_lookup_rt(dd->fibnum, addr, 0, NHR_UNLOCKED, &rnd);
+ if ((rt != NULL) && (*plen = rt_get_plen(rt)) > 0) {
+ *nhop_idx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop);
+ } else {
+ *nhop_idx = 0;
+ *plen = 0;
+ }
+}
+
+static enum flm_op_result
+handle_gu_change(struct dpdk_lpm_data *dd, const struct rib_cmd_info *rc,
+ const struct in_addr addr, int plen)
+{
+ uint32_t nhidx = 0;
+ int ret;
+ char abuf[INET_ADDRSTRLEN];
+ inet_ntop(AF_INET, &addr, abuf, sizeof(abuf));
+
+ /* So we get sin, plen and nhidx */
+ if (rc->rc_cmd != RTM_DELETE) {
+ /*
+ * Addition or change. Save nhop in the internal table
+ * and get index.
+ */
+ nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+ if (nhidx == 0) {
+ DPRINTF("nhop limit reached, need rebuild");
+ return (FLM_REBUILD);
+ }
+
+ ret = rte_lpm_add(dd->lpm, addr.s_addr, plen, nhidx);
+ DPRINTF("DPDK GU: %s %s/%d nhop %u = %d", (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE",
+ abuf, plen, nhidx, ret);
+ } else {
+ /*
+ * Need to lookup parent. Assume deletion happened already
+ */
+ const struct sockaddr_in *dst;
+ dst = (const struct sockaddr_in *)rt_key_const(rc->rc_rt);
+
+ uint8_t parent_plen;
+ uint32_t parent_nhop_idx;
+ get_parent_rule(dd, dst->sin_addr, &parent_plen, &parent_nhop_idx);
+
+ ret = rte_lpm_delete(dd->lpm, addr.s_addr, plen, parent_plen, parent_nhop_idx);
+ DPRINTF("DPDK: %s %s/%d nhop %u = %d", "DEL", abuf, plen, nhidx, ret);
+ }
+
+ if (rc->rc_nh_old != NULL)
+ fib_free_nhop(dd->fd, rc->rc_nh_old);
+
+ if (ret != 0) {
+ DPRINTF("error: %d", ret);
+ if (ret == -EOVERFLOW)
+ return (FLM_REBUILD);
+ return (FLM_ERROR);
+ }
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ const struct sockaddr_in *sin;
+ int plen = rt_get_plen(rc->rc_rt);
+ enum flm_op_result ret;
+ struct dpdk_lpm_data *dd;
+
+ dd = (struct dpdk_lpm_data *)_data;
+ sin = (const struct sockaddr_in *)rt_key_const(rc->rc_rt);
+
+ if (plen != 0)
+ ret = handle_gu_change(dd, rc, sin->sin_addr, plen);
+ else
+ ret = handle_default_change(dd, rc);
+
+ if (ret != 0)
+ DPRINTF("error handling route");
+ return (ret);
+}
+
+static void
+destroy_table(void *_data)
+{
+ struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data;
+
+ DPRINTF("destroy dd %p", dd);
+ if (dd->lpm != NULL)
+ rte_lpm_free(dd->lpm);
+ free(dd, M_TEMP);
+}
+
+static enum flm_op_result
+add_route_cb(struct rtentry *rt, void *_data)
+{
+ struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data;
+ const struct sockaddr_in *sin;
+ int plen = rt_get_plen(rt);
+ int ret;
+
+ sin = (const struct sockaddr_in *)rt_key_const(rt);
+
+ char abuf[INET_ADDRSTRLEN];
+ char mbuf[INET_ADDRSTRLEN];
+ inet_ntop(AF_INET, &sin->sin_addr, abuf, sizeof(abuf));
+
+ const struct sockaddr_in *mask;
+ mask = (const struct sockaddr_in *)rt_mask_const(rt);
+ if (mask != NULL) {
+ inet_ntop(AF_INET, &mask->sin_addr, mbuf, sizeof(mbuf));
+ } else
+ mbuf[0] = '\0';
+
+ DPRINTF("Operating on %s/%d [%s]", abuf, plen, mbuf);
+
+ if (plen == 0) {
+ struct rib_cmd_info rc;
+
+ bzero(&rc, sizeof(rc));
+ rc.rc_cmd = RTM_ADD;
+ rc.rc_nh_new = rt->rt_nhop;
+ DPRINTF("Adding default route");
+ return (handle_default_change(dd, &rc));
+ }
+
+ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rt->rt_nhop);
+ if (nhidx == 0) {
+ DPRINTF("unable to get nhop index");
+ return (FLM_REBUILD);
+ }
+ ret = rte_lpm_add(dd->lpm, sin->sin_addr.s_addr, plen, nhidx);
+ DPRINTF("ADD %p %s/%d nh %u = %d", dd->lpm, abuf, plen, nhidx, ret);
+
+ if (ret != 0) {
+ DPRINTF("rte_lpm_add() returned %d", ret);
+ if (ret == -ENOSPC) {
+ dd->hit_tables = 1;
+ return (FLM_REBUILD);
+ }
+ dd->routes_failed++;
+ } else
+ dd->routes_added++;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+check_dump_success(void *_data, struct fib_dp *dp)
+{
+ struct dpdk_lpm_data *dd;
+
+ dd = (struct dpdk_lpm_data *)_data;
+
+ DPRINTF("scan completed. added: %zu failed: %zu",
+ dd->routes_added, dd->routes_failed);
+ if (dd->hit_tables || dd->routes_failed > 0)
+ return (FLM_REBUILD);
+
+ DPRINTF("DPDK lookup engine synced with IPv4 RIB id %u, %zu routes",
+ dd->fibnum, dd->routes_added);
+
+ dp->f = lookup_ptr;
+ dp->arg = dd;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+estimate_scale(const struct dpdk_lpm_data *dd_src, struct dpdk_lpm_data *dd)
+{
+
+ /* XXX: update at 75% capacity */
+ if (dd_src->hit_tables)
+ dd->number_tbl8s = dd_src->number_tbl8s * 2;
+ else
+ dd->number_tbl8s = dd_src->number_tbl8s;
+
+ /* TODO: look into the appropriate RIB to adjust */
+}
+
+static struct dpdk_lpm_data *
+build_table(struct dpdk_lpm_data *dd_prev)
+{
+ struct dpdk_lpm_data *dd;
+ struct rte_lpm *lpm;
+
+ dd = malloc(sizeof(struct dpdk_lpm_data), M_TEMP, M_NOWAIT | M_ZERO);
+ if (dd == NULL) {
+ DPRINTF("Unable to allocate base datastructure");
+ return (NULL);
+ }
+ dd->fibnum = dd_prev->fibnum;
+ dd->fd = dd_prev->fd;
+
+ estimate_scale(dd_prev, dd);
+
+ struct rte_lpm_config cfg = {.number_tbl8s = dd->number_tbl8s};
+ lpm = rte_lpm_create("test", 0, &cfg);
+ if (lpm == NULL) {
+ DPRINTF("unable to create lpm");
+ free(dd, M_TEMP);
+ return (NULL);
+ }
+ dd->lpm = lpm;
+ struct rte_lpm_external *ext = (struct rte_lpm_external *)lpm;
+ ext->nh_idx = fib_get_nhop_array(dd->fd);
+
+ DPRINTF("allocated %u tbl8s", dd->number_tbl8s);
+
+ return (dd);
+}
+
+static enum flm_op_result
+init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data)
+{
+ struct dpdk_lpm_data *dd, dd_base;
+
+ if (_old_data == NULL) {
+ bzero(&dd_base, sizeof(struct dpdk_lpm_data));
+ dd_base.fibnum = fibnum;
+ dd_base.fd = fd;
+ /* TODO: get rib statistics */
+ dd_base.number_tbl8s = LPM_MIN_TBL8;
+ dd = &dd_base;
+ } else {
+ DPRINTF("Starting with old data");
+ dd = (struct dpdk_lpm_data *)_old_data;
+ }
+
+ /* Guaranteed to be in epoch */
+ dd = build_table(dd);
+ if (dd == NULL) {
+ DPRINTF("table creation failed");
+ return (FLM_REBUILD);
+ }
+
+ *data = dd;
+ return (FLM_SUCCESS);
+}
+
+static struct fib_lookup_module dpdk_lpm4 = {
+ .flm_name = "dpdk_lpm4",
+ .flm_family = AF_INET,
+ .flm_init_cb = init_table,
+ .flm_destroy_cb = destroy_table,
+ .flm_dump_rib_item_cb = add_route_cb,
+ .flm_dump_end_cb = check_dump_success,
+ .flm_change_rib_item_cb = handle_rtable_change_cb,
+ .flm_get_pref = rte_get_pref,
+};
+
+static int
+lpm4_modevent(module_t mod, int type, void *unused)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ fib_module_register(&dpdk_lpm4);
+ break;
+ case MOD_UNLOAD:
+ error = fib_module_unregister(&dpdk_lpm4);
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t lpm4mod = {
+ "dpdk_lpm4",
+ lpm4_modevent,
+ 0
+};
+
+DECLARE_MODULE(lpm4mod, lpm4mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(lpm4mod, 1);
Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h
===================================================================
--- sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h
+++ sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h
@@ -1,6 +1,7 @@
/*-
- * Copyright (c) 2015
- * Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -10,14 +11,11 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
*
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -29,17 +27,31 @@
* $FreeBSD$
*/
-#ifndef _NETINET6_IN6_FIB_H_
-#define _NETINET6_IN6_FIB_H_
+/*
+ * Contains various definitions shared between the parts of a routing subsystem.
+ *
+ */
+
+#ifndef _NETINET6_DPDK_LPM6_H_
+#define _NETINET6_DPDK_LPM6_H_
+
+/** LPM structure. */
+struct rte_lpm6;
+
+/** LPM configuration structure. */
+struct rte_lpm6_config {
+ uint32_t max_rules; /**< Max number of rules. */
+ uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */
+ int flags; /**< This field is currently unused. */
+};
+
+struct rte_lpm6 *
+rte_lpm6_create(const char *name, int socket_id,
+ const struct rte_lpm6_config *config);
+void
+rte_lpm6_free(struct rte_lpm6 *lpm);
+int
+rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ uint32_t next_hop, int is_new_rule);
-struct nhop_object *fib6_lookup(uint32_t fibnum,
- const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags,
- uint32_t flowid);
-int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
- uint32_t scopeid, uint32_t flags, const struct ifnet *src_if);
-struct nhop_object *fib6_lookup_debugnet(uint32_t fibnum,
- const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags);
-uint32_t fib6_calc_software_hash(const struct in6_addr *src,
- const struct in6_addr *dst, unsigned short src_port, unsigned short dst_port,
- char proto, uint32_t *phashtype);
#endif
Index: sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c
@@ -0,0 +1,560 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_fib.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+#define RTDEBUG
+
+#include "rte_lpm6.h"
+
+#define LPM6_MIN_TBL8 8 /* 2 pages of memory */
+#define LPM6_MAX_TBL8 65536 * 16 /* 256M */
+
+struct fib_algo_calldata {
+ void *lookup;
+ void *arg;
+};
+
+struct dpdk_lpm6_data {
+ struct rte_lpm6 *lpm6;
+ uint32_t number_tbl8s;
+ uint64_t routes_added;
+ uint64_t routes_failed;
+ uint32_t fibnum;
+ uint8_t hit_tables;
+ struct fib_data *fd;
+};
+
+static struct nhop_object *
+lookup_ptr_ll(const struct rte_lpm6 *lpm6, const struct in6_addr *dst6,
+ uint32_t scopeid)
+{
+ const struct rte_lpm6_external *rte_ext;
+ struct nhop_object *nh = NULL;
+ struct sockaddr_in6 sin6;
+ struct rib_head *rh;
+ struct radix_node *rn;
+ RIB_RLOCK_TRACKER;
+
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+ sin6.sin6_addr = *dst6;
+ /* Assume scopeid is valid and embed it directly */
+ sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ rte_ext = (const struct rte_lpm6_external *)lpm6;
+ rh = rt_tables_get_rnh(rte_ext->fibnum, AF_INET6);
+ if (rh == NULL)
+ return (NULL);
+
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
+ nh = RNTORT(rn)->rt_nhop;
+ RIB_RUNLOCK(rh);
+ //TODO: check LL nhops refcounting
+
+ return (nh);
+}
+
+/*
+ * Main datapath routing
+ */
+static struct nhop_object *
+lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ const struct rte_lpm6 *lpm6;
+ const struct rte_lpm6_external *rte_ext;
+ const struct in6_addr *addr6;
+ uint32_t nhidx = 0;
+ int ret;
+
+ lpm6 = (const struct rte_lpm6 *)algo_data;
+ addr6 = key.addr6;
+ rte_ext = (const struct rte_lpm6_external *)lpm6;
+
+ if (!IN6_IS_SCOPE_LINKLOCAL(addr6)) {
+ ret = rte_lpm6_lookup(lpm6, (const uint8_t *)addr6, &nhidx);
+ if (ret == 0) {
+ /* Success! */
+ return (rte_ext->nh_idx[nhidx]);
+ } else {
+ /* Not found. Check default route */
+ if (rte_ext->default_idx > 0)
+ return (rte_ext->nh_idx[rte_ext->default_idx]);
+ else
+ return (NULL);
+ }
+ } else {
+ /* LL */
+ return (lookup_ptr_ll(lpm6, addr6, scopeid));
+ }
+}
+
+static uint8_t
+rte6_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ if (rinfo->num_prefixes < 10)
+ return (1);
+ else if (rinfo->num_prefixes < 1000)
+ return (rinfo->num_prefixes / 10);
+ else if (rinfo->num_prefixes < 500000)
+ return (100 + rinfo->num_prefixes / 3334);
+ else
+ return (250);
+}
+
+static int
+contigmask(const uint8_t *p, int len)
+{
+ int i, n;
+
+ for (i = 0; i < len ; i++)
+ if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */
+ break;
+ for (n= i + 1; n < len; n++)
+ if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0)
+ return (-1); /* mask not contiguous */
+ return (i);
+}
+
+static uint8_t
+rt_get_plen(const struct rtentry *rt)
+{
+ const struct sockaddr *sa;
+ int family;
+ int plen;
+
+ sa = rt_key_const(rt);
+ family = sa->sa_family;
+ sa = rt_mask_const(rt);
+ switch (family) {
+ case AF_INET:
+ if (sa != NULL) {
+ const struct in_addr *addr4;
+ addr4 = &((const struct sockaddr_in *)sa)->sin_addr;
+ plen = contigmask((const uint8_t *)addr4, 32);
+ if (plen == -1)
+ plen = 0;
+ } else
+ plen = 32;
+ break;
+ case AF_INET6:
+ if (sa != NULL) {
+ const struct in6_addr *addr6;
+ addr6 = &((const struct sockaddr_in6 *)sa)->sin6_addr;
+ plen = contigmask((const uint8_t *)addr6, 128);
+ if (plen == -1)
+ plen = 0;
+ } else
+ plen = 128;
+ break;
+ default:
+ plen = 0;
+ }
+
+ return (plen);
+}
+
+static enum flm_op_result
+handle_default_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc)
+{
+ struct rte_lpm6_external *rte_ext;
+ rte_ext = (struct rte_lpm6_external *)dd->lpm6;
+ uint32_t old_nhidx = rte_ext->default_idx;
+
+ if (rc->rc_cmd != RTM_DELETE) {
+ /* Reference new */
+ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+
+ if (nhidx == 0)
+ return (FLM_REBUILD);
+ rte_ext->default_idx = nhidx;
+ } else {
+ /* No default route */
+ rte_ext->default_idx = 0;
+ }
+
+ if (old_nhidx != 0)
+ fib_free_nhop_idx(dd->fd, old_nhidx);
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+handle_ll_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc,
+ const struct sockaddr_in6 *sin6, int plen)
+{
+
+ return (FLM_SUCCESS);
+}
+
+static struct rte_lpm6_rule *
+pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6,
+ char *buffer)
+{
+ struct rte_lpm6_rule *lsp_rule = NULL;
+ struct route_nhop_data rnd;
+ struct rtentry *rt;
+ int plen;
+
+ rt = fib6_lookup_rt(dd->fibnum, addr6, 0, NHR_UNLOCKED, &rnd);
+ /* plen = 0 means default route and it's out of scope */
+ if ((rt != NULL) && (plen = rt_get_plen(rt)) > 0) {
+ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop);
+ if (nhidx == 0) {
+ /*
+ * shouldn't happen as we already have parent route.
+ * It will trigger rebuild automatically.
+ */
+ return (NULL);
+ }
+ const struct sockaddr_in6 *psin6;
+ const uint8_t *uaddr6;
+ psin6 = (const struct sockaddr_in6 *)rt_key_const(rt);
+ uaddr6 = (const uint8_t *)&psin6->sin6_addr;
+ lsp_rule = fill_rule6(buffer, uaddr6, plen, nhidx);
+ }
+
+ return (lsp_rule);
+}
+
+static enum flm_op_result
+handle_gu_change(struct dpdk_lpm6_data *dd, const struct rib_cmd_info *rc,
+ const struct in6_addr *addr6, int plen)
+{
+ uint32_t nhidx = 0;
+ int ret;
+ char abuf[INET6_ADDRSTRLEN];
+ inet_ntop(AF_INET6, addr6, abuf, sizeof(abuf));
+
+ /* So we get sin6, plen and nhidx */
+ if (rc->rc_cmd != RTM_DELETE) {
+ /*
+ * Addition or change. Save nhop in the internal table
+ * and get index.
+ */
+ nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+ if (nhidx == 0) {
+ DPRINTF("FCK: nhop limit reached, need rebuild");
+ return (FLM_REBUILD);
+ }
+
+ ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)addr6,
+ plen, nhidx, (rc->rc_cmd == RTM_ADD) ? 1 : 0);
+ DPRINTF("DPDK GU: %s %s/%d nhop %u = %d", (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE",
+ abuf, plen, nhidx, ret);
+ } else {
+ /*
+ * Need to lookup parent. Assume deletion happened already
+ */
+ char buffer[RTE_LPM6_RULE_SIZE];
+ struct rte_lpm6_rule *lsp_rule = NULL;
+ const struct sockaddr_in6 *dst6;
+ dst6 = (const struct sockaddr_in6 *)rt_key_const(rc->rc_rt);
+
+ lsp_rule = pack_parent_rule(dd, &dst6->sin6_addr, buffer);
+
+ ret = rte_lpm6_delete(dd->lpm6, (const uint8_t *)addr6, plen, lsp_rule);
+ DPRINTF("DPDK GU: %s %s/%d nhop %u = %d", "DEL", abuf, plen, nhidx, ret);
+ }
+
+ if (rc->rc_nh_old != NULL)
+ fib_free_nhop(dd->fd, rc->rc_nh_old);
+
+ if (ret != 0) {
+ DPRINTF("error: %d", ret);
+ if (ret == -EOVERFLOW)
+ return (FLM_REBUILD);
+ return (FLM_ERROR);
+ }
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+handle_any_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc)
+{
+ const struct sockaddr_in6 *sin6;
+ int plen = rt_get_plen(rc->rc_rt);
+ enum flm_op_result ret;
+
+ sin6 = (const struct sockaddr_in6 *)rt_key_const(rc->rc_rt);
+
+ if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr))
+ ret = handle_ll_change(dd, rc, sin6, plen);
+ else if (plen == 0)
+ ret = handle_default_change(dd, rc);
+ else
+ ret = handle_gu_change(dd, rc, &sin6->sin6_addr, plen);
+
+ if (ret != 0)
+ DPRINTF("error handling route");
+ return (ret);
+}
+
+static enum flm_op_result
+handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ struct dpdk_lpm6_data *dd;
+
+ dd = (struct dpdk_lpm6_data *)_data;
+
+ return (handle_any_change(dd, rc));
+}
+
+static void
+destroy_dd(struct dpdk_lpm6_data *dd)
+{
+
+ DPRINTF("destroy dd %p", dd);
+ if (dd->lpm6 != NULL)
+ rte_lpm6_free(dd->lpm6);
+ free(dd, M_TEMP);
+}
+
+static void
+destroy_table(void *_data)
+{
+
+ destroy_dd((struct dpdk_lpm6_data *)_data);
+}
+
+static enum flm_op_result
+add_route_cb(struct rtentry *rt, void *_data)
+{
+ struct dpdk_lpm6_data *dd = (struct dpdk_lpm6_data *)_data;
+ const struct sockaddr_in6 *sin6;
+ int plen = rt_get_plen(rt);
+ int ret;
+
+ sin6 = (const struct sockaddr_in6 *)rt_key_const(rt);
+
+ char abuf[INET6_ADDRSTRLEN];
+ char mbuf[INET6_ADDRSTRLEN];
+ inet_ntop(AF_INET6, &sin6->sin6_addr, abuf, sizeof(abuf));
+
+ const struct sockaddr_in6 *mask6;
+ mask6 = (const struct sockaddr_in6 *)rt_mask_const(rt);
+ if (mask6 != NULL) {
+ inet_ntop(AF_INET6, &mask6->sin6_addr, mbuf, sizeof(abuf));
+ } else
+ mbuf[0] = '\0';
+
+ DPRINTF("Operating on %s/%d [%s]", abuf, plen, mbuf);
+
+ if (plen == 0) {
+ struct rib_cmd_info rc;
+
+ bzero(&rc, sizeof(rc));
+ rc.rc_cmd = RTM_ADD;
+ rc.rc_nh_new = rt->rt_nhop;
+ DPRINTF("Adding default route");
+ return (handle_default_change(dd, &rc));
+ }
+
+ uint32_t nhidx = fib_get_nhop_idx(dd->fd, rt->rt_nhop);
+ if (nhidx == 0) {
+ DPRINTF("unable to get nhop index");
+ return (FLM_REBUILD);
+ }
+ ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)&sin6->sin6_addr, plen,
+ nhidx, 1);
+ DPRINTF("ADD %p %s/%d nh %u = %d", dd->lpm6, abuf, plen, nhidx, ret);
+
+ if (ret != 0) {
+ DPRINTF("rte_lpm6_add() returned %d", ret);
+ if (ret == -ENOSPC) {
+ dd->hit_tables = 1;
+ return (FLM_REBUILD);
+ }
+ dd->routes_failed++;
+ } else
+ dd->routes_added++;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+check_dump_success(void *_data, struct fib_dp *dp)
+{
+ struct dpdk_lpm6_data *dd;
+
+ dd = (struct dpdk_lpm6_data *)_data;
+
+ DPRINTF("scan completed. added: %zu failed: %zu",
+ dd->routes_added, dd->routes_failed);
+ if (dd->hit_tables || dd->routes_failed > 0)
+ return (FLM_REBUILD);
+
+ DPRINTF("DPDK lookup engine synced with IPv6 RIB id %u, %zu routes",
+ dd->fibnum, dd->routes_added);
+
+ dp->f = lookup_ptr;
+ dp->arg = dd->lpm6;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+estimate_scale(const struct dpdk_lpm6_data *dd_src, struct dpdk_lpm6_data *dd)
+{
+
+ /* XXX: update at 75% capacity */
+ if (dd_src->hit_tables)
+ dd->number_tbl8s = dd_src->number_tbl8s * 2;
+ else
+ dd->number_tbl8s = dd_src->number_tbl8s;
+
+ /* TODO: look into the appropriate RIB to adjust */
+}
+
+static struct dpdk_lpm6_data *
+build_table(struct dpdk_lpm6_data *dd_prev)
+{
+ struct dpdk_lpm6_data *dd;
+ struct rte_lpm6 *lpm6;
+
+ dd = malloc(sizeof(struct dpdk_lpm6_data), M_TEMP, M_NOWAIT | M_ZERO);
+ if (dd == NULL) {
+ DPRINTF("Unable to allocate base datastructure");
+ return (NULL);
+ }
+ dd->fibnum = dd_prev->fibnum;
+ dd->fd = dd_prev->fd;
+
+ estimate_scale(dd_prev, dd);
+
+ struct rte_lpm6_config cfg = {.number_tbl8s = dd->number_tbl8s};
+ lpm6 = rte_lpm6_create("test", 0, &cfg);
+ if (lpm6 == NULL) {
+ DPRINTF("unable to create lpm6");
+ free(dd, M_TEMP);
+ return (NULL);
+ }
+ dd->lpm6 = lpm6;
+ struct rte_lpm6_external *ext = (struct rte_lpm6_external *)lpm6;
+ ext->nh_idx = fib_get_nhop_array(dd->fd);
+
+ DPRINTF("allocated %u tbl8s", dd->number_tbl8s);
+
+ return (dd);
+}
+
+static enum flm_op_result
+init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data)
+{
+ struct dpdk_lpm6_data *dd, dd_base;
+
+ if (_old_data == NULL) {
+ bzero(&dd_base, sizeof(struct dpdk_lpm6_data));
+ dd_base.fibnum = fibnum;
+ dd_base.fd = fd;
+ /* TODO: get rib statistics */
+ dd_base.number_tbl8s = LPM6_MIN_TBL8;
+ dd = &dd_base;
+ } else {
+ DPRINTF("Starting with old data");
+ dd = (struct dpdk_lpm6_data *)_old_data;
+ }
+
+ /* Guaranteed to be in epoch */
+ dd = build_table(dd);
+ if (dd == NULL) {
+ DPRINTF("table creation failed");
+ return (FLM_REBUILD);
+ }
+
+ *data = dd;
+ return (FLM_SUCCESS);
+}
+
+static struct fib_lookup_module dpdk_lpm6 = {
+ .flm_name = "dpdk_lpm6",
+ .flm_family = AF_INET6,
+ .flm_init_cb = init_table,
+ .flm_destroy_cb = destroy_table,
+ .flm_dump_rib_item_cb = add_route_cb,
+ .flm_dump_end_cb = check_dump_success,
+ .flm_change_rib_item_cb = handle_rtable_change_cb,
+ .flm_get_pref = rte6_get_pref,
+};
+
+static int
+lpm6_modevent(module_t mod, int type, void *unused)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ fib_module_register(&dpdk_lpm6);
+ break;
+ case MOD_UNLOAD:
+ error = fib_module_unregister(&dpdk_lpm6);
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t lpm6mod = {
+ "dpdk_lpm6",
+ lpm6_modevent,
+ 0
+};
+
+DECLARE_MODULE(lpm6mod, lpm6mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(lpm6mod, 1);
Index: sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+/**
+ * @file
+ * Branch Prediction Helpers in RTE
+ */
+
+#ifndef _RTE_BRANCH_PREDICTION_H_
+#define _RTE_BRANCH_PREDICTION_H_
+
+/**
+ * Check if a branch is likely to be taken.
+ *
+ * This compiler builtin allows the developer to indicate if a branch is
+ * likely to be taken. Example:
+ *
+ * if (likely(x > 1))
+ * do_stuff();
+ *
+ */
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#endif /* likely */
+
+/**
+ * Check if a branch is unlikely to be taken.
+ *
+ * This compiler builtin allows the developer to indicate if a branch is
+ * unlikely to be taken. Example:
+ *
+ * if (unlikely(x < 1))
+ * do_stuff();
+ *
+ */
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif /* unlikely */
+
+#endif /* _RTE_BRANCH_PREDICTION_H_ */
Index: sys/contrib/dpdk_rte_lpm/rte_common.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_common.h
@@ -0,0 +1,838 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2019 Intel Corporation
+ */
+
+#ifndef _RTE_COMMON_H_
+#define _RTE_COMMON_H_
+
+/**
+ * @file
+ *
+ * Generic, commonly-used macro and inline function definitions
+ * for DPDK.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#include <rte_config.h>
+
+/* OS specific include */
+//#include <rte_os.h>
+
+#ifndef typeof
+#define typeof __typeof__
+#endif
+
+#ifndef asm
+#define asm __asm__
+#endif
+
+/** C extension macro for environments lacking C11 features. */
+#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L
+#define RTE_STD_C11 __extension__
+#else
+#define RTE_STD_C11
+#endif
+
+/*
+ * RTE_TOOLCHAIN_GCC is defined if the target is built with GCC,
+ * while a host application (like pmdinfogen) may have another compiler.
+ * RTE_CC_IS_GNU is true if the file is compiled with GCC,
+ * no matter it is a target or host application.
+ */
+#define RTE_CC_IS_GNU 0
+#if defined __clang__
+#define RTE_CC_CLANG
+#elif defined __INTEL_COMPILER
+#define RTE_CC_ICC
+#elif defined __GNUC__
+#define RTE_CC_GCC
+#undef RTE_CC_IS_GNU
+#define RTE_CC_IS_GNU 1
+#endif
+#if RTE_CC_IS_GNU
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + \
+ __GNUC_PATCHLEVEL__)
+#endif
+
+/**
+ * Force alignment
+ */
+#define __rte_aligned(a) __attribute__((__aligned__(a)))
+
+#ifdef RTE_ARCH_STRICT_ALIGN
+typedef uint64_t unaligned_uint64_t __rte_aligned(1);
+typedef uint32_t unaligned_uint32_t __rte_aligned(1);
+typedef uint16_t unaligned_uint16_t __rte_aligned(1);
+#else
+typedef uint64_t unaligned_uint64_t;
+typedef uint32_t unaligned_uint32_t;
+typedef uint16_t unaligned_uint16_t;
+#endif
+
+/**
+ * Force a structure to be packed
+ */
+#define __rte_packed __attribute__((__packed__))
+
+/******* Macro to mark functions and fields scheduled for removal *****/
+#define __rte_deprecated __attribute__((__deprecated__))
+
+/**
+ * Mark a function or variable to a weak reference.
+ */
+#define __rte_weak __attribute__((__weak__))
+
+/**
+ * Force symbol to be generated even if it appears to be unused.
+ */
+#define __rte_used __attribute__((used))
+
+/*********** Macros to eliminate unused variable warnings ********/
+
+/**
+ * short definition to mark a function parameter unused
+ */
+#define __rte_unused __attribute__((__unused__))
+
+/**
+ * definition to mark a variable or function parameter as used so
+ * as to avoid a compiler warning
+ */
+#define RTE_SET_USED(x) (void)(x)
+
+/**
+ * Check format string and its arguments at compile-time.
+ *
+ * GCC on Windows assumes MS-specific format string by default,
+ * even if the underlying stdio implementation is ANSI-compliant,
+ * so this must be overridden.
+ */
+#if RTE_CC_IS_GNU
+#define __rte_format_printf(format_index, first_arg) \
+ __attribute__((format(gnu_printf, format_index, first_arg)))
+#else
+#define __rte_format_printf(format_index, first_arg) \
+ __attribute__((format(printf, format_index, first_arg)))
+#endif
+
+#define RTE_PRIORITY_LOG 101
+#define RTE_PRIORITY_BUS 110
+#define RTE_PRIORITY_CLASS 120
+#define RTE_PRIORITY_LAST 65535
+
+#define RTE_PRIO(prio) \
+ RTE_PRIORITY_ ## prio
+
+/**
+ * Run function before main() with high priority.
+ *
+ * @param func
+ * Constructor function.
+ * @param prio
+ * Priority number must be above 100.
+ * Lowest number is the first to run.
+ */
+#ifndef RTE_INIT_PRIO /* Allow to override from EAL */
+#define RTE_INIT_PRIO(func, prio) \
+static void __attribute__((constructor(RTE_PRIO(prio)), used)) func(void)
+#endif
+
+/**
+ * Run function before main() with low priority.
+ *
+ * The constructor will be run after prioritized constructors.
+ *
+ * @param func
+ * Constructor function.
+ */
+#define RTE_INIT(func) \
+ RTE_INIT_PRIO(func, LAST)
+
+/**
+ * Run after main() with low priority.
+ *
+ * @param func
+ * Destructor function name.
+ * @param prio
+ * Priority number must be above 100.
+ * Lowest number is the last to run.
+ */
+#ifndef RTE_FINI_PRIO /* Allow to override from EAL */
+#define RTE_FINI_PRIO(func, prio) \
+static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void)
+#endif
+
+/**
+ * Run after main() with high priority.
+ *
+ * The destructor will be run *before* prioritized destructors.
+ *
+ * @param func
+ * Destructor function name.
+ */
+#define RTE_FINI(func) \
+ RTE_FINI_PRIO(func, LAST)
+
+/**
+ * Hint never returning function
+ */
+#define __rte_noreturn __attribute__((noreturn))
+
+/**
+ * Force a function to be inlined
+ */
+#define __rte_always_inline inline __attribute__((always_inline))
+
+/**
+ * Force a function to be noinlined
+ */
+#define __rte_noinline __attribute__((noinline))
+
+/**
+ * Hint function in the hot path
+ */
+#define __rte_hot __attribute__((hot))
+
+/**
+ * Hint function in the cold path
+ */
+#define __rte_cold __attribute__((cold))
+
+/*********** Macros for pointer arithmetic ********/
+
+/**
+ * add a byte-value offset to a pointer
+ */
+#define RTE_PTR_ADD(ptr, x) ((void*)((uintptr_t)(ptr) + (x)))
+
+/**
+ * subtract a byte-value offset from a pointer
+ */
+#define RTE_PTR_SUB(ptr, x) ((void*)((uintptr_t)ptr - (x)))
+
+/**
+ * get the difference between two pointer values, i.e. how far apart
+ * in bytes are the locations they point two. It is assumed that
+ * ptr1 is greater than ptr2.
+ */
+#define RTE_PTR_DIFF(ptr1, ptr2) ((uintptr_t)(ptr1) - (uintptr_t)(ptr2))
+
+/**
+ * Workaround to cast a const field of a structure to non-const type.
+ */
+#define RTE_CAST_FIELD(var, field, type) \
+ (*(type *)((uintptr_t)(var) + offsetof(typeof(*(var)), field)))
+
+/*********** Macros/static functions for doing alignment ********/
+
+
+/**
+ * Macro to align a pointer to a given power-of-two. The resultant
+ * pointer will be a pointer of the same type as the first parameter, and
+ * point to an address no higher than the first parameter. Second parameter
+ * must be a power-of-two value.
+ */
+#define RTE_PTR_ALIGN_FLOOR(ptr, align) \
+ ((typeof(ptr))RTE_ALIGN_FLOOR((uintptr_t)ptr, align))
+
+/**
+ * Macro to align a value to a given power-of-two. The resultant value
+ * will be of the same type as the first parameter, and will be no
+ * bigger than the first parameter. Second parameter must be a
+ * power-of-two value.
+ */
+#define RTE_ALIGN_FLOOR(val, align) \
+ (typeof(val))((val) & (~((typeof(val))((align) - 1))))
+
+/**
+ * Macro to align a pointer to a given power-of-two. The resultant
+ * pointer will be a pointer of the same type as the first parameter, and
+ * point to an address no lower than the first parameter. Second parameter
+ * must be a power-of-two value.
+ */
+#define RTE_PTR_ALIGN_CEIL(ptr, align) \
+ RTE_PTR_ALIGN_FLOOR((typeof(ptr))RTE_PTR_ADD(ptr, (align) - 1), align)
+
+/**
+ * Macro to align a value to a given power-of-two. The resultant value
+ * will be of the same type as the first parameter, and will be no lower
+ * than the first parameter. Second parameter must be a power-of-two
+ * value.
+ */
+#define RTE_ALIGN_CEIL(val, align) \
+ RTE_ALIGN_FLOOR(((val) + ((typeof(val)) (align) - 1)), align)
+
+/**
+ * Macro to align a pointer to a given power-of-two. The resultant
+ * pointer will be a pointer of the same type as the first parameter, and
+ * point to an address no lower than the first parameter. Second parameter
+ * must be a power-of-two value.
+ * This function is the same as RTE_PTR_ALIGN_CEIL
+ */
+#define RTE_PTR_ALIGN(ptr, align) RTE_PTR_ALIGN_CEIL(ptr, align)
+
+/**
+ * Macro to align a value to a given power-of-two. The resultant
+ * value will be of the same type as the first parameter, and
+ * will be no lower than the first parameter. Second parameter
+ * must be a power-of-two value.
+ * This function is the same as RTE_ALIGN_CEIL
+ */
+#define RTE_ALIGN(val, align) RTE_ALIGN_CEIL(val, align)
+
+/**
+ * Macro to align a value to the multiple of given value. The resultant
+ * value will be of the same type as the first parameter and will be no lower
+ * than the first parameter.
+ */
+#define RTE_ALIGN_MUL_CEIL(v, mul) \
+ (((v + (typeof(v))(mul) - 1) / ((typeof(v))(mul))) * (typeof(v))(mul))
+
+/**
+ * Macro to align a value to the multiple of given value. The resultant
+ * value will be of the same type as the first parameter and will be no higher
+ * than the first parameter.
+ */
+#define RTE_ALIGN_MUL_FLOOR(v, mul) \
+ ((v / ((typeof(v))(mul))) * (typeof(v))(mul))
+
+/**
+ * Macro to align value to the nearest multiple of the given value.
+ * The resultant value might be greater than or less than the first parameter
+ * whichever difference is the lowest.
+ */
+#define RTE_ALIGN_MUL_NEAR(v, mul) \
+ ({ \
+ typeof(v) ceil = RTE_ALIGN_MUL_CEIL(v, mul); \
+ typeof(v) floor = RTE_ALIGN_MUL_FLOOR(v, mul); \
+ (ceil - v) > (v - floor) ? floor : ceil; \
+ })
+
+/**
+ * Checks if a pointer is aligned to a given power-of-two value
+ *
+ * @param ptr
+ * The pointer whose alignment is to be checked
+ * @param align
+ * The power-of-two value to which the ptr should be aligned
+ *
+ * @return
+ * True(1) where the pointer is correctly aligned, false(0) otherwise
+ */
+static inline int
+rte_is_aligned(void *ptr, unsigned align)
+{
+ return RTE_PTR_ALIGN(ptr, align) == ptr;
+}
+
+/*********** Macros for compile type checks ********/
+
+/**
+ * Triggers an error at compilation time if the condition is true.
+ */
+#define RTE_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+/*********** Cache line related macros ********/
+
+/** Cache line mask. */
+#define RTE_CACHE_LINE_MASK (RTE_CACHE_LINE_SIZE-1)
+
+/** Return the first cache-aligned value greater or equal to size. */
+#define RTE_CACHE_LINE_ROUNDUP(size) \
+ (RTE_CACHE_LINE_SIZE * ((size + RTE_CACHE_LINE_SIZE - 1) / \
+ RTE_CACHE_LINE_SIZE))
+
+/** Cache line size in terms of log2 */
+#if RTE_CACHE_LINE_SIZE == 64
+#define RTE_CACHE_LINE_SIZE_LOG2 6
+#elif RTE_CACHE_LINE_SIZE == 128
+#define RTE_CACHE_LINE_SIZE_LOG2 7
+#else
+#error "Unsupported cache line size"
+#endif
+
+/** Minimum Cache line size. */
+#define RTE_CACHE_LINE_MIN_SIZE 64
+
+/** Force alignment to cache line. */
+#define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE)
+
+/** Force minimum cache line alignment. */
+#define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE)
+
+/*********** PA/IOVA type definitions ********/
+
+/** Physical address */
+typedef uint64_t phys_addr_t;
+#define RTE_BAD_PHYS_ADDR ((phys_addr_t)-1)
+
+/**
+ * IO virtual address type.
+ * When the physical addressing mode (IOVA as PA) is in use,
+ * the translation from an IO virtual address (IOVA) to a physical address
+ * is a direct mapping, i.e. the same value.
+ * Otherwise, in virtual mode (IOVA as VA), an IOMMU may do the translation.
+ */
+typedef uint64_t rte_iova_t;
+#define RTE_BAD_IOVA ((rte_iova_t)-1)
+
+/*********** Structure alignment markers ********/
+
+/** Generic marker for any place in a structure. */
+__extension__ typedef void *RTE_MARKER[0];
+/** Marker for 1B alignment in a structure. */
+__extension__ typedef uint8_t RTE_MARKER8[0];
+/** Marker for 2B alignment in a structure. */
+__extension__ typedef uint16_t RTE_MARKER16[0];
+/** Marker for 4B alignment in a structure. */
+__extension__ typedef uint32_t RTE_MARKER32[0];
+/** Marker for 8B alignment in a structure. */
+__extension__ typedef uint64_t RTE_MARKER64[0];
+
+/**
+ * Combines 32b inputs most significant set bits into the least
+ * significant bits to construct a value with the same MSBs as x
+ * but all 1's under it.
+ *
+ * @param x
+ * The integer whose MSBs need to be combined with its LSBs
+ * @return
+ * The combined value.
+ */
+static inline uint32_t
+rte_combine32ms1b(register uint32_t x)
+{
+ x |= x >> 1;
+ x |= x >> 2;
+ x |= x >> 4;
+ x |= x >> 8;
+ x |= x >> 16;
+
+ return x;
+}
+
+/**
+ * Combines 64b inputs most significant set bits into the least
+ * significant bits to construct a value with the same MSBs as x
+ * but all 1's under it.
+ *
+ * @param v
+ * The integer whose MSBs need to be combined with its LSBs
+ * @return
+ * The combined value.
+ */
+static inline uint64_t
+rte_combine64ms1b(register uint64_t v)
+{
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v |= v >> 32;
+
+ return v;
+}
+
+/*********** Macros to work with powers of 2 ********/
+
+/**
+ * Macro to return 1 if n is a power of 2, 0 otherwise
+ */
+#define RTE_IS_POWER_OF_2(n) ((n) && !(((n) - 1) & (n)))
+
+/**
+ * Returns true if n is a power of 2
+ * @param n
+ * Number to check
+ * @return 1 if true, 0 otherwise
+ */
+static inline int
+rte_is_power_of_2(uint32_t n)
+{
+ return n && !(n & (n - 1));
+}
+
+/**
+ * Aligns input parameter to the next power of 2
+ *
+ * @param x
+ * The integer value to align
+ *
+ * @return
+ * Input parameter aligned to the next power of 2
+ */
+static inline uint32_t
+rte_align32pow2(uint32_t x)
+{
+ x--;
+ x = rte_combine32ms1b(x);
+
+ return x + 1;
+}
+
+/**
+ * Aligns input parameter to the previous power of 2
+ *
+ * @param x
+ * The integer value to align
+ *
+ * @return
+ * Input parameter aligned to the previous power of 2
+ */
+static inline uint32_t
+rte_align32prevpow2(uint32_t x)
+{
+ x = rte_combine32ms1b(x);
+
+ return x - (x >> 1);
+}
+
+/**
+ * Aligns 64b input parameter to the next power of 2
+ *
+ * @param v
+ * The 64b value to align
+ *
+ * @return
+ * Input parameter aligned to the next power of 2
+ */
+static inline uint64_t
+rte_align64pow2(uint64_t v)
+{
+ v--;
+ v = rte_combine64ms1b(v);
+
+ return v + 1;
+}
+
+/**
+ * Aligns 64b input parameter to the previous power of 2
+ *
+ * @param v
+ * The 64b value to align
+ *
+ * @return
+ * Input parameter aligned to the previous power of 2
+ */
+static inline uint64_t
+rte_align64prevpow2(uint64_t v)
+{
+ v = rte_combine64ms1b(v);
+
+ return v - (v >> 1);
+}
+
+/*********** Macros for calculating min and max **********/
+
+/**
+ * Macro to return the minimum of two numbers
+ */
+#define RTE_MIN(a, b) \
+ __extension__ ({ \
+ typeof (a) _a = (a); \
+ typeof (b) _b = (b); \
+ _a < _b ? _a : _b; \
+ })
+
+/**
+ * Macro to return the maximum of two numbers
+ */
+#define RTE_MAX(a, b) \
+ __extension__ ({ \
+ typeof (a) _a = (a); \
+ typeof (b) _b = (b); \
+ _a > _b ? _a : _b; \
+ })
+
+/*********** Other general functions / macros ********/
+
+/**
+ * Searches the input parameter for the least significant set bit
+ * (starting from zero).
+ * If a least significant 1 bit is found, its bit index is returned.
+ * If the content of the input parameter is zero, then the content of the return
+ * value is undefined.
+ * @param v
+ * input parameter, should not be zero.
+ * @return
+ * least significant set bit in the input parameter.
+ */
+static inline uint32_t
+rte_bsf32(uint32_t v)
+{
+ return (uint32_t)__builtin_ctz(v);
+}
+
+/**
+ * Searches the input parameter for the least significant set bit
+ * (starting from zero). Safe version (checks for input parameter being zero).
+ *
+ * @warning ``pos`` must be a valid pointer. It is not checked!
+ *
+ * @param v
+ * The input parameter.
+ * @param pos
+ * If ``v`` was not 0, this value will contain position of least significant
+ * bit within the input parameter.
+ * @return
+ * Returns 0 if ``v`` was 0, otherwise returns 1.
+ */
+static inline int
+rte_bsf32_safe(uint64_t v, uint32_t *pos)
+{
+ if (v == 0)
+ return 0;
+
+ *pos = rte_bsf32(v);
+ return 1;
+}
+
+/**
+ * Return the rounded-up log2 of a integer.
+ *
+ * @note Contrary to the logarithm mathematical operation,
+ * rte_log2_u32(0) == 0 and not -inf.
+ *
+ * @param v
+ * The input parameter.
+ * @return
+ * The rounded-up log2 of the input, or 0 if the input is 0.
+ */
+static inline uint32_t
+rte_log2_u32(uint32_t v)
+{
+ if (v == 0)
+ return 0;
+ v = rte_align32pow2(v);
+ return rte_bsf32(v);
+}
+
+
+/**
+ * Return the last (most-significant) bit set.
+ *
+ * @note The last (most significant) bit is at position 32.
+ * @note rte_fls_u32(0) = 0, rte_fls_u32(1) = 1, rte_fls_u32(0x80000000) = 32
+ *
+ * @param x
+ * The input parameter.
+ * @return
+ * The last (most-significant) bit set, or 0 if the input is 0.
+ */
+static inline int
+rte_fls_u32(uint32_t x)
+{
+ return (x == 0) ? 0 : 32 - __builtin_clz(x);
+}
+
+/**
+ * Searches the input parameter for the least significant set bit
+ * (starting from zero).
+ * If a least significant 1 bit is found, its bit index is returned.
+ * If the content of the input parameter is zero, then the content of the return
+ * value is undefined.
+ * @param v
+ * input parameter, should not be zero.
+ * @return
+ * least significant set bit in the input parameter.
+ */
+static inline int
+rte_bsf64(uint64_t v)
+{
+ return (uint32_t)__builtin_ctzll(v);
+}
+
+/**
+ * Searches the input parameter for the least significant set bit
+ * (starting from zero). Safe version (checks for input parameter being zero).
+ *
+ * @warning ``pos`` must be a valid pointer. It is not checked!
+ *
+ * @param v
+ * The input parameter.
+ * @param pos
+ * If ``v`` was not 0, this value will contain position of least significant
+ * bit within the input parameter.
+ * @return
+ * Returns 0 if ``v`` was 0, otherwise returns 1.
+ */
+static inline int
+rte_bsf64_safe(uint64_t v, uint32_t *pos)
+{
+ if (v == 0)
+ return 0;
+
+ *pos = rte_bsf64(v);
+ return 1;
+}
+
+/**
+ * Return the last (most-significant) bit set.
+ *
+ * @note The last (most significant) bit is at position 64.
+ * @note rte_fls_u64(0) = 0, rte_fls_u64(1) = 1,
+ * rte_fls_u64(0x8000000000000000) = 64
+ *
+ * @param x
+ * The input parameter.
+ * @return
+ * The last (most-significant) bit set, or 0 if the input is 0.
+ */
+static inline int
+rte_fls_u64(uint64_t x)
+{
+ return (x == 0) ? 0 : 64 - __builtin_clzll(x);
+}
+
+/**
+ * Return the rounded-up log2 of a 64-bit integer.
+ *
+ * @note Contrary to the logarithm mathematical operation,
+ * rte_log2_u64(0) == 0 and not -inf.
+ *
+ * @param v
+ * The input parameter.
+ * @return
+ * The rounded-up log2 of the input, or 0 if the input is 0.
+ */
+static inline uint32_t
+rte_log2_u64(uint64_t v)
+{
+ if (v == 0)
+ return 0;
+ v = rte_align64pow2(v);
+ /* we checked for v being 0 already, so no undefined behavior */
+ return rte_bsf64(v);
+}
+
+#ifndef offsetof
+/** Return the offset of a field in a structure. */
+#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
+
+/**
+ * Return pointer to the wrapping struct instance.
+ *
+ * Example:
+ *
+ * struct wrapper {
+ * ...
+ * struct child c;
+ * ...
+ * };
+ *
+ * struct child *x = obtain(...);
+ * struct wrapper *w = container_of(x, struct wrapper, c);
+ */
+#ifndef container_of
+#define container_of(ptr, type, member) __extension__ ({ \
+ const typeof(((type *)0)->member) *_ptr = (ptr); \
+ __rte_unused type *_target_ptr = \
+ (type *)(ptr); \
+ (type *)(((uintptr_t)_ptr) - offsetof(type, member)); \
+ })
+#endif
+
+/**
+ * Get the size of a field in a structure.
+ *
+ * @param type
+ * The type of the structure.
+ * @param field
+ * The field in the structure.
+ * @return
+ * The size of the field in the structure, in bytes.
+ */
+#define RTE_SIZEOF_FIELD(type, field) (sizeof(((type *)0)->field))
+
+#define _RTE_STR(x) #x
+/** Take a macro value and get a string version of it */
+#define RTE_STR(x) _RTE_STR(x)
+
+/**
+ * ISO C helpers to modify format strings using variadic macros.
+ * This is a replacement for the ", ## __VA_ARGS__" GNU extension.
+ * An empty %s argument is appended to avoid a dangling comma.
+ */
+#define RTE_FMT(fmt, ...) fmt "%.0s", __VA_ARGS__ ""
+#define RTE_FMT_HEAD(fmt, ...) fmt
+#define RTE_FMT_TAIL(fmt, ...) __VA_ARGS__
+
+/** Mask value of type "tp" for the first "ln" bit set. */
+#define RTE_LEN2MASK(ln, tp) \
+ ((tp)((uint64_t)-1 >> (sizeof(uint64_t) * CHAR_BIT - (ln))))
+
+/** Number of elements in the array. */
+#define RTE_DIM(a) (sizeof (a) / sizeof ((a)[0]))
+
+/**
+ * Converts a numeric string to the equivalent uint64_t value.
+ * As well as straight number conversion, also recognises the suffixes
+ * k, m and g for kilobytes, megabytes and gigabytes respectively.
+ *
+ * If a negative number is passed in i.e. a string with the first non-black
+ * character being "-", zero is returned. Zero is also returned in the case of
+ * an error with the strtoull call in the function.
+ *
+ * @param str
+ * String containing number to convert.
+ * @return
+ * Number.
+ */
+#if 0
+static inline uint64_t
+rte_str_to_size(const char *str)
+{
+ char *endptr;
+ unsigned long long size;
+
+ while (isspace((int)*str))
+ str++;
+ if (*str == '-')
+ return 0;
+
+ errno = 0;
+ size = strtoull(str, &endptr, 0);
+ if (errno)
+ return 0;
+
+ if (*endptr == ' ')
+ endptr++; /* allow 1 space gap */
+
+ switch (*endptr){
+ case 'G': case 'g': size *= 1024; /* fall-through */
+ case 'M': case 'm': size *= 1024; /* fall-through */
+ case 'K': case 'k': size *= 1024; /* fall-through */
+ default:
+ break;
+ }
+ return size;
+}
+#endif
+
+/**
+ * Function to terminate the application immediately, printing an error
+ * message and returning the exit_code back to the shell.
+ *
+ * This function never returns
+ *
+ * @param exit_code
+ * The exit code to be returned by the application
+ * @param format
+ * The format string to be used for printing the message. This can include
+ * printf format characters which will be expanded using any further parameters
+ * to the function.
+ */
+__rte_noreturn void
+rte_exit(int exit_code, const char *format, ...)
+ __rte_format_printf(2, 3);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
Index: sys/contrib/dpdk_rte_lpm/rte_debug.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_debug.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifndef _RTE_DEBUG_H_
+#define _RTE_DEBUG_H_
+
+/**
+ * @file
+ *
+ * Debug Functions in RTE
+ *
+ * This file defines a generic API for debug operations. Part of
+ * the implementation is architecture-specific.
+ */
+
+//#include "rte_log.h"
+#include "rte_branch_prediction.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Dump the stack of the calling core to the console.
+ */
+void rte_dump_stack(void);
+
+/**
+ * Dump the registers of the calling core to the console.
+ *
+ * Note: Not implemented in a userapp environment; use gdb instead.
+ */
+void rte_dump_registers(void);
+
+/**
+ * Provide notification of a critical non-recoverable error and terminate
+ * execution abnormally.
+ *
+ * Display the format string and its expanded arguments (printf-like).
+ *
+ * In a linux environment, this function dumps the stack and calls
+ * abort() resulting in a core dump if enabled.
+ *
+ * The function never returns.
+ *
+ * @param ...
+ * The format string, followed by the variable list of arguments.
+ */
+#define rte_panic(...) rte_panic_(__func__, __VA_ARGS__, "dummy")
+#define rte_panic_(func, format, ...) __rte_panic(func, format "%.0s", __VA_ARGS__)
+
+#ifdef RTE_ENABLE_ASSERT
+#define RTE_ASSERT(exp) RTE_VERIFY(exp)
+#else
+#define RTE_ASSERT(exp) do {} while (0)
+#endif
+#define RTE_VERIFY(exp) do { \
+ if (unlikely(!(exp))) \
+ rte_panic("line %d\tassert \"%s\" failed\n", __LINE__, #exp); \
+} while (0)
+
+/*
+ * Provide notification of a critical non-recoverable error and stop.
+ *
+ * This function should not be called directly. Refer to rte_panic() macro
+ * documentation.
+ */
+void __rte_panic(const char *funcname , const char *format, ...)
+{
+#ifdef __GNUC__
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2))
+ __rte_cold
+#endif
+#endif
+ //__rte_noreturn
+ //__rte_format_printf(2, 3);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_DEBUG_H_ */
Index: sys/contrib/dpdk_rte_lpm/rte_jhash.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_jhash.h
@@ -0,0 +1,379 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2015 Intel Corporation.
+ */
+
+#ifndef _RTE_JHASH_H
+#define _RTE_JHASH_H
+
+/**
+ * @file
+ *
+ * jhash functions.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//#include <rte_byteorder.h>
+
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 2006 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions. Routines to test the hash are included
+ * if SELF_TEST is defined. You can use this free for any purpose. It's in
+ * the public domain. It has no warranty.
+ *
+ * $FreeBSD$
+ */
+
+#define rot(x, k) (((x) << (k)) | ((x) >> (32-(k))))
+
+/** @internal Internal function. NOTE: Arguments are modified. */
+#define __rte_jhash_mix(a, b, c) do { \
+ a -= c; a ^= rot(c, 4); c += b; \
+ b -= a; b ^= rot(a, 6); a += c; \
+ c -= b; c ^= rot(b, 8); b += a; \
+ a -= c; a ^= rot(c, 16); c += b; \
+ b -= a; b ^= rot(a, 19); a += c; \
+ c -= b; c ^= rot(b, 4); b += a; \
+} while (0)
+
+#define __rte_jhash_final(a, b, c) do { \
+ c ^= b; c -= rot(b, 14); \
+ a ^= c; a -= rot(c, 11); \
+ b ^= a; b -= rot(a, 25); \
+ c ^= b; c -= rot(b, 16); \
+ a ^= c; a -= rot(c, 4); \
+ b ^= a; b -= rot(a, 14); \
+ c ^= b; c -= rot(b, 24); \
+} while (0)
+
+/** The golden ratio: an arbitrary value. */
+#define RTE_JHASH_GOLDEN_RATIO 0xdeadbeef
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+#define BIT_SHIFT(x, y, k) (((x) >> (k)) | ((uint64_t)(y) << (32-(k))))
+#else
+#define BIT_SHIFT(x, y, k) (((uint64_t)(x) << (k)) | ((y) >> (32-(k))))
+#endif
+
+#define LOWER8b_MASK rte_le_to_cpu_32(0xff)
+#define LOWER16b_MASK rte_le_to_cpu_32(0xffff)
+#define LOWER24b_MASK rte_le_to_cpu_32(0xffffff)
+
+static inline void
+__rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc,
+ uint32_t *pb, unsigned check_align)
+{
+ uint32_t a, b, c;
+
+ /* Set up the internal state */
+ a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + *pc;
+ c += *pb;
+
+ /*
+ * Check key alignment. For x86 architecture, first case is always optimal
+ * If check_align is not set, first case will be used
+ */
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686) || defined(RTE_ARCH_X86_X32)
+ const uint32_t *k = (const uint32_t *)key;
+ const uint32_t s = 0;
+#else
+ const uint32_t *k = (uint32_t *)((uintptr_t)key & (uintptr_t)~3);
+ const uint32_t s = ((uintptr_t)key & 3) * CHAR_BIT;
+#endif
+ if (!check_align || s == 0) {
+ while (length > 12) {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+
+ __rte_jhash_mix(a, b, c);
+
+ k += 3;
+ length -= 12;
+ }
+
+ switch (length) {
+ case 12:
+ c += k[2]; b += k[1]; a += k[0]; break;
+ case 11:
+ c += k[2] & LOWER24b_MASK; b += k[1]; a += k[0]; break;
+ case 10:
+ c += k[2] & LOWER16b_MASK; b += k[1]; a += k[0]; break;
+ case 9:
+ c += k[2] & LOWER8b_MASK; b += k[1]; a += k[0]; break;
+ case 8:
+ b += k[1]; a += k[0]; break;
+ case 7:
+ b += k[1] & LOWER24b_MASK; a += k[0]; break;
+ case 6:
+ b += k[1] & LOWER16b_MASK; a += k[0]; break;
+ case 5:
+ b += k[1] & LOWER8b_MASK; a += k[0]; break;
+ case 4:
+ a += k[0]; break;
+ case 3:
+ a += k[0] & LOWER24b_MASK; break;
+ case 2:
+ a += k[0] & LOWER16b_MASK; break;
+ case 1:
+ a += k[0] & LOWER8b_MASK; break;
+ /* zero length strings require no mixing */
+ case 0:
+ *pc = c;
+ *pb = b;
+ return;
+ };
+ } else {
+ /* all but the last block: affect some 32 bits of (a, b, c) */
+ while (length > 12) {
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s);
+ c += BIT_SHIFT(k[2], k[3], s);
+ __rte_jhash_mix(a, b, c);
+
+ k += 3;
+ length -= 12;
+ }
+
+ /* last block: affect all 32 bits of (c) */
+ switch (length) {
+ case 12:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s);
+ c += BIT_SHIFT(k[2], k[3], s);
+ break;
+ case 11:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s);
+ c += BIT_SHIFT(k[2], k[3], s) & LOWER24b_MASK;
+ break;
+ case 10:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s);
+ c += BIT_SHIFT(k[2], k[3], s) & LOWER16b_MASK;
+ break;
+ case 9:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s);
+ c += BIT_SHIFT(k[2], k[3], s) & LOWER8b_MASK;
+ break;
+ case 8:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s);
+ break;
+ case 7:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s) & LOWER24b_MASK;
+ break;
+ case 6:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s) & LOWER16b_MASK;
+ break;
+ case 5:
+ a += BIT_SHIFT(k[0], k[1], s);
+ b += BIT_SHIFT(k[1], k[2], s) & LOWER8b_MASK;
+ break;
+ case 4:
+ a += BIT_SHIFT(k[0], k[1], s);
+ break;
+ case 3:
+ a += BIT_SHIFT(k[0], k[1], s) & LOWER24b_MASK;
+ break;
+ case 2:
+ a += BIT_SHIFT(k[0], k[1], s) & LOWER16b_MASK;
+ break;
+ case 1:
+ a += BIT_SHIFT(k[0], k[1], s) & LOWER8b_MASK;
+ break;
+ /* zero length strings require no mixing */
+ case 0:
+ *pc = c;
+ *pb = b;
+ return;
+ }
+ }
+
+ __rte_jhash_final(a, b, c);
+
+ *pc = c;
+ *pb = b;
+}
+
+/**
+ * Same as rte_jhash, but takes two seeds and return two uint32_ts.
+ * pc and pb must be non-null, and *pc and *pb must both be initialized
+ * with seeds. If you pass in (*pb)=0, the output (*pc) will be
+ * the same as the return value from rte_jhash.
+ *
+ * @param key
+ * Key to calculate hash of.
+ * @param length
+ * Length of key in bytes.
+ * @param pc
+ * IN: seed OUT: primary hash value.
+ * @param pb
+ * IN: second seed OUT: secondary hash value.
+ */
+static inline void
+rte_jhash_2hashes(const void *key, uint32_t length, uint32_t *pc, uint32_t *pb)
+{
+ __rte_jhash_2hashes(key, length, pc, pb, 1);
+}
+
+/**
+ * Same as rte_jhash_32b, but takes two seeds and return two uint32_ts.
+ * pc and pb must be non-null, and *pc and *pb must both be initialized
+ * with seeds. If you pass in (*pb)=0, the output (*pc) will be
+ * the same as the return value from rte_jhash_32b.
+ *
+ * @param k
+ * Key to calculate hash of.
+ * @param length
+ * Length of key in units of 4 bytes.
+ * @param pc
+ * IN: seed OUT: primary hash value.
+ * @param pb
+ * IN: second seed OUT: secondary hash value.
+ */
+static inline void
+rte_jhash_32b_2hashes(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t *pb)
+{
+ __rte_jhash_2hashes((const void *) k, (length << 2), pc, pb, 0);
+}
+
+/**
+ * The most generic version, hashes an arbitrary sequence
+ * of bytes. No alignment or length assumptions are made about
+ * the input key. For keys not aligned to four byte boundaries
+ * or a multiple of four bytes in length, the memory region
+ * just after may be read (but not used in the computation).
+ * This may cross a page boundary.
+ *
+ * @param key
+ * Key to calculate hash of.
+ * @param length
+ * Length of key in bytes.
+ * @param initval
+ * Initialising value of hash.
+ * @return
+ * Calculated hash value.
+ */
+static inline uint32_t
+rte_jhash(const void *key, uint32_t length, uint32_t initval)
+{
+ uint32_t initval2 = 0;
+
+ rte_jhash_2hashes(key, length, &initval, &initval2);
+
+ return initval;
+}
+
+/**
+ * A special optimized version that handles 1 or more of uint32_ts.
+ * The length parameter here is the number of uint32_ts in the key.
+ *
+ * @param k
+ * Key to calculate hash of.
+ * @param length
+ * Length of key in units of 4 bytes.
+ * @param initval
+ * Initialising value of hash.
+ * @return
+ * Calculated hash value.
+ */
+static inline uint32_t
+rte_jhash_32b(const uint32_t *k, uint32_t length, uint32_t initval)
+{
+ uint32_t initval2 = 0;
+
+ rte_jhash_32b_2hashes(k, length, &initval, &initval2);
+
+ return initval;
+}
+
+static inline uint32_t
+__rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
+{
+ a += RTE_JHASH_GOLDEN_RATIO + initval;
+ b += RTE_JHASH_GOLDEN_RATIO + initval;
+ c += RTE_JHASH_GOLDEN_RATIO + initval;
+
+ __rte_jhash_final(a, b, c);
+
+ return c;
+}
+
+/**
+ * A special ultra-optimized versions that knows it is hashing exactly
+ * 3 words.
+ *
+ * @param a
+ * First word to calculate hash of.
+ * @param b
+ * Second word to calculate hash of.
+ * @param c
+ * Third word to calculate hash of.
+ * @param initval
+ * Initialising value of hash.
+ * @return
+ * Calculated hash value.
+ */
+static inline uint32_t
+rte_jhash_3words(uint32_t a, uint32_t b, uint32_t c, uint32_t initval)
+{
+ return __rte_jhash_3words(a + 12, b + 12, c + 12, initval);
+}
+
+/**
+ * A special ultra-optimized versions that knows it is hashing exactly
+ * 2 words.
+ *
+ * @param a
+ * First word to calculate hash of.
+ * @param b
+ * Second word to calculate hash of.
+ * @param initval
+ * Initialising value of hash.
+ * @return
+ * Calculated hash value.
+ */
+static inline uint32_t
+rte_jhash_2words(uint32_t a, uint32_t b, uint32_t initval)
+{
+ return __rte_jhash_3words(a + 8, b + 8, 8, initval);
+}
+
+/**
+ * A special ultra-optimized versions that knows it is hashing exactly
+ * 1 word.
+ *
+ * @param a
+ * Word to calculate hash of.
+ * @param initval
+ * Initialising value of hash.
+ * @return
+ * Calculated hash value.
+ */
+static inline uint32_t
+rte_jhash_1word(uint32_t a, uint32_t initval)
+{
+ return __rte_jhash_3words(a + 4, 4, 4, initval);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_JHASH_H */
Index: sys/contrib/dpdk_rte_lpm/rte_log.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_log.h
@@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2017 Intel Corporation
+ */
+
+#ifndef _RTE_LOG_H_
+#define _RTE_LOG_H_
+
+/**
+ * @file
+ *
+ * RTE Logs API
+ *
+ * This file provides a log API to RTE applications.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <sys/queue.h>
+
+#include <rte_common.h>
+#include <rte_config.h>
+#include <rte_compat.h>
+
+struct rte_log_dynamic_type;
+
+/** The rte_log structure. */
+struct rte_logs {
+ uint32_t type; /**< Bitfield with enabled logs. */
+ uint32_t level; /**< Log level. */
+ FILE *file; /**< Output file set by rte_openlog_stream, or NULL. */
+ size_t dynamic_types_len;
+ struct rte_log_dynamic_type *dynamic_types;
+};
+
+/** Global log information */
+extern struct rte_logs rte_logs;
+
+/* SDK log type */
+#define RTE_LOGTYPE_EAL 0 /**< Log related to eal. */
+#define RTE_LOGTYPE_MALLOC 1 /**< Log related to malloc. */
+#define RTE_LOGTYPE_RING 2 /**< Log related to ring. */
+#define RTE_LOGTYPE_MEMPOOL 3 /**< Log related to mempool. */
+#define RTE_LOGTYPE_TIMER 4 /**< Log related to timers. */
+#define RTE_LOGTYPE_PMD 5 /**< Log related to poll mode driver. */
+#define RTE_LOGTYPE_HASH 6 /**< Log related to hash table. */
+#define RTE_LOGTYPE_LPM 7 /**< Log related to LPM. */
+#define RTE_LOGTYPE_KNI 8 /**< Log related to KNI. */
+#define RTE_LOGTYPE_ACL 9 /**< Log related to ACL. */
+#define RTE_LOGTYPE_POWER 10 /**< Log related to power. */
+#define RTE_LOGTYPE_METER 11 /**< Log related to QoS meter. */
+#define RTE_LOGTYPE_SCHED 12 /**< Log related to QoS port scheduler. */
+#define RTE_LOGTYPE_PORT 13 /**< Log related to port. */
+#define RTE_LOGTYPE_TABLE 14 /**< Log related to table. */
+#define RTE_LOGTYPE_PIPELINE 15 /**< Log related to pipeline. */
+#define RTE_LOGTYPE_MBUF 16 /**< Log related to mbuf. */
+#define RTE_LOGTYPE_CRYPTODEV 17 /**< Log related to cryptodev. */
+#define RTE_LOGTYPE_EFD 18 /**< Log related to EFD. */
+#define RTE_LOGTYPE_EVENTDEV 19 /**< Log related to eventdev. */
+#define RTE_LOGTYPE_GSO 20 /**< Log related to GSO. */
+
+/* these log types can be used in an application */
+#define RTE_LOGTYPE_USER1 24 /**< User-defined log type 1. */
+#define RTE_LOGTYPE_USER2 25 /**< User-defined log type 2. */
+#define RTE_LOGTYPE_USER3 26 /**< User-defined log type 3. */
+#define RTE_LOGTYPE_USER4 27 /**< User-defined log type 4. */
+#define RTE_LOGTYPE_USER5 28 /**< User-defined log type 5. */
+#define RTE_LOGTYPE_USER6 29 /**< User-defined log type 6. */
+#define RTE_LOGTYPE_USER7 30 /**< User-defined log type 7. */
+#define RTE_LOGTYPE_USER8 31 /**< User-defined log type 8. */
+
+/** First identifier for extended logs */
+#define RTE_LOGTYPE_FIRST_EXT_ID 32
+
+/* Can't use 0, as it gives compiler warnings */
+#define RTE_LOG_EMERG 1U /**< System is unusable. */
+#define RTE_LOG_ALERT 2U /**< Action must be taken immediately. */
+#define RTE_LOG_CRIT 3U /**< Critical conditions. */
+#define RTE_LOG_ERR 4U /**< Error conditions. */
+#define RTE_LOG_WARNING 5U /**< Warning conditions. */
+#define RTE_LOG_NOTICE 6U /**< Normal but significant condition. */
+#define RTE_LOG_INFO 7U /**< Informational. */
+#define RTE_LOG_DEBUG 8U /**< Debug-level messages. */
+
+/**
+ * Change the stream that will be used by the logging system.
+ *
+ * This can be done at any time. The f argument represents the stream
+ * to be used to send the logs. If f is NULL, the default output is
+ * used (stderr).
+ *
+ * @param f
+ * Pointer to the stream.
+ * @return
+ * - 0 on success.
+ * - Negative on error.
+ */
+int rte_openlog_stream(FILE *f);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Retrieve the stream used by the logging system (see rte_openlog_stream()
+ * to change it).
+ *
+ * @return
+ * Pointer to the stream.
+ */
+__rte_experimental
+FILE *rte_log_get_stream(void);
+
+/**
+ * Set the global log level.
+ *
+ * After this call, logs with a level lower or equal than the level
+ * passed as argument will be displayed.
+ *
+ * @param level
+ * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8).
+ */
+void rte_log_set_global_level(uint32_t level);
+
+/**
+ * Get the global log level.
+ *
+ * @return
+ * The current global log level.
+ */
+uint32_t rte_log_get_global_level(void);
+
+/**
+ * Get the log level for a given type.
+ *
+ * @param logtype
+ * The log type identifier.
+ * @return
+ * 0 on success, a negative value if logtype is invalid.
+ */
+int rte_log_get_level(uint32_t logtype);
+
+/**
+ * For a given `logtype`, check if a log with `loglevel` can be printed.
+ *
+ * @param logtype
+ * The log type identifier
+ * @param loglevel
+ * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8).
+ * @return
+ * Returns 'true' if log can be printed and 'false' if it can't.
+ */
+__rte_experimental
+bool rte_log_can_log(uint32_t logtype, uint32_t loglevel);
+
+/**
+ * Set the log level for a given type based on shell pattern.
+ *
+ * @param pattern
+ * The match pattern identifying the log type.
+ * @param level
+ * The level to be set.
+ * @return
+ * 0 on success, a negative value if level is invalid.
+ */
+int rte_log_set_level_pattern(const char *pattern, uint32_t level);
+
+/**
+ * Set the log level for a given type based on regular expression.
+ *
+ * @param regex
+ * The regular expression identifying the log type.
+ * @param level
+ * The level to be set.
+ * @return
+ * 0 on success, a negative value if level is invalid.
+ */
+int rte_log_set_level_regexp(const char *regex, uint32_t level);
+
+/**
+ * Set the log level for a given type.
+ *
+ * @param logtype
+ * The log type identifier.
+ * @param level
+ * The level to be set.
+ * @return
+ * 0 on success, a negative value if logtype or level is invalid.
+ */
+int rte_log_set_level(uint32_t logtype, uint32_t level);
+
+/**
+ * Get the current loglevel for the message being processed.
+ *
+ * Before calling the user-defined stream for logging, the log
+ * subsystem sets a per-lcore variable containing the loglevel and the
+ * logtype of the message being processed. This information can be
+ * accessed by the user-defined log output function through this
+ * function.
+ *
+ * @return
+ * The loglevel of the message being processed.
+ */
+int rte_log_cur_msg_loglevel(void);
+
+/**
+ * Get the current logtype for the message being processed.
+ *
+ * Before calling the user-defined stream for logging, the log
+ * subsystem sets a per-lcore variable containing the loglevel and the
+ * logtype of the message being processed. This information can be
+ * accessed by the user-defined log output function through this
+ * function.
+ *
+ * @return
+ * The logtype of the message being processed.
+ */
+int rte_log_cur_msg_logtype(void);
+
+/**
+ * Register a dynamic log type
+ *
+ * If a log is already registered with the same type, the returned value
+ * is the same than the previous one.
+ *
+ * @param name
+ * The string identifying the log type.
+ * @return
+ * - >0: success, the returned value is the log type identifier.
+ * - (-ENOMEM): cannot allocate memory.
+ */
+int rte_log_register(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register a dynamic log type and try to pick its level from EAL options
+ *
+ * rte_log_register() is called inside. If successful, the function tries
+ * to search for matching regexp in the list of EAL log level options and
+ * pick the level from the last matching entry. If nothing can be applied
+ * from the list, the level will be set to the user-defined default value.
+ *
+ * @param name
+ * Name for the log type to be registered
+ * @param level_def
+ * Fallback level to be set if the global list has no matching options
+ * @return
+ * - >=0: the newly registered log type
+ * - <0: rte_log_register() error value
+ */
+__rte_experimental
+int rte_log_register_type_and_pick_level(const char *name, uint32_t level_def);
+
+/**
+ * Dump log information.
+ *
+ * Dump the global level and the registered log types.
+ *
+ * @param f
+ * The output stream where the dump should be sent.
+ */
+void rte_log_dump(FILE *f);
+
+/**
+ * Generates a log message.
+ *
+ * The message will be sent in the stream defined by the previous call
+ * to rte_openlog_stream().
+ *
+ * The level argument determines if the log should be displayed or
+ * not, depending on the global rte_logs variable.
+ *
+ * The preferred alternative is the RTE_LOG() because it adds the
+ * level and type in the logged string.
+ *
+ * @param level
+ * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8).
+ * @param logtype
+ * The log type, for example, RTE_LOGTYPE_EAL.
+ * @param format
+ * The format string, as in printf(3), followed by the variable arguments
+ * required by the format.
+ * @return
+ * - 0: Success.
+ * - Negative on error.
+ */
+int rte_log(uint32_t level, uint32_t logtype, const char *format, ...)
+#ifdef __GNUC__
+#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2))
+ __rte_cold
+#endif
+#endif
+ __rte_format_printf(3, 4);
+
+/**
+ * Generates a log message.
+ *
+ * The message will be sent in the stream defined by the previous call
+ * to rte_openlog_stream().
+ *
+ * The level argument determines if the log should be displayed or
+ * not, depending on the global rte_logs variable. A trailing
+ * newline may be added if needed.
+ *
+ * The preferred alternative is the RTE_LOG() because it adds the
+ * level and type in the logged string.
+ *
+ * @param level
+ * Log level. A value between RTE_LOG_EMERG (1) and RTE_LOG_DEBUG (8).
+ * @param logtype
+ * The log type, for example, RTE_LOGTYPE_EAL.
+ * @param format
+ * The format string, as in printf(3), followed by the variable arguments
+ * required by the format.
+ * @param ap
+ * The va_list of the variable arguments required by the format.
+ * @return
+ * - 0: Success.
+ * - Negative on error.
+ */
+int rte_vlog(uint32_t level, uint32_t logtype, const char *format, va_list ap)
+ __rte_format_printf(3, 0);
+
+/**
+ * Generates a log message.
+ *
+ * The RTE_LOG() is a helper that prefixes the string with the log level
+ * and type, and call rte_log().
+ *
+ * @param l
+ * Log level. A value between EMERG (1) and DEBUG (8). The short name is
+ * expanded by the macro, so it cannot be an integer value.
+ * @param t
+ * The log type, for example, EAL. The short name is expanded by the
+ * macro, so it cannot be an integer value.
+ * @param ...
+ * The fmt string, as in printf(3), followed by the variable arguments
+ * required by the format.
+ * @return
+ * - 0: Success.
+ * - Negative on error.
+ */
+#define RTE_LOG(l, t, ...) \
+ rte_log(RTE_LOG_ ## l, \
+ RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__)
+
+/**
+ * Generates a log message for data path.
+ *
+ * Similar to RTE_LOG(), except that it is removed at compilation time
+ * if the RTE_LOG_DP_LEVEL configuration option is lower than the log
+ * level argument.
+ *
+ * @param l
+ * Log level. A value between EMERG (1) and DEBUG (8). The short name is
+ * expanded by the macro, so it cannot be an integer value.
+ * @param t
+ * The log type, for example, EAL. The short name is expanded by the
+ * macro, so it cannot be an integer value.
+ * @param ...
+ * The fmt string, as in printf(3), followed by the variable arguments
+ * required by the format.
+ * @return
+ * - 0: Success.
+ * - Negative on error.
+ */
+#define RTE_LOG_DP(l, t, ...) \
+ (void)((RTE_LOG_ ## l <= RTE_LOG_DP_LEVEL) ? \
+ rte_log(RTE_LOG_ ## l, \
+ RTE_LOGTYPE_ ## t, # t ": " __VA_ARGS__) : \
+ 0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LOG_H_ */
Index: sys/contrib/dpdk_rte_lpm/rte_lpm.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_lpm.h
@@ -0,0 +1,403 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifndef _RTE_LPM_H_
+#define _RTE_LPM_H_
+
+/**
+ * @file
+ * RTE Longest Prefix Match (LPM)
+ */
+
+/*
+#include <errno.h>
+#include <sys/queue.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+*/
+#include "rte_branch_prediction.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Max number of characters in LPM name. */
+#define RTE_LPM_NAMESIZE 32
+
+/** Maximum depth value possible for IPv4 LPM. */
+#define RTE_LPM_MAX_DEPTH 32
+
+/** @internal Total number of tbl24 entries. */
+#define RTE_LPM_TBL24_NUM_ENTRIES (1 << 24)
+
+/** @internal Number of entries in a tbl8 group. */
+#define RTE_LPM_TBL8_GROUP_NUM_ENTRIES 256
+
+/** @internal Max number of tbl8 groups in the tbl8. */
+#define RTE_LPM_MAX_TBL8_NUM_GROUPS (1 << 24)
+
+/** @internal Total number of tbl8 groups in the tbl8. */
+#define RTE_LPM_TBL8_NUM_GROUPS 256
+
+/** @internal Total number of tbl8 entries. */
+#define RTE_LPM_TBL8_NUM_ENTRIES (RTE_LPM_TBL8_NUM_GROUPS * \
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES)
+
+/** @internal Macro to enable/disable run-time checks. */
+#if defined(RTE_LIBRTE_LPM_DEBUG)
+#define RTE_LPM_RETURN_IF_TRUE(cond, retval) do { \
+ if (cond) return (retval); \
+} while (0)
+#else
+#define RTE_LPM_RETURN_IF_TRUE(cond, retval)
+#endif
+
+/** @internal bitmask with valid and valid_group fields set */
+#define RTE_LPM_VALID_EXT_ENTRY_BITMASK 0x03000000
+
+/** Bitmask used to indicate successful lookup */
+#define RTE_LPM_LOOKUP_SUCCESS 0x01000000
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+/** @internal Tbl24 entry structure. */
+__extension__
+struct rte_lpm_tbl_entry {
+ /**
+ * Stores Next hop (tbl8 or tbl24 when valid_group is not set) or
+ * a group index pointing to a tbl8 structure (tbl24 only, when
+ * valid_group is set)
+ */
+ uint32_t next_hop :24;
+ /* Using single uint8_t to store 3 values. */
+ uint32_t valid :1; /**< Validation flag. */
+ /**
+ * For tbl24:
+ * - valid_group == 0: entry stores a next hop
+ * - valid_group == 1: entry stores a group_index pointing to a tbl8
+ * For tbl8:
+ * - valid_group indicates whether the current tbl8 is in use or not
+ */
+ uint32_t valid_group :1;
+ uint32_t depth :6; /**< Rule depth. */
+};
+
+#else
+
+__extension__
+struct rte_lpm_tbl_entry {
+ uint32_t depth :6;
+ uint32_t valid_group :1;
+ uint32_t valid :1;
+ uint32_t next_hop :24;
+
+};
+
+#endif
+
+/** LPM configuration structure. */
+struct rte_lpm_config {
+ uint32_t max_rules; /**< Max number of rules. */
+ uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */
+ int flags; /**< This field is currently unused. */
+};
+
+/** @internal Rule structure. */
+struct rte_lpm_rule {
+ uint32_t ip; /**< Rule IP address. */
+ uint32_t next_hop; /**< Rule next hop. */
+};
+
+/** @internal Contains metadata about the rules table. */
+struct rte_lpm_rule_info {
+ uint32_t used_rules; /**< Used rules so far. */
+ uint32_t first_rule; /**< Indexes the first rule of a given depth. */
+};
+
+struct nhop_object;
+struct rte_lpm_external {
+ struct nhop_object **nh_idx; /**< # -> idx mappings */
+ uint32_t default_idx; /* nhop index of default route */
+ uint32_t fibnum; /* fib index */
+};
+
+/** @internal LPM structure. */
+struct rte_lpm {
+ /* LPM metadata. */
+ struct rte_lpm_external ext;
+ char name[RTE_LPM_NAMESIZE]; /**< Name of the lpm. */
+ uint32_t max_rules; /**< Max. balanced rules per lpm. */
+ uint32_t number_tbl8s; /**< Number of tbl8s. */
+ struct rte_lpm_rule_info rule_info[RTE_LPM_MAX_DEPTH]; /**< Rule info table. */
+
+ /* LPM Tables. */
+ struct rte_lpm_tbl_entry tbl24[RTE_LPM_TBL24_NUM_ENTRIES]
+ __rte_cache_aligned; /**< LPM tbl24 table. */
+ struct rte_lpm_tbl_entry *tbl8; /**< LPM tbl8 table. */
+ struct rte_lpm_rule *rules_tbl; /**< LPM rules. */
+};
+
+/**
+ * Create an LPM object.
+ *
+ * @param name
+ * LPM object name
+ * @param socket_id
+ * NUMA socket ID for LPM table memory allocation
+ * @param config
+ * Structure containing the configuration
+ * @return
+ * Handle to LPM object on success, NULL otherwise with rte_errno set
+ * to an appropriate values. Possible rte_errno values include:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - EINVAL - invalid parameter passed to function
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ */
+struct rte_lpm *
+rte_lpm_create(const char *name, int socket_id,
+ const struct rte_lpm_config *config);
+
+/**
+ * Find an existing LPM object and return a pointer to it.
+ *
+ * @param name
+ * Name of the lpm object as passed to rte_lpm_create()
+ * @return
+ * Pointer to lpm object or NULL if object not found with rte_errno
+ * set appropriately. Possible rte_errno values include:
+ * - ENOENT - required entry not available to return.
+ */
+struct rte_lpm *
+rte_lpm_find_existing(const char *name);
+
+/**
+ * Free an LPM object.
+ *
+ * @param lpm
+ * LPM object handle
+ * @return
+ * None
+ */
+void
+rte_lpm_free(struct rte_lpm *lpm);
+
+/**
+ * Add a rule to the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP of the rule to be added to the LPM table
+ * @param depth
+ * Depth of the rule to be added to the LPM table
+ * @param next_hop
+ * Next hop of the rule to be added to the LPM table
+ * @return
+ * 0 on success, negative value otherwise
+ */
+int
+rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth, uint32_t next_hop);
+
+/**
+ * Check if a rule is present in the LPM table,
+ * and provide its next hop if it is.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP of the rule to be searched
+ * @param depth
+ * Depth of the rule to searched
+ * @param next_hop
+ * Next hop of the rule (valid only if it is found)
+ * @return
+ * 1 if the rule exists, 0 if it does not, a negative value on failure
+ */
+int
+rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
+uint32_t *next_hop);
+
+/**
+ * Delete a rule from the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP of the rule to be deleted from the LPM table
+ * @param depth
+ * Depth of the rule to be deleted from the LPM table
+ * @param psub_rule_depth
+ * Pointer to depth of the parent rule
+ * @param sub_rule_nhop
+ * Pinter to the parent rule nexthop index
+ * @return
+ * 0 on success, negative value otherwise
+ */
+int
+rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
+ uint8_t sub_rule_depth, uint32_t sub_rule_nhop);
+
+/**
+ * Delete all rules from the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ */
+void
+rte_lpm_delete_all(struct rte_lpm *lpm);
+
+/**
+ * Lookup an IP into the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP to be looked up in the LPM table
+ * @param next_hop
+ * Next hop of the most specific rule found for IP (valid on lookup hit only)
+ * @return
+ * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit
+ */
+static inline int
+rte_lpm_lookup(struct rte_lpm *lpm, uint32_t ip, uint32_t *next_hop)
+{
+ unsigned tbl24_index = (ip >> 8);
+ uint32_t tbl_entry;
+ const uint32_t *ptbl;
+
+ /* DEBUG: Check user input arguments. */
+ RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (next_hop == NULL)), -EINVAL);
+
+ /* Copy tbl24 entry */
+ ptbl = (const uint32_t *)(&lpm->tbl24[tbl24_index]);
+ tbl_entry = *ptbl;
+
+ /* Memory ordering is not required in lookup. Because dataflow
+ * dependency exists, compiler or HW won't be able to re-order
+ * the operations.
+ */
+ /* Copy tbl8 entry (only if needed) */
+ if (unlikely((tbl_entry & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+ RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+
+ unsigned tbl8_index = (uint8_t)ip +
+ (((uint32_t)tbl_entry & 0x00FFFFFF) *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
+
+ ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index];
+ tbl_entry = *ptbl;
+ }
+
+ *next_hop = ((uint32_t)tbl_entry & 0x00FFFFFF);
+ return (tbl_entry & RTE_LPM_LOOKUP_SUCCESS) ? 0 : -ENOENT;
+}
+
+/**
+ * Lookup multiple IP addresses in an LPM table. This may be implemented as a
+ * macro, so the address of the function should not be used.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ips
+ * Array of IPs to be looked up in the LPM table
+ * @param next_hops
+ * Next hop of the most specific rule found for IP (valid on lookup hit only).
+ * This is an array of two byte values. The most significant byte in each
+ * value says whether the lookup was successful (bitmask
+ * RTE_LPM_LOOKUP_SUCCESS is set). The least significant byte is the
+ * actual next hop.
+ * @param n
+ * Number of elements in ips (and next_hops) array to lookup. This should be a
+ * compile time constant, and divisible by 8 for best performance.
+ * @return
+ * -EINVAL for incorrect arguments, otherwise 0
+ */
+#define rte_lpm_lookup_bulk(lpm, ips, next_hops, n) \
+ rte_lpm_lookup_bulk_func(lpm, ips, next_hops, n)
+
+static inline int
+rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
+ uint32_t *next_hops, const unsigned n)
+{
+ unsigned i;
+ unsigned tbl24_indexes[n];
+ const uint32_t *ptbl;
+
+ /* DEBUG: Check user input arguments. */
+ RTE_LPM_RETURN_IF_TRUE(((lpm == NULL) || (ips == NULL) ||
+ (next_hops == NULL)), -EINVAL);
+
+ for (i = 0; i < n; i++) {
+ tbl24_indexes[i] = ips[i] >> 8;
+ }
+
+ for (i = 0; i < n; i++) {
+ /* Simply copy tbl24 entry to output */
+ ptbl = (const uint32_t *)&lpm->tbl24[tbl24_indexes[i]];
+ next_hops[i] = *ptbl;
+
+ /* Overwrite output with tbl8 entry if needed */
+ if (unlikely((next_hops[i] & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+ RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+
+ unsigned tbl8_index = (uint8_t)ips[i] +
+ (((uint32_t)next_hops[i] & 0x00FFFFFF) *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES);
+
+ ptbl = (const uint32_t *)&lpm->tbl8[tbl8_index];
+ next_hops[i] = *ptbl;
+ }
+ }
+ return 0;
+}
+
+/* Mask four results. */
+#define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff)
+
+/**
+ * Lookup four IP addresses in an LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * Four IPs to be looked up in the LPM table
+ * @param hop
+ * Next hop of the most specific rule found for IP (valid on lookup hit only).
+ * This is an 4 elements array of two byte values.
+ * If the lookup was successful for the given IP, then least significant byte
+ * of the corresponding element is the actual next hop and the most
+ * significant byte is zero.
+ * If the lookup for the given IP failed, then corresponding element would
+ * contain default value, see description of then next parameter.
+ * @param defv
+ * Default value to populate into corresponding element of hop[] array,
+ * if lookup would fail.
+ */
+#if 0
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+ uint32_t defv);
+
+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+#include "rte_lpm_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "rte_lpm_altivec.h"
+#else
+#include "rte_lpm_sse.h"
+#endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_H_ */
Index: sys/contrib/dpdk_rte_lpm/rte_lpm.c
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_lpm.c
@@ -0,0 +1,1107 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <sys/param.h>
+#include <sys/ctype.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+
+int errno = 0, rte_errno = 0;
+
+#if 0
+#include <string.h>
+#include <stdint.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <sys/queue.h>
+
+#include <rte_log.h>
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_memory.h> /* for definition of RTE_CACHE_LINE_SIZE */
+#include <rte_malloc.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_per_lcore.h>
+#include <rte_string_fns.h>
+#include <rte_errno.h>
+#include <rte_rwlock.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+#endif
+
+#include "rte_shim.h"
+#include "rte_lpm.h"
+
+#if 0
+TAILQ_HEAD(rte_lpm_list, rte_tailq_entry);
+
+static struct rte_tailq_elem rte_lpm_tailq = {
+ .name = "RTE_LPM",
+};
+EAL_REGISTER_TAILQ(rte_lpm_tailq)
+#endif
+
+#define MAX_DEPTH_TBL24 24
+
+enum valid_flag {
+ INVALID = 0,
+ VALID
+};
+
+/* Macro to enable/disable run-time checks. */
+#if defined(RTE_LIBRTE_LPM_DEBUG)
+#include <rte_debug.h>
+#define VERIFY_DEPTH(depth) do { \
+ if ((depth == 0) || (depth > RTE_LPM_MAX_DEPTH)) \
+ rte_panic("LPM: Invalid depth (%u) at line %d", \
+ (unsigned)(depth), __LINE__); \
+} while (0)
+#else
+#define VERIFY_DEPTH(depth)
+#endif
+
+/*
+ * Converts a given depth value to its corresponding mask value.
+ *
+ * depth (IN) : range = 1 - 32
+ * mask (OUT) : 32bit mask
+ */
+static uint32_t __attribute__((pure))
+depth_to_mask(uint8_t depth)
+{
+ VERIFY_DEPTH(depth);
+
+ /* To calculate a mask start with a 1 on the left hand side and right
+ * shift while populating the left hand side with 1's
+ */
+ return (int)0x80000000 >> (depth - 1);
+}
+
+/*
+ * Converts given depth value to its corresponding range value.
+ */
+static uint32_t __attribute__((pure))
+depth_to_range(uint8_t depth)
+{
+ VERIFY_DEPTH(depth);
+
+ /*
+ * Calculate tbl24 range. (Note: 2^depth = 1 << depth)
+ */
+ if (depth <= MAX_DEPTH_TBL24)
+ return 1 << (MAX_DEPTH_TBL24 - depth);
+
+ /* Else if depth is greater than 24 */
+ return 1 << (RTE_LPM_MAX_DEPTH - depth);
+}
+
+#if 0
+/*
+ * Find an existing lpm table and return a pointer to it.
+ */
+struct rte_lpm *
+rte_lpm_find_existing(const char *name)
+{
+ struct rte_lpm *l = NULL;
+ struct rte_tailq_entry *te;
+ struct rte_lpm_list *lpm_list;
+
+ lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list);
+
+ rte_mcfg_tailq_read_lock();
+ TAILQ_FOREACH(te, lpm_list, next) {
+ l = te->data;
+ if (strncmp(name, l->name, RTE_LPM_NAMESIZE) == 0)
+ break;
+ }
+ rte_mcfg_tailq_read_unlock();
+
+ if (te == NULL) {
+ rte_errno = ENOENT;
+ return NULL;
+ }
+
+ return l;
+}
+#endif
+
+/*
+ * Allocates memory for LPM object
+ */
+struct rte_lpm *
+rte_lpm_create(const char *name, int socket_id,
+ const struct rte_lpm_config *config)
+{
+ char mem_name[RTE_LPM_NAMESIZE];
+ struct rte_lpm *lpm = NULL;
+ //struct rte_tailq_entry *te;
+ uint32_t mem_size, rules_size, tbl8s_size;
+ //struct rte_lpm_list *lpm_list;
+
+ //lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list);
+
+ RTE_BUILD_BUG_ON(sizeof(struct rte_lpm_tbl_entry) != 4);
+
+ /* Check user arguments. */
+ if ((name == NULL) || (socket_id < -1) || (config->max_rules == 0)
+ || config->number_tbl8s > RTE_LPM_MAX_TBL8_NUM_GROUPS) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+ snprintf(mem_name, sizeof(mem_name), "LPM_%s", name);
+
+ /* Determine the amount of memory to allocate. */
+ mem_size = sizeof(*lpm);
+ rules_size = sizeof(struct rte_lpm_rule) * config->max_rules;
+ tbl8s_size = (sizeof(struct rte_lpm_tbl_entry) *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s);
+
+#if 0
+ rte_mcfg_tailq_write_lock();
+
+ /* guarantee there's no existing */
+ TAILQ_FOREACH(te, lpm_list, next) {
+ lpm = te->data;
+ if (strncmp(name, lpm->name, RTE_LPM_NAMESIZE) == 0)
+ break;
+ }
+
+ if (te != NULL) {
+ lpm = NULL;
+ rte_errno = EEXIST;
+ goto exit;
+ }
+
+ /* allocate tailq entry */
+ te = rte_zmalloc("LPM_TAILQ_ENTRY", sizeof(*te), 0);
+ if (te == NULL) {
+ RTE_LOG(ERR, LPM, "Failed to allocate tailq entry\n");
+ rte_errno = ENOMEM;
+ goto exit;
+ }
+#endif
+
+ /* Allocate memory to store the LPM data structures. */
+ lpm = rte_zmalloc_socket(mem_name, mem_size,
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (lpm == NULL) {
+ RTE_LOG(ERR, LPM, "LPM memory allocation failed\n");
+ //rte_free(te);
+ rte_errno = ENOMEM;
+ goto exit;
+ }
+
+ lpm->rules_tbl = rte_zmalloc_socket(NULL,
+ (size_t)rules_size, RTE_CACHE_LINE_SIZE, socket_id);
+
+ if (lpm->rules_tbl == NULL) {
+ RTE_LOG(ERR, LPM, "LPM rules_tbl memory allocation failed\n");
+ rte_free(lpm);
+ lpm = NULL;
+ //rte_free(te);
+ rte_errno = ENOMEM;
+ goto exit;
+ }
+
+ lpm->tbl8 = rte_zmalloc_socket(NULL,
+ (size_t)tbl8s_size, RTE_CACHE_LINE_SIZE, socket_id);
+
+ if (lpm->tbl8 == NULL) {
+ RTE_LOG(ERR, LPM, "LPM tbl8 memory allocation failed\n");
+ rte_free(lpm->rules_tbl);
+ rte_free(lpm);
+ lpm = NULL;
+ //rte_free(te);
+ rte_errno = ENOMEM;
+ goto exit;
+ }
+
+ /* Save user arguments. */
+ lpm->max_rules = config->max_rules;
+ lpm->number_tbl8s = config->number_tbl8s;
+ strlcpy(lpm->name, name, sizeof(lpm->name));
+
+ //te->data = lpm;
+
+ //TAILQ_INSERT_TAIL(lpm_list, te, next);
+
+exit:
+ rte_mcfg_tailq_write_unlock();
+
+ return lpm;
+}
+
+/*
+ * Deallocates memory for given LPM table.
+ */
+void
+rte_lpm_free(struct rte_lpm *lpm)
+{
+#if 0
+ struct rte_lpm_list *lpm_list;
+ struct rte_tailq_entry *te;
+
+ /* Check user arguments. */
+ if (lpm == NULL)
+ return;
+
+ lpm_list = RTE_TAILQ_CAST(rte_lpm_tailq.head, rte_lpm_list);
+
+ rte_mcfg_tailq_write_lock();
+
+ /* find our tailq entry */
+ TAILQ_FOREACH(te, lpm_list, next) {
+ if (te->data == (void *) lpm)
+ break;
+ }
+ if (te != NULL)
+ TAILQ_REMOVE(lpm_list, te, next);
+
+ rte_mcfg_tailq_write_unlock();
+#endif
+
+ rte_free(lpm->tbl8);
+ rte_free(lpm->rules_tbl);
+ rte_free(lpm);
+ //rte_free(te);
+}
+
+#if 0
+/*
+ * Adds a rule to the rule table.
+ *
+ * NOTE: The rule table is split into 32 groups. Each group contains rules that
+ * apply to a specific prefix depth (i.e. group 1 contains rules that apply to
+ * prefixes with a depth of 1 etc.). In the following code (depth - 1) is used
+ * to refer to depth 1 because even though the depth range is 1 - 32, depths
+ * are stored in the rule table from 0 - 31.
+ * NOTE: Valid range for depth parameter is 1 .. 32 inclusive.
+ */
+static int32_t
+rule_add(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth,
+ uint32_t next_hop)
+{
+ uint32_t rule_gindex, rule_index, last_rule;
+ int i;
+
+ VERIFY_DEPTH(depth);
+
+ /* Scan through rule group to see if rule already exists. */
+ if (lpm->rule_info[depth - 1].used_rules > 0) {
+
+ /* rule_gindex stands for rule group index. */
+ rule_gindex = lpm->rule_info[depth - 1].first_rule;
+ /* Initialise rule_index to point to start of rule group. */
+ rule_index = rule_gindex;
+ /* Last rule = Last used rule in this rule group. */
+ last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules;
+
+ for (; rule_index < last_rule; rule_index++) {
+
+ /* If rule already exists update next hop and return. */
+ if (lpm->rules_tbl[rule_index].ip == ip_masked) {
+
+ if (lpm->rules_tbl[rule_index].next_hop
+ == next_hop)
+ return -EEXIST;
+ lpm->rules_tbl[rule_index].next_hop = next_hop;
+
+ return rule_index;
+ }
+ }
+
+ if (rule_index == lpm->max_rules)
+ return -ENOSPC;
+ } else {
+ /* Calculate the position in which the rule will be stored. */
+ rule_index = 0;
+
+ for (i = depth - 1; i > 0; i--) {
+ if (lpm->rule_info[i - 1].used_rules > 0) {
+ rule_index = lpm->rule_info[i - 1].first_rule
+ + lpm->rule_info[i - 1].used_rules;
+ break;
+ }
+ }
+ if (rule_index == lpm->max_rules)
+ return -ENOSPC;
+
+ lpm->rule_info[depth - 1].first_rule = rule_index;
+ }
+
+ /* Make room for the new rule in the array. */
+ for (i = RTE_LPM_MAX_DEPTH; i > depth; i--) {
+ if (lpm->rule_info[i - 1].first_rule
+ + lpm->rule_info[i - 1].used_rules == lpm->max_rules)
+ return -ENOSPC;
+
+ if (lpm->rule_info[i - 1].used_rules > 0) {
+ lpm->rules_tbl[lpm->rule_info[i - 1].first_rule
+ + lpm->rule_info[i - 1].used_rules]
+ = lpm->rules_tbl[lpm->rule_info[i - 1].first_rule];
+ lpm->rule_info[i - 1].first_rule++;
+ }
+ }
+
+ /* Add the new rule. */
+ lpm->rules_tbl[rule_index].ip = ip_masked;
+ lpm->rules_tbl[rule_index].next_hop = next_hop;
+
+ /* Increment the used rules counter for this rule group. */
+ lpm->rule_info[depth - 1].used_rules++;
+
+ return rule_index;
+}
+
+/*
+ * Delete a rule from the rule table.
+ * NOTE: Valid range for depth parameter is 1 .. 32 inclusive.
+ */
+static void
+rule_delete(struct rte_lpm *lpm, int32_t rule_index, uint8_t depth)
+{
+ int i;
+
+ VERIFY_DEPTH(depth);
+
+ lpm->rules_tbl[rule_index] =
+ lpm->rules_tbl[lpm->rule_info[depth - 1].first_rule
+ + lpm->rule_info[depth - 1].used_rules - 1];
+
+ for (i = depth; i < RTE_LPM_MAX_DEPTH; i++) {
+ if (lpm->rule_info[i].used_rules > 0) {
+ lpm->rules_tbl[lpm->rule_info[i].first_rule - 1] =
+ lpm->rules_tbl[lpm->rule_info[i].first_rule
+ + lpm->rule_info[i].used_rules - 1];
+ lpm->rule_info[i].first_rule--;
+ }
+ }
+
+ lpm->rule_info[depth - 1].used_rules--;
+}
+
+/*
+ * Finds a rule in rule table.
+ * NOTE: Valid range for depth parameter is 1 .. 32 inclusive.
+ */
+static int32_t
+rule_find(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth)
+{
+ uint32_t rule_gindex, last_rule, rule_index;
+
+ VERIFY_DEPTH(depth);
+
+ rule_gindex = lpm->rule_info[depth - 1].first_rule;
+ last_rule = rule_gindex + lpm->rule_info[depth - 1].used_rules;
+
+ /* Scan used rules at given depth to find rule. */
+ for (rule_index = rule_gindex; rule_index < last_rule; rule_index++) {
+ /* If rule is found return the rule index. */
+ if (lpm->rules_tbl[rule_index].ip == ip_masked)
+ return rule_index;
+ }
+
+ /* If rule is not found return -EINVAL. */
+ return -EINVAL;
+}
+#endif
+
+/*
+ * Find, clean and allocate a tbl8.
+ */
+static int32_t
+tbl8_alloc(struct rte_lpm_tbl_entry *tbl8, uint32_t number_tbl8s)
+{
+ uint32_t group_idx; /* tbl8 group index. */
+ struct rte_lpm_tbl_entry *tbl8_entry;
+
+ /* Scan through tbl8 to find a free (i.e. INVALID) tbl8 group. */
+ for (group_idx = 0; group_idx < number_tbl8s; group_idx++) {
+ tbl8_entry = &tbl8[group_idx * RTE_LPM_TBL8_GROUP_NUM_ENTRIES];
+ /* If a free tbl8 group is found clean it and set as VALID. */
+ if (!tbl8_entry->valid_group) {
+ struct rte_lpm_tbl_entry new_tbl8_entry = {
+ .next_hop = 0,
+ .valid = INVALID,
+ .depth = 0,
+ .valid_group = VALID,
+ };
+
+ memset(&tbl8_entry[0], 0,
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES *
+ sizeof(tbl8_entry[0]));
+
+ __atomic_store(tbl8_entry, &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+
+ /* Return group index for allocated tbl8 group. */
+ return group_idx;
+ }
+ }
+
+ /* If there are no tbl8 groups free then return error. */
+ return -ENOSPC;
+}
+
+static void
+tbl8_free(struct rte_lpm_tbl_entry *tbl8, uint32_t tbl8_group_start)
+{
+ /* Set tbl8 group invalid*/
+ struct rte_lpm_tbl_entry zero_tbl8_entry = {0};
+
+ __atomic_store(&tbl8[tbl8_group_start], &zero_tbl8_entry,
+ __ATOMIC_RELAXED);
+}
+
+static __rte_noinline int32_t
+add_depth_small(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
+ uint32_t next_hop)
+{
+#define group_idx next_hop
+ uint32_t tbl24_index, tbl24_range, tbl8_index, tbl8_group_end, i, j;
+
+ /* Calculate the index into Table24. */
+ tbl24_index = ip >> 8;
+ tbl24_range = depth_to_range(depth);
+
+ for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) {
+ /*
+ * For invalid OR valid and non-extended tbl 24 entries set
+ * entry.
+ */
+ if (!lpm->tbl24[i].valid || (lpm->tbl24[i].valid_group == 0 &&
+ lpm->tbl24[i].depth <= depth)) {
+
+ struct rte_lpm_tbl_entry new_tbl24_entry = {
+ .next_hop = next_hop,
+ .valid = VALID,
+ .valid_group = 0,
+ .depth = depth,
+ };
+
+ /* Setting tbl24 entry in one go to avoid race
+ * conditions
+ */
+ __atomic_store(&lpm->tbl24[i], &new_tbl24_entry,
+ __ATOMIC_RELEASE);
+
+ continue;
+ }
+
+ if (lpm->tbl24[i].valid_group == 1) {
+ /* If tbl24 entry is valid and extended calculate the
+ * index into tbl8.
+ */
+ tbl8_index = lpm->tbl24[i].group_idx *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+ tbl8_group_end = tbl8_index +
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+
+ for (j = tbl8_index; j < tbl8_group_end; j++) {
+ if (!lpm->tbl8[j].valid ||
+ lpm->tbl8[j].depth <= depth) {
+ struct rte_lpm_tbl_entry
+ new_tbl8_entry = {
+ .valid = VALID,
+ .valid_group = VALID,
+ .depth = depth,
+ .next_hop = next_hop,
+ };
+
+ /*
+ * Setting tbl8 entry in one go to avoid
+ * race conditions
+ */
+ __atomic_store(&lpm->tbl8[j],
+ &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+
+ continue;
+ }
+ }
+ }
+ }
+#undef group_idx
+ return 0;
+}
+
+static __rte_noinline int32_t
+add_depth_big(struct rte_lpm *lpm, uint32_t ip_masked, uint8_t depth,
+ uint32_t next_hop)
+{
+#define group_idx next_hop
+ uint32_t tbl24_index;
+ int32_t tbl8_group_index, tbl8_group_start, tbl8_group_end, tbl8_index,
+ tbl8_range, i;
+
+ tbl24_index = (ip_masked >> 8);
+ tbl8_range = depth_to_range(depth);
+
+ if (!lpm->tbl24[tbl24_index].valid) {
+ /* Search for a free tbl8 group. */
+ tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s);
+
+ /* Check tbl8 allocation was successful. */
+ if (tbl8_group_index < 0) {
+ return tbl8_group_index;
+ }
+
+ /* Find index into tbl8 and range. */
+ tbl8_index = (tbl8_group_index *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES) +
+ (ip_masked & 0xFF);
+
+ /* Set tbl8 entry. */
+ for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) {
+ struct rte_lpm_tbl_entry new_tbl8_entry = {
+ .valid = VALID,
+ .depth = depth,
+ .valid_group = lpm->tbl8[i].valid_group,
+ .next_hop = next_hop,
+ };
+ __atomic_store(&lpm->tbl8[i], &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+ }
+
+ /*
+ * Update tbl24 entry to point to new tbl8 entry. Note: The
+ * ext_flag and tbl8_index need to be updated simultaneously,
+ * so assign whole structure in one go
+ */
+
+ struct rte_lpm_tbl_entry new_tbl24_entry = {
+ .group_idx = tbl8_group_index,
+ .valid = VALID,
+ .valid_group = 1,
+ .depth = 0,
+ };
+
+ /* The tbl24 entry must be written only after the
+ * tbl8 entries are written.
+ */
+ __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry,
+ __ATOMIC_RELEASE);
+
+ } /* If valid entry but not extended calculate the index into Table8. */
+ else if (lpm->tbl24[tbl24_index].valid_group == 0) {
+ /* Search for free tbl8 group. */
+ tbl8_group_index = tbl8_alloc(lpm->tbl8, lpm->number_tbl8s);
+
+ if (tbl8_group_index < 0) {
+ return tbl8_group_index;
+ }
+
+ tbl8_group_start = tbl8_group_index *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+ tbl8_group_end = tbl8_group_start +
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+
+ /* Populate new tbl8 with tbl24 value. */
+ for (i = tbl8_group_start; i < tbl8_group_end; i++) {
+ struct rte_lpm_tbl_entry new_tbl8_entry = {
+ .valid = VALID,
+ .depth = lpm->tbl24[tbl24_index].depth,
+ .valid_group = lpm->tbl8[i].valid_group,
+ .next_hop = lpm->tbl24[tbl24_index].next_hop,
+ };
+ __atomic_store(&lpm->tbl8[i], &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+ }
+
+ tbl8_index = tbl8_group_start + (ip_masked & 0xFF);
+
+ /* Insert new rule into the tbl8 entry. */
+ for (i = tbl8_index; i < tbl8_index + tbl8_range; i++) {
+ struct rte_lpm_tbl_entry new_tbl8_entry = {
+ .valid = VALID,
+ .depth = depth,
+ .valid_group = lpm->tbl8[i].valid_group,
+ .next_hop = next_hop,
+ };
+ __atomic_store(&lpm->tbl8[i], &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+ }
+
+ /*
+ * Update tbl24 entry to point to new tbl8 entry. Note: The
+ * ext_flag and tbl8_index need to be updated simultaneously,
+ * so assign whole structure in one go.
+ */
+
+ struct rte_lpm_tbl_entry new_tbl24_entry = {
+ .group_idx = tbl8_group_index,
+ .valid = VALID,
+ .valid_group = 1,
+ .depth = 0,
+ };
+
+ /* The tbl24 entry must be written only after the
+ * tbl8 entries are written.
+ */
+ __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry,
+ __ATOMIC_RELEASE);
+
+ } else { /*
+ * If it is valid, extended entry calculate the index into tbl8.
+ */
+ tbl8_group_index = lpm->tbl24[tbl24_index].group_idx;
+ tbl8_group_start = tbl8_group_index *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+ tbl8_index = tbl8_group_start + (ip_masked & 0xFF);
+
+ for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) {
+
+ if (!lpm->tbl8[i].valid ||
+ lpm->tbl8[i].depth <= depth) {
+ struct rte_lpm_tbl_entry new_tbl8_entry = {
+ .valid = VALID,
+ .depth = depth,
+ .next_hop = next_hop,
+ .valid_group = lpm->tbl8[i].valid_group,
+ };
+
+ /*
+ * Setting tbl8 entry in one go to avoid race
+ * condition
+ */
+ __atomic_store(&lpm->tbl8[i], &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+
+ continue;
+ }
+ }
+ }
+#undef group_idx
+ return 0;
+}
+
+/*
+ * Add a route
+ */
+int
+rte_lpm_add(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
+ uint32_t next_hop)
+{
+ int32_t status = 0;
+ uint32_t ip_masked;
+
+ /* Check user arguments. */
+ if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH))
+ return -EINVAL;
+
+ ip_masked = ip & depth_to_mask(depth);
+
+#if 0
+ /* Add the rule to the rule table. */
+ rule_index = rule_add(lpm, ip_masked, depth, next_hop);
+
+ /* Skip table entries update if The rule is the same as
+ * the rule in the rules table.
+ */
+ if (rule_index == -EEXIST)
+ return 0;
+
+ /* If the is no space available for new rule return error. */
+ if (rule_index < 0) {
+ return rule_index;
+ }
+#endif
+
+ if (depth <= MAX_DEPTH_TBL24) {
+ status = add_depth_small(lpm, ip_masked, depth, next_hop);
+ } else { /* If depth > RTE_LPM_MAX_DEPTH_TBL24 */
+ status = add_depth_big(lpm, ip_masked, depth, next_hop);
+
+ /*
+ * If add fails due to exhaustion of tbl8 extensions delete
+ * rule that was added to rule table.
+ */
+ if (status < 0) {
+ //rule_delete(lpm, rule_index, depth);
+
+ return status;
+ }
+ }
+
+ return 0;
+}
+
+#if 0
+/*
+ * Look for a rule in the high-level rules table
+ */
+int
+rte_lpm_is_rule_present(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
+uint32_t *next_hop)
+{
+ uint32_t ip_masked;
+ int32_t rule_index;
+
+ /* Check user arguments. */
+ if ((lpm == NULL) ||
+ (next_hop == NULL) ||
+ (depth < 1) || (depth > RTE_LPM_MAX_DEPTH))
+ return -EINVAL;
+
+ /* Look for the rule using rule_find. */
+ ip_masked = ip & depth_to_mask(depth);
+ rule_index = rule_find(lpm, ip_masked, depth);
+
+ if (rule_index >= 0) {
+ *next_hop = lpm->rules_tbl[rule_index].next_hop;
+ return 1;
+ }
+
+ /* If rule is not found return 0. */
+ return 0;
+}
+
+static int32_t
+find_previous_rule(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
+ uint8_t *sub_rule_depth)
+{
+ int32_t rule_index;
+ uint32_t ip_masked;
+ uint8_t prev_depth;
+
+ for (prev_depth = (uint8_t)(depth - 1); prev_depth > 0; prev_depth--) {
+ ip_masked = ip & depth_to_mask(prev_depth);
+
+ rule_index = rule_find(lpm, ip_masked, prev_depth);
+
+ if (rule_index >= 0) {
+ *sub_rule_depth = prev_depth;
+ return rule_index;
+ }
+ }
+
+ return -1;
+}
+#endif
+
+static int32_t
+delete_depth_small(struct rte_lpm *lpm, uint32_t ip_masked,
+ uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth)
+{
+#define group_idx next_hop
+ uint32_t tbl24_range, tbl24_index, tbl8_group_index, tbl8_index, i, j;
+
+ /* Calculate the range and index into Table24. */
+ tbl24_range = depth_to_range(depth);
+ tbl24_index = (ip_masked >> 8);
+ struct rte_lpm_tbl_entry zero_tbl24_entry = {0};
+
+ /*
+ * Firstly check the sub_rule_index. A -1 indicates no replacement rule
+ * and a positive number indicates a sub_rule_index.
+ */
+ if (sub_rule_nhop == 0) {
+ /*
+ * If no replacement rule exists then invalidate entries
+ * associated with this rule.
+ */
+ for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) {
+
+ if (lpm->tbl24[i].valid_group == 0 &&
+ lpm->tbl24[i].depth <= depth) {
+ __atomic_store(&lpm->tbl24[i],
+ &zero_tbl24_entry, __ATOMIC_RELEASE);
+ } else if (lpm->tbl24[i].valid_group == 1) {
+ /*
+ * If TBL24 entry is extended, then there has
+ * to be a rule with depth >= 25 in the
+ * associated TBL8 group.
+ */
+
+ tbl8_group_index = lpm->tbl24[i].group_idx;
+ tbl8_index = tbl8_group_index *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+
+ for (j = tbl8_index; j < (tbl8_index +
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) {
+
+ if (lpm->tbl8[j].depth <= depth)
+ lpm->tbl8[j].valid = INVALID;
+ }
+ }
+ }
+ } else {
+ /*
+ * If a replacement rule exists then modify entries
+ * associated with this rule.
+ */
+
+ struct rte_lpm_tbl_entry new_tbl24_entry = {
+ .next_hop = sub_rule_nhop,
+ .valid = VALID,
+ .valid_group = 0,
+ .depth = sub_rule_depth,
+ };
+
+ struct rte_lpm_tbl_entry new_tbl8_entry = {
+ .valid = VALID,
+ .valid_group = VALID,
+ .depth = sub_rule_depth,
+ .next_hop = sub_rule_nhop,
+ };
+
+ for (i = tbl24_index; i < (tbl24_index + tbl24_range); i++) {
+
+ if (lpm->tbl24[i].valid_group == 0 &&
+ lpm->tbl24[i].depth <= depth) {
+ __atomic_store(&lpm->tbl24[i], &new_tbl24_entry,
+ __ATOMIC_RELEASE);
+ } else if (lpm->tbl24[i].valid_group == 1) {
+ /*
+ * If TBL24 entry is extended, then there has
+ * to be a rule with depth >= 25 in the
+ * associated TBL8 group.
+ */
+
+ tbl8_group_index = lpm->tbl24[i].group_idx;
+ tbl8_index = tbl8_group_index *
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+
+ for (j = tbl8_index; j < (tbl8_index +
+ RTE_LPM_TBL8_GROUP_NUM_ENTRIES); j++) {
+
+ if (lpm->tbl8[j].depth <= depth)
+ __atomic_store(&lpm->tbl8[j],
+ &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+ }
+ }
+ }
+ }
+#undef group_idx
+ return 0;
+}
+
+/*
+ * Checks if table 8 group can be recycled.
+ *
+ * Return of -EEXIST means tbl8 is in use and thus can not be recycled.
+ * Return of -EINVAL means tbl8 is empty and thus can be recycled
+ * Return of value > -1 means tbl8 is in use but has all the same values and
+ * thus can be recycled
+ */
+static int32_t
+tbl8_recycle_check(struct rte_lpm_tbl_entry *tbl8,
+ uint32_t tbl8_group_start)
+{
+ uint32_t tbl8_group_end, i;
+ tbl8_group_end = tbl8_group_start + RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+
+ /*
+ * Check the first entry of the given tbl8. If it is invalid we know
+ * this tbl8 does not contain any rule with a depth < RTE_LPM_MAX_DEPTH
+ * (As they would affect all entries in a tbl8) and thus this table
+ * can not be recycled.
+ */
+ if (tbl8[tbl8_group_start].valid) {
+ /*
+ * If first entry is valid check if the depth is less than 24
+ * and if so check the rest of the entries to verify that they
+ * are all of this depth.
+ */
+ if (tbl8[tbl8_group_start].depth <= MAX_DEPTH_TBL24) {
+ for (i = (tbl8_group_start + 1); i < tbl8_group_end;
+ i++) {
+
+ if (tbl8[i].depth !=
+ tbl8[tbl8_group_start].depth) {
+
+ return -EEXIST;
+ }
+ }
+ /* If all entries are the same return the tb8 index */
+ return tbl8_group_start;
+ }
+
+ return -EEXIST;
+ }
+ /*
+ * If the first entry is invalid check if the rest of the entries in
+ * the tbl8 are invalid.
+ */
+ for (i = (tbl8_group_start + 1); i < tbl8_group_end; i++) {
+ if (tbl8[i].valid)
+ return -EEXIST;
+ }
+ /* If no valid entries are found then return -EINVAL. */
+ return -EINVAL;
+}
+
+static int32_t
+delete_depth_big(struct rte_lpm *lpm, uint32_t ip_masked,
+ uint8_t depth, uint32_t sub_rule_nhop, uint8_t sub_rule_depth)
+{
+#define group_idx next_hop
+ uint32_t tbl24_index, tbl8_group_index, tbl8_group_start, tbl8_index,
+ tbl8_range, i;
+ int32_t tbl8_recycle_index;
+
+ /*
+ * Calculate the index into tbl24 and range. Note: All depths larger
+ * than MAX_DEPTH_TBL24 are associated with only one tbl24 entry.
+ */
+ tbl24_index = ip_masked >> 8;
+
+ /* Calculate the index into tbl8 and range. */
+ tbl8_group_index = lpm->tbl24[tbl24_index].group_idx;
+ tbl8_group_start = tbl8_group_index * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+ tbl8_index = tbl8_group_start + (ip_masked & 0xFF);
+ tbl8_range = depth_to_range(depth);
+
+ if (sub_rule_nhop == 0) {
+ /*
+ * Loop through the range of entries on tbl8 for which the
+ * rule_to_delete must be removed or modified.
+ */
+ for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) {
+ if (lpm->tbl8[i].depth <= depth)
+ lpm->tbl8[i].valid = INVALID;
+ }
+ } else {
+ /* Set new tbl8 entry. */
+ struct rte_lpm_tbl_entry new_tbl8_entry = {
+ .valid = VALID,
+ .depth = sub_rule_depth,
+ .valid_group = lpm->tbl8[tbl8_group_start].valid_group,
+ .next_hop = sub_rule_nhop,
+ };
+
+ /*
+ * Loop through the range of entries on tbl8 for which the
+ * rule_to_delete must be modified.
+ */
+ for (i = tbl8_index; i < (tbl8_index + tbl8_range); i++) {
+ if (lpm->tbl8[i].depth <= depth)
+ __atomic_store(&lpm->tbl8[i], &new_tbl8_entry,
+ __ATOMIC_RELAXED);
+ }
+ }
+
+ /*
+ * Check if there are any valid entries in this tbl8 group. If all
+ * tbl8 entries are invalid we can free the tbl8 and invalidate the
+ * associated tbl24 entry.
+ */
+
+ tbl8_recycle_index = tbl8_recycle_check(lpm->tbl8, tbl8_group_start);
+
+ if (tbl8_recycle_index == -EINVAL) {
+ /* Set tbl24 before freeing tbl8 to avoid race condition.
+ * Prevent the free of the tbl8 group from hoisting.
+ */
+ lpm->tbl24[tbl24_index].valid = 0;
+ __atomic_thread_fence(__ATOMIC_RELEASE);
+ tbl8_free(lpm->tbl8, tbl8_group_start);
+ } else if (tbl8_recycle_index > -1) {
+ /* Update tbl24 entry. */
+ struct rte_lpm_tbl_entry new_tbl24_entry = {
+ .next_hop = lpm->tbl8[tbl8_recycle_index].next_hop,
+ .valid = VALID,
+ .valid_group = 0,
+ .depth = lpm->tbl8[tbl8_recycle_index].depth,
+ };
+
+ /* Set tbl24 before freeing tbl8 to avoid race condition.
+ * Prevent the free of the tbl8 group from hoisting.
+ */
+ __atomic_store(&lpm->tbl24[tbl24_index], &new_tbl24_entry,
+ __ATOMIC_RELAXED);
+ __atomic_thread_fence(__ATOMIC_RELEASE);
+ tbl8_free(lpm->tbl8, tbl8_group_start);
+ }
+#undef group_idx
+ return 0;
+}
+
+/*
+ * Deletes a rule
+ */
+int
+rte_lpm_delete(struct rte_lpm *lpm, uint32_t ip, uint8_t depth,
+ uint8_t sub_rule_depth, uint32_t sub_rule_nhop)
+{
+ //int32_t rule_to_delete_index;
+ uint32_t ip_masked;
+ //uint8_t sub_rule_depth;
+ /*
+ * Check input arguments. Note: IP must be a positive integer of 32
+ * bits in length therefore it need not be checked.
+ */
+ if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM_MAX_DEPTH)) {
+ return -EINVAL;
+ }
+
+ ip_masked = ip & depth_to_mask(depth);
+
+#if 0
+ /*
+ * Find the index of the input rule, that needs to be deleted, in the
+ * rule table.
+ */
+ rule_to_delete_index = rule_find(lpm, ip_masked, depth);
+
+ /*
+ * Check if rule_to_delete_index was found. If no rule was found the
+ * function rule_find returns -EINVAL.
+ */
+ if (rule_to_delete_index < 0)
+ return -EINVAL;
+
+ /* Delete the rule from the rule table. */
+ rule_delete(lpm, rule_to_delete_index, depth);
+#endif
+
+ /*
+ * Find rule to replace the rule_to_delete. If there is no rule to
+ * replace the rule_to_delete we return -1 and invalidate the table
+ * entries associated with this rule.
+ */
+ //sub_rule_depth = *psub_rule_depth;
+ //sub_rule_index = find_previous_rule(lpm, ip, depth, &sub_rule_depth);
+
+ /*
+ * If the input depth value is less than 25 use function
+ * delete_depth_small otherwise use delete_depth_big.
+ */
+ if (depth <= MAX_DEPTH_TBL24) {
+ return delete_depth_small(lpm, ip_masked, depth,
+ sub_rule_nhop, sub_rule_depth);
+ } else { /* If depth > MAX_DEPTH_TBL24 */
+ return delete_depth_big(lpm, ip_masked, depth, sub_rule_nhop,
+ sub_rule_depth);
+ }
+}
+
+/*
+ * Delete all rules from the LPM table.
+ */
+void
+rte_lpm_delete_all(struct rte_lpm *lpm)
+{
+ /* Zero rule information. */
+ memset(lpm->rule_info, 0, sizeof(lpm->rule_info));
+
+ /* Zero tbl24. */
+ memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
+
+ /* Zero tbl8. */
+ memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0])
+ * RTE_LPM_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
+
+ /* Delete all rules form the rules table. */
+ memset(lpm->rules_tbl, 0, sizeof(lpm->rules_tbl[0]) * lpm->max_rules);
+}
Index: sys/contrib/dpdk_rte_lpm/rte_lpm6.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_lpm6.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+#ifndef _RTE_LPM6_H_
+#define _RTE_LPM6_H_
+
+/**
+ * @file
+ * RTE Longest Prefix Match for IPv6 (LPM6)
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define RTE_LPM6_MAX_DEPTH 128
+#define RTE_LPM6_IPV6_ADDR_SIZE 16
+/** Max number of characters in LPM name. */
+#define RTE_LPM6_NAMESIZE 32
+
+/** LPM structure. */
+struct rte_lpm6;
+
+struct nhop_object;
+struct rte_lpm6_external {
+ struct nhop_object **nh_idx; /**< # -> idx mappings */
+ uint32_t default_idx; /* nhop index of default route */
+ uint32_t fibnum; /* fib index */
+};
+
+/** LPM configuration structure. */
+struct rte_lpm6_config {
+ uint32_t max_rules; /**< Max number of rules. */
+ uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */
+ int flags; /**< This field is currently unused. */
+};
+
+#define RTE_LPM6_RULE_SIZE 32
+struct rte_lpm6_rule *fill_rule6(char *buffer, const uint8_t *ip,
+ uint8_t depth, uint32_t next_hop);
+/**
+ * Create an LPM object.
+ *
+ * @param name
+ * LPM object name
+ * @param socket_id
+ * NUMA socket ID for LPM table memory allocation
+ * @param config
+ * Structure containing the configuration
+ * @return
+ * Handle to LPM object on success, NULL otherwise with rte_errno set
+ * to an appropriate values. Possible rte_errno values include:
+ * - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ * - E_RTE_SECONDARY - function was called from a secondary process instance
+ * - EINVAL - invalid parameter passed to function
+ * - ENOSPC - the maximum number of memzones has already been allocated
+ * - EEXIST - a memzone with the same name already exists
+ * - ENOMEM - no appropriate memory area found in which to create memzone
+ */
+struct rte_lpm6 *
+rte_lpm6_create(const char *name, int socket_id,
+ const struct rte_lpm6_config *config);
+
+/**
+ * Find an existing LPM object and return a pointer to it.
+ *
+ * @param name
+ * Name of the lpm object as passed to rte_lpm6_create()
+ * @return
+ * Pointer to lpm object or NULL if object not found with rte_errno
+ * set appropriately. Possible rte_errno values include:
+ * - ENOENT - required entry not available to return.
+ */
+struct rte_lpm6 *
+rte_lpm6_find_existing(const char *name);
+
+/**
+ * Free an LPM object.
+ *
+ * @param lpm
+ * LPM object handle
+ * @return
+ * None
+ */
+void
+rte_lpm6_free(struct rte_lpm6 *lpm);
+
+/**
+ * Add a rule to the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP of the rule to be added to the LPM table
+ * @param depth
+ * Depth of the rule to be added to the LPM table
+ * @param next_hop
+ * Next hop of the rule to be added to the LPM table
+ * @return
+ * 0 on success, negative value otherwise
+ */
+int
+rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ uint32_t next_hop, int is_new_rule);
+
+/**
+ * Check if a rule is present in the LPM table,
+ * and provide its next hop if it is.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP of the rule to be searched
+ * @param depth
+ * Depth of the rule to searched
+ * @param next_hop
+ * Next hop of the rule (valid only if it is found)
+ * @return
+ * 1 if the rule exists, 0 if it does not, a negative value on failure
+ */
+int
+rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ uint32_t *next_hop);
+
+/**
+ * Delete a rule from the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP of the rule to be deleted from the LPM table
+ * @param depth
+ * Depth of the rule to be deleted from the LPM table
+ * @return
+ * 0 on success, negative value otherwise
+ */
+int
+rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ struct rte_lpm6_rule *lsp_rule);
+
+/**
+ * Delete a rule from the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ips
+ * Array of IPs to be deleted from the LPM table
+ * @param depths
+ * Array of depths of the rules to be deleted from the LPM table
+ * @param n
+ * Number of rules to be deleted from the LPM table
+ * @return
+ * 0 on success, negative value otherwise.
+ */
+int
+rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm,
+ uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths, unsigned n);
+
+/**
+ * Delete all rules from the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ */
+void
+rte_lpm6_delete_all(struct rte_lpm6 *lpm);
+
+/**
+ * Lookup an IP into the LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * IP to be looked up in the LPM table
+ * @param next_hop
+ * Next hop of the most specific rule found for IP (valid on lookup hit only)
+ * @return
+ * -EINVAL for incorrect arguments, -ENOENT on lookup miss, 0 on lookup hit
+ */
+int
+rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip, uint32_t *next_hop);
+
+/**
+ * Lookup multiple IP addresses in an LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ips
+ * Array of IPs to be looked up in the LPM table
+ * @param next_hops
+ * Next hop of the most specific rule found for IP (valid on lookup hit only).
+ * This is an array of two byte values. The next hop will be stored on
+ * each position on success; otherwise the position will be set to -1.
+ * @param n
+ * Number of elements in ips (and next_hops) array to lookup.
+ * @return
+ * -EINVAL for incorrect arguments, otherwise 0
+ */
+int
+rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm,
+ uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE],
+ int32_t *next_hops, unsigned int n);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
Index: sys/contrib/dpdk_rte_lpm/rte_lpm6.c
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_lpm6.c
@@ -0,0 +1,1415 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <sys/param.h>
+#include <sys/ctype.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+
+//#include <netinet6/rte_tailq.h>
+int errno = 0, rte_errno = 0;
+
+#include "rte_shim.h"
+#include "rte_lpm6.h"
+
+#define RTE_LPM6_TBL24_NUM_ENTRIES (1 << 24)
+#define RTE_LPM6_TBL8_GROUP_NUM_ENTRIES 256
+#define RTE_LPM6_TBL8_MAX_NUM_GROUPS (1 << 21)
+
+#define RTE_LPM6_VALID_EXT_ENTRY_BITMASK 0xA0000000
+#define RTE_LPM6_LOOKUP_SUCCESS 0x20000000
+#define RTE_LPM6_TBL8_BITMASK 0x001FFFFF
+
+#define ADD_FIRST_BYTE 3
+#define LOOKUP_FIRST_BYTE 4
+#define BYTE_SIZE 8
+#define BYTES2_SIZE 16
+
+#define RULE_HASH_TABLE_EXTRA_SPACE 64
+#define TBL24_IND UINT32_MAX
+
+#define lpm6_tbl8_gindex next_hop
+
+/** Flags for setting an entry as valid/invalid. */
+enum valid_flag {
+ INVALID = 0,
+ VALID
+};
+
+#if 0
+TAILQ_HEAD(rte_lpm6_list, rte_tailq_entry);
+
+static struct rte_tailq_elem rte_lpm6_tailq = {
+ .name = "RTE_LPM6",
+};
+EAL_REGISTER_TAILQ(rte_lpm6_tailq)
+#endif
+
+/** Tbl entry structure. It is the same for both tbl24 and tbl8 */
+struct rte_lpm6_tbl_entry {
+ uint32_t next_hop: 21; /**< Next hop / next table to be checked. */
+ uint32_t depth :8; /**< Rule depth. */
+
+ /* Flags. */
+ uint32_t valid :1; /**< Validation flag. */
+ uint32_t valid_group :1; /**< Group validation flag. */
+ uint32_t ext_entry :1; /**< External entry. */
+};
+
+/** Rules tbl entry structure. */
+struct rte_lpm6_rule {
+ uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */
+ uint32_t next_hop; /**< Rule next hop. */
+ uint8_t depth; /**< Rule depth. */
+};
+
+/** Rules tbl entry key. */
+struct rte_lpm6_rule_key {
+ uint8_t ip[RTE_LPM6_IPV6_ADDR_SIZE]; /**< Rule IP address. */
+ uint8_t depth; /**< Rule depth. */
+};
+
+/* Header of tbl8 */
+struct rte_lpm_tbl8_hdr {
+ uint32_t owner_tbl_ind; /**< owner table: TBL24_IND if owner is tbl24,
+ * otherwise index of tbl8
+ */
+ uint32_t owner_entry_ind; /**< index of the owner table entry where
+ * pointer to the tbl8 is stored
+ */
+ uint32_t ref_cnt; /**< table reference counter */
+};
+
+/** LPM6 structure. */
+struct rte_lpm6 {
+ struct rte_lpm6_external ext; /* Storage used by the algo wrapper */
+ /* LPM metadata. */
+ char name[RTE_LPM6_NAMESIZE]; /**< Name of the lpm. */
+ uint32_t max_rules; /**< Max number of rules. */
+ uint32_t used_rules; /**< Used rules so far. */
+ uint32_t number_tbl8s; /**< Number of tbl8s to allocate. */
+
+ /* LPM Tables. */
+ //struct rte_hash *rules_tbl; /**< LPM rules. */
+ struct rte_lpm6_tbl_entry tbl24[RTE_LPM6_TBL24_NUM_ENTRIES]
+ __rte_cache_aligned; /**< LPM tbl24 table. */
+
+ uint32_t *tbl8_pool; /**< pool of indexes of free tbl8s */
+ uint32_t tbl8_pool_pos; /**< current position in the tbl8 pool */
+
+ struct rte_lpm_tbl8_hdr *tbl8_hdrs; /* array of tbl8 headers */
+
+ struct rte_lpm6_tbl_entry tbl8[0]
+ __rte_cache_aligned; /**< LPM tbl8 table. */
+};
+
+/*
+ * Takes an array of uint8_t (IPv6 address) and masks it using the depth.
+ * It leaves untouched one bit per unit in the depth variable
+ * and set the rest to 0.
+ */
+static inline void
+ip6_mask_addr(uint8_t *ip, uint8_t depth)
+{
+ int16_t part_depth, mask;
+ int i;
+
+ part_depth = depth;
+
+ for (i = 0; i < RTE_LPM6_IPV6_ADDR_SIZE; i++) {
+ if (part_depth < BYTE_SIZE && part_depth >= 0) {
+ mask = (uint16_t)(~(UINT8_MAX >> part_depth));
+ ip[i] = (uint8_t)(ip[i] & mask);
+ } else if (part_depth < 0)
+ ip[i] = 0;
+
+ part_depth -= BYTE_SIZE;
+ }
+}
+
+/* copy ipv6 address */
+static inline void
+ip6_copy_addr(uint8_t *dst, const uint8_t *src)
+{
+ rte_memcpy(dst, src, RTE_LPM6_IPV6_ADDR_SIZE);
+}
+
+#if 0
+/*
+ * LPM6 rule hash function
+ *
+ * It's used as a hash function for the rte_hash
+ * containing rules
+ */
+static inline uint32_t
+rule_hash(const void *data, __rte_unused uint32_t data_len,
+ uint32_t init_val)
+{
+ return rte_jhash(data, sizeof(struct rte_lpm6_rule_key), init_val);
+}
+#endif
+
+/*
+ * Init pool of free tbl8 indexes
+ */
+static void
+tbl8_pool_init(struct rte_lpm6 *lpm)
+{
+ uint32_t i;
+
+ /* put entire range of indexes to the tbl8 pool */
+ for (i = 0; i < lpm->number_tbl8s; i++)
+ lpm->tbl8_pool[i] = i;
+
+ lpm->tbl8_pool_pos = 0;
+}
+
+/*
+ * Get an index of a free tbl8 from the pool
+ */
+static inline uint32_t
+tbl8_get(struct rte_lpm6 *lpm, uint32_t *tbl8_ind)
+{
+ if (lpm->tbl8_pool_pos == lpm->number_tbl8s)
+ /* no more free tbl8 */
+ return -ENOSPC;
+
+ /* next index */
+ *tbl8_ind = lpm->tbl8_pool[lpm->tbl8_pool_pos++];
+ return 0;
+}
+
+/*
+ * Put an index of a free tbl8 back to the pool
+ */
+static inline uint32_t
+tbl8_put(struct rte_lpm6 *lpm, uint32_t tbl8_ind)
+{
+ if (lpm->tbl8_pool_pos == 0)
+ /* pool is full */
+ return -ENOSPC;
+
+ lpm->tbl8_pool[--lpm->tbl8_pool_pos] = tbl8_ind;
+ return 0;
+}
+
+/*
+ * Returns number of tbl8s available in the pool
+ */
+static inline uint32_t
+tbl8_available(struct rte_lpm6 *lpm)
+{
+ return lpm->number_tbl8s - lpm->tbl8_pool_pos;
+}
+
+#if 0
+/*
+ * Init a rule key.
+ * note that ip must be already masked
+ */
+static inline void
+rule_key_init(struct rte_lpm6_rule_key *key, uint8_t *ip, uint8_t depth)
+{
+ ip6_copy_addr(key->ip, ip);
+ key->depth = depth;
+}
+
+/*
+ * Rebuild the entire LPM tree by reinserting all rules
+ */
+static void
+rebuild_lpm(struct rte_lpm6 *lpm)
+{
+ uint64_t next_hop;
+ struct rte_lpm6_rule_key *rule_key;
+ uint32_t iter = 0;
+
+ while (rte_hash_iterate(lpm->rules_tbl, (void *) &rule_key,
+ (void **) &next_hop, &iter) >= 0)
+ rte_lpm6_add(lpm, rule_key->ip, rule_key->depth,
+ (uint32_t) next_hop);
+}
+#endif
+
+/*
+ * Allocates memory for LPM object
+ */
+struct rte_lpm6 *
+rte_lpm6_create(const char *name, int socket_id,
+ const struct rte_lpm6_config *config)
+{
+ char mem_name[RTE_LPM6_NAMESIZE];
+ struct rte_lpm6 *lpm = NULL;
+ //struct rte_tailq_entry *te;
+ uint64_t mem_size;
+ //struct rte_lpm6_list *lpm_list;
+ //struct rte_hash *rules_tbl = NULL;
+ uint32_t *tbl8_pool = NULL;
+ struct rte_lpm_tbl8_hdr *tbl8_hdrs = NULL;
+
+ //lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list);
+
+ RTE_BUILD_BUG_ON(sizeof(struct rte_lpm6_tbl_entry) != sizeof(uint32_t));
+
+ /* Check user arguments. */
+ if ((name == NULL) || (socket_id < -1) || (config == NULL) ||
+ config->number_tbl8s > RTE_LPM6_TBL8_MAX_NUM_GROUPS) {
+ rte_errno = EINVAL;
+ return NULL;
+ }
+
+#if 0
+ /* create rules hash table */
+ snprintf(mem_name, sizeof(mem_name), "LRH_%s", name);
+ struct rte_hash_parameters rule_hash_tbl_params = {
+ .entries = config->max_rules * 1.2 +
+ RULE_HASH_TABLE_EXTRA_SPACE,
+ .key_len = sizeof(struct rte_lpm6_rule_key),
+ .hash_func = rule_hash,
+ .hash_func_init_val = 0,
+ .name = mem_name,
+ .reserved = 0,
+ .socket_id = socket_id,
+ .extra_flag = 0
+ };
+
+ rules_tbl = rte_hash_create(&rule_hash_tbl_params);
+ if (rules_tbl == NULL) {
+ RTE_LOG(ERR, LPM, "LPM rules hash table allocation failed: %s (%d)",
+ rte_strerror(rte_errno), rte_errno);
+ goto fail_wo_unlock;
+ }
+#endif
+
+ /* allocate tbl8 indexes pool */
+ tbl8_pool = rte_malloc(NULL,
+ sizeof(uint32_t) * config->number_tbl8s,
+ RTE_CACHE_LINE_SIZE);
+ if (tbl8_pool == NULL) {
+ RTE_LOG(ERR, LPM, "LPM tbl8 pool allocation failed: %s (%d)",
+ rte_strerror(rte_errno), rte_errno);
+ rte_errno = ENOMEM;
+ goto fail_wo_unlock;
+ }
+
+ /* allocate tbl8 headers */
+ tbl8_hdrs = rte_malloc(NULL,
+ sizeof(struct rte_lpm_tbl8_hdr) * config->number_tbl8s,
+ RTE_CACHE_LINE_SIZE);
+ if (tbl8_hdrs == NULL) {
+ RTE_LOG(ERR, LPM, "LPM tbl8 headers allocation failed: %s (%d)",
+ rte_strerror(rte_errno), rte_errno);
+ rte_errno = ENOMEM;
+ goto fail_wo_unlock;
+ }
+
+ snprintf(mem_name, sizeof(mem_name), "LPM_%s", name);
+
+ /* Determine the amount of memory to allocate. */
+ mem_size = sizeof(*lpm) + (sizeof(lpm->tbl8[0]) *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * config->number_tbl8s);
+
+#if 0
+ rte_mcfg_tailq_write_lock();
+
+ /* Guarantee there's no existing */
+ TAILQ_FOREACH(te, lpm_list, next) {
+ lpm = (struct rte_lpm6 *) te->data;
+ if (strncmp(name, lpm->name, RTE_LPM6_NAMESIZE) == 0)
+ break;
+ }
+ lpm = NULL;
+ if (te != NULL) {
+ rte_errno = EEXIST;
+ goto fail;
+ }
+
+ /* allocate tailq entry */
+ te = rte_zmalloc("LPM6_TAILQ_ENTRY", sizeof(*te), 0);
+ if (te == NULL) {
+ RTE_LOG(ERR, LPM, "Failed to allocate tailq entry!\n");
+ rte_errno = ENOMEM;
+ goto fail;
+ }
+#endif
+
+ /* Allocate memory to store the LPM data structures. */
+ lpm = rte_zmalloc_socket(mem_name, (size_t)mem_size,
+ RTE_CACHE_LINE_SIZE, socket_id);
+
+ if (lpm == NULL) {
+ RTE_LOG(ERR, LPM, "LPM memory allocation failed\n");
+ //rte_free(te);
+ rte_errno = ENOMEM;
+ goto fail;
+ }
+
+ /* Save user arguments. */
+ //lpm->max_rules = config->max_rules;
+ lpm->number_tbl8s = config->number_tbl8s;
+ strlcpy(lpm->name, name, sizeof(lpm->name));
+ //lpm->rules_tbl = rules_tbl;
+ lpm->tbl8_pool = tbl8_pool;
+ lpm->tbl8_hdrs = tbl8_hdrs;
+
+ /* init the stack */
+ tbl8_pool_init(lpm);
+
+ //te->data = (void *) lpm;
+
+ //TAILQ_INSERT_TAIL(lpm_list, te, next);
+ rte_mcfg_tailq_write_unlock();
+ return lpm;
+
+fail:
+ rte_mcfg_tailq_write_unlock();
+
+fail_wo_unlock:
+ rte_free(tbl8_hdrs);
+ rte_free(tbl8_pool);
+ //rte_hash_free(rules_tbl);
+
+ return NULL;
+}
+
+#if 0
+/*
+ * Find an existing lpm table and return a pointer to it.
+ */
+struct rte_lpm6 *
+rte_lpm6_find_existing(const char *name)
+{
+ struct rte_lpm6 *l = NULL;
+ struct rte_tailq_entry *te;
+ struct rte_lpm6_list *lpm_list;
+
+ lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list);
+
+ rte_mcfg_tailq_read_lock();
+ TAILQ_FOREACH(te, lpm_list, next) {
+ l = (struct rte_lpm6 *) te->data;
+ if (strncmp(name, l->name, RTE_LPM6_NAMESIZE) == 0)
+ break;
+ }
+ rte_mcfg_tailq_read_unlock();
+
+ if (te == NULL) {
+ rte_errno = ENOENT;
+ return NULL;
+ }
+
+ return l;
+}
+#endif
+
+/*
+ * Deallocates memory for given LPM table.
+ */
+void
+rte_lpm6_free(struct rte_lpm6 *lpm)
+{
+#if 0
+ struct rte_lpm6_list *lpm_list;
+ struct rte_tailq_entry *te;
+
+ /* Check user arguments. */
+ if (lpm == NULL)
+ return;
+
+ lpm_list = RTE_TAILQ_CAST(rte_lpm6_tailq.head, rte_lpm6_list);
+
+ rte_mcfg_tailq_write_lock();
+
+ /* find our tailq entry */
+ TAILQ_FOREACH(te, lpm_list, next) {
+ if (te->data == (void *) lpm)
+ break;
+ }
+
+ if (te != NULL)
+ TAILQ_REMOVE(lpm_list, te, next);
+
+ rte_mcfg_tailq_write_unlock();
+#endif
+
+ rte_free(lpm->tbl8_hdrs);
+ rte_free(lpm->tbl8_pool);
+ //rte_hash_free(lpm->rules_tbl);
+ rte_free(lpm);
+ //rte_free(te);
+}
+
+#if 0
+/* Find a rule */
+static inline int
+rule_find_with_key(struct rte_lpm6 *lpm,
+ const struct rte_lpm6_rule_key *rule_key,
+ uint32_t *next_hop)
+{
+ uint64_t hash_val;
+ int ret;
+
+ /* lookup for a rule */
+ ret = rte_hash_lookup_data(lpm->rules_tbl, (const void *) rule_key,
+ (void **) &hash_val);
+ if (ret >= 0) {
+ *next_hop = (uint32_t) hash_val;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Find a rule */
+static int
+rule_find(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
+ uint32_t *next_hop)
+{
+ struct rte_lpm6_rule_key rule_key;
+
+ /* init a rule key */
+ rule_key_init(&rule_key, ip, depth);
+
+ return rule_find_with_key(lpm, &rule_key, next_hop);
+}
+
+/*
+ * Checks if a rule already exists in the rules table and updates
+ * the nexthop if so. Otherwise it adds a new rule if enough space is available.
+ *
+ * Returns:
+ * 0 - next hop of existed rule is updated
+ * 1 - new rule successfully added
+ * <0 - error
+ */
+static inline int
+rule_add(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth, uint32_t next_hop)
+{
+ int ret, rule_exist;
+ struct rte_lpm6_rule_key rule_key;
+ uint32_t unused;
+
+ /* init a rule key */
+ rule_key_init(&rule_key, ip, depth);
+
+ /* Scan through rule list to see if rule already exists. */
+ rule_exist = rule_find_with_key(lpm, &rule_key, &unused);
+
+ /*
+ * If rule does not exist check if there is space to add a new rule to
+ * this rule group. If there is no space return error.
+ */
+ if (!rule_exist && lpm->used_rules == lpm->max_rules)
+ return -ENOSPC;
+
+ /* add the rule or update rules next hop */
+ ret = rte_hash_add_key_data(lpm->rules_tbl, &rule_key,
+ (void *)(uintptr_t) next_hop);
+ if (ret < 0)
+ return ret;
+
+ /* Increment the used rules counter for this rule group. */
+ if (!rule_exist) {
+ lpm->used_rules++;
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+/*
+ * Function that expands a rule across the data structure when a less-generic
+ * one has been added before. It assures that every possible combination of bits
+ * in the IP address returns a match.
+ */
+static void
+expand_rule(struct rte_lpm6 *lpm, uint32_t tbl8_gindex, uint8_t old_depth,
+ uint8_t new_depth, uint32_t next_hop, uint8_t valid)
+{
+ uint32_t tbl8_group_end, tbl8_gindex_next, j;
+
+ tbl8_group_end = tbl8_gindex + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+
+ struct rte_lpm6_tbl_entry new_tbl8_entry = {
+ .valid = valid,
+ .valid_group = valid,
+ .depth = new_depth,
+ .next_hop = next_hop,
+ .ext_entry = 0,
+ };
+
+ for (j = tbl8_gindex; j < tbl8_group_end; j++) {
+ if (!lpm->tbl8[j].valid || (lpm->tbl8[j].ext_entry == 0
+ && lpm->tbl8[j].depth <= old_depth)) {
+
+ lpm->tbl8[j] = new_tbl8_entry;
+
+ } else if (lpm->tbl8[j].ext_entry == 1) {
+
+ tbl8_gindex_next = lpm->tbl8[j].lpm6_tbl8_gindex
+ * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+ expand_rule(lpm, tbl8_gindex_next, old_depth, new_depth,
+ next_hop, valid);
+ }
+ }
+}
+
+/*
+ * Init a tbl8 header
+ */
+static inline void
+init_tbl8_header(struct rte_lpm6 *lpm, uint32_t tbl_ind,
+ uint32_t owner_tbl_ind, uint32_t owner_entry_ind)
+{
+ struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind];
+ tbl_hdr->owner_tbl_ind = owner_tbl_ind;
+ tbl_hdr->owner_entry_ind = owner_entry_ind;
+ tbl_hdr->ref_cnt = 0;
+}
+
+/*
+ * Calculate index to the table based on the number and position
+ * of the bytes being inspected in this step.
+ */
+static uint32_t
+get_bitshift(const uint8_t *ip, uint8_t first_byte, uint8_t bytes)
+{
+ uint32_t entry_ind, i;
+ int8_t bitshift;
+
+ entry_ind = 0;
+ for (i = first_byte; i < (uint32_t)(first_byte + bytes); i++) {
+ bitshift = (int8_t)((bytes - i)*BYTE_SIZE);
+
+ if (bitshift < 0)
+ bitshift = 0;
+ entry_ind = entry_ind | ip[i-1] << bitshift;
+ }
+
+ return entry_ind;
+}
+
+/*
+ * Simulate adding a new route to the LPM counting number
+ * of new tables that will be needed
+ *
+ * It returns 0 on success, or 1 if
+ * the process needs to be continued by calling the function again.
+ */
+static inline int
+simulate_add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
+ struct rte_lpm6_tbl_entry **next_tbl, const uint8_t *ip,
+ uint8_t bytes, uint8_t first_byte, uint8_t depth,
+ uint32_t *need_tbl_nb)
+{
+ uint32_t entry_ind;
+ uint8_t bits_covered;
+ uint32_t next_tbl_ind;
+
+ /*
+ * Calculate index to the table based on the number and position
+ * of the bytes being inspected in this step.
+ */
+ entry_ind = get_bitshift(ip, first_byte, bytes);
+
+ /* Number of bits covered in this step */
+ bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE);
+
+ if (depth <= bits_covered) {
+ *need_tbl_nb = 0;
+ return 0;
+ }
+
+ if (tbl[entry_ind].valid == 0 || tbl[entry_ind].ext_entry == 0) {
+ /* from this point on a new table is needed on each level
+ * that is not covered yet
+ */
+ depth -= bits_covered;
+ uint32_t cnt = depth >> 3; /* depth / BYTE_SIZE */
+ if (depth & 7) /* 0b00000111 */
+ /* if depth % 8 > 0 then one more table is needed
+ * for those last bits
+ */
+ cnt++;
+
+ *need_tbl_nb = cnt;
+ return 0;
+ }
+
+ next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex;
+ *next_tbl = &(lpm->tbl8[next_tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]);
+ *need_tbl_nb = 0;
+ return 1;
+}
+
+/*
+ * Partially adds a new route to the data structure (tbl24+tbl8s).
+ * It returns 0 on success, a negative number on failure, or 1 if
+ * the process needs to be continued by calling the function again.
+ */
+static inline int
+add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl,
+ uint32_t tbl_ind, struct rte_lpm6_tbl_entry **next_tbl,
+ uint32_t *next_tbl_ind, uint8_t *ip, uint8_t bytes,
+ uint8_t first_byte, uint8_t depth, uint32_t next_hop,
+ uint8_t is_new_rule)
+{
+ uint32_t entry_ind, tbl_range, tbl8_group_start, tbl8_group_end, i;
+ uint32_t tbl8_gindex;
+ uint8_t bits_covered;
+ int ret;
+
+ /*
+ * Calculate index to the table based on the number and position
+ * of the bytes being inspected in this step.
+ */
+ entry_ind = get_bitshift(ip, first_byte, bytes);
+
+ /* Number of bits covered in this step */
+ bits_covered = (uint8_t)((bytes+first_byte-1)*BYTE_SIZE);
+
+ /*
+ * If depth if smaller than this number (ie this is the last step)
+ * expand the rule across the relevant positions in the table.
+ */
+ if (depth <= bits_covered) {
+ tbl_range = 1 << (bits_covered - depth);
+
+ for (i = entry_ind; i < (entry_ind + tbl_range); i++) {
+ if (!tbl[i].valid || (tbl[i].ext_entry == 0 &&
+ tbl[i].depth <= depth)) {
+
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = next_hop,
+ .depth = depth,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 0,
+ };
+
+ tbl[i] = new_tbl_entry;
+
+ } else if (tbl[i].ext_entry == 1) {
+
+ /*
+ * If tbl entry is valid and extended calculate the index
+ * into next tbl8 and expand the rule across the data structure.
+ */
+ tbl8_gindex = tbl[i].lpm6_tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+ expand_rule(lpm, tbl8_gindex, depth, depth,
+ next_hop, VALID);
+ }
+ }
+
+ /* update tbl8 rule reference counter */
+ if (tbl_ind != TBL24_IND && is_new_rule)
+ lpm->tbl8_hdrs[tbl_ind].ref_cnt++;
+
+ return 0;
+ }
+ /*
+ * If this is not the last step just fill one position
+ * and calculate the index to the next table.
+ */
+ else {
+ /* If it's invalid a new tbl8 is needed */
+ if (!tbl[entry_ind].valid) {
+ /* get a new table */
+ ret = tbl8_get(lpm, &tbl8_gindex);
+ if (ret != 0)
+ return -ENOSPC;
+
+ /* invalidate all new tbl8 entries */
+ tbl8_group_start = tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+ memset(&lpm->tbl8[tbl8_group_start], 0,
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES *
+ sizeof(struct rte_lpm6_tbl_entry));
+
+ /* init the new table's header:
+ * save the reference to the owner table
+ */
+ init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind);
+
+ /* reference to a new tbl8 */
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .lpm6_tbl8_gindex = tbl8_gindex,
+ .depth = 0,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 1,
+ };
+
+ tbl[entry_ind] = new_tbl_entry;
+
+ /* update the current table's reference counter */
+ if (tbl_ind != TBL24_IND)
+ lpm->tbl8_hdrs[tbl_ind].ref_cnt++;
+ }
+ /*
+ * If it's valid but not extended the rule that was stored
+ * here needs to be moved to the next table.
+ */
+ else if (tbl[entry_ind].ext_entry == 0) {
+ /* get a new tbl8 index */
+ ret = tbl8_get(lpm, &tbl8_gindex);
+ if (ret != 0)
+ return -ENOSPC;
+
+ tbl8_group_start = tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+ tbl8_group_end = tbl8_group_start +
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES;
+
+ struct rte_lpm6_tbl_entry tbl_entry = {
+ .next_hop = tbl[entry_ind].next_hop,
+ .depth = tbl[entry_ind].depth,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 0
+ };
+
+ /* Populate new tbl8 with tbl value. */
+ for (i = tbl8_group_start; i < tbl8_group_end; i++)
+ lpm->tbl8[i] = tbl_entry;
+
+ /* init the new table's header:
+ * save the reference to the owner table
+ */
+ init_tbl8_header(lpm, tbl8_gindex, tbl_ind, entry_ind);
+
+ /*
+ * Update tbl entry to point to new tbl8 entry. Note: The
+ * ext_flag and tbl8_index need to be updated simultaneously,
+ * so assign whole structure in one go.
+ */
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .lpm6_tbl8_gindex = tbl8_gindex,
+ .depth = 0,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 1,
+ };
+
+ tbl[entry_ind] = new_tbl_entry;
+
+ /* update the current table's reference counter */
+ if (tbl_ind != TBL24_IND)
+ lpm->tbl8_hdrs[tbl_ind].ref_cnt++;
+ }
+
+ *next_tbl_ind = tbl[entry_ind].lpm6_tbl8_gindex;
+ *next_tbl = &(lpm->tbl8[*next_tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES]);
+ }
+
+ return 1;
+}
+
+/*
+ * Simulate adding a route to LPM
+ *
+ * Returns:
+ * 0 on success
+ * -ENOSPC not enough tbl8 left
+ */
+static int
+simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth)
+{
+ struct rte_lpm6_tbl_entry *tbl;
+ struct rte_lpm6_tbl_entry *tbl_next = NULL;
+ int ret, i;
+
+ /* number of new tables needed for a step */
+ uint32_t need_tbl_nb;
+ /* total number of new tables needed */
+ uint32_t total_need_tbl_nb;
+
+ /* Inspect the first three bytes through tbl24 on the first step. */
+ ret = simulate_add_step(lpm, lpm->tbl24, &tbl_next, masked_ip,
+ ADD_FIRST_BYTE, 1, depth, &need_tbl_nb);
+ total_need_tbl_nb = need_tbl_nb;
+ /*
+ * Inspect one by one the rest of the bytes until
+ * the process is completed.
+ */
+ for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && ret == 1; i++) {
+ tbl = tbl_next;
+ ret = simulate_add_step(lpm, tbl, &tbl_next, masked_ip, 1,
+ (uint8_t)(i + 1), depth, &need_tbl_nb);
+ total_need_tbl_nb += need_tbl_nb;
+ }
+
+ if (tbl8_available(lpm) < total_need_tbl_nb)
+ /* not enough tbl8 to add a rule */
+ return -ENOSPC;
+
+ return 0;
+}
+
+/*
+ * Add a route
+ */
+int
+rte_lpm6_add(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ uint32_t next_hop, int is_new_rule)
+{
+ struct rte_lpm6_tbl_entry *tbl;
+ struct rte_lpm6_tbl_entry *tbl_next = NULL;
+ /* init to avoid compiler warning */
+ uint32_t tbl_next_num = 123456;
+ int status;
+ uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
+ int i;
+
+ /* Check user arguments. */
+ if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH))
+ return -EINVAL;
+
+ /* Copy the IP and mask it to avoid modifying user's input data. */
+ ip6_copy_addr(masked_ip, ip);
+ ip6_mask_addr(masked_ip, depth);
+
+ /* Simulate adding a new route */
+ int ret = simulate_add(lpm, masked_ip, depth);
+ if (ret < 0)
+ return ret;
+
+#if 0
+ /* Add the rule to the rule table. */
+ int is_new_rule = rule_add(lpm, masked_ip, depth, next_hop);
+ /* If there is no space available for new rule return error. */
+ if (is_new_rule < 0)
+ return is_new_rule;
+#endif
+
+ /* Inspect the first three bytes through tbl24 on the first step. */
+ tbl = lpm->tbl24;
+ status = add_step(lpm, tbl, TBL24_IND, &tbl_next, &tbl_next_num,
+ masked_ip, ADD_FIRST_BYTE, 1, depth, next_hop,
+ is_new_rule);
+ assert(status >= 0);
+
+ /*
+ * Inspect one by one the rest of the bytes until
+ * the process is completed.
+ */
+ for (i = ADD_FIRST_BYTE; i < RTE_LPM6_IPV6_ADDR_SIZE && status == 1; i++) {
+ tbl = tbl_next;
+ status = add_step(lpm, tbl, tbl_next_num, &tbl_next,
+ &tbl_next_num, masked_ip, 1, (uint8_t)(i + 1),
+ depth, next_hop, is_new_rule);
+ assert(status >= 0);
+ }
+
+ return status;
+}
+
+/*
+ * Takes a pointer to a table entry and inspect one level.
+ * The function returns 0 on lookup success, ENOENT if no match was found
+ * or 1 if the process needs to be continued by calling the function again.
+ */
+static inline int
+lookup_step(const struct rte_lpm6 *lpm, const struct rte_lpm6_tbl_entry *tbl,
+ const struct rte_lpm6_tbl_entry **tbl_next, const uint8_t *ip,
+ uint8_t first_byte, uint32_t *next_hop)
+{
+ uint32_t tbl8_index, tbl_entry;
+
+ /* Take the integer value from the pointer. */
+ tbl_entry = *(const uint32_t *)tbl;
+
+ /* If it is valid and extended we calculate the new pointer to return. */
+ if ((tbl_entry & RTE_LPM6_VALID_EXT_ENTRY_BITMASK) ==
+ RTE_LPM6_VALID_EXT_ENTRY_BITMASK) {
+
+ tbl8_index = ip[first_byte-1] +
+ ((tbl_entry & RTE_LPM6_TBL8_BITMASK) *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES);
+
+ *tbl_next = &lpm->tbl8[tbl8_index];
+
+ return 1;
+ } else {
+ /* If not extended then we can have a match. */
+ *next_hop = ((uint32_t)tbl_entry & RTE_LPM6_TBL8_BITMASK);
+ return (tbl_entry & RTE_LPM6_LOOKUP_SUCCESS) ? 0 : -ENOENT;
+ }
+}
+
+/*
+ * Looks up an IP
+ */
+int
+rte_lpm6_lookup(const struct rte_lpm6 *lpm, const uint8_t *ip,
+ uint32_t *next_hop)
+{
+ const struct rte_lpm6_tbl_entry *tbl;
+ const struct rte_lpm6_tbl_entry *tbl_next = NULL;
+ int status;
+ uint8_t first_byte;
+ uint32_t tbl24_index;
+
+ /* DEBUG: Check user input arguments. */
+ if ((lpm == NULL) || (ip == NULL) || (next_hop == NULL))
+ return -EINVAL;
+
+ first_byte = LOOKUP_FIRST_BYTE;
+ tbl24_index = (ip[0] << BYTES2_SIZE) | (ip[1] << BYTE_SIZE) | ip[2];
+
+ /* Calculate pointer to the first entry to be inspected */
+ tbl = &lpm->tbl24[tbl24_index];
+
+ do {
+ /* Continue inspecting following levels until success or failure */
+ status = lookup_step(lpm, tbl, &tbl_next, ip, first_byte++, next_hop);
+ tbl = tbl_next;
+ } while (status == 1);
+
+ return status;
+}
+
+/*
+ * Looks up a group of IP addresses
+ */
+int
+rte_lpm6_lookup_bulk_func(const struct rte_lpm6 *lpm,
+ uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE],
+ int32_t *next_hops, unsigned int n)
+{
+ unsigned int i;
+ const struct rte_lpm6_tbl_entry *tbl;
+ const struct rte_lpm6_tbl_entry *tbl_next = NULL;
+ uint32_t tbl24_index, next_hop;
+ uint8_t first_byte;
+ int status;
+
+ /* DEBUG: Check user input arguments. */
+ if ((lpm == NULL) || (ips == NULL) || (next_hops == NULL))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ first_byte = LOOKUP_FIRST_BYTE;
+ tbl24_index = (ips[i][0] << BYTES2_SIZE) |
+ (ips[i][1] << BYTE_SIZE) | ips[i][2];
+
+ /* Calculate pointer to the first entry to be inspected */
+ tbl = &lpm->tbl24[tbl24_index];
+
+ do {
+ /* Continue inspecting following levels
+ * until success or failure
+ */
+ status = lookup_step(lpm, tbl, &tbl_next, ips[i],
+ first_byte++, &next_hop);
+ tbl = tbl_next;
+ } while (status == 1);
+
+ if (status < 0)
+ next_hops[i] = -1;
+ else
+ next_hops[i] = (int32_t)next_hop;
+ }
+
+ return 0;
+}
+
+struct rte_lpm6_rule *
+fill_rule6(char *buffer, const uint8_t *ip, uint8_t depth, uint32_t next_hop)
+{
+ struct rte_lpm6_rule *rule = (struct rte_lpm6_rule *)buffer;
+
+ ip6_copy_addr((uint8_t *)&rule->ip, ip);
+ rule->depth = depth;
+ rule->next_hop = next_hop;
+
+ return (rule);
+}
+
+#if 0
+/*
+ * Look for a rule in the high-level rules table
+ */
+int
+rte_lpm6_is_rule_present(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ uint32_t *next_hop)
+{
+ uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
+
+ /* Check user arguments. */
+ if ((lpm == NULL) || next_hop == NULL || ip == NULL ||
+ (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH))
+ return -EINVAL;
+
+ /* Copy the IP and mask it to avoid modifying user's input data. */
+ ip6_copy_addr(masked_ip, ip);
+ ip6_mask_addr(masked_ip, depth);
+
+ return rule_find(lpm, masked_ip, depth, next_hop);
+}
+
+/*
+ * Delete a rule from the rule table.
+ * NOTE: Valid range for depth parameter is 1 .. 128 inclusive.
+ * return
+ * 0 on success
+ * <0 on failure
+ */
+static inline int
+rule_delete(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth)
+{
+ int ret;
+ struct rte_lpm6_rule_key rule_key;
+
+ /* init rule key */
+ rule_key_init(&rule_key, ip, depth);
+
+ /* delete the rule */
+ ret = rte_hash_del_key(lpm->rules_tbl, (void *) &rule_key);
+ if (ret >= 0)
+ lpm->used_rules--;
+
+ return ret;
+}
+
+/*
+ * Deletes a group of rules
+ *
+ * Note that the function rebuilds the lpm table,
+ * rather than doing incremental updates like
+ * the regular delete function
+ */
+int
+rte_lpm6_delete_bulk_func(struct rte_lpm6 *lpm,
+ uint8_t ips[][RTE_LPM6_IPV6_ADDR_SIZE], uint8_t *depths,
+ unsigned n)
+{
+ uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
+ unsigned i;
+
+ /* Check input arguments. */
+ if ((lpm == NULL) || (ips == NULL) || (depths == NULL))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ ip6_copy_addr(masked_ip, ips[i]);
+ ip6_mask_addr(masked_ip, depths[i]);
+ rule_delete(lpm, masked_ip, depths[i]);
+ }
+
+ /*
+ * Set all the table entries to 0 (ie delete every rule
+ * from the data structure.
+ */
+ memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
+ memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0])
+ * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
+ tbl8_pool_init(lpm);
+
+ /*
+ * Add every rule again (except for the ones that were removed from
+ * the rules table).
+ */
+ rebuild_lpm(lpm);
+
+ return 0;
+}
+
+/*
+ * Delete all rules from the LPM table.
+ */
+void
+rte_lpm6_delete_all(struct rte_lpm6 *lpm)
+{
+ /* Zero used rules counter. */
+ lpm->used_rules = 0;
+
+ /* Zero tbl24. */
+ memset(lpm->tbl24, 0, sizeof(lpm->tbl24));
+
+ /* Zero tbl8. */
+ memset(lpm->tbl8, 0, sizeof(lpm->tbl8[0]) *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * lpm->number_tbl8s);
+
+ /* init pool of free tbl8 indexes */
+ tbl8_pool_init(lpm);
+
+ /* Delete all rules form the rules table. */
+ rte_hash_reset(lpm->rules_tbl);
+}
+#endif
+
+/*
+ * Convert a depth to a one byte long mask
+ * Example: 4 will be converted to 0xF0
+ */
+static uint8_t __attribute__((pure))
+depth_to_mask_1b(uint8_t depth)
+{
+ /* To calculate a mask start with a 1 on the left hand side and right
+ * shift while populating the left hand side with 1's
+ */
+ return (signed char)0x80 >> (depth - 1);
+}
+
+#if 0
+/*
+ * Find a less specific rule
+ */
+static int
+rule_find_less_specific(struct rte_lpm6 *lpm, uint8_t *ip, uint8_t depth,
+ struct rte_lpm6_rule *rule)
+{
+ int ret;
+ uint32_t next_hop;
+ uint8_t mask;
+ struct rte_lpm6_rule_key rule_key;
+
+ if (depth == 1)
+ return 0;
+
+ rule_key_init(&rule_key, ip, depth);
+
+ while (depth > 1) {
+ depth--;
+
+ /* each iteration zero one more bit of the key */
+ mask = depth & 7; /* depth % BYTE_SIZE */
+ if (mask > 0)
+ mask = depth_to_mask_1b(mask);
+
+ rule_key.depth = depth;
+ rule_key.ip[depth >> 3] &= mask;
+
+ ret = rule_find_with_key(lpm, &rule_key, &next_hop);
+ if (ret) {
+ rule->depth = depth;
+ ip6_copy_addr(rule->ip, rule_key.ip);
+ rule->next_hop = next_hop;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+#endif
+
+/*
+ * Find range of tbl8 cells occupied by a rule
+ */
+static void
+rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ struct rte_lpm6_tbl_entry **from,
+ struct rte_lpm6_tbl_entry **to,
+ uint32_t *out_tbl_ind)
+{
+ uint32_t ind;
+ uint32_t first_3bytes = (uint32_t)ip[0] << 16 | ip[1] << 8 | ip[2];
+
+ if (depth <= 24) {
+ /* rule is within the top level */
+ ind = first_3bytes;
+ *from = &lpm->tbl24[ind];
+ ind += (1 << (24 - depth)) - 1;
+ *to = &lpm->tbl24[ind];
+ *out_tbl_ind = TBL24_IND;
+ } else {
+ /* top level entry */
+ struct rte_lpm6_tbl_entry *tbl = &lpm->tbl24[first_3bytes];
+ assert(tbl->ext_entry == 1);
+ /* first tbl8 */
+ uint32_t tbl_ind = tbl->lpm6_tbl8_gindex;
+ tbl = &lpm->tbl8[tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES];
+ /* current ip byte, the top level is already behind */
+ uint8_t byte = 3;
+ /* minus top level */
+ depth -= 24;
+
+ /* iterate through levels (tbl8s)
+ * until we reach the last one
+ */
+ while (depth > 8) {
+ tbl += ip[byte];
+ assert(tbl->ext_entry == 1);
+ /* go to the next level/tbl8 */
+ tbl_ind = tbl->lpm6_tbl8_gindex;
+ tbl = &lpm->tbl8[tbl_ind *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES];
+ byte += 1;
+ depth -= 8;
+ }
+
+ /* last level/tbl8 */
+ ind = ip[byte] & depth_to_mask_1b(depth);
+ *from = &tbl[ind];
+ ind += (1 << (8 - depth)) - 1;
+ *to = &tbl[ind];
+ *out_tbl_ind = tbl_ind;
+ }
+}
+
+/*
+ * Remove a table from the LPM tree
+ */
+static void
+remove_tbl(struct rte_lpm6 *lpm, struct rte_lpm_tbl8_hdr *tbl_hdr,
+ uint32_t tbl_ind, struct rte_lpm6_rule *lsp_rule)
+{
+ struct rte_lpm6_tbl_entry *owner_entry;
+
+ if (tbl_hdr->owner_tbl_ind == TBL24_IND)
+ owner_entry = &lpm->tbl24[tbl_hdr->owner_entry_ind];
+ else {
+ uint32_t owner_tbl_ind = tbl_hdr->owner_tbl_ind;
+ owner_entry = &lpm->tbl8[
+ owner_tbl_ind * RTE_LPM6_TBL8_GROUP_NUM_ENTRIES +
+ tbl_hdr->owner_entry_ind];
+
+ struct rte_lpm_tbl8_hdr *owner_tbl_hdr =
+ &lpm->tbl8_hdrs[owner_tbl_ind];
+ if (--owner_tbl_hdr->ref_cnt == 0)
+ remove_tbl(lpm, owner_tbl_hdr, owner_tbl_ind, lsp_rule);
+ }
+
+ assert(owner_entry->ext_entry == 1);
+
+ /* unlink the table */
+ if (lsp_rule != NULL) {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = lsp_rule->next_hop,
+ .depth = lsp_rule->depth,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 0
+ };
+
+ *owner_entry = new_tbl_entry;
+ } else {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = 0,
+ .depth = 0,
+ .valid = INVALID,
+ .valid_group = INVALID,
+ .ext_entry = 0
+ };
+
+ *owner_entry = new_tbl_entry;
+ }
+
+ /* return the table to the pool */
+ tbl8_put(lpm, tbl_ind);
+}
+
+/*
+ * Deletes a rule
+ */
+int
+rte_lpm6_delete(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth,
+ struct rte_lpm6_rule *lsp_rule)
+{
+ uint8_t masked_ip[RTE_LPM6_IPV6_ADDR_SIZE];
+ //struct rte_lpm6_rule lsp_rule_obj;
+ //struct rte_lpm6_rule *lsp_rule;
+ //int ret;
+ uint32_t tbl_ind;
+ struct rte_lpm6_tbl_entry *from, *to;
+
+ /* Check input arguments. */
+ if ((lpm == NULL) || (depth < 1) || (depth > RTE_LPM6_MAX_DEPTH))
+ return -EINVAL;
+
+ /* Copy the IP and mask it to avoid modifying user's input data. */
+ ip6_copy_addr(masked_ip, ip);
+ ip6_mask_addr(masked_ip, depth);
+
+#if 0
+ /* Delete the rule from the rule table. */
+ ret = rule_delete(lpm, masked_ip, depth);
+ if (ret < 0)
+ return -ENOENT;
+#endif
+
+ /* find rule cells */
+ rule_find_range(lpm, masked_ip, depth, &from, &to, &tbl_ind);
+
+#if 0
+ /* find a less specific rule (a rule with smaller depth)
+ * note: masked_ip will be modified, don't use it anymore
+ */
+ ret = rule_find_less_specific(lpm, masked_ip, depth,
+ &lsp_rule_obj);
+ lsp_rule = ret ? &lsp_rule_obj : NULL;
+#endif
+ /* decrement the table rule counter,
+ * note that tbl24 doesn't have a header
+ */
+ if (tbl_ind != TBL24_IND) {
+ struct rte_lpm_tbl8_hdr *tbl_hdr = &lpm->tbl8_hdrs[tbl_ind];
+ if (--tbl_hdr->ref_cnt == 0) {
+ /* remove the table */
+ remove_tbl(lpm, tbl_hdr, tbl_ind, lsp_rule);
+ return 0;
+ }
+ }
+
+ /* iterate rule cells */
+ for (; from <= to; from++)
+ if (from->ext_entry == 1) {
+ /* reference to a more specific space
+ * of the prefix/rule. Entries in a more
+ * specific space that are not used by
+ * a more specific prefix must be occupied
+ * by the prefix
+ */
+ if (lsp_rule != NULL)
+ expand_rule(lpm,
+ from->lpm6_tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES,
+ depth, lsp_rule->depth,
+ lsp_rule->next_hop, VALID);
+ else
+ /* since the prefix has no less specific prefix,
+ * its more specific space must be invalidated
+ */
+ expand_rule(lpm,
+ from->lpm6_tbl8_gindex *
+ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES,
+ depth, 0, 0, INVALID);
+ } else if (from->depth == depth) {
+ /* entry is not a reference and belongs to the prefix */
+ if (lsp_rule != NULL) {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = lsp_rule->next_hop,
+ .depth = lsp_rule->depth,
+ .valid = VALID,
+ .valid_group = VALID,
+ .ext_entry = 0
+ };
+
+ *from = new_tbl_entry;
+ } else {
+ struct rte_lpm6_tbl_entry new_tbl_entry = {
+ .next_hop = 0,
+ .depth = 0,
+ .valid = INVALID,
+ .valid_group = INVALID,
+ .ext_entry = 0
+ };
+
+ *from = new_tbl_entry;
+ }
+ }
+
+ return 0;
+}
Index: sys/contrib/dpdk_rte_lpm/rte_shim.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_shim.h
@@ -0,0 +1,31 @@
+#ifndef _RTE_SHIM_H_
+#define _RTE_SHIM_H_
+
+#define rte_malloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT)
+#define rte_free(_ptr) free(_ptr, M_TEMP)
+#define rte_zmalloc(_type, _size, _align) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO)
+#define rte_zmalloc_socket(_type, _size, _align, _s) malloc(_size, M_TEMP, M_NOWAIT | M_ZERO)
+
+#define rte_mcfg_tailq_write_unlock()
+#define rte_mcfg_tailq_write_lock()
+
+#define RTE_CACHE_LINE_SIZE CACHE_LINE_SIZE
+#define strtoull strtoul
+#define assert(_s) KASSERT((_s), ("DPDK: assert failed"))
+#define rte_memcpy memcpy
+#define rte_strerror(_err) "strerror_not_implemented"
+#define RTE_LOG(_sev, _sub, _fmt, ...) printf("DPDK::" #_sev "::" #_sub " %s: " _fmt, __func__ , ## __VA_ARGS__)
+
+#include "sys/endian.h"
+#define RTE_BYTE_ORDER BYTE_ORDER
+#define RTE_LITTLE_ENDIAN LITTLE_ENDIAN
+#define RTE_BIG_ENDIAN BIG_ENDIAN
+
+#include "sys/limits.h" // CHAR_BIT
+#define rte_le_to_cpu_32 le32toh
+
+#include "rte_jhash.h"
+#include "rte_common.h"
+
+
+#endif
Index: sys/contrib/dpdk_rte_lpm/rte_tailq.h
===================================================================
--- /dev/null
+++ sys/contrib/dpdk_rte_lpm/rte_tailq.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifndef _RTE_TAILQ_H_
+#define _RTE_TAILQ_H_
+
+/**
+ * @file
+ * Here defines rte_tailq APIs for only internal use
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/queue.h>
+//#include <stdio.h>
+#include <netinet6/rte_debug.h>
+
+/** dummy structure type used by the rte_tailq APIs */
+struct rte_tailq_entry {
+ TAILQ_ENTRY(rte_tailq_entry) next; /**< Pointer entries for a tailq list */
+ void *data; /**< Pointer to the data referenced by this tailq entry */
+};
+/** dummy */
+TAILQ_HEAD(rte_tailq_entry_head, rte_tailq_entry);
+
+#define RTE_TAILQ_NAMESIZE 32
+
+/**
+ * The structure defining a tailq header entry for storing
+ * in the rte_config structure in shared memory. Each tailq
+ * is identified by name.
+ * Any library storing a set of objects e.g. rings, mempools, hash-tables,
+ * is recommended to use an entry here, so as to make it easy for
+ * a multi-process app to find already-created elements in shared memory.
+ */
+struct rte_tailq_head {
+ struct rte_tailq_entry_head tailq_head; /**< NOTE: must be first element */
+ char name[RTE_TAILQ_NAMESIZE];
+};
+
+struct rte_tailq_elem {
+ /**
+ * Reference to head in shared mem, updated at init time by
+ * rte_eal_tailqs_init()
+ */
+ struct rte_tailq_head *head;
+ TAILQ_ENTRY(rte_tailq_elem) next;
+ const char name[RTE_TAILQ_NAMESIZE];
+};
+
+/**
+ * Return the first tailq entry cast to the right struct.
+ */
+#define RTE_TAILQ_CAST(tailq_entry, struct_name) \
+ (struct struct_name *)&(tailq_entry)->tailq_head
+
+/**
+ * Utility macro to make looking up a tailqueue for a particular struct easier.
+ *
+ * @param name
+ * The name of tailq
+ *
+ * @param struct_name
+ * The name of the list type we are using. (Generally this is the same as the
+ * first parameter passed to TAILQ_HEAD macro)
+ *
+ * @return
+ * The return value from rte_eal_tailq_lookup, typecast to the appropriate
+ * structure pointer type.
+ * NULL on error, since the tailq_head is the first
+ * element in the rte_tailq_head structure.
+ */
+#define RTE_TAILQ_LOOKUP(name, struct_name) \
+ RTE_TAILQ_CAST(rte_eal_tailq_lookup(name), struct_name)
+
+/**
+ * Dump tail queues to a file.
+ *
+ * @param f
+ * A pointer to a file for output
+ */
+//void rte_dump_tailq(FILE *f);
+
+/**
+ * Lookup for a tail queue.
+ *
+ * Get a pointer to a tail queue header of a tail
+ * queue identified by the name given as an argument.
+ * Note: this function is not multi-thread safe, and should only be called from
+ * a single thread at a time
+ *
+ * @param name
+ * The name of the queue.
+ * @return
+ * A pointer to the tail queue head structure.
+ */
+struct rte_tailq_head *rte_eal_tailq_lookup(const char *name);
+
+/**
+ * Register a tail queue.
+ *
+ * Register a tail queue from shared memory.
+ * This function is mainly used by EAL_REGISTER_TAILQ macro which is used to
+ * register tailq from the different dpdk libraries. Since this macro is a
+ * constructor, the function has no access to dpdk shared memory, so the
+ * registered tailq can not be used before call to rte_eal_init() which calls
+ * rte_eal_tailqs_init().
+ *
+ * @param t
+ * The tailq element which contains the name of the tailq you want to
+ * create (/retrieve when in secondary process).
+ * @return
+ * 0 on success or -1 in case of an error.
+ */
+int rte_eal_tailq_register(struct rte_tailq_elem *t);
+
+#define EAL_REGISTER_TAILQ(t) \
+RTE_INIT(tailqinitfn_ ##t) \
+{ \
+ if (rte_eal_tailq_register(&t) < 0) \
+ rte_panic("Cannot initialize tailq: %s\n", t.name); \
+}
+
+/* This macro permits both remove and free var within the loop safely.*/
+#ifndef TAILQ_FOREACH_SAFE
+#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \
+ (var) = (tvar))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_TAILQ_H_ */
Index: sys/modules/dpdk_lpm4/Makefile
===================================================================
--- /dev/null
+++ sys/modules/dpdk_lpm4/Makefile
@@ -0,0 +1,12 @@
+# $FreeBSD$
+
+SYSDIR?=${SRCTOP}/sys
+.include "${SYSDIR}/conf/kern.opts.mk"
+
+.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm
+
+KMOD= dpdk_lpm4
+SRCS= opt_inet.h
+SRCS.INET=dpdk_lpm.c rte_lpm.c
+
+.include <bsd.kmod.mk>
Index: sys/modules/dpdk_lpm6/Makefile
===================================================================
--- /dev/null
+++ sys/modules/dpdk_lpm6/Makefile
@@ -0,0 +1,12 @@
+# $FreeBSD$
+
+SYSDIR?=${SRCTOP}/sys
+.include "${SYSDIR}/conf/kern.opts.mk"
+
+.PATH: ${SYSDIR}/contrib/dpdk_rte_lpm
+
+KMOD= dpdk_lpm6
+SRCS= opt_inet6.h
+SRCS.INET6=dpdk_lpm6.c rte_lpm6.c
+
+.include <bsd.kmod.mk>
Index: sys/net/route.h
===================================================================
--- sys/net/route.h
+++ sys/net/route.h
@@ -230,6 +230,7 @@
/* Control plane route request flags */
#define NHR_COPY 0x100 /* Copy rte data */
+#define NHR_UNLOCKED 0x200 /* Do not lock table */
/*
* Routing statistics.
@@ -454,6 +455,8 @@
/* New API */
struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst,
uint32_t flags, uint32_t flowid);
+struct rib_rtable_info;
+bool rib_get_rtable_info(uint32_t fibnum, int family, struct rib_rtable_info *info);
#endif
#endif
Index: sys/net/route.c
===================================================================
--- sys/net/route.c
+++ sys/net/route.c
@@ -155,6 +155,12 @@
rt_table_destroy(struct rib_head *rh)
{
+ RIB_WLOCK(rh);
+ rh->rib_dying = true;
+ RIB_WUNLOCK(rh);
+
+ fib_destroy_rib(rh);
+
tmproutes_destroy(rh);
rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
Index: sys/net/route/nhgrp_ctl.c
===================================================================
--- sys/net/route/nhgrp_ctl.c
+++ sys/net/route/nhgrp_ctl.c
@@ -293,6 +293,17 @@
return (nhg_priv);
}
+void
+nhgrp_ref_object(struct nhgrp_object *nhg)
+{
+ struct nhgrp_priv *nhg_priv;
+ u_int old;
+
+ nhg_priv = NHGRP_PRIV(nhg);
+ old = refcount_acquire(&nhg_priv->nhg_refcount);
+ KASSERT(old > 0, ("%s: nhgrp object %p has 0 refs", __func__, nhg));
+}
+
void
nhgrp_free(struct nhgrp_object *nhg)
{
@@ -753,6 +764,21 @@
return (error);
}
+uint32_t
+nhgrp_get_count(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ uint32_t count;
+
+ ctl = rh->nh_control;
+
+ NHOPS_RLOCK(ctl);
+ count = ctl->gr_head.items_count;
+ NHOPS_RUNLOCK(ctl);
+
+ return (count);
+}
+
uint32_t
nhgrp_get_idx(const struct nhgrp_object *nhg)
{
Index: sys/net/route/nhop_ctl.c
===================================================================
--- sys/net/route/nhop_ctl.c
+++ sys/net/route/nhop_ctl.c
@@ -690,6 +690,19 @@
&nh_priv->nh_epoch_ctx);
}
+void
+nhop_ref_any(struct nhop_object *nh)
+{
+#ifdef ROUTE_MPATH
+ if (!NH_IS_NHGRP(nh))
+ nhop_ref_object(nh);
+ else
+ nhgrp_ref_object((struct nhgrp_object *)nh);
+#else
+ nhop_ref_object(nh);
+#endif
+}
+
void
nhop_free_any(struct nhop_object *nh)
{
@@ -852,6 +865,21 @@
return (error);
}
+uint32_t
+nhops_get_count(struct rib_head *rh)
+{
+ struct nh_control *ctl;
+ uint32_t count;
+
+ ctl = rh->nh_control;
+
+ NHOPS_RLOCK(ctl);
+ count = ctl->nh_head.items_count;
+ NHOPS_RUNLOCK(ctl);
+
+ return (count);
+}
+
int
nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w)
{
Index: sys/net/route/route_algo.h
===================================================================
--- /dev/null
+++ sys/net/route/route_algo.h
@@ -0,0 +1,110 @@
+/*-
+ * Copyright (c) 2020
+ * Alexander V. Chernikov <melifaro@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+struct fib_data;
+struct fib_dp;
+enum flm_op_result {
+ FLM_SUCCESS, /* No errors, operation successful */
+ FLM_REBUILD, /* Operation cannot be completed, schedule algorithm rebuild */
+ FLM_ERROR, /* Operation failed, this algo cannot be used */
+};
+
+struct rib_rtable_info {
+ uint32_t num_prefixes;
+ uint32_t num_nhops;
+ uint32_t num_nhgrp;
+};
+
+struct flm_lookup_key {
+ union {
+ const struct in6_addr *addr6;
+ struct in_addr addr4;
+ };
+};
+
+typedef struct nhop_object *flm_lookup_t(void *algo_data,
+ const struct flm_lookup_key key, uint32_t scopeid);
+typedef enum flm_op_result flm_init_t (uint32_t fibnum, struct fib_data *fd,
+ void *_old_data, void **new_data);
+typedef void flm_destroy_t(void *data);
+typedef enum flm_op_result flm_dump_t(struct rtentry *rt, void *data);
+typedef enum flm_op_result flm_dump_end_t(void *data, struct fib_dp *dp);
+typedef enum flm_op_result flm_change_t(struct rib_head *rnh,
+ struct rib_cmd_info *rc, void *data);
+typedef uint8_t flm_get_pref_t(const struct rib_rtable_info *rinfo);
+
+#define FIB_M_NEED_NHOPS 0x01 /* need nexthop index map allocation */
+#define FIB_M_NO_CALLOUT 0x02 /* does not need callouts */
+
+struct fib_lookup_module {
+ char *flm_name; /* algo name */
+ int flm_family; /* address family this module supports */
+ int flm_refcount; /* # of references */
+ uint32_t flm_flags; /* flags */
+ flm_init_t *flm_init_cb; /* instance init */
+ flm_destroy_t *flm_destroy_cb; /* destroy instance */
+ flm_change_t *flm_change_rib_item_cb;/* routing table change hook */
+ flm_dump_t *flm_dump_rib_item_cb; /* routing table dump cb */
+ flm_dump_end_t *flm_dump_end_cb; /* end of dump */
+ flm_lookup_t *flm_lookup; /* lookup function */
+ flm_get_pref_t *flm_get_pref; /* get algo preference */
+ TAILQ_ENTRY(fib_lookup_module) entries;
+};
+
+/* Datapath lookup data */
+struct fib_dp {
+ flm_lookup_t *f;
+ void *arg;
+};
+
+VNET_DECLARE(struct fib_dp *, inet_dp);
+#define V_inet_dp VNET(inet_dp)
+VNET_DECLARE(struct fib_dp *, inet6_dp);
+#define V_inet6_dp VNET(inet6_dp)
+
+int fib_module_init(struct fib_lookup_module *flm, uint32_t fibnum,
+ int family);
+int fib_module_clone(const struct fib_lookup_module *flm_orig,
+ struct fib_lookup_module *flm, bool waitok);
+int fib_module_dumptree(struct fib_lookup_module *flm,
+ enum rib_subscription_type subscription_type);
+int fib_module_register(struct fib_lookup_module *flm);
+int fib_module_unregister(struct fib_lookup_module *flm);
+
+uint32_t fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh);
+void fib_free_nhop_idx(struct fib_data *fd, uint32_t idx);
+void fib_free_nhop(struct fib_data *fd, struct nhop_object *nh);
+struct nhop_object **fib_get_nhop_array(struct fib_data *fd);
+void fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo);
+struct rib_head *fib_get_rh(struct fib_data *fd);
+
+
Index: sys/net/route/route_algo.c
===================================================================
--- /dev/null
+++ sys/net/route/route_algo.c
@@ -0,0 +1,1198 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#define RTDEBUG
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/sbuf.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/queue.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+
+/*
+ * Route lookup framework.
+ *
+ * flm - fib lookup modules - kernel modules implementing particular algo
+ * fd - fib data - instance of an flm bound to specific routing table
+ *
+ *
+ * For each supported address family, there is a an allocated array of fib_dp
+ * structures, indexed by fib number. Each array entry contains callback function
+ * and its argument. This function will be called with a family-specific lookup key,
+ * scope and provided argument. This array gets re-created every time when new algo
+ * instance gets created. Please take a look at the replace_rtables_family() function
+ * for more details.
+ *
+ * Control plane for to setup and update the necessary dataplane structures.
+ * 1) nexhops abstraction -> module has to deal with index, refcounting, nexhtop groups etc
+ * 2) sync with route tables
+ * 3) dataplane attachment points
+ * 3) fail early. Some algorithms are immutable, so any change leads to rebuild. Some
+ * are mutable till some extent so the module is build over common setup/teardown
+ * instances, making error handling * easier.
+ * 4) preference.
+ *
+ */
+
+SYSCTL_DECL(_net_route);
+SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "Route algorithm lookups");
+
+SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "IPv6 algorithm lookups");
+SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "IPv4 algorithm lookups");
+
+struct nhop_ref_table {
+ uint32_t count;
+ int32_t refcnt[0];
+};
+
+struct fib_data {
+ uint32_t number_nhops; /* current # of nhops */
+ uint32_t number_records; /* current # of routes */
+ uint8_t hit_nhops; /* true if out of nhop limit */
+ uint8_t init_done; /* true if init is competed */
+ uint32_t fd_dead:1; /* Scheduled for deletion */
+ uint32_t fd_linked:1; /* true if linked */
+ uint32_t fd_need_rebuild:1; /* true if rebuild scheduled */
+ uint32_t fd_force_eval:1;/* true if rebuild scheduled */
+ uint8_t fd_family; /* family */
+ uint32_t fd_fibnum; /* fibnum */
+ uint32_t fd_failed_rebuilds; /* stat: failed rebuilds */
+ struct callout fd_callout; /* rebuild callout */
+ void *fd_algo_data; /* algorithm data */
+ struct nhop_object **nh_idx; /* nhop idx->ptr array */
+ struct nhop_ref_table *nh_ref_table; /* array with # of nhop references */
+ struct rib_head *fd_rh; /* RIB table we're attached to */
+ struct rib_subscription *fd_rs; /* storing table subscription */
+ struct fib_algo_calldata *fa;
+ struct fib_dp fd_dp; /* fib datapath data */
+ struct vnet *fd_vnet; /* vnet nhop belongs to */
+ struct epoch_context fd_epoch_ctx;
+ uint64_t gencnt;
+ struct fib_lookup_module *fd_flm;
+ uint32_t fd_num_changes; /* number of changes since last callout */
+ TAILQ_ENTRY(fib_data) entries; /* list of all fds in vnet */
+};
+
+static void rebuild_callout(void *_data);
+static void destroy_instance_epoch(epoch_context_t ctx);
+static enum flm_op_result switch_algo(struct fib_data *fd);
+static struct fib_lookup_module *find_algo(const char *algo_name, int family);
+
+static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
+ struct fib_lookup_module *orig_flm);
+
+struct mtx fib_mtx;
+#define MOD_LOCK() mtx_lock(&fib_mtx)
+#define MOD_UNLOCK() mtx_unlock(&fib_mtx)
+
+
+/* Algorithm has to be this percent better than the current to switch */
+#define BEST_DIFF_PERCENT (5 * 256 / 100)
+/* Schedule algo re-evaluation X seconds after a change */
+#define ALGO_EVAL_DELAY_MS 30000
+/* Force algo re-evaluation after X changes */
+#define ALGO_EVAL_NUM_ROUTES 100
+/* Try to setup algorithm X times */
+#define FIB_MAX_TRIES 32
+/* Max amount of supported nexthops */
+#define FIB_MAX_NHOPS 262144
+#define FIB_CALLOUT_DELAY_MS 50
+
+
+/* TODO: per-VNET */
+static TAILQ_HEAD(fib_data_head, fib_data) fib_data_list = TAILQ_HEAD_INITIALIZER(fib_data_list);
+
+struct fib_dp_header {
+ struct epoch_context ffi_epoch_ctx;
+ uint32_t ffi_num_tables;
+ struct fib_dp ffi_idx[0];
+};
+
+static TAILQ_HEAD(, fib_lookup_module) all_algo_list;
+
+#ifdef RTDEBUG
+#define RH_PRINTF(_rh, _fmt, ...) printf("[rt_algo] %s.%u %s: " _fmt "\n", \
+ print_family(_rh->rib_family), _rh->rib_fibnum, __func__ , ## __VA_ARGS__)
+#define RH_PRINTF_RAW(_fmt, ...) printf("[rt_algo] %s: " _fmt "\n", __func__ , ## __VA_ARGS__)
+#define FD_PRINTF(fd, _fmt, ...) printf("[rt_algo] %s.%u (%s) %s: " _fmt "\n",\
+ print_family(fd->fd_family), fd->fd_fibnum, fd->fd_flm->flm_name, __func__, \
+ ##__VA_ARGS__)
+#else
+#define FD_RH_PRINTF(fd, _fmt, ...)
+#define RH_PRINTF(_fmt, ...)
+#define RH_PRINTF_RAW(_fmt, ...)
+#endif
+
+static const char *
+print_family(int family)
+{
+ if (family == AF_INET)
+ return ("inet");
+ else if (family == AF_INET6)
+ return ("inet6");
+ else
+ return ("unknown");
+}
+
+static int
+print_algos(struct sysctl_req *req, int family)
+{
+ struct fib_lookup_module *flm;
+ struct sbuf sbuf;
+ int error, count = 0;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error == 0) {
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ TAILQ_FOREACH(flm, &all_algo_list, entries) {
+ if (flm->flm_family == family) {
+ if (count++ > 0)
+ sbuf_cat(&sbuf, ", ");
+ sbuf_cat(&sbuf, flm->flm_name);
+ }
+ }
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ }
+ return (error);
+}
+
+static int
+print_algos_inet6(SYSCTL_HANDLER_ARGS)
+{
+
+ return (print_algos(req, AF_INET6));
+}
+SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ print_algos_inet6, "A", "List of algos");
+
+static int
+print_algos_inet(SYSCTL_HANDLER_ARGS)
+{
+
+ return (print_algos(req, AF_INET));
+}
+SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ print_algos_inet, "A", "List of algos");
+
+
+static struct fib_lookup_module *
+find_algo(const char *algo_name, int family)
+{
+ struct fib_lookup_module *flm;
+
+ TAILQ_FOREACH(flm, &all_algo_list, entries) {
+ if ((strcmp(flm->flm_name, algo_name) == 0) &&
+ (family == flm->flm_family))
+ return (flm);
+ }
+
+ return (NULL);
+}
+
+static uint32_t
+callout_calc_delay(struct fib_data *fd)
+{
+ uint32_t shift;
+
+ if (fd->fd_failed_rebuilds > 10)
+ shift = 10;
+ else
+ shift = fd->fd_failed_rebuilds;
+
+ return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
+}
+
+static void
+schedule_callout(struct fib_data *fd, int delay_ms)
+{
+
+ callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms,
+ rebuild_callout, fd, 0);
+}
+
+static void
+schedule_algo_eval(struct fib_data *fd)
+{
+
+ if (fd->fd_num_changes++ == 0) {
+ /* Start callout to consider switch */
+ MOD_LOCK();
+ if (!callout_pending(&fd->fd_callout))
+ schedule_callout(fd, ALGO_EVAL_DELAY_MS);
+ MOD_UNLOCK();
+ } else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
+ /* Reset callout to exec immediately */
+ MOD_LOCK();
+ if (!fd->fd_need_rebuild) {
+ fd->fd_force_eval = true;
+ schedule_callout(fd, 1);
+ }
+ MOD_UNLOCK();
+ }
+}
+
+/*
+ * rib subscription handler
+ */
+static void
+handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ struct fib_data *fd = (struct fib_data *)_data;
+ enum flm_op_result result;
+
+ RIB_WLOCK_ASSERT(rnh);
+
+ if (!fd->init_done)
+ return;
+
+ schedule_algo_eval(fd);
+
+ result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
+
+ switch (result) {
+ case FLM_SUCCESS:
+ break;
+ case FLM_REBUILD:
+ /*
+ * Algo reported inability to handle,
+ * schedule algo rebuild.
+ */
+ MOD_LOCK();
+ if (!fd->fd_need_rebuild) {
+ fd->fd_need_rebuild = true;
+ /*
+ * Potentially rewrites pending callout
+ * to re-evaluate algo.
+ */
+ FD_PRINTF(fd, "Scheduling rebuilt");
+ schedule_callout(fd, callout_calc_delay(fd));
+ }
+ MOD_UNLOCK();
+ break;
+ default:
+ /*
+ * Algo reported a non-recoverable error.
+ * Remove and switch to radix?
+ */
+ FD_PRINTF(fd, "algo reported non-recoverable error");
+ // TODO: switch to radix
+ }
+}
+
+static void
+estimate_scale(const struct fib_data *old_fd, struct fib_data *fd)
+{
+
+ if (old_fd == NULL) {
+ fd->number_nhops = 16;
+ return;
+ }
+
+ if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
+ fd->number_nhops = 2 * old_fd->number_nhops;
+ else
+ fd->number_nhops = old_fd->number_nhops;
+}
+
+struct walk_cbdata {
+ struct fib_data *fd;
+ flm_dump_t *func;
+ enum flm_op_result result;
+};
+
+static void
+sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
+{
+ struct walk_cbdata *w = (struct walk_cbdata *)_data;
+ struct fib_data *fd = w->fd;
+
+ if (rnh->rib_dying) {
+ w->result = FLM_ERROR;
+ return;
+ }
+
+ if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
+ return;
+
+ if (fd->hit_nhops) {
+ FD_PRINTF(fd, "ran out of nexthops at %u nhops",
+ fd->nh_ref_table->count);
+ w->result = FLM_REBUILD;
+ return;
+ }
+
+ w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
+
+ if (w->result == FLM_SUCCESS) {
+ /* Mark init as done to allow routing updates */
+ fd->init_done = 1;
+ }
+}
+
+static int
+sync_algo_cb(struct rtentry *rt, void *_data)
+{
+ struct walk_cbdata *w = (struct walk_cbdata *)_data;
+ enum flm_op_result result;
+
+ if (w->result == FLM_SUCCESS && w->func) {
+ result = w->func(rt, w->fd->fd_algo_data);
+ if (result != FLM_SUCCESS)
+ w->result = result;
+ }
+
+ return (0);
+}
+
+static enum flm_op_result
+sync_algo(struct fib_data *fd)
+{
+ struct walk_cbdata w;
+
+ w.fd = fd;
+ w.func = fd->fd_flm->flm_dump_rib_item_cb;
+ w.result = FLM_SUCCESS;
+
+ rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
+
+ FD_PRINTF(fd, "initial dump completed.");
+
+ return (w.result);
+}
+
+/*
+ * Assume already unlinked from datapath
+ */
+static int
+schedule_destroy_instance(struct fib_data *fd, bool in_callout)
+{
+ bool is_dead;
+
+ NET_EPOCH_ASSERT();
+
+ MOD_LOCK();
+ is_dead = fd->fd_dead;
+ if (!is_dead)
+ fd->fd_dead = true;
+ if (fd->fd_linked) {
+ TAILQ_REMOVE(&fib_data_list, fd, entries);
+ fd->fd_linked = false;
+ }
+ MOD_UNLOCK();
+ if (is_dead)
+ return (0);
+
+ FD_PRINTF(fd, "DETACH");
+
+ if (fd->fd_rs != NULL)
+ rib_unsibscribe(fd->fd_rs);
+
+ /*
+ * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
+ * will be executed, hence no _new_ callout schedules will happen.
+ *
+ * There can be 3 possible scenarious here:
+ * 1) we're running inside a callout when we're deleting ourselves
+ * due to migration to a newer fd
+ * 2) we're running from rt_table_destroy() and callout is scheduled
+ * for execution OR is executing
+ *
+ * For (2) we need to wait for the callout termination, as the routing table
+ * will be destroyed after this function returns.
+ * For (1) we cannot call drain, but can ensure that this is the last invocation.
+ */
+
+ if (in_callout)
+ callout_stop(&fd->fd_callout);
+ else
+ callout_drain(&fd->fd_callout);
+
+ /*
+ * At this moment there are no other pending work scheduled.
+ */
+ FD_PRINTF(fd, "destroying old instance");
+ epoch_call(net_epoch_preempt, destroy_instance_epoch,
+ &fd->fd_epoch_ctx);
+
+ return (0);
+}
+
+void
+fib_destroy_rib(struct rib_head *rh)
+{
+ struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
+ struct fib_data *fd, *fd_tmp;
+
+ /*
+ * Atm we have set is_dying flag on rnh, so all new fd's will
+ * fail at sync_algo() stage, so nothing new will be added to the list.
+ */
+ MOD_LOCK();
+ TAILQ_FOREACH_SAFE(fd, &fib_data_list, entries, fd_tmp) {
+ if (fd->fd_rh == rh) {
+ TAILQ_REMOVE(&fib_data_list, fd, entries);
+ fd->fd_linked = false;
+ TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
+ }
+ }
+ MOD_UNLOCK();
+
+ /* Pass 2: remove each entry */
+ TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
+ schedule_destroy_instance(fd, false);
+ }
+}
+
+static void
+destroy_instance(struct fib_data *fd)
+{
+
+ FD_PRINTF(fd, "destroy fd %p", fd);
+
+ /* Call destroy callback first */
+ if (fd->fd_algo_data != NULL)
+ fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
+
+ /* Nhop table */
+ if (fd->nh_idx != NULL) {
+ for (int i = 0; i < fd->number_nhops; i++) {
+ if (fd->nh_idx[i] != NULL) {
+ FD_PRINTF(fd, " FREE nhop %d %p", i, fd->nh_idx[i]);
+ nhop_free_any(fd->nh_idx[i]);
+ }
+ }
+ free(fd->nh_idx, M_RTABLE);
+ }
+ if (fd->nh_ref_table != NULL)
+ free(fd->nh_ref_table, M_RTABLE);
+
+ MOD_LOCK();
+ fd->fd_flm->flm_refcount--;
+ MOD_UNLOCK();
+
+ free(fd, M_RTABLE);
+}
+
+/*
+ * Epoch callback indicating fd is safe to destroy
+ */
+static void
+destroy_instance_epoch(epoch_context_t ctx)
+{
+ struct fib_data *fd;
+
+ fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
+
+ destroy_instance(fd);
+}
+
+static enum flm_op_result
+try_setup_instance(struct fib_lookup_module *flm, struct rib_head *rh,
+ struct fib_data *old_fd, struct fib_data **pfd)
+{
+ struct fib_data *fd;
+ size_t size;
+ enum flm_op_result result;
+
+ /* Allocate */
+ fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (fd == NULL) {
+ *pfd = NULL;
+ return (FLM_REBUILD);
+ }
+ *pfd = fd;
+
+ estimate_scale(old_fd, fd);
+
+ fd->fd_rh = rh;
+ fd->fd_family = rh->rib_family;
+ fd->fd_fibnum = rh->rib_fibnum;
+ callout_init(&fd->fd_callout, 1);
+ fd->fd_vnet = curvnet;
+ fd->fd_flm = flm;
+
+ /* Allocate nhidx -> nhop_ptr table */
+ size = fd->number_nhops * sizeof(void *);
+ //FD_PRINTF(fd, "malloc(%lu)", size);
+ fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (fd->nh_idx == NULL) {
+ FD_PRINTF(fd, "Unable to allocate nhop table idx (sz:%zu)", size);
+ return (FLM_REBUILD);
+ }
+
+ /* Allocate nhop index refcount table */
+ size = sizeof(struct nhop_ref_table);
+ size += fd->number_nhops * sizeof(uint32_t);
+ //FD_PRINTF(fd, "malloc(%lu)", size);
+ fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (fd->nh_ref_table == NULL) {
+ FD_PRINTF(fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
+ return (FLM_REBUILD);
+ }
+
+ /* Okay, we're ready for algo init */
+ void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
+ result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
+ if (result != FLM_SUCCESS)
+ return (result);
+
+ /* Try to subscribe */
+ if (flm->flm_change_rib_item_cb != NULL) {
+ fd->fd_rs = rib_subscribe_internal(fd->fd_rh,
+ handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0);
+ if (fd->fd_rs == NULL)
+ return (FLM_REBUILD);
+ }
+
+ /* Dump */
+ result = sync_algo(fd);
+ if (result != FLM_SUCCESS)
+ return (result);
+ FD_PRINTF(fd, "DUMP completed successfully.");
+
+ MOD_LOCK();
+ TAILQ_INSERT_TAIL(&fib_data_list, fd, entries);
+ fd->fd_linked = true;
+ MOD_UNLOCK();
+
+ return (FLM_SUCCESS);
+}
+
+/*
+ * Sets up algo @flm for table @rh and links it to the datapath.
+ *
+ */
+static enum flm_op_result
+setup_instance(struct fib_lookup_module *flm, struct rib_head *rh,
+ struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
+{
+ struct fib_data *prev_fd, *new_fd;
+ struct epoch_tracker et;
+ enum flm_op_result result;
+
+ prev_fd = orig_fd;
+ new_fd = NULL;
+ for (int i = 0; i < FIB_MAX_TRIES; i++) {
+ NET_EPOCH_ENTER(et);
+ result = try_setup_instance(flm, rh, prev_fd, &new_fd);
+
+ if ((result == FLM_SUCCESS) && attach)
+ result = switch_algo(new_fd);
+
+ if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
+ schedule_destroy_instance(prev_fd, false);
+ prev_fd = NULL;
+ }
+ NET_EPOCH_EXIT(et);
+
+ RH_PRINTF(rh, "try %d: fib algo result: %d", i, result);
+
+ if (result == FLM_REBUILD) {
+ prev_fd = new_fd;
+ new_fd = NULL;
+ continue;
+ }
+
+ break;
+ }
+
+ if (result != FLM_SUCCESS) {
+ /* update failure count */
+ MOD_LOCK();
+ if (orig_fd != NULL)
+ orig_fd->fd_failed_rebuilds++;
+ MOD_UNLOCK();
+
+ NET_EPOCH_ENTER(et);
+ if ((prev_fd != NULL) && (prev_fd != orig_fd))
+ schedule_destroy_instance(prev_fd, false);
+ if (new_fd != NULL) {
+ schedule_destroy_instance(new_fd, false);
+ new_fd = NULL;
+ }
+ NET_EPOCH_EXIT(et);
+ }
+
+ *pfd = new_fd;
+ return (result);
+}
+
+static void
+rebuild_callout(void *_data)
+{
+ struct fib_data *fd, *fd_new;
+ struct fib_lookup_module *flm_new;
+ struct epoch_tracker et;
+ enum flm_op_result result;
+ bool need_rebuild = false;
+
+ fd = (struct fib_data *)_data;
+
+ MOD_LOCK();
+ need_rebuild = fd->fd_need_rebuild;
+ fd->fd_need_rebuild = false;
+ fd->fd_force_eval = false;
+ fd->fd_num_changes = 0;
+ MOD_UNLOCK();
+
+ CURVNET_SET(fd->fd_vnet);
+
+ /* First, check if we're still OK to use this algo */
+ flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
+ if ((flm_new == NULL) && (!need_rebuild)) {
+ /* Keep existing algo, no need to rebuild. */
+ CURVNET_RESTORE();
+ return;
+ }
+
+ struct fib_data *fd_tmp = (flm_new == NULL) ? fd : NULL;
+ result = setup_instance(fd->fd_flm, fd->fd_rh, fd_tmp, &fd_new, true);
+ if (result != FLM_SUCCESS) {
+ FD_PRINTF(fd, "table rebuild failed");
+ CURVNET_RESTORE();
+ return;
+ }
+ FD_PRINTF(fd_new, "switched to new instance");
+
+ /* Remove old */
+ if (fd != NULL) {
+ NET_EPOCH_ENTER(et);
+ schedule_destroy_instance(fd, true);
+ NET_EPOCH_EXIT(et);
+ }
+
+ CURVNET_RESTORE();
+}
+
+static int
+set_algo_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+#if 0
+ struct epoch_tracker et;
+ struct fib_lookup_module *flm;
+ struct fib_data *old_fd, *fd;
+ char old_algo_name[32], algo_name[32];
+ uint32_t fibnum;
+ int error;
+
+ fibnum = RT_DEFAULT_FIB;
+
+ if (old_fd == NULL) {
+ strlcpy(old_algo_name, "radix", sizeof(old_algo_name));
+ } else {
+ strlcpy(old_algo_name, fd_ptr->fd_flm->flm_name,
+ sizeof(old_algo_name));
+ }
+ strlcpy(algo_name, old_algo_name, sizeof(algo_name));
+ error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ if (strcmp(algo_name, old_algo_name) == 0)
+ return (0);
+
+ if (strcmp(algo_name, "radix") == 0) {
+ /* teardown old one */
+ NET_EPOCH_ENTER(et);
+ MOD_LOCK();
+ old_fd = fd_ptr;
+ fd_ptr = NULL;
+ MOD_UNLOCK();
+
+ if (old_fd != NULL)
+ schedule_destroy_instance(old_fd);
+ NET_EPOCH_EXIT(et);
+ return (0);
+ }
+
+ MOD_LOCK();
+ flm = find_algo(algo_name, AF_INET6);
+ if (flm != NULL)
+ flm->flm_refcount++;
+ MOD_UNLOCK();
+
+ if (flm == NULL) {
+ DPRINTF("unable to find algo %s", algo_name);
+ return (ESRCH);
+ }
+ DPRINTF("inet6.%u: requested fib algo %s", fibnum, algo_name);
+
+ fd = setup_instance(flm, fibnum, NULL, &error);
+
+ if (error != 0) {
+ MOD_LOCK();
+ flm->flm_refcount--;
+ MOD_UNLOCK();
+ return (error);
+ }
+
+ MOD_LOCK();
+ old_fd = fd_ptr;
+ fd_ptr = fd;
+ MOD_UNLOCK();
+
+ /* Remove old */
+ NET_EPOCH_ENTER(et);
+ if (old_fd != NULL) {
+ error = schedule_destroy_instance(old_fd);
+ }
+ NET_EPOCH_EXIT(et);
+#endif
+
+ /* Set new */
+
+ /* Drain cb so user can unload the module after userret if so desired */
+ epoch_drain_callbacks(net_epoch_preempt);
+
+ return (error);
+}
+SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
+ CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
+ set_algo_sysctl_handler, "A",
+ "Set");
+
+static void
+destroy_fdh_epoch(epoch_context_t ctx)
+{
+ struct fib_dp_header *ffi;
+
+ ffi = __containerof(ctx, struct fib_dp_header, ffi_epoch_ctx);
+ free(ffi, M_RTABLE);
+}
+
+static struct fib_dp_header *
+alloc_fib_dp_array(uint32_t num_tables, bool waitok)
+{
+ size_t sz;
+ struct fib_dp_header *ffi;
+
+ sz = sizeof(struct fib_dp_header);
+ sz += sizeof(struct fib_dp) * num_tables;
+ ffi = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
+ if (ffi != NULL)
+ ffi->ffi_num_tables = num_tables;
+ return (ffi);
+}
+
+static struct fib_dp_header *
+get_fib_dp_header(struct fib_dp *dp)
+{
+
+ return (__containerof((void *)dp, struct fib_dp_header, ffi_idx));
+}
+
+/*
+ * Replace per-family index pool @pdp with a new one which
+ * contains updated callback/algo data from @fd.
+ * Returns 0 on success.
+ */
+static enum flm_op_result
+replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd)
+{
+ struct fib_dp_header *new_ffi, *old_ffi;
+
+ NET_EPOCH_ASSERT();
+
+ FD_PRINTF(fd, "[vnet %p] replace with f:%p arg:%p", curvnet, fd->fd_dp.f, fd->fd_dp.arg);
+
+ MOD_LOCK();
+ old_ffi = get_fib_dp_header(*pdp);
+ new_ffi = alloc_fib_dp_array(old_ffi->ffi_num_tables, false);
+ FD_PRINTF(fd, "OLD FFI: %p NEW FFI: %p", old_ffi, new_ffi);
+ if (new_ffi == NULL) {
+ MOD_UNLOCK();
+ FD_PRINTF(fd, "error attaching datapath");
+ return (FLM_REBUILD);
+ }
+
+ memcpy(&new_ffi->ffi_idx[0], &old_ffi->ffi_idx[0],
+ old_ffi->ffi_num_tables * sizeof(struct fib_dp));
+ /* Update relevant data structure for @fd */
+ new_ffi->ffi_idx[fd->fd_fibnum] = fd->fd_dp;
+
+ /* Ensure memcpy() writes have completed */
+ atomic_thread_fence_rel();
+ /* Set new datapath pointer */
+ *pdp = &new_ffi->ffi_idx[0];
+ MOD_UNLOCK();
+ FD_PRINTF(fd, "update %p -> %p", old_ffi, new_ffi);
+
+ epoch_call(net_epoch_preempt, destroy_fdh_epoch,
+ &old_ffi->ffi_epoch_ctx);
+
+ return (FLM_SUCCESS);
+}
+
+static struct fib_dp **
+get_family_ptr(int family)
+{
+ switch (family) {
+ case AF_INET:
+ return (&V_inet_dp);
+ case AF_INET6:
+ return (&V_inet6_dp);
+ }
+ return (NULL);
+}
+
+static enum flm_op_result
+switch_algo(struct fib_data *fd)
+{
+ struct fib_dp **pdp;
+
+ pdp = get_family_ptr(fd->fd_family);
+ return (replace_rtables_family(pdp, fd));
+}
+
+/*
+ * Grow datapath pointers array.
+ * Called from sysctl handler on growing number of routing tables.
+ */
+static void
+grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
+{
+ struct fib_dp_header *new_fdh, *old_fdh = NULL;
+
+ new_fdh = alloc_fib_dp_array(new_num_tables, true);
+
+ MOD_LOCK();
+ if (*pdp != NULL) {
+ old_fdh = get_fib_dp_header(*pdp);
+ memcpy(&new_fdh->ffi_idx[0], &old_fdh->ffi_idx[0],
+ old_fdh->ffi_num_tables * sizeof(struct fib_dp));
+ }
+
+ /* Wait till all writes completed */
+ atomic_thread_fence_rel();
+
+ *pdp = &new_fdh->ffi_idx[0];
+ MOD_UNLOCK();
+
+ if (old_fdh != NULL)
+ epoch_call(net_epoch_preempt, destroy_fdh_epoch,
+ &old_fdh->ffi_epoch_ctx);
+}
+
+/*
+ * Grows per-AF arrays of datapath pointers for each supported family.
+ * Called from fibs resize sysctl handler.
+ */
+void
+fib_grow_rtables(uint32_t new_num_tables)
+{
+
+ grow_rtables_family(get_family_ptr(AF_INET), new_num_tables);
+ grow_rtables_family(get_family_ptr(AF_INET6), new_num_tables);
+}
+
+void
+fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
+{
+
+ bzero(rinfo, sizeof(struct rib_rtable_info));
+ rinfo->num_prefixes = rh->rnh_prefixes;
+ rinfo->num_nhops = nhops_get_count(rh);
+ rinfo->num_nhgrp = nhgrp_get_count(rh);
+}
+
+struct rib_head *
+fib_get_rh(struct fib_data *fd)
+{
+
+ return (fd->fd_rh);
+}
+
+static uint32_t
+get_nhop_idx(struct nhop_object *nh)
+{
+#ifdef ROUTE_MPATH
+ if (NH_IS_NHGRP(nh))
+ return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
+ else
+ return (nhop_get_idx(nh) * 2);
+#else
+ return (nhop_get_idx(nh));
+#endif
+}
+
+
+uint32_t
+fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
+{
+ uint32_t idx = get_nhop_idx(nh);
+
+ if (idx >= fd->number_nhops) {
+ fd->hit_nhops = 1;
+ return (0);
+ }
+
+ if (fd->nh_idx[idx] == NULL) {
+ nhop_ref_any(nh);
+ fd->nh_idx[idx] = nh;
+ fd->nh_ref_table->count++;
+ FD_PRINTF(fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
+ }
+ fd->nh_ref_table->refcnt[idx]++;
+
+ return (idx);
+}
+
+struct nhop_release_data {
+ struct nhop_object *nh;
+ struct epoch_context ctx;
+};
+
+static void
+release_nhop_epoch(epoch_context_t ctx)
+{
+ struct nhop_release_data *nrd;
+
+ nrd = __containerof(ctx, struct nhop_release_data, ctx);
+ nhop_free_any(nrd->nh);
+ free(nrd, M_RTABLE);
+}
+
+static void
+fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
+{
+ struct nhop_release_data *nrd;
+
+ nrd = malloc(sizeof(struct nhop_release_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (nrd != NULL) {
+ nrd->nh = nh;
+ epoch_call(net_epoch_preempt, release_nhop_epoch, &nrd->ctx);
+ } else {
+ /*
+ * Unable to allocate memory. Leak nexthop to maintain guarantee
+ * that each nhop.
+ */
+ FD_PRINTF(fd, "unable to allocate structure for nhop %p deletion", nh);
+ }
+}
+
+void
+fib_free_nhop_idx(struct fib_data *fd, uint32_t idx)
+{
+
+ KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
+
+ fd->nh_ref_table->refcnt[idx]--;
+ if (fd->nh_ref_table->refcnt[idx] == 0) {
+ FD_PRINTF(fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
+ fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
+ }
+}
+
+void
+fib_free_nhop(struct fib_data *fd, struct nhop_object *nh)
+{
+
+ fib_free_nhop_idx(fd, get_nhop_idx(nh));
+}
+
+struct nhop_object **
+fib_get_nhop_array(struct fib_data *fd)
+{
+
+ return (fd->nh_idx);
+}
+
+static struct fib_lookup_module *
+fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
+{
+ uint8_t preference, curr_preference = 0, best_preference = 0;
+ struct fib_lookup_module *flm, *best_flm = NULL;
+ struct rib_rtable_info rinfo;
+ int candidate_algos = 0;
+
+ fib_get_rtable_info(rh, &rinfo);
+
+ MOD_LOCK();
+ TAILQ_FOREACH(flm, &all_algo_list, entries) {
+ if (flm->flm_family != rh->rib_family)
+ continue;
+ candidate_algos++;
+ preference = flm->flm_get_pref(&rinfo);
+ if (preference > best_preference) {
+ best_preference = preference;
+ best_flm = flm;
+ }
+ if (flm == orig_flm)
+ curr_preference = preference;
+ }
+ if (best_flm != NULL && best_flm != orig_flm) {
+ /* Check */
+ if (curr_preference + BEST_DIFF_PERCENT < best_preference)
+ best_flm->flm_refcount++;
+ else
+ best_flm = NULL;
+ } else
+ best_flm = NULL;
+ MOD_UNLOCK();
+
+ RH_PRINTF(rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
+ candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
+ best_flm ? best_flm->flm_name : "NULL", best_preference);
+
+ return (best_flm);
+}
+
+/*
+ * Called when new route table is created.
+ * Selects, allocates and attaches fib algo for the table.
+ */
+int
+fib_select_algo_initial(struct rib_head *rh)
+{
+ struct fib_lookup_module *flm;
+ struct fib_data *fd = NULL;
+ enum flm_op_result result;
+
+ flm = fib_check_best_algo(rh, NULL);
+ if (flm == NULL) {
+ RH_PRINTF(rh, "no algo selected");
+ return (ENOENT);
+ }
+ RH_PRINTF(rh, "selected algo %s", flm->flm_name);
+
+ result = setup_instance(flm, rh, NULL, &fd, false);
+ RH_PRINTF(rh, "result=%d fd=%p", result, fd);
+ if (result == FLM_SUCCESS) {
+ /*
+ * Attach datapath directly to avoid N reallocations
+ * during fib growth
+ */
+ struct fib_dp_header *fdp;
+ struct fib_dp **pdp;
+
+ pdp = get_family_ptr(rh->rib_family);
+ if (pdp != NULL) {
+ fdp = get_fib_dp_header(*pdp);
+ fdp->ffi_idx[fd->fd_fibnum] = fd->fd_dp;
+ FD_PRINTF(fd, "datapath attached");
+ }
+ }
+
+ return (0);
+}
+
+int
+fib_module_register(struct fib_lookup_module *flm)
+{
+
+ MOD_LOCK();
+ RH_PRINTF_RAW("linking %s (%p)", flm->flm_name, flm);
+ TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
+ MOD_UNLOCK();
+
+ return (0);
+}
+
+int
+fib_module_unregister(struct fib_lookup_module *flm)
+{
+ MOD_LOCK();
+ if (flm->flm_refcount > 0) {
+ MOD_UNLOCK();
+ return (EBUSY);
+ }
+ RH_PRINTF_RAW("unlinking %s (%p)", flm->flm_name, flm);
+ TAILQ_REMOVE(&all_algo_list, flm, entries);
+ MOD_UNLOCK();
+
+ return (0);
+}
+
+int
+fib_module_clone(const struct fib_lookup_module *flm_orig,
+ struct fib_lookup_module *flm, bool waitok)
+{
+
+ return (0);
+}
+
+int
+fib_module_dumptree(struct fib_lookup_module *flm,
+ enum rib_subscription_type subscription_type)
+{
+
+
+ return (0);
+}
+
+static void
+fib_algo_init(void)
+{
+
+ mtx_init(&fib_mtx, "algo list mutex", NULL, MTX_DEF);
+ TAILQ_INIT(&all_algo_list);
+}
+SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, fib_algo_init, NULL);
+
Index: sys/net/route/route_ctl.h
===================================================================
--- sys/net/route/route_ctl.h
+++ sys/net/route/route_ctl.h
@@ -72,6 +72,8 @@
void *arg);
void rib_walk_ext(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f,
rib_walk_hook_f_t *hook_f, void *arg);
+void rib_walk_ext_internal(struct rib_head *rnh, bool wlock,
+ rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg);
void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f,
void *arg, bool report);
@@ -87,6 +89,10 @@
const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family,
const struct sockaddr *dst, struct route_nhop_data *rnd);
+/* Nhops */
+void nhop_ref_any(struct nhop_object *nh);
+void nhop_free_any(struct nhop_object *nh);
+
/* Multipath */
struct nhgrp_object;
struct weightened_nhop;
@@ -109,6 +115,6 @@
struct rib_subscription *rib_subscribe_internal(struct rib_head *rnh,
rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type,
bool waitok);
-int rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs);
+void rib_unsibscribe(struct rib_subscription *rs);
#endif
Index: sys/net/route/route_ctl.c
===================================================================
--- sys/net/route/route_ctl.c
+++ sys/net/route/route_ctl.c
@@ -70,6 +70,7 @@
CK_STAILQ_ENTRY(rib_subscription) next;
rib_subscription_cb_t *func;
void *arg;
+ struct rib_head *rnh;
enum rib_subscription_type type;
struct epoch_context epoch_ctx;
};
@@ -669,6 +670,8 @@
/* Finalize notification */
rnh->rnh_gen++;
+ rnh->rnh_prefixes--;
+
rc->rc_cmd = RTM_DELETE;
rc->rc_rt = rt;
rc->rc_nh_old = rt->rt_nhop;
@@ -929,6 +932,7 @@
/* Finalize notification */
rnh->rnh_gen++;
+ rnh->rnh_prefixes++;
rc->rc_cmd = RTM_ADD;
rc->rc_rt = rt;
@@ -984,6 +988,8 @@
/* Finalize notification */
rnh->rnh_gen++;
+ if (rnd->rnd_nhop == NULL)
+ rnh->rnh_prefixes--;
rc->rc_cmd = (rnd->rnd_nhop != NULL) ? RTM_CHANGE : RTM_DELETE;
rc->rc_rt = rt;
@@ -1222,7 +1228,7 @@
enum rib_subscription_type type, bool waitok)
{
struct rib_subscription *rs;
- int flags = M_ZERO | (waitok ? M_WAITOK : 0);
+ int flags = M_ZERO | (waitok ? M_WAITOK : M_NOWAIT);
rs = malloc(sizeof(struct rib_subscription), M_RTABLE, flags);
if (rs == NULL)
@@ -1246,22 +1252,14 @@
enum rib_subscription_type type, bool waitok)
{
struct rib_head *rnh;
- struct rib_subscription *rs;
struct epoch_tracker et;
- if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL)
- return (NULL);
-
NET_EPOCH_ENTER(et);
KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
rnh = rt_tables_get_rnh(fibnum, family);
-
- RIB_WLOCK(rnh);
- CK_STAILQ_INSERT_TAIL(&rnh->rnh_subscribers, rs, next);
- RIB_WUNLOCK(rnh);
NET_EPOCH_EXIT(et);
- return (rs);
+ return (rib_subscribe_internal(rnh, f, arg, type, waitok));
}
struct rib_subscription *
@@ -1273,6 +1271,7 @@
if ((rs = allocate_subscription(f, arg, type, waitok)) == NULL)
return (NULL);
+ rs->rnh = rnh;
NET_EPOCH_ENTER(et);
RIB_WLOCK(rnh);
@@ -1284,23 +1283,15 @@
}
/*
- * Remove rtable subscription @rs from the table specified by @fibnum
- * and @family.
+ * Remove rtable subscription @rs from the routing table.
* Needs to be run in network epoch.
- *
- * Returns 0 on success.
*/
-int
-rib_unsibscribe(uint32_t fibnum, int family, struct rib_subscription *rs)
+void
+rib_unsibscribe(struct rib_subscription *rs)
{
- struct rib_head *rnh;
+ struct rib_head *rnh = rs->rnh;
NET_EPOCH_ASSERT();
- KASSERT((fibnum < rt_numfibs), ("%s: bad fibnum", __func__));
- rnh = rt_tables_get_rnh(fibnum, family);
-
- if (rnh == NULL)
- return (ENOENT);
RIB_WLOCK(rnh);
CK_STAILQ_REMOVE(&rnh->rnh_subscribers, rs, rib_subscription, next);
@@ -1308,8 +1299,6 @@
epoch_call(net_epoch_preempt, destroy_subscription_epoch,
&rs->epoch_ctx);
-
- return (0);
}
/*
Index: sys/net/route/route_helpers.c
===================================================================
--- sys/net/route/route_helpers.c
+++ sys/net/route/route_helpers.c
@@ -77,14 +77,10 @@
* Table is traversed under read lock unless @wlock is set.
*/
void
-rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
+rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
rib_walk_hook_f_t *hook_f, void *arg)
{
RIB_RLOCK_TRACKER;
- struct rib_head *rnh;
-
- if ((rnh = rt_tables_get_rnh(fibnum, family)) == NULL)
- return;
if (wlock)
RIB_WLOCK(rnh);
@@ -101,6 +97,16 @@
RIB_RUNLOCK(rnh);
}
+void
+rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
+ rib_walk_hook_f_t *hook_f, void *arg)
+{
+ struct rib_head *rnh;
+
+ if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
+ rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg);
+}
+
/*
* Calls @wa_f with @arg for each entry in the table specified by
* @af and @fibnum.
Index: sys/net/route/route_tables.c
===================================================================
--- sys/net/route/route_tables.c
+++ sys/net/route/route_tables.c
@@ -171,7 +171,7 @@
grow_rtables(uint32_t num_tables)
{
struct domain *dom;
- struct rib_head **prnh;
+ struct rib_head **prnh, *rh;
struct rib_head **new_rt_tables, **old_rt_tables;
int family;
@@ -188,6 +188,8 @@
"by default. Consider tuning %s if needed\n",
"net.add_addr_allfibs");
+ fib_grow_rtables(num_tables);
+
/*
* Current rt_tables layout:
* fib0[af0, af1, af2, .., AF_MAX]fib1[af0, af1, af2, .., Af_MAX]..
@@ -206,10 +208,16 @@
prnh = &new_rt_tables[i * (AF_MAX + 1) + family];
if (*prnh != NULL)
continue;
- *prnh = dom->dom_rtattach(i);
- if (*prnh == NULL)
- log(LOG_ERR, "unable to create routing tables for domain %d\n",
- dom->dom_family);
+ rh = dom->dom_rtattach(i);
+ if (rh == NULL)
+ log(LOG_ERR, "unable to create routing table for %d.%d\n",
+ dom->dom_family, i);
+ if (fib_select_algo_initial(rh) != 0) {
+ log(LOG_ERR, "unable to select algo for table %d.%d\n",
+ dom->dom_family, i);
+ // TODO: detach table
+ }
+ *prnh = rh;
}
}
Index: sys/net/route/route_var.h
===================================================================
--- sys/net/route/route_var.h
+++ sys/net/route/route_var.h
@@ -68,8 +68,10 @@
struct vnet *rib_vnet; /* vnet pointer */
int rib_family; /* AF of the rtable */
u_int rib_fibnum; /* fib number */
+ bool rib_dying; /* rib is detaching */
struct callout expire_callout; /* Callout for expiring dynamic routes */
time_t next_expire; /* Next expire run ts */
+ uint32_t rnh_prefixes; /* Number of prefixes */
struct nh_control *nh_control; /* nexthop subsystem data */
CK_STAILQ_HEAD(, rib_subscription) rnh_subscribers;/* notification subscribers */
};
@@ -241,7 +243,6 @@
void nhops_destroy_rib(struct rib_head *rh);
void nhop_ref_object(struct nhop_object *nh);
int nhop_try_ref_object(struct nhop_object *nh);
-void nhop_free_any(struct nhop_object *nh);
void nhop_set_type(struct nhop_object *nh, enum nhop_type nh_type);
void nhop_set_rtflags(struct nhop_object *nh, int rt_flags);
@@ -253,6 +254,7 @@
void nhops_update_ifmtu(struct rib_head *rh, struct ifnet *ifp, uint32_t mtu);
int nhops_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+uint32_t nhops_get_count(struct rib_head *rh);
/* MULTIPATH */
#define MPF_MULTIPATH 0x08 /* need to be consistent with NHF_MULTIPATH */
@@ -295,6 +297,7 @@
/* nhgrp_ctl.c */
int nhgrp_dump_sysctl(struct rib_head *rh, struct sysctl_req *w);
+uint32_t nhgrp_get_count(struct rib_head *rh);
int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn,
int num_nhops, struct route_nhop_data *rnd);
@@ -306,7 +309,14 @@
struct route_nhop_data *rnd_new);
uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg);
+void nhgrp_ref_object(struct nhgrp_object *nhg);
void nhgrp_free(struct nhgrp_object *nhg);
+uint32_t nhgrp_get_idx(const struct nhgrp_object *nhg);
+
+/* lookup_framework.c */
+void fib_grow_rtables(uint32_t new_num_tables);
+int fib_select_algo_initial(struct rib_head *rh);
+void fib_destroy_rib(struct rib_head *rh);
/* Entropy data used for outbound hashing */
#define MPATH_ENTROPY_KEY_LEN 40
Index: sys/netinet/in_fib.h
===================================================================
--- sys/netinet/in_fib.h
+++ sys/netinet/in_fib.h
@@ -45,10 +45,15 @@
struct sockaddr_in ro_dst4;
};
+struct rtentry;
+struct route_nhop_data;
+
struct nhop_object *fib4_lookup(uint32_t fibnum, struct in_addr dst,
uint32_t scopeid, uint32_t flags, uint32_t flowid);
int fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
uint32_t flags, const struct ifnet *src_if);
+struct rtentry *fib4_lookup_rt(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, struct route_nhop_data *nrd);
struct nhop_object *fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst,
uint32_t scopeid, uint32_t flags);
uint32_t fib4_calc_software_hash(struct in_addr src, struct in_addr dst,
Index: sys/netinet/in_fib.c
===================================================================
--- sys/netinet/in_fib.c
+++ sys/netinet/in_fib.c
@@ -49,6 +49,7 @@
#include <net/route.h>
#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
#include <net/route/nhop.h>
#include <net/toeplitz.h>
#include <net/vnet.h>
@@ -63,6 +64,10 @@
/* Assert 'struct route_in' is compatible with 'struct route' */
CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4);
+#ifdef ROUTE_ALGO
+VNET_DEFINE(struct fib_dp *, inet_dp);
+#endif
+
#ifdef ROUTE_MPATH
struct _hash_5tuple_ipv4 {
struct in_addr src;
@@ -75,7 +80,6 @@
_Static_assert(sizeof(struct _hash_5tuple_ipv4) == 16,
"_hash_5tuple_ipv4 size is wrong");
-
uint32_t
fib4_calc_software_hash(struct in_addr src, struct in_addr dst,
unsigned short src_port, unsigned short dst_port, char proto,
@@ -104,6 +108,29 @@
* one needs to pass NHR_REF as a flag. This will return referenced
* nexthop.
*/
+#ifdef ROUTE_ALGO
+struct nhop_object *
+fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, uint32_t flowid)
+{
+ struct nhop_object *nh;
+ struct fib_dp *dp = &V_inet_dp[fibnum];
+ struct flm_lookup_key key = {.addr4 = dst };
+
+ nh = dp->f(dp->arg, key, scopeid);
+ if (nh != NULL) {
+ nh = nhop_select(nh, flowid);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ return (nh);
+ }
+ }
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+#else
struct nhop_object *
fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
uint32_t flags, uint32_t flowid)
@@ -143,6 +170,7 @@
RTSTAT_INC(rts_unreach);
return (NULL);
}
+#endif
inline static int
check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
@@ -181,28 +209,19 @@
return (check_urpf_nhop(nh, flags, src_if));
}
-/*
- * Performs reverse path forwarding lookup.
- * If @src_if is non-zero, verifies that at least 1 path goes via
- * this interface.
- * If @src_if is zero, verifies that route exist.
- * if @flags contains NHR_NOTDEFAULT, do not consider default route.
- *
- * Returns 1 if route matching conditions is found, 0 otherwise.
- */
-int
-fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
- uint32_t flags, const struct ifnet *src_if)
+#ifndef ROUTE_ALGO
+static struct nhop_object *
+lookup_nhop(uint32_t fibnum, struct in_addr dst, uint32_t scopeid)
{
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- int ret;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
if (rh == NULL)
- return (0);
+ return (NULL);
/* Prepare lookup key */
struct sockaddr_in sin4;
@@ -210,49 +229,94 @@
sin4.sin_len = sizeof(struct sockaddr_in);
sin4.sin_addr = dst;
+ nh = NULL;
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
- if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
- RIB_RUNLOCK(rh);
- return (ret);
- }
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
+ nh = RNTORT(rn)->rt_nhop;
RIB_RUNLOCK(rh);
+ return (nh);
+}
+#endif
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, const struct ifnet *src_if)
+{
+ struct nhop_object *nh;
+#ifdef ROUTE_ALGO
+ struct fib_dp *dp = &V_inet_dp[fibnum];
+ struct flm_lookup_key key = {.addr4 = dst };
+
+ nh = dp->f(dp->arg, key, scopeid);
+#else
+ nh = lookup_nhop(fibnum, dst, scopeid);
+#endif
+ if (nh != NULL)
+ return (check_urpf(nh, flags, src_if));
+
return (0);
}
-struct nhop_object *
-fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
- uint32_t flags)
+struct rtentry *
+fib4_lookup_rt(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags, struct route_nhop_data *rnd)
{
+ RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct nhop_object *nh;
+ struct rtentry *rt;
- KASSERT((fibnum < rt_numfibs), ("fib4_lookup_debugnet: bad fibnum"));
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_rt: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET);
if (rh == NULL)
return (NULL);
/* Prepare lookup key */
- struct sockaddr_in sin4;
- memset(&sin4, 0, sizeof(sin4));
- sin4.sin_family = AF_INET;
- sin4.sin_len = sizeof(struct sockaddr_in);
- sin4.sin_addr = dst;
-
- nh = NULL;
- /* unlocked lookup */
+ struct sockaddr_in sin4 = {
+ .sin_family = AF_INET,
+ .sin_len = sizeof(struct sockaddr_in),
+ .sin_addr = dst,
+ };
+
+ rt = NULL;
+ if (!(flags & NHR_UNLOCKED))
+ RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
+ rt = (struct rtentry *)rn;
+ rnd->rnd_nhop = rt->rt_nhop;
+ rnd->rnd_weight = rt->rt_weight;
+ }
+ if (!(flags & NHR_UNLOCKED))
+ RIB_RUNLOCK(rh);
+
+ return (rt);
+}
+
+struct nhop_object *
+fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
+ uint32_t flags)
+{
+ struct rtentry *rt;
+ struct route_nhop_data rnd;
+
+ rt = fib4_lookup_rt(fibnum, dst, scopeid, NHR_UNLOCKED, &rnd);
+ if (rt != NULL) {
+ struct nhop_object *nh = nhop_select(rnd.rnd_nhop, 0);
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(nh->nh_ifp)) {
- if (flags & NHR_REF)
- nhop_ref_object(nh);
+ if (RT_LINK_IS_UP(nh->nh_ifp))
return (nh);
- }
}
return (NULL);
Index: sys/netinet/in_fib_algo.c
===================================================================
--- /dev/null
+++ sys/netinet/in_fib_algo.c
@@ -0,0 +1,315 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+
+
+#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
+#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
+struct radix4_addr_entry {
+ struct radix_node rn[2];
+ struct sockaddr_in addr;
+ struct nhop_object *nhop;
+};
+#define LRADIX4_ITEM_SZ roundup2(sizeof(struct radix4_addr_entry), 64)
+
+struct lradix4_data {
+ struct radix_node_head *rnh;
+ struct fib_data *fd;
+ void *mem;
+ uint32_t alloc_items;
+ uint32_t num_items;
+};
+
+static struct nhop_object *
+lradix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ struct radix_node_head *rnh = (struct radix_node_head *)algo_data;
+ struct radix4_addr_entry *ent;
+ struct sockaddr_in addr4 = {
+ .sin_len = KEY_LEN_INET,
+ .sin_addr = key.addr4,
+ };
+ ent = (struct radix4_addr_entry *)(rnh->rnh_matchaddr(&addr4, &rnh->rh));
+ if (ent != NULL)
+ return (ent->nhop);
+ return (NULL);
+}
+
+static uint8_t
+lradix4_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ if (rinfo->num_prefixes < 10)
+ return (255);
+ else if (rinfo->num_prefixes < 100000)
+ return (255 - rinfo->num_prefixes / 394);
+ else
+ return (1);
+}
+
+static enum flm_op_result
+lradix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct lradix4_data *lr;
+ struct rib_rtable_info rinfo;
+ uint32_t count;
+
+ lr = malloc(sizeof(struct lradix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET))
+ return (FLM_REBUILD);
+ fib_get_rtable_info(fib_get_rh(fd), &rinfo);
+
+ count = rinfo.num_prefixes * 11 / 10;
+ // XXX: alignment!
+ lr->mem = malloc(count * LRADIX4_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr->mem == NULL)
+ return (FLM_REBUILD);
+ lr->alloc_items = count;
+ lr->fd = fd;
+
+ *_data = lr;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+lradix4_destroy(void *_data)
+{
+ struct lradix4_data *lr = (struct lradix4_data *)_data;
+
+ if (lr->rnh != NULL)
+ rn_detachhead((void **)&lr->rnh);
+ if (lr->mem != NULL)
+ free(lr->mem, M_RTABLE);
+ free(lr, M_RTABLE);
+}
+
+static enum flm_op_result
+lradix4_add_route_cb(struct rtentry *rt, void *_data)
+{
+ struct lradix4_data *lr = (struct lradix4_data *)_data;
+ struct radix4_addr_entry *ae;
+ struct sockaddr_in *rt_dst, *rt_mask, mask;
+ struct radix_node *rn;
+
+ if (fib_get_nhop_idx(lr->fd, rt->rt_nhop) == 0)
+ return (FLM_REBUILD);
+
+ if (lr->num_items >= lr->alloc_items)
+ return (FLM_REBUILD);
+
+ ae = (struct radix4_addr_entry *)((char *)lr->mem + lr->num_items * LRADIX4_ITEM_SZ);
+ lr->num_items++;
+
+ ae->nhop = rt->rt_nhop;
+
+ rt_dst = (struct sockaddr_in *)rt_key(rt);
+ rt_mask = (struct sockaddr_in *)rt_mask(rt);
+
+ ae->addr.sin_len = KEY_LEN_INET;
+ ae->addr.sin_addr = rt_dst->sin_addr;
+
+ if (rt_mask != NULL) {
+ bzero(&mask, sizeof(mask));
+ mask.sin_len = KEY_LEN_INET;
+ mask.sin_addr = rt_mask->sin_addr;
+ rt_mask = &mask;
+ }
+
+ rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr,
+ (struct sockaddr *)rt_mask, &lr->rnh->rh, ae->rn);
+ if (rn == NULL)
+ return (FLM_REBUILD);
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix4_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct lradix4_data *lr = (struct lradix4_data *)_data;
+
+ dp->f = lradix4_lookup;
+ dp->arg = lr->rnh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+
+ return (FLM_REBUILD);
+}
+
+struct fib_lookup_module flm_radix4_lockless = {
+ .flm_name = "radix4_lockless",
+ .flm_family = AF_INET,
+ .flm_init_cb = lradix4_init,
+ .flm_destroy_cb = lradix4_destroy,
+ .flm_dump_rib_item_cb = lradix4_add_route_cb,
+ .flm_dump_end_cb = lradix4_end_dump,
+ .flm_change_rib_item_cb = lradix4_change_cb,
+ .flm_get_pref = lradix4_get_pref,
+};
+
+
+struct radix4_data {
+ struct fib_data *fd;
+ struct rib_head *rh;
+};
+
+static struct nhop_object *
+radix4_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh = (struct rib_head *)algo_data;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+
+ /* Prepare lookup key */
+ struct sockaddr_in sin4 = {
+ .sin_family = AF_INET,
+ .sin_len = sizeof(struct sockaddr_in),
+ .sin_addr = key.addr4,
+ };
+
+ nh = NULL;
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
+ nh = (RNTORT(rn))->rt_nhop;
+ RIB_RUNLOCK(rh);
+
+ return (nh);
+}
+
+static uint8_t
+radix4_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ return (50);
+}
+
+static enum flm_op_result
+radix4_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct radix4_data *r4;
+
+ r4 = malloc(sizeof(struct radix4_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (r4 == NULL)
+ return (FLM_REBUILD);
+ r4->fd = fd;
+ r4->rh = fib_get_rh(fd);
+ if (r4->rh == NULL)
+ return (FLM_ERROR);
+
+ *_data = r4;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+radix4_destroy(void *_data)
+{
+
+ free(_data, M_RTABLE);
+}
+
+static enum flm_op_result
+radix4_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct radix4_data *r4 = (struct radix4_data *)_data;
+
+ dp->f = radix4_lookup;
+ dp->arg = r4->rh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+radix4_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ struct radix4_data *r4 = (struct radix4_data *)_data;
+
+ /*
+ * Grab additional reference for each nexthop to maintain guarantee
+ * that we have non-zero # of reference for each nexthop in radix in
+ * the epoch.
+ */
+ if (rc->rc_nh_new != NULL) {
+ if (fib_get_nhop_idx(r4->fd, rc->rc_nh_new) == 0)
+ return (FLM_REBUILD);
+ }
+ if (rc->rc_nh_old != NULL)
+ fib_free_nhop(r4->fd, rc->rc_nh_old);
+
+ return (FLM_SUCCESS);
+}
+
+struct fib_lookup_module flm_radix4 = {
+ .flm_name = "radix4",
+ .flm_family = AF_INET,
+ .flm_init_cb = radix4_init,
+ .flm_destroy_cb = radix4_destroy,
+ .flm_dump_end_cb = radix4_end_dump,
+ .flm_change_rib_item_cb = radix4_change_cb,
+ .flm_get_pref = radix4_get_pref,
+};
+
+static void
+fib4_algo_init(void)
+{
+
+ fib_module_register(&flm_radix4_lockless);
+ fib_module_register(&flm_radix4);
+}
+SYSINIT(fib4_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib4_algo_init, NULL);
Index: sys/netinet6/in6_fib.h
===================================================================
--- sys/netinet6/in6_fib.h
+++ sys/netinet6/in6_fib.h
@@ -32,11 +32,16 @@
#ifndef _NETINET6_IN6_FIB_H_
#define _NETINET6_IN6_FIB_H_
+struct rtentry;
+struct route_nhop_data;
+
struct nhop_object *fib6_lookup(uint32_t fibnum,
const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags,
uint32_t flowid);
int fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
uint32_t scopeid, uint32_t flags, const struct ifnet *src_if);
+struct rtentry *fib6_lookup_rt(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, struct route_nhop_data *rnd);
struct nhop_object *fib6_lookup_debugnet(uint32_t fibnum,
const struct in6_addr *dst6, uint32_t scopeid, uint32_t flags);
uint32_t fib6_calc_software_hash(const struct in6_addr *src,
Index: sys/netinet6/in6_fib.c
===================================================================
--- sys/netinet6/in6_fib.c
+++ sys/netinet6/in6_fib.c
@@ -50,6 +50,7 @@
#include <net/route.h>
#include <net/route/route_ctl.h>
#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
#include <net/route/nhop.h>
#include <net/toeplitz.h>
#include <net/vnet.h>
@@ -69,6 +70,10 @@
CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst);
+#ifdef ROUTE_ALGO
+VNET_DEFINE(struct fib_dp *, inet6_dp);
+#endif
+
#ifdef ROUTE_MPATH
struct _hash_5tuple_ipv6 {
struct in6_addr src;
@@ -81,6 +86,7 @@
_Static_assert(sizeof(struct _hash_5tuple_ipv6) == 40,
"_hash_5tuple_ipv6 size is wrong");
+
uint32_t
fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst,
unsigned short src_port, unsigned short dst_port, char proto,
@@ -111,6 +117,29 @@
* one needs to pass NHR_REF as a flag. This will return referenced
* nexthop.
*/
+#ifdef ROUTE_ALGO
+struct nhop_object *
+fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, uint32_t flowid)
+{
+ struct nhop_object *nh;
+ struct fib_dp *dp = &V_inet6_dp[fibnum];
+ struct flm_lookup_key key = {.addr6 = dst6 };
+
+ nh = dp->f(dp->arg, key, scopeid);
+ if (nh != NULL) {
+ nh = nhop_select(nh, flowid);
+ /* Ensure route & ifp is UP */
+ if (RT_LINK_IS_UP(nh->nh_ifp)) {
+ if (flags & NHR_REF)
+ nhop_ref_object(nh);
+ return (nh);
+ }
+ }
+ RTSTAT_INC(rts_unreach);
+ return (NULL);
+}
+#else
struct nhop_object *
fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
uint32_t scopeid, uint32_t flags, uint32_t flowid)
@@ -119,7 +148,6 @@
struct rib_head *rh;
struct radix_node *rn;
struct nhop_object *nh;
- struct sockaddr_in6 sin6;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -127,11 +155,10 @@
return (NULL);
/* TODO: radix changes */
- //addr = *dst6;
- /* Prepare lookup key */
- memset(&sin6, 0, sizeof(sin6));
- sin6.sin6_len = sizeof(struct sockaddr_in6);
- sin6.sin6_addr = *dst6;
+ struct sockaddr_in6 sin6 = {
+ .sin6_len = sizeof(struct sockaddr_in6),
+ .sin6_addr = *dst6,
+ };
/* Assume scopeid is valid and embed it directly */
if (IN6_IS_SCOPE_LINKLOCAL(dst6))
@@ -154,6 +181,7 @@
RTSTAT_INC(rts_unreach);
return (NULL);
}
+#endif
inline static int
check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
@@ -192,60 +220,75 @@
return (check_urpf_nhop(nh, flags, src_if));
}
-/*
- * Performs reverse path forwarding lookup.
- * If @src_if is non-zero, verifies that at least 1 path goes via
- * this interface.
- * If @src_if is zero, verifies that route exist.
- * if @flags contains NHR_NOTDEFAULT, do not consider default route.
- *
- * Returns 1 if route matching conditions is found, 0 otherwise.
- */
-int
-fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
- uint32_t scopeid, uint32_t flags, const struct ifnet *src_if)
+static struct nhop_object *
+lookup_nhop(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid)
{
RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct sockaddr_in6 sin6;
- int ret;
+ struct nhop_object *nh;
KASSERT((fibnum < rt_numfibs), ("fib6_check_urpf: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
if (rh == NULL)
- return (0);
+ return (NULL);
- /* TODO: radix changes */
/* Prepare lookup key */
- memset(&sin6, 0, sizeof(sin6));
- sin6.sin6_len = sizeof(struct sockaddr_in6);
- sin6.sin6_addr = *dst6;
+ struct sockaddr_in6 sin6 = {
+ .sin6_len = sizeof(struct sockaddr_in6),
+ .sin6_addr = *dst6,
+ };
/* Assume scopeid is valid and embed it directly */
if (IN6_IS_SCOPE_LINKLOCAL(dst6))
sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+ nh = NULL;
RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
- if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- ret = check_urpf(RNTORT(rn)->rt_nhop, flags, src_if);
- RIB_RUNLOCK(rh);
- return (ret);
- }
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
+ nh = RNTORT(rn)->rt_nhop;
RIB_RUNLOCK(rh);
+ return (nh);
+}
+
+/*
+ * Performs reverse path forwarding lookup.
+ * If @src_if is non-zero, verifies that at least 1 path goes via
+ * this interface.
+ * If @src_if is zero, verifies that route exist.
+ * if @flags contains NHR_NOTDEFAULT, do not consider default route.
+ *
+ * Returns 1 if route matching conditions is found, 0 otherwise.
+ */
+int
+fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, const struct ifnet *src_if)
+{
+ struct nhop_object *nh;
+#ifndef ROUTE_ALGO
+ struct fib_dp *dp = &V_inet6_dp[fibnum];
+ struct flm_lookup_key key = {.addr6 = dst6 };
+
+ nh = dp->f(dp->arg, key, scopeid);
+#else
+ nh = lookup_nhop(fibnum, dst6, scopeid);
+#endif
+ if (nh != NULL)
+ return (check_urpf(nh, flags, src_if));
return (0);
}
-struct nhop_object *
-fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6,
- uint32_t scopeid, uint32_t flags)
+struct rtentry *
+fib6_lookup_rt(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags, struct route_nhop_data *rnd)
{
+ RIB_RLOCK_TRACKER;
struct rib_head *rh;
struct radix_node *rn;
- struct nhop_object *nh;
- struct sockaddr_in6 sin6;
+ struct rtentry *rt;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum"));
rh = rt_tables_get_rnh(fibnum, AF_INET6);
@@ -253,25 +296,43 @@
return (NULL);
/* TODO: radix changes */
- //addr = *dst6;
- /* Prepare lookup key */
- memset(&sin6, 0, sizeof(sin6));
- sin6.sin6_len = sizeof(struct sockaddr_in6);
- sin6.sin6_addr = *dst6;
+ struct sockaddr_in6 sin6 = {
+ .sin6_len = sizeof(struct sockaddr_in6),
+ .sin6_addr = *dst6,
+ };
/* Assume scopeid is valid and embed it directly */
if (IN6_IS_SCOPE_LINKLOCAL(dst6))
sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+ rt = NULL;
+ if (!(flags & NHR_UNLOCKED))
+ RIB_RLOCK(rh);
rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
- nh = nhop_select((RNTORT(rn))->rt_nhop, 0);
+ rt = (struct rtentry *)rn;
+ rnd->rnd_nhop = rt->rt_nhop;
+ rnd->rnd_weight = rt->rt_weight;
+ }
+ if (!(flags & NHR_UNLOCKED))
+ RIB_RUNLOCK(rh);
+
+ return (rt);
+}
+
+struct nhop_object *
+fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6,
+ uint32_t scopeid, uint32_t flags)
+{
+ struct rtentry *rt;
+ struct route_nhop_data rnd;
+
+ rt = fib6_lookup_rt(fibnum, dst6, scopeid, NHR_UNLOCKED, &rnd);
+ if (rt != NULL) {
+ struct nhop_object *nh = nhop_select(rnd.rnd_nhop, 0);
/* Ensure route & ifp is UP */
- if (RT_LINK_IS_UP(nh->nh_ifp)) {
- if (flags & NHR_REF)
- nhop_ref_object(nh);
+ if (RT_LINK_IS_UP(nh->nh_ifp))
return (nh);
- }
}
return (NULL);
Index: sys/netinet6/in6_fib_algo.c
===================================================================
--- /dev/null
+++ sys/netinet6/in6_fib_algo.c
@@ -0,0 +1,338 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/route_algo.h>
+#define RTDEBUG
+
+#define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr))
+#define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr))
+struct sa_in6 {
+ uint8_t sin6_len;
+ uint8_t sin6_family;
+ uint8_t pad[2];
+ struct in6_addr sin6_addr;
+};
+struct radix6_addr_entry {
+ struct radix_node rn[2];
+ struct sa_in6 addr;
+ struct nhop_object *nhop;
+};
+#define LRADIX6_ITEM_SZ roundup2(sizeof(struct radix6_addr_entry), 64)
+
+struct lradix6_data {
+ struct radix_node_head *rnh;
+ struct fib_data *fd;
+ void *mem;
+ uint32_t alloc_items;
+ uint32_t num_items;
+};
+
+static struct nhop_object *
+lradix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ struct radix_node_head *rnh = (struct radix_node_head *)algo_data;
+ struct radix6_addr_entry *ent;
+ struct sockaddr_in6 addr6 = {
+ .sin6_len = KEY_LEN_INET6,
+ .sin6_addr = *key.addr6,
+ };
+ ent = (struct radix6_addr_entry *)(rnh->rnh_matchaddr(&addr6, &rnh->rh));
+ if (ent != NULL)
+ return (ent->nhop);
+ return (NULL);
+}
+
+static uint8_t
+lradix6_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ if (rinfo->num_prefixes < 10)
+ return (255);
+ else if (rinfo->num_prefixes < 100000)
+ return (255 - rinfo->num_prefixes / 394);
+ else
+ return (1);
+}
+
+static enum flm_op_result
+lradix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct lradix6_data *lr;
+ struct rib_rtable_info rinfo;
+ uint32_t count;
+
+ lr = malloc(sizeof(struct lradix6_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr == NULL || !rn_inithead((void **)&lr->rnh, OFF_LEN_INET6))
+ return (FLM_REBUILD);
+ fib_get_rtable_info(fib_get_rh(fd), &rinfo);
+
+ count = rinfo.num_prefixes * 11 / 10;
+ // XXX: alignment!
+ lr->mem = malloc(count * LRADIX6_ITEM_SZ, M_RTABLE, M_NOWAIT | M_ZERO);
+ if (lr->mem == NULL)
+ return (FLM_REBUILD);
+ lr->alloc_items = count;
+ lr->fd = fd;
+
+ *_data = lr;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+lradix6_destroy(void *_data)
+{
+ struct lradix6_data *lr = (struct lradix6_data *)_data;
+
+ if (lr->rnh != NULL)
+ rn_detachhead((void **)&lr->rnh);
+ if (lr->mem != NULL)
+ free(lr->mem, M_RTABLE);
+ free(lr, M_RTABLE);
+}
+
+static enum flm_op_result
+lradix6_add_route_cb(struct rtentry *rt, void *_data)
+{
+ struct lradix6_data *lr = (struct lradix6_data *)_data;
+ struct radix6_addr_entry *ae;
+ struct sockaddr_in6 *rt_dst, *rt_mask;
+ struct sa_in6 mask;
+ struct radix_node *rn;
+
+ if (fib_get_nhop_idx(lr->fd, rt->rt_nhop) == 0)
+ return (FLM_REBUILD);
+
+ if (lr->num_items >= lr->alloc_items)
+ return (FLM_REBUILD);
+
+ ae = (struct radix6_addr_entry *)((char *)lr->mem + lr->num_items * LRADIX6_ITEM_SZ);
+ lr->num_items++;
+
+ ae->nhop = rt->rt_nhop;
+
+ rt_dst = (struct sockaddr_in6 *)rt_key(rt);
+ rt_mask = (struct sockaddr_in6 *)rt_mask(rt);
+
+ ae->addr.sin6_len = KEY_LEN_INET6;
+ ae->addr.sin6_addr = rt_dst->sin6_addr;
+
+ if (rt_mask != NULL) {
+ bzero(&mask, sizeof(mask));
+ mask.sin6_len = KEY_LEN_INET6;
+ mask.sin6_addr = rt_mask->sin6_addr;
+ rt_mask = (struct sockaddr_in6 *)&mask;
+ }
+
+ rn = lr->rnh->rnh_addaddr((struct sockaddr *)&ae->addr,
+ (struct sockaddr *)rt_mask, &lr->rnh->rh, ae->rn);
+ if (rn == NULL)
+ return (FLM_REBUILD);
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix6_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct lradix6_data *lr = (struct lradix6_data *)_data;
+
+ dp->f = lradix6_lookup;
+ dp->arg = lr->rnh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+lradix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+
+ return (FLM_REBUILD);
+}
+
+struct fib_lookup_module flm_radix6_lockless = {
+ .flm_name = "radix6_lockless",
+ .flm_family = AF_INET6,
+ .flm_init_cb = lradix6_init,
+ .flm_destroy_cb = lradix6_destroy,
+ .flm_dump_rib_item_cb = lradix6_add_route_cb,
+ .flm_dump_end_cb = lradix6_end_dump,
+ .flm_change_rib_item_cb = lradix6_change_cb,
+ .flm_get_pref = lradix6_get_pref,
+};
+
+
+struct radix6_data {
+ struct fib_data *fd;
+ struct rib_head *rh;
+};
+
+static struct nhop_object *
+radix6_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+ RIB_RLOCK_TRACKER;
+ struct rib_head *rh = (struct rib_head *)algo_data;
+ struct radix_node *rn;
+ struct nhop_object *nh;
+
+ /* Prepare lookup key */
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ .sin6_len = sizeof(struct sockaddr_in6),
+ .sin6_addr = *key.addr6,
+ };
+ if (IN6_IS_SCOPE_LINKLOCAL(key.addr6))
+ sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
+
+ nh = NULL;
+ RIB_RLOCK(rh);
+ rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
+ nh = (RNTORT(rn))->rt_nhop;
+ RIB_RUNLOCK(rh);
+
+ return (nh);
+}
+
+static uint8_t
+radix6_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+ return (50);
+}
+
+static enum flm_op_result
+radix6_init(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **_data)
+{
+ struct radix6_data *r6;
+
+ r6 = malloc(sizeof(struct radix6_data), M_RTABLE, M_NOWAIT | M_ZERO);
+ if (r6 == NULL)
+ return (FLM_REBUILD);
+ r6->fd = fd;
+ r6->rh = fib_get_rh(fd);
+ if (r6->rh == NULL)
+ return (FLM_ERROR);
+
+ *_data = r6;
+
+ return (FLM_SUCCESS);
+}
+
+static void
+radix6_destroy(void *_data)
+{
+
+ free(_data, M_RTABLE);
+}
+
+static enum flm_op_result
+radix6_end_dump(void *_data, struct fib_dp *dp)
+{
+ struct radix6_data *r6 = (struct radix6_data *)_data;
+
+ dp->f = radix6_lookup;
+ dp->arg = r6->rh;
+
+ return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+radix6_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+ void *_data)
+{
+ struct radix6_data *r6 = (struct radix6_data *)_data;
+
+ /*
+ * Grab additional reference for each nexthop to maintain guarantee
+ * that we have non-zero # of reference for each nexthop in radix in
+ * the epoch.
+ */
+ if (rc->rc_nh_new != NULL) {
+ if (fib_get_nhop_idx(r6->fd, rc->rc_nh_new) == 0)
+ return (FLM_REBUILD);
+ }
+ if (rc->rc_nh_old != NULL)
+ fib_free_nhop(r6->fd, rc->rc_nh_old);
+
+ return (FLM_SUCCESS);
+}
+
+struct fib_lookup_module flm_radix6 = {
+ .flm_name = "radix6",
+ .flm_family = AF_INET6,
+ .flm_init_cb = radix6_init,
+ .flm_destroy_cb = radix6_destroy,
+ .flm_dump_end_cb = radix6_end_dump,
+ .flm_change_rib_item_cb = radix6_change_cb,
+ .flm_get_pref = radix6_get_pref,
+};
+
+static void
+fib6_algo_init(void)
+{
+
+ fib_module_register(&flm_radix6_lockless);
+ fib_module_register(&flm_radix6);
+}
+SYSINIT(fib6_algo_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, fib6_algo_init, NULL);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Jan 27, 2:53 PM (2 h, 27 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16199545
Default Alt Text
D27401.id80103.diff (241 KB)
Attached To
Mode
D27401: Add modular routing lookup framework.
Attached
Detach File
Event Timeline
Log In to Comment