Page MenuHomeFreeBSD

D36275.diff
No OneTemporary

D36275.diff

diff --git a/share/man/man4/inet.4 b/share/man/man4/inet.4
--- a/share/man/man4/inet.4
+++ b/share/man/man4/inet.4
@@ -28,7 +28,7 @@
.\" From: @(#)inet.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
-.Dd August 1, 2022
+.Dd September 8, 2022
.Dt INET 4
.Os
.Sh NAME
@@ -186,6 +186,8 @@
.It Va fragpackets
Integer: Current number of IPv4 fragment reassembly queue entries
for the VNET (read-only).
+.It Va fragttl
+Integer: time to live for IPv4 packet fragments in the per-VNET reassemby queue.
.It Va loopback_prefixlen
Integer: prefix length of the address space reserved for loopback purposes.
The default is 8, meaning that 127.0.0.0/8 is reserved for loopback,
diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h
--- a/sys/netinet/ip.h
+++ b/sys/netinet/ip.h
@@ -210,7 +210,6 @@
*/
#define MAXTTL 255 /* maximum time to live (seconds) */
#define IPDEFTTL 64 /* default ttl, from RFC 1340 */
-#define IPFRAGTTL 60 /* time to live for frags, slowhz */
#define IPTTLDEC 1 /* subtracted when forwarding */
#define IP_MSS 576 /* default maximum segment size */
diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c
--- a/sys/netinet/ip_reass.c
+++ b/sys/netinet/ip_reass.c
@@ -75,6 +75,10 @@
struct ipqbucket {
TAILQ_HEAD(ipqhead, ipq) head;
struct mtx lock;
+ struct callout timer;
+#ifdef VIMAGE
+ struct vnet *vnet;
+#endif
int count;
};
@@ -87,6 +91,7 @@
#define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock)
#define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock)
#define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED)
+#define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED)
VNET_DEFINE_STATIC(int, ipreass_maxbucketsize);
#define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize)
@@ -98,10 +103,13 @@
#endif
static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
+static int sysctl_fragttl(SYSCTL_HANDLER_ARGS);
static void ipreass_zone_change(void *);
static void ipreass_drain_tomax(void);
static void ipq_free(struct ipqbucket *, struct ipq *);
static struct ipq * ipq_reuse(int);
+static void ipreass_callout(void *);
+static void ipreass_reschedule(struct ipqbucket *);
static inline void
ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
@@ -117,6 +125,7 @@
IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
ipq_free(bucket, fp);
+ ipreass_reschedule(bucket);
}
/*
@@ -167,9 +176,11 @@
sysctl_maxfragbucketsize, "I",
"Maximum number of IPv4 fragment reassembly queue entries per bucket");
-static u_int ipfragttl = IPFRAGTTL / 2;
-SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragttl, CTLFLAG_RD, &ipfragttl,
- IPFRAGTTL / 2, "IP fragment life time on reassembly queue");
+VNET_DEFINE_STATIC(u_int, ipfragttl) = 30;
+#define V_ipfragttl VNET(ipfragttl)
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW |
+ CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU",
+ "IP fragment life time on reassembly queue (seconds)");
/*
* Take incoming datagram fragment and try to reassemble it into
@@ -311,7 +322,7 @@
V_ipq[hash].count++;
fp->ipq_nfrags = 1;
atomic_add_int(&nfrags, 1);
- fp->ipq_ttl = IPFRAGTTL;
+ fp->ipq_expire = time_uptime + V_ipfragttl;
fp->ipq_p = ip->ip_p;
fp->ipq_id = ip->ip_id;
fp->ipq_src = ip->ip_src;
@@ -322,6 +333,12 @@
else
fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
m->m_nextpkt = NULL;
+ if (fp == TAILQ_LAST(head, ipqhead))
+ callout_reset_sbt(&V_ipq[hash].timer,
+ SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout,
+ &V_ipq[hash], 0);
+ else
+ MPASS(callout_active(&V_ipq[hash].timer));
goto done;
} else {
/*
@@ -509,6 +526,7 @@
m->m_pkthdr.rcvif = srcifp;
}
IPSTAT_INC(ips_reassembled);
+ ipreass_reschedule(&V_ipq[hash]);
IPQ_UNLOCK(hash);
#ifdef RSS
@@ -560,44 +578,48 @@
}
/*
- * If a timer expires on a reassembly queue, discard it.
+ * Timer expired on a bucket.
+ * There should be at least one ipq to be timed out.
*/
-static struct callout ipreass_callout;
static void
-ipreass_slowtimo(void *arg __unused)
+ipreass_callout(void *arg)
{
- VNET_ITERATOR_DECL(vnet_iter);
- struct ipq *fp, *tmp;
+ struct ipqbucket *bucket = arg;
+ struct ipq *fp;
- if (atomic_load_int(&nfrags) == 0)
- return;
+ IPQ_BUCKET_LOCK_ASSERT(bucket);
+ MPASS(atomic_load_int(&nfrags) > 0);
- VNET_FOREACH(vnet_iter) {
- CURVNET_SET(vnet_iter);
- for (int i = 0; i < IPREASS_NHASH; i++) {
- if (TAILQ_EMPTY(&V_ipq[i].head))
- continue;
- IPQ_LOCK(i);
- TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
- if (--fp->ipq_ttl == 0)
- ipq_timeout(&V_ipq[i], fp);
- IPQ_UNLOCK(i);
- }
- CURVNET_RESTORE();
- }
- VNET_LIST_RUNLOCK_NOSLEEP();
+ CURVNET_SET(bucket->vnet);
+ fp = TAILQ_LAST(&bucket->head, ipqhead);
+ KASSERT(fp != NULL && fp->ipq_expire >= time_uptime,
+ ("%s: stray callout on bucket %p", __func__, bucket));
- callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10,
- ipreass_slowtimo, NULL, 0);
+ while (fp != NULL && fp->ipq_expire >= time_uptime) {
+ ipq_timeout(bucket, fp);
+ fp = TAILQ_LAST(&bucket->head, ipqhead);
+ }
+ ipreass_reschedule(bucket);
+ CURVNET_RESTORE();
}
static void
-ipreass_timer_init(void *arg __unused)
+ipreass_reschedule(struct ipqbucket *bucket)
{
+ struct ipq *fp;
- callout_init(&ipreass_callout, 1);
- callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10,
- ipreass_slowtimo, NULL, 0);
+ IPQ_BUCKET_LOCK_ASSERT(bucket);
+
+ if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) {
+ time_t t;
+
+ /* Protect against time_uptime tick. */
+ t = fp->ipq_expire - time_uptime;
+ t = (t > 0) ? t : 1;
+ callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S,
+ ipreass_callout, bucket, 0);
+ } else
+ callout_stop(&bucket->timer);
}
static void
@@ -614,7 +636,6 @@
IPQ_UNLOCK(i);
}
}
-SYSINIT(ipreass, SI_SUB_VNET_DONE, SI_ORDER_ANY, ipreass_timer_init, NULL);
/*
* Drain off all datagram fragments.
@@ -644,7 +665,11 @@
TAILQ_INIT(&V_ipq[i].head);
mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
MTX_DEF | MTX_DUPOK);
+ callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0);
V_ipq[i].count = 0;
+#ifdef VIMAGE
+ V_ipq[i].vnet = curvnet;
+#endif
}
V_ipq_hashseed = arc4random();
V_maxfragsperpacket = 16;
@@ -745,6 +770,7 @@
while (V_ipq[i].count > V_ipreass_maxbucketsize &&
(fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
ipq_timeout(&V_ipq[i], fp);
+ ipreass_reschedule(&V_ipq[i]);
IPQ_UNLOCK(i);
}
@@ -759,8 +785,10 @@
for (int i = 0; i < IPREASS_NHASH; i++) {
IPQ_LOCK(i);
fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
- if (fp != NULL)
+ if (fp != NULL) {
ipq_timeout(&V_ipq[i], fp);
+ ipreass_reschedule(&V_ipq[i]);
+ }
IPQ_UNLOCK(i);
}
}
@@ -854,6 +882,7 @@
}
TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
V_ipq[bucket].count--;
+ ipreass_reschedule(&V_ipq[bucket]);
if (bucket != start)
IPQ_UNLOCK(bucket);
break;
@@ -902,3 +931,24 @@
ipreass_drain_tomax();
return (0);
}
+
+/*
+ * Get or set the IP fragment time to live.
+ */
+static int
+sysctl_fragttl(SYSCTL_HANDLER_ARGS)
+{
+ u_int ttl;
+ int error;
+
+ ttl = V_ipfragttl;
+ error = sysctl_handle_int(oidp, &ttl, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (ttl < 1 || ttl > MAXTTL)
+ return (EINVAL);
+
+ atomic_store_int(&V_ipfragttl, ttl);
+ return (0);
+}
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -56,18 +56,18 @@
/*
* Ip reassembly queue structure. Each fragment
* being reassembled is attached to one of these structures.
- * They are timed out after ipq_ttl drops to 0, and may also
- * be reclaimed if memory becomes tight.
+ * They are timed out after net.inet.ip.fragttl seconds, and may also be
+ * reclaimed if memory becomes tight.
*/
struct ipq {
TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */
- u_char ipq_ttl; /* time for reass q to live */
+ time_t ipq_expire; /* time_uptime when ipq expires */
+ u_char ipq_nfrags; /* # frags in this packet */
u_char ipq_p; /* protocol of this fragment */
u_short ipq_id; /* sequence id for reassembly */
int ipq_maxoff; /* total length of packet */
struct mbuf *ipq_frags; /* to ip headers of fragments */
struct in_addr ipq_src,ipq_dst;
- u_char ipq_nfrags; /* # frags in this packet */
struct label *ipq_label; /* MAC label */
};
#endif /* _KERNEL */

File Metadata

Mime Type
text/plain
Expires
Sun, Nov 17, 1:35 AM (20 h, 52 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14668797
Default Alt Text
D36275.diff (8 KB)

Event Timeline