Page MenuHomeFreeBSD

D36303.diff
No OneTemporary

D36303.diff

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -31,7 +31,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd November 30, 2023
+.Dd January 17, 2023
.Dt TCP 4
.Os
.Sh NAME
@@ -504,6 +504,9 @@
specific connection.
This is needed to help with connection establishment
when a broken firewall is in the network path.
+.It Va ecn.option
+Reflect back the number of received bytes with a particular ECN marking
+by using the Accurate ECN TCP option on each outgoing packet.
.It Va fast_finwait2_recycle
Recycle
.Tn TCP
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -121,6 +121,10 @@
#define TCPOLEN_SIGNATURE 18
#define TCPOPT_FAST_OPEN 34
#define TCPOLEN_FAST_OPEN_EMPTY 2
+#define TCPOPT_ACCECN0 0xAC
+#define TCPOPT_ACCECN1 0XAE
+#define TCPOLEN_ACCECN_EMPTY 2
+#define TCPOLEN_ACCECN_COUNTER 3
#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */
@@ -431,12 +435,12 @@
/* Accurate ECN counters. */
u_int32_t tcpi_delivered_ce;
u_int32_t tcpi_received_ce; /* # of CE marks received */
- u_int32_t __tcpi_delivered_e1_bytes;
- u_int32_t __tcpi_delivered_e0_bytes;
- u_int32_t __tcpi_delivered_ce_bytes;
- u_int32_t __tcpi_received_e1_bytes;
- u_int32_t __tcpi_received_e0_bytes;
- u_int32_t __tcpi_received_ce_bytes;
+ u_int32_t tcpi_delivered_e1_bytes;
+ u_int32_t tcpi_delivered_e0_bytes;
+ u_int32_t tcpi_delivered_ce_bytes;
+ u_int32_t tcpi_received_e1_bytes;
+ u_int32_t tcpi_received_e0_bytes;
+ u_int32_t tcpi_received_ce_bytes;
u_int32_t tcpi_total_tlp; /* tail loss probes sent */
u_int64_t tcpi_total_tlp_bytes; /* tail loss probe bytes sent */
diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h
--- a/sys/netinet/tcp_ecn.h
+++ b/sys/netinet/tcp_ecn.h
@@ -49,6 +49,24 @@
int tcp_ecn_syncache_add(uint16_t, int);
uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *);
+static inline void hton24(u_char **p, uint32_t v)
+{
+ *(*p)++ = (u_char)(v >> 16);
+ *(*p)++ = (u_char)(v >> 8);
+ *(*p)++ = (u_char)(v);
+}
+
+static inline uint32_t ntoh24(u_char *p)
+{
+ uint32_t v;
+
+ v = (uint32_t)(p[0] << 16);
+ v |= (uint32_t)(p[1] << 8);
+ v |= (uint32_t)(p[2] << 0);
+ return v;
+}
+
+
#endif /* _KERNEL */
#endif /* _NETINET_TCP_ECN_H_ */
diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c
--- a/sys/netinet/tcp_ecn.c
+++ b/sys/netinet/tcp_ecn.c
@@ -113,6 +113,11 @@
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0,
"Max retries before giving up on ECN");
+VNET_DEFINE(int, tcp_ecn_option) = 0;
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, option,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_option), 0,
+ "Use AccECN TCP option");
+
/*
* Process incoming SYN,ACK packet
*/
@@ -156,7 +161,9 @@
case (0|TH_CWR|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_nect);
break;
@@ -164,7 +171,9 @@
case (TH_AE|0|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect0);
break;
@@ -172,7 +181,9 @@
case (0|TH_CWR|TH_ECE):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect1);
break;
@@ -180,7 +191,9 @@
case (TH_AE|TH_CWR|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 6;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 6;
/*
* reduce the IW to 2 MSS (to
* account for delayed acks) if
@@ -203,16 +216,16 @@
*/
switch (iptos & IPTOS_ECN_MASK) {
case (IPTOS_ECN_NOTECT):
- tp->t_rcep = 0b010;
+ tp->t_ae.rcep = 0b010;
break;
case (IPTOS_ECN_ECT0):
- tp->t_rcep = 0b100;
+ tp->t_ae.rcep = 0b100;
break;
case (IPTOS_ECN_ECT1):
- tp->t_rcep = 0b011;
+ tp->t_ae.rcep = 0b011;
break;
case (IPTOS_ECN_CE):
- tp->t_rcep = 0b110;
+ tp->t_ae.rcep = 0b110;
break;
}
break;
@@ -259,6 +272,8 @@
case (TH_AE|TH_CWR|TH_ECE):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
TCPSTAT_INC(tcps_ecn_shs);
/*
* Set the AccECN Codepoints on
@@ -269,16 +284,16 @@
*/
switch (iptos & IPTOS_ECN_MASK) {
case (IPTOS_ECN_NOTECT):
- tp->t_rcep = 0b010;
+ tp->t_ae.rcep = 0b010;
break;
case (IPTOS_ECN_ECT0):
- tp->t_rcep = 0b100;
+ tp->t_ae.rcep = 0b100;
break;
case (IPTOS_ECN_ECT1):
- tp->t_rcep = 0b011;
+ tp->t_ae.rcep = 0b011;
break;
case (IPTOS_ECN_CE):
- tp->t_rcep = 0b110;
+ tp->t_ae.rcep = 0b110;
break;
}
break;
@@ -306,18 +321,31 @@
TCPSTAT_INC(tcps_ecn_rcvect1);
break;
}
-
if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
if (tp->t_flags2 & TF2_ACE_PERMIT) {
- if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
- tp->t_rcep += 1;
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags2 |= TF2_ACO_CE;
+ tp->t_ae.rceb += tlen;
+ tp->t_ae.rcep++;
+ break;
+ case IPTOS_ECN_ECT0:
+ tp->t_flags2 |= TF2_ACO_E0;
+ tp->t_ae.re0b += tlen;
+ break;
+ case IPTOS_ECN_ECT1:
+ tp->t_flags2 |= TF2_ACO_E1;
+ tp->t_ae.re1b += tlen;
+ break;
+ }
if (tp->t_flags2 & TF2_ECN_PERMIT) {
delta_cep = (tcp_ecn_get_ace(thflags) + 8 -
- (tp->t_scep & 7)) & 7;
+ (tp->t_ae.scep & 7)) & 7;
if (delta_cep < pkts)
delta_cep = pkts -
((pkts - delta_cep) & 7);
- tp->t_scep += delta_cep;
+ tp->t_ae.scep += delta_cep;
+ tp->t_ae.dcep = delta_cep;
} else {
/*
* process the final ACK of the 3WHS
@@ -332,16 +360,16 @@
/* FALLTHROUGH */
case 0b100:
/* ECT0 SYN or SYN,ACK */
- tp->t_scep = 5;
+ tp->t_ae.scep = 5;
break;
case 0b110:
/* CE SYN or SYN,ACK */
- tp->t_scep = 6;
+ tp->t_ae.scep = 6;
tp->snd_cwnd = 2 * tcp_maxseg(tp);
break;
default:
/* mangled AccECN handshake */
- tp->t_scep = 5;
+ tp->t_ae.scep = 5;
break;
}
tp->t_flags2 |= TF2_ECN_PERMIT;
@@ -350,7 +378,7 @@
/* RFC3168 ECN handling */
if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) {
delta_cep = 1;
- tp->t_scep++;
+ tp->t_ae.scep++;
}
if (thflags & TH_CWR) {
tp->t_flags2 &= ~TF2_ECN_SND_ECE;
@@ -429,16 +457,16 @@
* Reply with proper ECN notifications.
*/
if (tp->t_flags2 & TF2_ACE_PERMIT) {
- tcp_ecn_set_ace(thflags, tp->t_rcep);
+ tcp_ecn_set_ace(thflags, tp->t_ae.rcep);
if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
/*
* here we process the final
* ACK of the 3WHS
*/
- if (tp->t_rcep == 0b110) {
- tp->t_rcep = 6;
+ if (tp->t_ae.rcep == 0b110) {
+ tp->t_ae.rcep = 6;
} else {
- tp->t_rcep = 5;
+ tp->t_ae.rcep = 5;
}
tp->t_flags2 |= TF2_ECN_PERMIT;
}
@@ -451,7 +479,6 @@
if (tp->t_flags2 & TF2_ECN_SND_ECE)
*thflags |= TH_ECE;
}
-
return ipecn;
}
@@ -473,13 +500,17 @@
/* FALLTHROUGH */
case SCF_ACE_1:
tp->t_flags2 |= TF2_ACE_PERMIT;
- tp->t_scep = 5;
- tp->t_rcep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
+ tp->t_ae.rcep = 5;
break;
case SCF_ACE_CE:
tp->t_flags2 |= TF2_ACE_PERMIT;
- tp->t_scep = 6;
- tp->t_rcep = 6;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 6;
+ tp->t_ae.rcep = 6;
break;
}
}
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -997,6 +997,8 @@
}
tp = intotcpcb(inp);
+ to.to_ae = &tp->t_ae;
+
switch (tp->t_state) {
case TCPS_TIME_WAIT:
/*
@@ -1523,7 +1525,7 @@
int acked, ourfinisacked, needoutput = 0;
sackstatus_t sack_changed;
int rstreason, todrop, win, incforsyn = 0;
- uint32_t tiwin;
+ uint32_t tiwin, old_sceb;
uint16_t nsegs;
char *s;
struct inpcb *inp = tptoinpcb(tp);
@@ -1537,6 +1539,7 @@
thflags = tcp_get_flags(th);
tp->sackhint.last_sack_ack = 0;
sack_changed = SACK_NOCHANGE;
+ to.to_ae = &tp->t_ae;
nsegs = max(1, m->m_pkthdr.lro_nsegs);
NET_EPOCH_ASSERT();
@@ -1608,9 +1611,15 @@
/*
* Parse options on any incoming segment.
*/
+ old_sceb = tp->t_ae.sceb;
tcp_dooptions(&to, (u_char *)(th + 1),
(th->th_off << 2) - sizeof(struct tcphdr),
(thflags & TH_SYN) ? TO_SYN : 0);
+ if ((to.to_flags & TOF_ACCE_CE) &&
+ (tp->t_ae.dcep != 0) &&
+ ((tp->t_ae.sceb - old_sceb) == 0))
+ tp->t_ae.scep -= tp->t_ae.dcep;
+
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if ((tp->t_flags & TF_SIGNATURE) != 0 &&
@@ -3463,7 +3472,7 @@
void
tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
{
- int opt, optlen;
+ int opt, optlen, tmp;
to->to_flags = 0;
for (; cnt > 0; cnt -= optlen, cp += optlen) {
@@ -3556,6 +3565,48 @@
to->to_tfo_len = optlen - 2;
to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
break;
+ case TCPOPT_ACCECN0:
+ case TCPOPT_ACCECN1:
+ to->to_flags |= TOF_ACCECNOPT;
+ if (optlen >= (TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 0);
+ if (opt == TCPOPT_ACCECN0) {
+ to->to_flags |= TOF_ACCE_E0;
+ tmp -= (to->to_ae->se0b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se0b += tmp;
+ } else {
+ to->to_flags |= TOF_ACCE_E1;
+ tmp -= (to->to_ae->se1b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se1b += tmp;
+ }
+ }
+ if (optlen >= (TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER)) {
+ to->to_flags |= TOF_ACCE_CE;
+ tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 3);
+ tmp -= (to->to_ae->sceb & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->sceb += tmp;
+ }
+ if (optlen >= (TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 6);
+ if (opt == TCPOPT_ACCECN0) {
+ to->to_flags |= TOF_ACCE_E1;
+ tmp -= (to->to_ae->se1b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se1b += tmp;
+ } else {
+ to->to_flags |= TOF_ACCE_E0;
+ tmp -= (to->to_ae->se0b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se0b += tmp;
+ }
+ }
+ break;
default:
continue;
}
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -589,10 +589,14 @@
* Note: this may not work when tcp headers change
* very dynamically in the future.
*/
- if ((((tp->t_flags & TF_SIGNATURE) ?
+ if ((min(TCP_MAXOLEN,
+ (((tp->t_flags & TF_SIGNATURE) ?
PADTCPOLEN(TCPOLEN_SIGNATURE) : 0) +
((tp->t_flags & TF_RCVD_TSTMP) ?
PADTCPOLEN(TCPOLEN_TIMESTAMP) : 0) +
+ ((tp->t_flags & TF_ACCECN_OPT) ?
+ PADTCPOLEN(TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER) : 0))) +
len) >= tp->t_maxseg)
goto send;
/*
@@ -868,9 +872,32 @@
if (tp->t_flags & TF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
#endif /* TCP_SIGNATURE */
-
+ /*
+ * AccECN option
+ * Don't send on <SYN>, only on <SYN,ACK> or
+ * when doing an AccECN session
+ */
+ if (tp->t_flags & TF_ACCECN_OPT) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_ae = &tp->t_ae;
+ to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) |
+ ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) |
+ ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0);
+ if (flags & TH_SYN)
+ to.to_flags |= TOF_ACCE_SYN;
+ if (tp->t_flags & TF_ACKNOW)
+ to.to_flags |= TOF_ACCE_ACKNOW;
+ }
/* Processing the options. */
hdrlen += optlen = tcp_addoptions(&to, opt);
+ if (to.to_flags & TOF_ACCECNOPT) {
+ if ((to.to_flags & TOF_ACCE_E0) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E0;
+ if ((to.to_flags & TOF_ACCE_E1) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E1;
+ if ((to.to_flags & TOF_ACCE_CE) == 0)
+ tp->t_flags2 &= ~TF2_ACO_CE;
+ }
/*
* If we wanted a TFO option to be added, but it was unable
* to fit, ensure no data is sent.
@@ -1909,6 +1936,78 @@
optlen += total_len;
break;
}
+ case TOF_ACCECNOPT:
+ {
+ int tmp = 0;
+ int max_len = TCP_MAXOLEN - optlen;
+ if (max_len < TCPOLEN_ACCECN_EMPTY) {
+ to->to_flags &= ~TOF_ACCECNOPT;
+ continue;
+ }
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER)) {
+ if (to->to_flags & TOF_ACCE_SYN) {
+ *optp++ = TCPOPT_ACCECN0;
+ optlen += TCPOLEN_ACCECN_EMPTY;
+ *optp++ = TCPOLEN_ACCECN_EMPTY;
+ continue;
+ } else {
+ to->to_flags &= ~TOF_ACCECNOPT;
+ continue;
+ }
+ }
+ *optp++ = (to->to_flags & TOF_ACCE_E1) ?
+ TCPOPT_ACCECN1 : TCPOPT_ACCECN0;
+ if (max_len >= (TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER;
+ } else
+ if (max_len >= (TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER;
+ } else
+ if (max_len >= (TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER;
+ }
+ *optp++ = tmp;
+ optlen += tmp;
+ if (to->to_flags & TOF_ACCE_E1) {
+ hton24(&optp, to->to_ae->re1b);
+ } else {
+ hton24(&optp, to->to_ae->re0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ }
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER)) {
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ }
+ hton24(&optp, to->to_ae->rceb);
+ to->to_flags &= ~TOF_ACCE_CE;
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER)) {
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ }
+ /*
+ * TCP option sufficient to hold full AccECN option
+ * but only send changed counters normally,
+ * full counters on ACKNOW
+ */
+ if (to->to_flags & TOF_ACCE_E1) {
+ hton24(&optp, to->to_ae->re0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ } else {
+ hton24(&optp, to->to_ae->re1b);
+ continue;
+ }
+ }
default:
panic("%s: unknown TCP option type", __func__);
break;
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1805,7 +1805,6 @@
#ifdef INVARIANTS
int thflags = tcp_get_flags(th);
#endif
-
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
NET_EPOCH_ASSERT();
@@ -2013,9 +2012,24 @@
if (tp->t_flags & TF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
#endif
+ /* AccECN option */
+ if (tp->t_flags & TF_ACCECN_OPT) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_ae = &tp->t_ae;
+ to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) |
+ ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) |
+ ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0);
+ }
/* Add the options. */
tlen += optlen = tcp_addoptions(&to, optp);
-
+ if (to.to_flags & TOF_ACCECNOPT) {
+ if ((to.to_flags & TOF_ACCE_E0) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E0;
+ if ((to.to_flags & TOF_ACCE_E1) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E1;
+ if ((to.to_flags & TOF_ACCE_CE) == 0)
+ tp->t_flags2 &= ~TF2_ACO_CE;
+ }
/* Update m_len in the correct mbuf. */
optm->m_len += optlen;
} else
@@ -2330,6 +2344,14 @@
tcp_log_tcpcbinit(tp);
#endif
tp->t_pacing_rate = -1;
+ if (V_tcp_do_lrd)
+ tp->t_flags |= TF_LRD;
+ tp->t_ae.re0b = 1;
+ tp->t_ae.re1b = 1;
+ tp->t_ae.rceb = 0;
+ tp->t_ae.se0b = 1;
+ tp->t_ae.se1b = 1;
+ tp->t_ae.sceb = 0;
if (tp->t_fb->tfb_tcp_fb_init) {
if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
refcount_release(&tp->t_fb->tfb_refcnt);
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1810,6 +1810,7 @@
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
+ struct accecn ae;
NET_EPOCH_ASSERT();
@@ -1949,6 +1950,20 @@
/* don't send cookie again when retransmitting response */
sc->sc_tfo_cookie = NULL;
}
+ if (V_tcp_ecn_option)
+ to.to_flags |= TOF_ACCE_SYN;
+ }
+ if (V_tcp_ecn_option &&
+ (sc->sc_flags & SCF_ECN_MASK) &&
+ ((sc->sc_flags & SCF_ECN_MASK) != SCF_ECN)) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_flags |= TOF_ACCE_E0 |
+ TOF_ACCE_E1 |
+ TOF_ACCE_CE;
+ ae.re0b = 1;
+ ae.re1b = 1;
+ ae.rceb = 0;
+ to.to_ae = &ae;
}
if (sc->sc_flags & SCF_TIMESTAMP) {
to.to_tsval = sc->sc_tsoff + tcp_ts_getticks();
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -789,6 +789,15 @@
#endif
in_losing(inp);
}
+ /*
+ * Disable AccECN option when
+ * retransmitting after multiple
+ * timeouts.
+ */
+ if ((tp->t_rxtshift >= V_tcp_ecn_maxretries) &&
+ (tp->t_flags2 & TF2_ACE_PERMIT) &&
+ (tp->t_flags & TF_ACCECN_OPT))
+ tp->t_flags &= ~TF_ACCECN_OPT;
tp->snd_nxt = tp->snd_una;
tp->snd_recover = tp->snd_max;
/*
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1611,15 +1611,23 @@
* AccECN related counters.
*/
if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ==
- (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
+ (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
/*
* Internal counter starts at 5 for AccECN
* but 0 for RFC3168 ECN.
*/
- ti->tcpi_delivered_ce = tp->t_scep - 5;
- else
- ti->tcpi_delivered_ce = tp->t_scep;
- ti->tcpi_received_ce = tp->t_rcep;
+ ti->tcpi_delivered_ce = tp->t_ae.scep - 5;
+ ti->tcpi_received_ce = tp->t_ae.rcep - 5;
+ } else {
+ ti->tcpi_delivered_ce = tp->t_ae.scep;
+ ti->tcpi_received_ce = tp->t_ae.rcep;
+ }
+ ti->tcpi_received_e0_bytes = tp->t_ae.re0b - 1;
+ ti->tcpi_received_e1_bytes = tp->t_ae.re1b - 1;
+ ti->tcpi_received_ce_bytes = tp->t_ae.rceb;
+ ti->tcpi_delivered_e0_bytes = tp->t_ae.se0b - 1;
+ ti->tcpi_delivered_e1_bytes = tp->t_ae.se1b - 1;
+ ti->tcpi_delivered_ce_bytes = tp->t_ae.sceb;
}
/*
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -129,6 +129,18 @@
int32_t lost_bytes; /* number of rfc6675 IsLost() bytes */
};
+struct accecn {
+ uint32_t rcep; /* Number of received CE marked pkts */
+ uint32_t scep; /* Synced number of delivered CE pkts */
+ uint32_t dcep; /* delta of CE marks for rollback */
+ uint32_t re0b; /* Number of ECT0 marked data bytes */
+ uint32_t re1b; /* Number of ECT1 marked data bytes */
+ uint32_t rceb; /* Number of CE marked data bytes */
+ uint32_t se0b; /* Synced number of delivered ECT0 bytes */
+ uint32_t se1b; /* Synced number of delivered ECT1 bytes */
+ uint32_t sceb; /* Synced number of delivered CE bytes */
+};
+
#define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq)
STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
@@ -431,8 +443,7 @@
int t_dupacks; /* consecutive dup acks recd */
int t_lognum; /* Number of log entries */
int t_loglimit; /* Maximum number of log entries */
- uint32_t t_rcep; /* Number of received CE marked pkts */
- uint32_t t_scep; /* Synced number of delivered CE pkts */
+ struct accecn t_ae; /* AccECN related byte counters */
int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */
struct tcp_log_stailq t_logs; /* Log buffer */
struct tcp_log_id_node *t_lin;
@@ -788,7 +799,7 @@
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
#define TF_CLOSED 0x04000000 /* close(2) called on socket */
-#define TF_UNUSED1 0x08000000 /* unused */
+#define TF_ACCECN_OPT 0x08000000 /* AccECN is using TCP options */
#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
@@ -843,7 +854,9 @@
#define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */
#define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */
#define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */
-
+#define TF2_ACO_E0 0x00100000 /* EE0 counter changed */
+#define TF2_ACO_E1 0x00200000 /* EE1 counter changed */
+#define TF2_ACO_CE 0x00400000 /* ECE counter changed */
/*
* Structure to hold TCP options that are only used during segment
* processing (in tcp_input), but not held in the tcpcb.
@@ -854,14 +867,21 @@
*/
struct tcpopt {
u_int32_t to_flags; /* which options are present */
-#define TOF_MSS 0x0001 /* maximum segment size */
-#define TOF_SCALE 0x0002 /* window scaling */
-#define TOF_SACKPERM 0x0004 /* SACK permitted */
-#define TOF_TS 0x0010 /* timestamp */
-#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */
-#define TOF_SACK 0x0080 /* Peer sent SACK option */
-#define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */
-#define TOF_MAXOPT 0x0200
+#define TOF_MSS 0x00000001 /* maximum segment size */
+#define TOF_SCALE 0x00000002 /* window scaling */
+#define TOF_SACKPERM 0x00000004 /* SACK permitted */
+#define TOF_TS 0x00000010 /* timestamp */
+#define TOF_SIGNATURE 0x00000040 /* TCP-MD5 signature option (RFC2385) */
+#define TOF_SACK 0x00000080 /* Peer sent SACK option */
+#define TOF_FASTOPEN 0x00000100 /* TCP Fast Open (TFO) cookie */
+#define TOF_ACCECNOPT 0x00000200 /* AccECN Option */
+#define TOF_MAXOPT 0x00000400
+ /* Keep internal flags above TOF_MAXOPT */
+#define TOF_ACCE_SYN 0x80000000 /* send empty option */
+#define TOF_ACCE_CE 0x40000000 /* CE counter changed */
+#define TOF_ACCE_E0 0x20000000 /* E0 counter changed */
+#define TOF_ACCE_E1 0x10000000 /* E1 counter changed */
+#define TOF_ACCE_ACKNOW 0x08000000 /* send full option */
u_int32_t to_tsval; /* new timestamp */
u_int32_t to_tsecr; /* reflected timestamp */
u_char *to_sacks; /* pointer to the first SACK blocks */
@@ -871,7 +891,8 @@
u_int8_t to_wscale; /* window scaling */
u_int8_t to_nsacks; /* number of SACK blocks */
u_int8_t to_tfo_len; /* TFO cookie length */
- u_int32_t to_spare; /* UTO */
+ struct accecn *to_ae; /* pointer to AccECN byte counters */
+ u_int32_t to_spare; /* UTO */
};
/*
@@ -1273,6 +1294,7 @@
VNET_DECLARE(int, tcp_do_sack);
VNET_DECLARE(int, tcp_do_tso);
VNET_DECLARE(int, tcp_ecn_maxretries);
+VNET_DECLARE(int, tcp_ecn_option);
VNET_DECLARE(int, tcp_initcwnd_segments);
VNET_DECLARE(int, tcp_insecure_rst);
VNET_DECLARE(int, tcp_insecure_syn);
@@ -1319,6 +1341,7 @@
#define V_tcp_do_sack VNET(tcp_do_sack)
#define V_tcp_do_tso VNET(tcp_do_tso)
#define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries)
+#define V_tcp_ecn_option VNET(tcp_ecn_option)
#define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments)
#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
#define V_tcp_insecure_syn VNET(tcp_insecure_syn)

File Metadata

Mime Type
text/plain
Expires
Tue, Jan 28, 12:48 AM (10 h, 17 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16228946
Default Alt Text
D36303.diff (22 KB)

Event Timeline