Page MenuHomeFreeBSD

D30036.diff
No OneTemporary

D30036.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: sys/netinet/cc/cc.h
===================================================================
--- sys/netinet/cc/cc.h
+++ sys/netinet/cc/cc.h
@@ -91,15 +91,20 @@
struct sctp_nets *sctp;
} ccvc;
uint16_t nsegs; /* # segments coalesced into current chain. */
+ uint8_t labc; /* Dont use system abc use passed in */
};
/* cc_var flags. */
#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */
#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */
-#define CCF_UNUSED1 0x0004 /* unused */
+#define CCF_USE_LOCAL_ABC 0x0004 /* Dont use the system l_abc val */
#define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */
#define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */
#define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */
+#define CCF_MAX_CWND 0x0040 /* Have we reached maximum cwnd? */
+#define CCF_CHG_MAX_CWND 0x0080 /* Cubic max_cwnd changed, for K */
+#define CCF_USR_IWND 0x0100 /* User specified initial window */
+#define CCF_USR_IWND_INIT_NSEG 0x0200 /* Convert segs to bytes on conn init */
/* ACK types passed to the ack_received() hook. */
#define CC_ACK 0x0001 /* Regular in sequence ACK. */
Index: sys/netinet/cc/cc_newreno.h
===================================================================
--- sys/netinet/cc/cc_newreno.h
+++ sys/netinet/cc/cc_newreno.h
@@ -31,12 +31,17 @@
#define CCALGONAME_NEWRENO "newreno"
+struct newreno {
+ uint32_t beta;
+ uint32_t beta_ecn;
+ uint32_t newreno_flags;
+};
+
struct cc_newreno_opts {
- int name;
+ int name;
uint32_t val;
};
-#define CC_NEWRENO_BETA 1
-#define CC_NEWRENO_BETA_ECN 2
-
+#define CC_NEWRENO_BETA 1 /* Beta for normal DUP-ACK/Sack recovery */
+#define CC_NEWRENO_BETA_ECN 2 /* ECN Beta for Abe */
#endif /* _CC_NEWRENO_H */
Index: sys/netinet/cc/cc_newreno.c
===================================================================
--- sys/netinet/cc/cc_newreno.c
+++ sys/netinet/cc/cc_newreno.c
@@ -86,8 +86,8 @@
static void newreno_post_recovery(struct cc_var *ccv);
static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf);
-VNET_DEFINE_STATIC(uint32_t, newreno_beta) = 50;
-VNET_DEFINE_STATIC(uint32_t, newreno_beta_ecn) = 80;
+VNET_DEFINE(uint32_t, newreno_beta) = 50;
+VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
#define V_newreno_beta VNET(newreno_beta)
#define V_newreno_beta_ecn VNET(newreno_beta_ecn)
@@ -101,11 +101,6 @@
.ctl_output = newreno_ctl_output,
};
-struct newreno {
- uint32_t beta;
- uint32_t beta_ecn;
-};
-
static inline struct newreno *
newreno_malloc(struct cc_var *ccv)
{
@@ -182,9 +177,15 @@
* XXXLAS: Find a way to signal SS after RTO that
* doesn't rely on tcpcb vars.
*/
+ uint16_t abc_val;
+
+ if (ccv->flags & CCF_USE_LOCAL_ABC)
+ abc_val = ccv->labc;
+ else
+ abc_val = V_tcp_abc_l_var;
if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
incr = min(ccv->bytes_this_ack,
- ccv->nsegs * V_tcp_abc_l_var *
+ ccv->nsegs * abc_val *
CCV(ccv, t_maxseg));
else
incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
@@ -237,11 +238,19 @@
u_int mss;
cwin = CCV(ccv, snd_cwnd);
- mss = tcp_maxseg(ccv->ccvc.tcp);
+ mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
nreno = ccv->cc_data;
beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;
beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn;
- if (V_cc_do_abe && type == CC_ECN)
+
+ /*
+ * Note that we only change the backoff for ECN if the
+ * global sysctl V_cc_do_abe is set <or> the stack itself
+ * has set a flag in our newreno_flags (due to pacing) telling
+ * us to use the lower valued back-off.
+ */
+ if (V_cc_do_abe ||
+ (nreno && (nreno->newreno_flags & CC_NEWRENO_BETA_ECN) && (type == CC_ECN)))
factor = beta_ecn;
else
factor = beta;
@@ -260,8 +269,7 @@
V_cc_do_abe && V_cc_abe_frlossreduce)) {
CCV(ccv, snd_ssthresh) =
((uint64_t)CCV(ccv, snd_ssthresh) *
- (uint64_t)beta) /
- (100ULL * (uint64_t)beta_ecn);
+ (uint64_t)beta) / (uint64_t)beta_ecn;
}
if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
CCV(ccv, snd_ssthresh) = cwin;
@@ -344,7 +352,7 @@
nreno->beta = opt->val;
break;
case CC_NEWRENO_BETA_ECN:
- if (!V_cc_do_abe)
+ if ((!V_cc_do_abe) && ((nreno->newreno_flags & CC_NEWRENO_BETA_ECN) == 0))
return (EACCES);
nreno->beta_ecn = opt->val;
break;
Index: sys/netinet/tcp.h
===================================================================
--- sys/netinet/tcp.h
+++ sys/netinet/tcp.h
@@ -181,13 +181,24 @@
#define TCP_TXTLS_MODE 40 /* Transmit TLS mode */
#define TCP_RXTLS_ENABLE 41 /* TLS framing and encryption for receive */
#define TCP_RXTLS_MODE 42 /* Receive TLS mode */
+#define TCP_IWND_NB 43 /* Override initial window (units: bytes) */
+#define TCP_IWND_NSEG 44 /* Override initial window (units: MSS segs) */
+#define TCP_LOGID_CNT 46 /* get number of connections with the same ID */
+#define TCP_LOG_TAG 47 /* configure tag for grouping logs */
+#define TCP_USER_LOG 48 /* userspace log event */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
+#define TCP_MAXUNACKTIME 68 /* maximum time without making progress (sec) */
+#define TCP_MAXPEAKRATE 69 /* maximum peak rate allowed (kbps) */
+#define TCP_IDLE_REDUCE 70 /* Reduce cwnd on idle input */
#define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */
#define TCP_DELACK 72 /* socket option for delayed ack */
#define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */
#define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */
#define TCP_SHARED_CWND_ALLOWED 75 /* Use of a shared cwnd is allowed */
+#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */
+#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */
+#define TCP_PERF_INFO 78 /* retrieve accounting counters */
#define TCP_KEEPINIT 128 /* N, time to establish connection */
#define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */
#define TCP_KEEPINTVL 512 /* L,N interval between keepalives */
@@ -201,7 +212,7 @@
#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */
#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
-#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacing reduction factor (divisor) */
+#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */
#define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */
#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
@@ -284,6 +295,16 @@
#define TCP_RACK_PACE_TO_FILL 1127 /* If we are not in recovery, always pace to fill the cwnd in 1 RTT */
#define TCP_SHARED_CWND_TIME_LIMIT 1128 /* we should limit to low time values the scwnd life */
#define TCP_RACK_PROFILE 1129 /* Select a profile that sets multiple options */
+#define TCP_HDWR_RATE_CAP 1130 /* Allow hardware rates to cap pacing rate */
+#define TCP_PACING_RATE_CAP 1131 /* Highest rate allowed in pacing in bytes per second (uint64_t) */
+#define TCP_HDWR_UP_ONLY 1132 /* Allow the pacing rate to climb but not descend (with the exception of fill-cw */
+#define TCP_RACK_ABC_VAL 1133 /* Set a local ABC value different then the system default */
+#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */
+#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */
+#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */
+#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */
+#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */
+#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@@ -295,6 +316,7 @@
#define TCPI_OPT_WSCALE 0x04
#define TCPI_OPT_ECN 0x08
#define TCPI_OPT_TOE 0x10
+#define TCPI_OPT_TFO 0x20
/* Maximum length of log ID. */
#define TCP_LOG_ID_LEN 64
Index: sys/netinet/tcp_accounting.h
===================================================================
--- /dev/null
+++ sys/netinet/tcp_accounting.h
@@ -0,0 +1,39 @@
+#ifndef __tcp_accounting_h__
+#define __tcp_accounting_h__
+/*
+ * Return values from tcp_do_ack_accounting
+ * and indexs to the into the tcp_proc_time[]
+ * array.
+ */
+#define ACK_BEHIND 0
+#define ACK_SACK 1
+#define ACK_CUMACK 2
+#define ACK_CUMACK_SACK 3
+#define ACK_DUPACK 4
+#define ACK_RWND 5
+/* Added values for tracking output too */
+#define SND_BLOCKED 6
+#define SND_LIMITED 7
+#define SND_OUT_DATA 8
+#define SND_OUT_ACK 9
+#define SND_OUT_FAIL 10
+/* We also count in the counts array two added (MSS sent and ACKS In) */
+#define CNT_OF_MSS_OUT 11
+#define CNT_OF_ACKS_IN 12
+
+/* for the tcpcb we add two more cycle counters */
+#define CYC_HANDLE_MAP 11
+#define CYC_HANDLE_ACK 12
+
+/* Should the tp->xxx array's be alloc'ed? */
+/* #define TCP_NUM_PROC_COUNTERS 11 defined in tcp_var.h */
+/* #define TCP_NUM_CNT_COUNTERS 13 defined in tcp_var.h */
+
+#ifdef _KERNEL
+#ifdef TCP_ACCOUNTING
+extern counter_u64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS];
+extern counter_u64_t tcp_proc_time[TCP_NUM_PROC_COUNTERS];
+#endif
+#endif
+
+#endif
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -526,7 +526,7 @@
(V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
void inline
-cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
+cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos)
{
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -544,7 +544,7 @@
break;
}
- if (th->th_flags & TH_CWR)
+ if (flags & TH_CWR)
tp->ccv->flags |= CCF_TCPHDR_CWR;
else
tp->ccv->flags &= ~CCF_TCPHDR_CWR;
@@ -558,6 +558,12 @@
}
}
+void inline
+cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
+{
+ cc_ecnpkt_handler_flags(tp, th->th_flags, iptos);
+}
+
/*
* TCP input handling is split into multiple parts:
* tcp6_input is a thin wrapper around tcp_input for the extended
Index: sys/netinet/tcp_log_buf.h
===================================================================
--- sys/netinet/tcp_log_buf.h
+++ sys/netinet/tcp_log_buf.h
@@ -174,7 +174,7 @@
TCP_LOG_IN = 1, /* Incoming packet 1 */
TCP_LOG_OUT, /* Transmit (without other event) 2 */
TCP_LOG_RTO, /* Retransmit timeout 3 */
- TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */
+ TCP_LOG_SB_WAKE, /* Awaken socket buffer 4 */
TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
TCP_LOG_PRR, /* Doing PRR 6 */
TCP_LOG_REORDER, /* Detected reorder 7 */
@@ -200,7 +200,7 @@
BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */
BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */
BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
- BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
+ TCP_LOG_MAPCHG, /* Map Changes to the sendmap 30 */
TCP_LOG_USERSEND, /* User level sends data 31 */
BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */
BBR_LOG_STATE_TARGET, /* Log of target at state 33 */
@@ -232,7 +232,9 @@
TCP_LOG_USER_EVENT, /* User space event data 59 */
TCP_LOG_SENDFILE, /* sendfile() logging for TCP connections 60 */
TCP_LOG_HTTP_T, /* logging of http request tracking 61 */
- TCP_LOG_END /* End (keep at end) 62 */
+ TCP_LOG_ACCOUNTING, /* Log of TCP Accounting data 62 */
+ TCP_LOG_FSB, /* FSB information 63 */
+ TCP_LOG_END /* End (keep at end) 64 */
};
enum tcp_log_states {
Index: sys/netinet/tcp_ratelimit.h
===================================================================
--- sys/netinet/tcp_ratelimit.h
+++ sys/netinet/tcp_ratelimit.h
@@ -43,7 +43,9 @@
struct tcp_hwrate_limit_table {
const struct tcp_rate_set *ptbl; /* Pointer to parent table */
struct m_snd_tag *tag; /* Send tag if needed (chelsio) */
- uint64_t rate; /* Rate we get in Bytes per second (Bps) */
+ long rate; /* Rate we get in Bytes per second (Bps) */
+ long using; /* How many flows are using this hdwr rate. */
+ long rs_num_enobufs;
uint32_t time_between; /* Time-Gap between packets at this rate */
uint32_t flags;
};
Index: sys/netinet/tcp_ratelimit.c
===================================================================
--- sys/netinet/tcp_ratelimit.c
+++ sys/netinet/tcp_ratelimit.c
@@ -367,11 +367,22 @@
OID_AUTO, "pacetime", CTLFLAG_RD,
&rs->rs_rlt[i].time_between, 0,
"Time hardware inserts between 1500 byte sends");
- SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_ADD_LONG(&rs->sysctl_ctx,
SYSCTL_CHILDREN(rl_rate_num),
OID_AUTO, "rate", CTLFLAG_RD,
- &rs->rs_rlt[i].rate, 0,
+ &rs->rs_rlt[i].rate,
"Rate in bytes per second");
+ SYSCTL_ADD_LONG(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "using", CTLFLAG_RD,
+ &rs->rs_rlt[i].using,
+ "Number of flows using");
+ SYSCTL_ADD_LONG(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "enobufs", CTLFLAG_RD,
+ &rs->rs_rlt[i].rs_num_enobufs,
+ "Number of enobufs logged on this rate");
+
}
}
#endif
@@ -667,6 +678,8 @@
*/
rs->rs_rlt[i].ptbl = rs;
rs->rs_rlt[i].tag = NULL;
+ rs->rs_rlt[i].using = 0;
+ rs->rs_rlt[i].rs_num_enobufs = 0;
/*
* Calculate the time between.
*/
@@ -1063,16 +1076,28 @@
static void
rl_increment_using(const struct tcp_hwrate_limit_table *rte)
{
+ struct tcp_hwrate_limit_table *decon_rte;
+
+ decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
+ atomic_add_long(&decon_rte->using, 1);
}
static void
rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
{
+ struct tcp_hwrate_limit_table *decon_rte;
+
+ decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
+ atomic_subtract_long(&decon_rte->using, 1);
}
void
tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
{
+ struct tcp_hwrate_limit_table *decon_rte;
+
+ decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
+ atomic_add_long(&decon_rte->rs_num_enobufs, 1);
}
/*
Index: sys/netinet/tcp_sack.c
===================================================================
--- sys/netinet/tcp_sack.c
+++ sys/netinet/tcp_sack.c
@@ -156,6 +156,17 @@
&VNET_NAME(tcp_sack_globalholes), 0,
"Global number of TCP SACK holes currently allocated");
+int
+tcp_dsack_block_exists(struct tcpcb *tp)
+{
+ /* Return true if a DSACK block exists */
+ if (tp->rcv_numsacks == 0)
+ return (0);
+ if (SEQ_LEQ(tp->sackblks[0].end, tp->rcv_nxt))
+ return(1);
+ return (0);
+}
+
/*
* This function will find overlaps with the currently stored sackblocks
* and add any overlap as a dsack block upfront
Index: sys/netinet/tcp_stacks/bbr.c
===================================================================
--- sys/netinet/tcp_stacks/bbr.c
+++ sys/netinet/tcp_stacks/bbr.c
@@ -3930,6 +3930,9 @@
struct tcp_bbr *bbr;
INP_WLOCK_ASSERT(tp->t_inpcb);
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
+#endif
bbr = (struct tcp_bbr *)tp->t_fb_ptr;
switch (type) {
case CC_NDUPACK:
@@ -4403,6 +4406,7 @@
nrsm->r_start = start;
nrsm->r_end = rsm->r_end;
nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm-> r_rtt_not_allowed = rsm->r_rtt_not_allowed;
nrsm->r_flags = rsm->r_flags;
/* We don't transfer forward the SYN flag */
nrsm->r_flags &= ~BBR_HAS_SYN;
@@ -6429,65 +6433,6 @@
bbr->r_ctl.bbr_smallest_srtt_this_state = rtt;
}
-static void
-bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm,
- uint32_t t, uint32_t cts, int ack_type)
-{
- /*
- * For this RSM, we acknowledged the data from a previous
- * transmission, not the last one we made. This means we did a false
- * retransmit.
- */
- if (rsm->r_flags & BBR_HAS_FIN) {
- /*
- * The sending of the FIN often is multiple sent when we
- * have everything outstanding ack'd. We ignore this case
- * since its over now.
- */
- return;
- }
- if (rsm->r_flags & BBR_TLP) {
- /*
- * We expect TLP's to have this occur often
- */
- bbr->rc_tlp_rtx_out = 0;
- return;
- }
- if (ack_type != BBR_CUM_ACKED) {
- /*
- * If it was not a cum-ack we
- * don't really know for sure since
- * the timestamp could be from some
- * other transmission.
- */
- return;
- }
-
- if (rsm->r_flags & BBR_WAS_SACKPASS) {
- /*
- * We retransmitted based on a sack and the earlier
- * retransmission ack'd it - re-ordering is occuring.
- */
- BBR_STAT_INC(bbr_reorder_seen);
- bbr->r_ctl.rc_reorder_ts = cts;
- }
- /* Back down the loss count */
- if (rsm->r_flags & BBR_MARKED_LOST) {
- bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
- bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
- rsm->r_flags &= ~BBR_MARKED_LOST;
- if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
- /* LT sampling also needs adjustment */
- bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
- }
- /***** RRS HERE ************************/
- /* Do we need to do this??? */
- /* bbr_reset_lt_bw_sampling(bbr, cts); */
- /***** RRS HERE ************************/
- BBR_STAT_INC(bbr_badfr);
- BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start));
-}
-
static void
bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line)
{
@@ -6869,6 +6814,10 @@
/* Already done */
return (0);
}
+ if (rsm->r_rtt_not_allowed) {
+ /* Not allowed */
+ return (0);
+ }
if (rsm->r_rtr_cnt == 1) {
/*
* Only one transmit. Hopefully the normal case.
@@ -6926,7 +6875,7 @@
rsm->r_tim_lastsent[i], ack_type, to);
if ((i + 1) < rsm->r_rtr_cnt) {
/* Likely */
- bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
+ return (0);
} else if (rsm->r_flags & BBR_TLP) {
bbr->rc_tlp_rtx_out = 0;
}
@@ -6974,7 +6923,7 @@
t = 1;
bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET,
rsm->r_tim_lastsent[i], ack_type, to);
- bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type);
+ return (0);
} else {
/*
* Too many prior transmissions, just
@@ -10207,7 +10156,7 @@
tp->t_fb_ptr = NULL;
return (ENOMEM);
}
- rsm->r_flags = BBR_OVERMAX;
+ rsm->r_rtt_not_allowed = 1;
rsm->r_tim_lastsent[0] = cts;
rsm->r_rtr_cnt = 1;
rsm->r_rtr_bytes = 0;
@@ -10320,6 +10269,10 @@
counter_u64_add(bbr_flows_whdwr_pacing, -1);
else
counter_u64_add(bbr_flows_nohdwr_pacing, -1);
+ if (bbr->r_ctl.crte != NULL) {
+ tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
+ bbr->r_ctl.crte = NULL;
+ }
rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
while (rsm) {
TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
@@ -13463,15 +13416,6 @@
th->th_seq = htonl(tp->snd_max);
bbr_seq = tp->snd_max;
}
- } else if (flags & TH_RST) {
- /*
- * For a Reset send the last cum ack in sequence
- * (this like any other choice may still generate a
- * challenge ack, if a ack-update packet is in
- * flight).
- */
- th->th_seq = htonl(tp->snd_una);
- bbr_seq = tp->snd_una;
} else {
/*
* len == 0 and not persist we use snd_max, sending
@@ -14536,9 +14480,9 @@
} else {
bbr->bbr_hdw_pace_ena = 0;
#ifdef RATELIMIT
- if (bbr->bbr_hdrw_pacing) {
- bbr->bbr_hdrw_pacing = 0;
- in_pcbdetach_txrtlmt(bbr->rc_inp);
+ if (bbr->r_ctl.crte != NULL) {
+ tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
+ bbr->r_ctl.crte = NULL;
}
#endif
}
Index: sys/netinet/tcp_stacks/rack.c
===================================================================
--- sys/netinet/tcp_stacks/rack.c
+++ sys/netinet/tcp_stacks/rack.c
@@ -63,7 +63,10 @@
#include <sys/kthread.h>
#include <sys/kern_prefetch.h>
#include <sys/protosw.h>
-
+#ifdef TCP_ACCOUNTING
+#include <sys/sched.h>
+#include <machine/cpu.h>
+#endif
#include <vm/uma.h>
#include <net/route.h>
@@ -91,8 +94,10 @@
#include <netinet/tcp_var.h>
#include <netinet/tcp_hpts.h>
#include <netinet/tcp_ratelimit.h>
+#include <netinet/tcp_accounting.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
+#include <netinet/cc/cc_newreno.h>
#include <netinet/tcp_fastopen.h>
#include <netinet/tcp_lro.h>
#ifdef NETFLIX_SHARED_CWND
@@ -133,6 +138,15 @@
#define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t)))
#endif
+VNET_DECLARE(uint32_t, newreno_beta);
+VNET_DECLARE(uint32_t, newreno_beta_ecn);
+#define V_newreno_beta VNET(newreno_beta)
+#define V_newreno_beta_ecn VNET(newreno_beta_ecn)
+
+
+MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
+MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
+
struct sysctl_ctx_list rack_sysctl_ctx;
struct sysctl_oid *rack_sysctl_root;
@@ -175,30 +189,51 @@
static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */
static int32_t rack_tlp_use_greater = 1;
static int32_t rack_reorder_thresh = 2;
-static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000
+static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
* - 60 seconds */
+static uint8_t rack_req_measurements = 1;
/* Attack threshold detections */
static uint32_t rack_highest_sack_thresh_seen = 0;
static uint32_t rack_highest_move_thresh_seen = 0;
-
-static int32_t rack_pkt_delay = 1;
-static int32_t rack_early_recovery = 1;
+static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
+static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */
+static int32_t rack_hw_rate_caps = 1; /* 1; */
+static int32_t rack_hw_rate_min = 0; /* 1500000;*/
+static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
+static int32_t rack_hw_up_only = 1;
+static int32_t rack_stats_gets_ms_rtt = 1;
+static int32_t rack_prr_addbackmax = 2;
+
+static int32_t rack_pkt_delay = 1000;
static int32_t rack_send_a_lot_in_prr = 1;
-static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
+static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */
static int32_t rack_verbose_logging = 0;
static int32_t rack_ignore_data_after_close = 1;
-static int32_t rack_enable_shared_cwnd = 0;
+static int32_t rack_enable_shared_cwnd = 1;
+static int32_t rack_use_cmp_acks = 1;
+static int32_t rack_use_fsb = 1;
+static int32_t rack_use_rfo = 1;
+static int32_t rack_use_rsm_rfo = 1;
+static int32_t rack_max_abc_post_recovery = 2;
+static int32_t rack_client_low_buf = 0;
+#ifdef TCP_ACCOUNTING
+static int32_t rack_tcp_accounting = 0;
+#endif
static int32_t rack_limits_scwnd = 1;
static int32_t rack_enable_mqueue_for_nonpaced = 0;
static int32_t rack_disable_prr = 0;
static int32_t use_rack_rr = 1;
static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
-static int32_t rack_persist_min = 250; /* 250ms */
-static int32_t rack_persist_max = 2000; /* 2 Second */
-static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */
-static int32_t rack_default_init_window = 0; /* Use system default */
+static int32_t rack_persist_min = 250000; /* 250usec */
+static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
+static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
+static int32_t rack_default_init_window = 0; /* Use system default */
static int32_t rack_limit_time_with_srtt = 0;
-static int32_t rack_hw_pace_adjust = 0;
+static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
+static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */
+static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */
+static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */
+static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */
/*
* Currently regular tcp has a rto_min of 30ms
* the backoff goes 12 times so that ends up
@@ -209,23 +244,21 @@
static uint32_t rack_goal_bdp = 2;
static uint32_t rack_min_srtts = 1;
static uint32_t rack_min_measure_usec = 0;
-static int32_t rack_tlp_min = 10;
-static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */
-static int32_t rack_rto_max = 4000; /* 4 seconds */
+static int32_t rack_tlp_min = 10000; /* 10ms */
+static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */
+static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */
static const int32_t rack_free_cache = 2;
static int32_t rack_hptsi_segments = 40;
static int32_t rack_rate_sample_method = USE_RTT_LOW;
static int32_t rack_pace_every_seg = 0;
-static int32_t rack_delayed_ack_time = 200; /* 200ms */
+static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */
static int32_t rack_slot_reduction = 4;
static int32_t rack_wma_divisor = 8; /* For WMA calculation */
static int32_t rack_cwnd_block_ends_measure = 0;
static int32_t rack_rwnd_block_ends_measure = 0;
+static int32_t rack_def_profile = 0;
static int32_t rack_lower_cwnd_at_tlp = 0;
-static int32_t rack_use_proportional_reduce = 0;
-static int32_t rack_proportional_rate = 10;
-static int32_t rack_tlp_max_resend = 2;
static int32_t rack_limited_retran = 0;
static int32_t rack_always_send_oldest = 0;
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
@@ -247,13 +280,13 @@
static uint32_t rack_probertt_use_min_rtt_exit = 0;
static uint32_t rack_probe_rtt_sets_cwnd = 0;
static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */
-static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */
+static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */
static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */
-static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */
-static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */
+static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */
+static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */
static uint32_t rack_probertt_filter_life = 10000000;
static uint32_t rack_probertt_lower_within = 10;
-static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */
+static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */
static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */
static int32_t rack_probertt_clear_is = 1;
static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */
@@ -264,7 +297,7 @@
/* Timely information */
/* Combine these two gives the range of 'no change' to bw */
-/* ie the up/down provide the upper and lower bound */
+/* ie the up/down provide the upper and lower bound */
static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */
static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */
static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */
@@ -286,6 +319,7 @@
static int32_t rack_timely_no_stopping = 0;
static int32_t rack_down_raise_thresh = 100;
static int32_t rack_req_segs = 1;
+static uint64_t rack_bw_rate_cap = 0;
/* Weird delayed ack mode */
static int32_t rack_use_imac_dack = 0;
@@ -301,9 +335,14 @@
counter_u64_t rack_calc_zero;
counter_u64_t rack_calc_nonzero;
counter_u64_t rack_saw_enobuf;
+counter_u64_t rack_saw_enobuf_hw;
counter_u64_t rack_saw_enetunreach;
counter_u64_t rack_per_timer_hole;
-
+counter_u64_t rack_large_ackcmp;
+counter_u64_t rack_small_ackcmp;
+#ifdef INVARIANTS
+counter_u64_t rack_adjust_map_bw;
+#endif
/* Tail loss probe counters */
counter_u64_t rack_tlp_tot;
counter_u64_t rack_tlp_newdata;
@@ -313,6 +352,7 @@
counter_u64_t rack_to_tot;
counter_u64_t rack_to_arm_rack;
counter_u64_t rack_to_arm_tlp;
+counter_u64_t rack_hot_alloc;
counter_u64_t rack_to_alloc;
counter_u64_t rack_to_alloc_hard;
counter_u64_t rack_to_alloc_emerg;
@@ -320,6 +360,17 @@
counter_u64_t rack_alloc_limited_conns;
counter_u64_t rack_split_limited;
+#define MAX_NUM_OF_CNTS 13
+counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS];
+counter_u64_t rack_multi_single_eq;
+counter_u64_t rack_proc_non_comp_ack;
+
+counter_u64_t rack_fto_send;
+counter_u64_t rack_fto_rsm_send;
+counter_u64_t rack_nfto_resend;
+counter_u64_t rack_non_fto_send;
+counter_u64_t rack_extended_rfo;
+
counter_u64_t rack_sack_proc_all;
counter_u64_t rack_sack_proc_short;
counter_u64_t rack_sack_proc_restart;
@@ -342,6 +393,10 @@
counter_u64_t rack_collapsed_win;
counter_u64_t rack_tlp_does_nada;
counter_u64_t rack_try_scwnd;
+counter_u64_t rack_hw_pace_init_fail;
+counter_u64_t rack_hw_pace_lost;
+counter_u64_t rack_sbsndptr_right;
+counter_u64_t rack_sbsndptr_wrong;
/* Temp CPU counters */
counter_u64_t rack_find_high;
@@ -350,6 +405,17 @@
counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
+
+#define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
+
+#define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax) do { \
+ (tv) = (value) + TICKS_2_USEC(tcp_rexmit_slop); \
+ if ((u_long)(tv) < (u_long)(tvmin)) \
+ (tv) = (tvmin); \
+ if ((u_long)(tv) > (u_long)(tvmax)) \
+ (tv) = (tvmax); \
+} while (0)
+
static void
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line);
@@ -363,7 +429,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
static void
rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
- struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
+ uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
uint8_t limit_type);
@@ -371,24 +437,21 @@
rack_check_recovery_mode(struct tcpcb *tp,
uint32_t tsused);
static void
-rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
- uint32_t type);
+rack_cong_signal(struct tcpcb *tp,
+ uint32_t type, uint32_t ack);
static void rack_counter_destroy(void);
static int
rack_ctloutput(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp);
static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
static void
-rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line);
+rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
static void
rack_do_segment(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
uint8_t iptos);
static void rack_dtor(void *mem, int32_t size, void *arg);
static void
-rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
- uint32_t t, uint32_t cts);
-static void
rack_log_alt_to_to_cancel(struct tcp_rack *rack,
uint32_t flex1, uint32_t flex2,
uint32_t flex3, uint32_t flex4,
@@ -416,11 +479,12 @@
static void rack_init_sysctls(void);
static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
- struct tcphdr *th);
+ struct tcphdr *th, int entered_rec, int dup_ack_struck);
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
- uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
- uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts);
+ uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts,
+ struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff);
+
static void
rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm);
@@ -431,7 +495,7 @@
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
uint32_t cts, int *moved_two);
-static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
static void rack_remxt_tmr(struct tcpcb *tp);
static int
rack_set_sockopt(struct socket *so, struct sockopt *sopt,
@@ -446,10 +510,10 @@
static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
+ struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint32_t ts);
+ struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
static int
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
@@ -496,15 +560,182 @@
static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
static void
- tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
+ tcp_rack_partialack(struct tcpcb *tp);
+static int
+rack_set_profile(struct tcp_rack *rack, int prof);
+static void
+rack_apply_deferred_options(struct tcp_rack *rack);
int32_t rack_clear_counter=0;
+static void
+rack_set_cc_pacing(struct tcp_rack *rack)
+{
+ struct sockopt sopt;
+ struct cc_newreno_opts opt;
+ struct newreno old, *ptr;
+ struct tcpcb *tp;
+ int error;
+
+ if (rack->rc_pacing_cc_set)
+ return;
+
+ tp = rack->rc_tp;
+ if (tp->cc_algo == NULL) {
+ /* Tcb is leaving */
+ printf("No cc algorithm?\n");
+ return;
+ }
+ rack->rc_pacing_cc_set = 1;
+ if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
+ /* Not new-reno we can't play games with beta! */
+ printf("cc_algo:%s is not NEWRENO:%s\n",
+ tp->cc_algo->name, CCALGONAME_NEWRENO);
+ goto out;
+ }
+ ptr = ((struct newreno *)tp->ccv->cc_data);
+ if (CC_ALGO(tp)->ctl_output == NULL) {
+ /* Huh, why does new_reno no longer have a set function? */
+ printf("no ctl_output for algo:%s\n", tp->cc_algo->name);
+ goto out;
+ }
+ if (ptr == NULL) {
+ /* Just the default values */
+ old.beta = V_newreno_beta_ecn;
+ old.beta_ecn = V_newreno_beta_ecn;
+ old.newreno_flags = 0;
+ } else {
+ old.beta = ptr->beta;
+ old.beta_ecn = ptr->beta_ecn;
+ old.newreno_flags = ptr->newreno_flags;
+ }
+ sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
+ sopt.sopt_dir = SOPT_SET;
+ opt.name = CC_NEWRENO_BETA;
+ opt.val = rack->r_ctl.rc_saved_beta.beta;
+ error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
+ if (error) {
+ printf("Error returned by ctl_output %d\n", error);
+ goto out;
+ }
+ /*
+ * Hack alert we need to set in our newreno_flags
+ * so that Abe behavior is also applied.
+ */
+ ((struct newreno *)tp->ccv->cc_data)->newreno_flags = CC_NEWRENO_BETA_ECN;
+ opt.name = CC_NEWRENO_BETA_ECN;
+ opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
+ error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
+ if (error) {
+ printf("Error returned by ctl_output %d\n", error);
+ goto out;
+ }
+ /* Save off the original values for restoral */
+ memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
+out:
+ if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ ptr = ((struct newreno *)tp->ccv->cc_data);
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ if (ptr) {
+ log.u_bbr.flex1 = ptr->beta;
+ log.u_bbr.flex2 = ptr->beta_ecn;
+ log.u_bbr.flex3 = ptr->newreno_flags;
+ }
+ log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
+ log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
+ log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
+ log.u_bbr.flex7 = rack->gp_ready;
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->use_fixed_rate;
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
+ log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex8 = 3;
+ tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+}
+
+static void
+rack_undo_cc_pacing(struct tcp_rack *rack)
+{
+ struct newreno old, *ptr;
+ struct tcpcb *tp;
+
+ if (rack->rc_pacing_cc_set == 0)
+ return;
+ tp = rack->rc_tp;
+ rack->rc_pacing_cc_set = 0;
+ if (tp->cc_algo == NULL)
+ /* Tcb is leaving */
+ return;
+ if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
+ /* Not new-reno nothing to do! */
+ return;
+ }
+ ptr = ((struct newreno *)tp->ccv->cc_data);
+ if (ptr == NULL) {
+ /*
+ * This happens at rack_fini() if the
+ * cc module gets freed on us. In that
+ * case we loose our "new" settings but
+ * thats ok, since the tcb is going away anyway.
+ */
+ return;
+ }
+ /* Grab out our set values */
+ memcpy(&old, ptr, sizeof(struct newreno));
+ /* Copy back in the original values */
+ memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
+ /* Now save back the values we had set in (for when pacing is restored) */
+ memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
+ if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ ptr = ((struct newreno *)tp->ccv->cc_data);
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = ptr->beta;
+ log.u_bbr.flex2 = ptr->beta_ecn;
+ log.u_bbr.flex3 = ptr->newreno_flags;
+ log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
+ log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
+ log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
+ log.u_bbr.flex7 = rack->gp_ready;
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->use_fixed_rate;
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
+ log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex8 = 4;
+ tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+}
+
+#ifdef NETFLIX_PEAKRATE
+static inline void
+rack_update_peakrate_thr(struct tcpcb *tp)
+{
+ /* Keep in mind that t_maxpeakrate is in B/s. */
+ uint64_t peak;
+ peak = uqmax((tp->t_maxseg * 2),
+ (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
+ tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
+}
+#endif
+
static int
sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
{
uint32_t stat;
int32_t error;
+ int i;
error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
if (error || req->newptr == NULL)
@@ -536,17 +767,37 @@
counter_u64_zero(rack_calc_nonzero);
counter_u64_zero(rack_unpaced_segments);
counter_u64_zero(rack_saw_enobuf);
+ counter_u64_zero(rack_saw_enobuf_hw);
counter_u64_zero(rack_saw_enetunreach);
counter_u64_zero(rack_per_timer_hole);
+ counter_u64_zero(rack_large_ackcmp);
+ counter_u64_zero(rack_small_ackcmp);
+#ifdef INVARIANTS
+ counter_u64_zero(rack_adjust_map_bw);
+#endif
counter_u64_zero(rack_to_alloc_hard);
counter_u64_zero(rack_to_alloc_emerg);
counter_u64_zero(rack_sack_proc_all);
+ counter_u64_zero(rack_fto_send);
+ counter_u64_zero(rack_fto_rsm_send);
+ counter_u64_zero(rack_extended_rfo);
+ counter_u64_zero(rack_hw_pace_init_fail);
+ counter_u64_zero(rack_hw_pace_lost);
+ counter_u64_zero(rack_sbsndptr_wrong);
+ counter_u64_zero(rack_sbsndptr_right);
+ counter_u64_zero(rack_non_fto_send);
+ counter_u64_zero(rack_nfto_resend);
counter_u64_zero(rack_sack_proc_short);
counter_u64_zero(rack_sack_proc_restart);
counter_u64_zero(rack_to_alloc);
counter_u64_zero(rack_to_alloc_limited);
counter_u64_zero(rack_alloc_limited_conns);
counter_u64_zero(rack_split_limited);
+ for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
+ counter_u64_zero(rack_proc_comp_ack[i]);
+ }
+ counter_u64_zero(rack_multi_single_eq);
+ counter_u64_zero(rack_proc_non_comp_ack);
counter_u64_zero(rack_find_high);
counter_u64_zero(rack_sack_attacks_detected);
counter_u64_zero(rack_sack_attacks_reversed);
@@ -574,6 +825,7 @@
static void
rack_init_sysctls(void)
{
+ int i;
struct sysctl_oid *rack_counters;
struct sysctl_oid *rack_attack;
struct sysctl_oid *rack_pacing;
@@ -583,6 +835,7 @@
struct sysctl_oid *rack_misc;
struct sysctl_oid *rack_measure;
struct sysctl_oid *rack_probertt;
+ struct sysctl_oid *rack_hw_pacing;
rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -740,11 +993,6 @@
OID_AUTO, "init_win", CTLFLAG_RW,
&rack_default_init_window, 0,
"Do we have a rack initial window 0 = system default");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_pacing),
- OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW,
- &rack_hw_pace_adjust, 0,
- "What percentage do we raise the MSS by (11 = 1.1%)");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "gp_per_ss", CTLFLAG_RW,
@@ -775,7 +1023,73 @@
OID_AUTO, "use_pacing", CTLFLAG_RW,
&rack_pace_every_seg, 0,
"If set we use pacing, if clear we use only the original burst mitigation");
-
+ SYSCTL_ADD_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "rate_cap", CTLFLAG_RW,
+ &rack_bw_rate_cap, 0,
+ "If set we apply this value to the absolute rate cap used by pacing");
+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
+ &rack_req_measurements, 1,
+ "If doing dynamic pacing, how many measurements must be in before we start pacing?");
+ /* Hardware pacing */
+ rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO,
+ "hdwr_pacing",
+ CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "Pacing related Controls");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "rwnd_factor", CTLFLAG_RW,
+ &rack_hw_rwnd_factor, 2,
+ "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
+ &rack_enobuf_hw_boost_mult, 2,
+ "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
+ &rack_enobuf_hw_max, 2,
+ "What is the max boost the pacing time if we see a ENOBUFS?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
+ &rack_enobuf_hw_min, 2,
+ "What is the min boost the pacing time if we see a ENOBUFS?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "enable", CTLFLAG_RW,
+ &rack_enable_hw_pacing, 0,
+ "Should RACK attempt to use hw pacing?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "rate_cap", CTLFLAG_RW,
+ &rack_hw_rate_caps, 1,
+ "Does the highest hardware pacing rate cap the rate we will send at??");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "rate_min", CTLFLAG_RW,
+ &rack_hw_rate_min, 0,
+ "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "rate_to_low", CTLFLAG_RW,
+ &rack_hw_rate_to_low, 0,
+ "If we fall below this rate, dis-engage hw pacing?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "up_only", CTLFLAG_RW,
+ &rack_hw_up_only, 1,
+ "Do we allow hw pacing to lower the rate selected?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_hw_pacing),
+ OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
+ &rack_hw_pace_extra_slots, 2,
+ "If the rates between software and hardware match precisely how many extra time_betweens do we get?");
rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO,
@@ -901,6 +1215,11 @@
OID_AUTO, "use_rrr", CTLFLAG_RW,
&use_rack_rr, 1,
"Do we use Rack Rapid Recovery");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_tlp),
+ OID_AUTO, "post_rec_labc", CTLFLAG_RW,
+ &rack_max_abc_post_recovery, 2,
+ "Since we do early recovery, do we override the l_abc to a value, if so what?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
@@ -924,8 +1243,8 @@
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "tlpminto", CTLFLAG_RW,
- &rack_tlp_min, 10,
- "TLP minimum timeout per the specification (10ms)");
+ &rack_tlp_min, 10000,
+ "TLP minimum timeout per the specification (in microseconds)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "send_oldest", CTLFLAG_RW,
@@ -936,11 +1255,6 @@
OID_AUTO, "rack_tlimit", CTLFLAG_RW,
&rack_limited_retran, 0,
"How many times can a rack timeout drive out sends");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_tlp),
- OID_AUTO, "tlp_retry", CTLFLAG_RW,
- &rack_tlp_max_resend, 2,
- "How many times does TLP retry a single segment or multiple with no ACK");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
@@ -959,13 +1273,13 @@
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "reorder_fade", CTLFLAG_RW,
- &rack_reorder_fade, 0,
- "Does reorder detection fade, if so how many ms (0 means never)");
+ &rack_reorder_fade, 60000000,
+ "Does reorder detection fade, if so how many microseconds (0 means never)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "pktdelay", CTLFLAG_RW,
- &rack_pkt_delay, 1,
- "Extra RACK time (in ms) besides reordering thresh");
+ &rack_pkt_delay, 1000,
+ "Extra RACK time (in microseconds) besides reordering thresh");
/* Timer related controls */
rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
@@ -977,33 +1291,33 @@
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "persmin", CTLFLAG_RW,
- &rack_persist_min, 250,
- "What is the minimum time in milliseconds between persists");
+ &rack_persist_min, 250000,
+ "What is the minimum time in microseconds between persists");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "persmax", CTLFLAG_RW,
- &rack_persist_max, 2000,
- "What is the largest delay in milliseconds between persists");
+ &rack_persist_max, 2000000,
+ "What is the largest delay in microseconds between persists");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "delayed_ack", CTLFLAG_RW,
- &rack_delayed_ack_time, 200,
- "Delayed ack time (200ms)");
+ &rack_delayed_ack_time, 40000,
+ "Delayed ack time (40ms in microseconds)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "minrto", CTLFLAG_RW,
- &rack_rto_min, 0,
- "Minimum RTO in ms -- set with caution below 1000 due to TLP");
+ &rack_rto_min, 30000,
+ "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "maxrto", CTLFLAG_RW,
- &rack_rto_max, 0,
- "Maxiumum RTO in ms -- should be at least as large as min_rto");
+ &rack_rto_max, 4000000,
+ "Maxiumum RTO in microseconds -- should be at least as large as min_rto");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "minto", CTLFLAG_RW,
- &rack_min_to, 1,
- "Minimum rack timeout in milliseconds");
+ &rack_min_to, 1000,
+ "Minimum rack timeout in microseconds");
/* Measure controls */
rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -1053,10 +1367,57 @@
"misc",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Misc related controls");
+#ifdef TCP_ACCOUNTING
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "tcp_acct", CTLFLAG_RW,
+ &rack_tcp_accounting, 0,
+ "Should we turn on TCP accounting for all rack sessions?");
+#endif
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "prr_addback_max", CTLFLAG_RW,
+ &rack_prr_addbackmax, 2,
+ "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
+ &rack_stats_gets_ms_rtt, 1,
+ "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "clientlowbuf", CTLFLAG_RW,
+ &rack_client_low_buf, 0,
+ "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "defprofile", CTLFLAG_RW,
+ &rack_def_profile, 0,
+ "Should RACK use a default profile (0=no, num == profile num)?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "cmpack", CTLFLAG_RW,
+ &rack_use_cmp_acks, 1,
+ "Should RACK have LRO send compressed acks");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "fsb", CTLFLAG_RW,
+ &rack_use_fsb, 1,
+ "Should RACK use the fast send block?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rfo", CTLFLAG_RW,
+ &rack_use_rfo, 1,
+ "Should RACK use rack_fast_output()?");
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rsmrfo", CTLFLAG_RW,
+ &rack_use_rsm_rfo, 1,
+ "Should RACK use rack_fast_rsm_output()?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "shared_cwnd", CTLFLAG_RW,
- &rack_enable_shared_cwnd, 0,
+ &rack_enable_shared_cwnd, 1,
"Should RACK try to use the shared cwnd on connections where allowed");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
@@ -1091,18 +1452,8 @@
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "no_sack_needed", CTLFLAG_RW,
- &rack_sack_not_required, 0,
+ &rack_sack_not_required, 1,
"Do we allow rack to run on connections not supporting SACK");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
- &rack_use_proportional_reduce, 0,
- "Should we proportionaly reduce cwnd based on the number of losses ");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "recovery_prop", CTLFLAG_RW,
- &rack_proportional_rate, 10,
- "What percent reduction per loss");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "prr_sendalot", CTLFLAG_RW,
@@ -1110,9 +1461,9 @@
"Send a lot in prr");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "earlyrecovery", CTLFLAG_RW,
- &rack_early_recovery, 1,
- "Do we do early recovery with rack");
+ OID_AUTO, "autoscale", CTLFLAG_RW,
+ &rack_autosndbuf_inc, 20,
+ "What percentage should rack scale up its snd buffer by?");
/* Sack Attacker detection stuff */
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
@@ -1179,6 +1530,46 @@
&rack_sack_used_prev_merge,
"Total number of times we used the prev merge");
/* Counters */
+ rack_fto_send = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "fto_send", CTLFLAG_RD,
+ &rack_fto_send, "Total number of rack_fast_output sends");
+ rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
+ &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
+ rack_nfto_resend = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "nfto_resend", CTLFLAG_RD,
+ &rack_nfto_resend, "Total number of rack_output retransmissions");
+ rack_non_fto_send = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "nfto_send", CTLFLAG_RD,
+ &rack_non_fto_send, "Total number of rack_output first sends");
+ rack_extended_rfo = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "rfo_extended", CTLFLAG_RD,
+ &rack_extended_rfo, "Total number of times we extended rfo");
+
+ rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
+ &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
+ rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
+
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "hwpace_lost", CTLFLAG_RD,
+ &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
+
+
+
rack_badfr = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -1296,13 +1687,25 @@
SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "saw_enobufs", CTLFLAG_RD,
&rack_saw_enobuf,
- "Total number of times a segment did not cause hptsi");
+ "Total number of times a sends returned enobuf for non-hdwr paced connections");
+ rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
+ &rack_saw_enobuf_hw,
+ "Total number of times a send returned enobuf for hdwr paced connections");
rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
&rack_saw_enetunreach,
- "Total number of times a segment did not cause hptsi");
+ "Total number of times a send received a enetunreachable");
+ rack_hot_alloc = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "alloc_hot", CTLFLAG_RD,
+ &rack_hot_alloc,
+ "Total allocations from the top of our list");
rack_to_alloc = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -1339,6 +1742,51 @@
OID_AUTO, "split_limited", CTLFLAG_RD,
&rack_split_limited,
"Split allocations dropped due to limit");
+
+ for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
+ char name[32];
+ sprintf(name, "cmp_ack_cnt_%d", i);
+ rack_proc_comp_ack[i] = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, name, CTLFLAG_RD,
+ &rack_proc_comp_ack[i],
+ "Number of compressed acks we processed");
+ }
+ rack_large_ackcmp = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
+ &rack_large_ackcmp,
+ "Number of TCP connections with large mbuf's for compressed acks");
+ rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "cmp_small_mbufs", CTLFLAG_RD,
+ &rack_small_ackcmp,
+ "Number of TCP connections with small mbuf's for compressed acks");
+#ifdef INVARIANTS
+ rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "map_adjust_req", CTLFLAG_RD,
+ &rack_adjust_map_bw,
+ "Number of times we hit the case where the sb went up and down on a sendmap entry");
+#endif
+ rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
+ &rack_multi_single_eq,
+ "Number of compressed acks total represented");
+ rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
+ &rack_proc_non_comp_ack,
+ "Number of non compresseds acks that we processed");
+
+
rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -1424,6 +1872,18 @@
OID_AUTO, "timer_hole", CTLFLAG_RD,
&rack_per_timer_hole,
"Total persists start in timer hole");
+
+ rack_sbsndptr_wrong = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "sndptr_wrong", CTLFLAG_RD,
+ &rack_sbsndptr_wrong, "Total number of times the saved sbsndptr was incorret");
+ rack_sbsndptr_right = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "sndptr_right", CTLFLAG_RD,
+ &rack_sbsndptr_right, "Total number of times the saved sbsndptr was corret");
+
COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "outsize", CTLFLAG_RD,
@@ -1491,16 +1951,16 @@
* Nothing set by the user, use the system stack
* default.
*/
- return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
+ return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
}
win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
- return(win);
+ return (win);
}
static uint64_t
rack_get_fixed_pacing_bw(struct tcp_rack *rack)
{
- if (IN_RECOVERY(rack->rc_tp->t_flags))
+ if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
return (rack->r_ctl.rc_fixed_pacing_rate_rec);
else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
return (rack->r_ctl.rc_fixed_pacing_rate_ss);
@@ -1546,19 +2006,21 @@
}
/* Ok lets get the initial TCP win (not racks) */
bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
- srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
+ srtt = (uint64_t)rack->rc_tp->t_srtt;
bw *= (uint64_t)USECS_IN_SECOND;
bw /= srtt;
+ if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
+ bw = rack->r_ctl.bw_rate_cap;
return (bw);
} else {
uint64_t bw;
- if(rack->r_ctl.num_avg >= RACK_REQ_AVG) {
+ if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
/* Averaging is done, we can return the value */
bw = rack->r_ctl.gp_bw;
} else {
/* Still doing initial average must calculate */
- bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg;
+ bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements;
}
#ifdef NETFLIX_PEAKRATE
if ((rack->rc_tp->t_maxpeakrate) &&
@@ -1569,6 +2031,8 @@
return (rack->rc_tp->t_maxpeakrate);
}
#endif
+ if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
+ bw = rack->r_ctl.bw_rate_cap;
return (bw);
}
}
@@ -1579,26 +2043,26 @@
if (rack->use_fixed_rate) {
return (100);
} else if (rack->in_probe_rtt && (rsm == NULL))
- return(rack->r_ctl.rack_per_of_gp_probertt);
- else if ((IN_RECOVERY(rack->rc_tp->t_flags) &&
+ return (rack->r_ctl.rack_per_of_gp_probertt);
+ else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
rack->r_ctl.rack_per_of_gp_rec)) {
if (rsm) {
/* a retransmission always use the recovery rate */
- return(rack->r_ctl.rack_per_of_gp_rec);
+ return (rack->r_ctl.rack_per_of_gp_rec);
} else if (rack->rack_rec_nonrxt_use_cr) {
/* Directed to use the configured rate */
goto configured_rate;
} else if (rack->rack_no_prr &&
(rack->r_ctl.rack_per_of_gp_rec > 100)) {
/* No PRR, lets just use the b/w estimate only */
- return(100);
+ return (100);
} else {
/*
* Here we may have a non-retransmit but we
* have no overrides, so just use the recovery
* rate (prr is in effect).
*/
- return(rack->r_ctl.rack_per_of_gp_rec);
+ return (rack->r_ctl.rack_per_of_gp_rec);
}
}
configured_rate:
@@ -1606,16 +2070,64 @@
if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
return (rack->r_ctl.rack_per_of_gp_ss);
else
- return(rack->r_ctl.rack_per_of_gp_ca);
+ return (rack->r_ctl.rack_per_of_gp_ca);
+}
+
+static void
+rack_log_hdwr_pacing(struct tcp_rack *rack,
+ uint64_t rate, uint64_t hw_rate, int line,
+ int error, uint16_t mod)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+ const struct ifnet *ifp;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
+ if (rack->r_ctl.crte) {
+ ifp = rack->r_ctl.crte->ptbl->rs_ifp;
+ } else if (rack->rc_inp->inp_route.ro_nh &&
+ rack->rc_inp->inp_route.ro_nh->nh_ifp) {
+ ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
+ } else
+ ifp = NULL;
+ if (ifp) {
+ log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
+ }
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.bw_inuse = rate;
+ log.u_bbr.flex5 = line;
+ log.u_bbr.flex6 = error;
+ log.u_bbr.flex7 = mod;
+ log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex8 = rack->use_fixed_rate;
+ log.u_bbr.flex8 <<= 1;
+ log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
+ log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
+ log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
+ if (rack->r_ctl.crte)
+ log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
+ else
+ log.u_bbr.cur_del_rate = 0;
+ log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_HDWR_PACE, 0,
+ 0, &log, false, &tv);
+ }
}
static uint64_t
-rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm)
+rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
{
/*
* We allow rack_per_of_gp_xx to dictate our bw rate we want.
*/
- uint64_t bw_est;
+ uint64_t bw_est, high_rate;
uint64_t gain;
gain = (uint64_t)rack_get_output_gain(rack, rsm);
@@ -1624,6 +2136,43 @@
/* Never fall below the minimum (def 64kbps) */
if (bw_est < RACK_MIN_BW)
bw_est = RACK_MIN_BW;
+ if (rack->r_rack_hw_rate_caps) {
+ /* Rate caps are in place */
+ if (rack->r_ctl.crte != NULL) {
+ /* We have a hdwr rate already */
+ high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
+ if (bw_est >= high_rate) {
+ /* We are capping bw at the highest rate table entry */
+ rack_log_hdwr_pacing(rack,
+ bw_est, high_rate, __LINE__,
+ 0, 3);
+ bw_est = high_rate;
+ if (capped)
+ *capped = 1;
+ }
+ } else if ((rack->rack_hdrw_pacing == 0) &&
+ (rack->rack_hdw_pace_ena) &&
+ (rack->rack_attempt_hdwr_pace == 0) &&
+ (rack->rc_inp->inp_route.ro_nh != NULL) &&
+ (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
+ /*
+ * Special case, we have not yet attempted hardware
+ * pacing, and yet we may, when we do, find out if we are
+ * above the highest rate. We need to know the maxbw for the interface
+ * in question (if it supports ratelimiting). We get back
+ * a 0, if the interface is not found in the RL lists.
+ */
+ high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
+ if (high_rate) {
+ /* Yep, we have a rate is it above this rate? */
+ if (bw_est > high_rate) {
+ bw_est = high_rate;
+ if (capped)
+ *capped = 1;
+ }
+ }
+ }
+ }
return (bw_est);
}
@@ -1658,6 +2207,9 @@
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1674,8 +2226,8 @@
struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
- log.u_bbr.flex2 = to * 1000;
+ log.u_bbr.flex1 = rack->rc_tp->t_srtt;
+ log.u_bbr.flex2 = to;
log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
log.u_bbr.flex4 = slot;
log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
@@ -1690,6 +2242,11 @@
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
+ log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
+ log.u_bbr.lost = rack_rto_min;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1721,6 +2278,9 @@
log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1729,6 +2289,56 @@
}
}
+static void
+rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
+ struct rack_sendmap *prev,
+ struct rack_sendmap *rsm,
+ struct rack_sendmap *next,
+ int flag, uint32_t th_ack, int line)
+{
+ if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex8 = flag;
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.cur_del_rate = (uint64_t)prev;
+ log.u_bbr.delRate = (uint64_t)rsm;
+ log.u_bbr.rttProp = (uint64_t)next;
+ log.u_bbr.flex7 = 0;
+ if (prev) {
+ log.u_bbr.flex1 = prev->r_start;
+ log.u_bbr.flex2 = prev->r_end;
+ log.u_bbr.flex7 |= 0x4;
+ }
+ if (rsm) {
+ log.u_bbr.flex3 = rsm->r_start;
+ log.u_bbr.flex4 = rsm->r_end;
+ log.u_bbr.flex7 |= 0x2;
+ }
+ if (next) {
+ log.u_bbr.flex5 = next->r_start;
+ log.u_bbr.flex6 = next->r_end;
+ log.u_bbr.flex7 |= 0x1;
+ }
+ log.u_bbr.applimited = line;
+ log.u_bbr.pkts_out = th_ack;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ if (rack->rack_no_prr)
+ log.u_bbr.lost = 0;
+ else
+ log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_LOG_MAPCHG, 0,
+ 0, &log, false, &tv);
+ }
+}
+
static void
rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
struct rack_sendmap *rsm, int conf)
@@ -1741,30 +2351,28 @@
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
log.u_bbr.flex1 = t;
log.u_bbr.flex2 = len;
- log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC;
- log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC;
- log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC;
- log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
+ log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
+ log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
+ log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
+ log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
log.u_bbr.flex7 = conf;
- log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC;
+ log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
- if (rack->rack_no_prr)
- log.u_bbr.pkts_out = 0;
- else
- log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt;
+ log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
if (rsm) {
log.u_bbr.pkt_epoch = rsm->r_start;
log.u_bbr.lost = rsm->r_end;
log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
+ log.u_bbr.pacing_gain = rsm->r_flags;
} else {
/* Its a SYN */
log.u_bbr.pkt_epoch = rack->rc_tp->iss;
log.u_bbr.lost = 0;
log.u_bbr.cwnd_gain = 0;
+ log.u_bbr.pacing_gain = 0;
}
/* Write out general bits of interest rrs here */
log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
@@ -1787,11 +2395,17 @@
log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
+ log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ log.u_bbr.bw_inuse <<= 32;
+ if (rsm)
+ log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
TCP_LOG_EVENTP(tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_BBRRTT, 0,
0, &log, false, &tv);
+
+
}
}
@@ -1809,14 +2423,38 @@
/* Convert our ms to a microsecond */
memset(&log, 0, sizeof(log));
- log.u_bbr.flex1 = rtt * 1000;
+ log.u_bbr.flex1 = rtt;
log.u_bbr.flex2 = rack->r_ctl.ack_count;
log.u_bbr.flex3 = rack->r_ctl.sack_count;
log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
+ log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
+ log.u_bbr.flex7 = 1;
log.u_bbr.flex8 = rack->sack_attack_disable;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
+ /*
+ * We capture in delRate the upper 32 bits as
+ * the confidence level we had declared, and the
+ * lower 32 bits as the actual RTT using the arrival
+ * timestamp.
+ */
+ log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
+ log.u_bbr.delRate <<= 32;
+ log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
+ /* Lets capture all the things that make up t_rtxcur */
+ log.u_bbr.applimited = rack_rto_min;
+ log.u_bbr.epoch = rack_rto_max;
+ log.u_bbr.lt_epoch = rtt;
+ log.u_bbr.lost = rack_rto_min;
+ log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
+ log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
+ log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
+ log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
+ log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1825,24 +2463,52 @@
}
}
-static inline void
-rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line)
+static void
+rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
{
- if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
union tcp_log_stackspecific log;
struct timeval tv;
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
- log.u_bbr.ininput = rack->rc_inp->inp_in_input;
- log.u_bbr.flex1 = line;
- log.u_bbr.flex2 = tick;
- log.u_bbr.flex3 = tp->t_maxunacktime;
- log.u_bbr.flex4 = tp->t_acktime;
- log.u_bbr.flex8 = event;
+ /* Convert our ms to a microsecond */
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = rtt;
+ log.u_bbr.flex2 = send_time;
+ log.u_bbr.flex3 = ack_time;
+ log.u_bbr.flex4 = where;
+ log.u_bbr.flex7 = 2;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- TCP_LOG_EVENTP(tp, NULL,
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_LOG_RTT, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+
+
+static inline void
+rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line)
+{
+ if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = line;
+ log.u_bbr.flex2 = tick;
+ log.u_bbr.flex3 = tp->t_maxunacktime;
+ log.u_bbr.flex4 = tp->t_acktime;
+ log.u_bbr.flex8 = event;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
+ TCP_LOG_EVENTP(tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
BBR_LOG_PROGRESS, 0,
@@ -1868,6 +2534,9 @@
log.u_bbr.flex8 = rack->rc_in_persist;
log.u_bbr.timeStamp = cts;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1877,7 +2546,7 @@
}
static void
-rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
+rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
@@ -1892,12 +2561,23 @@
log.u_bbr.flex5 = 0;
else
log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex6 = nsegs;
log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
- log.u_bbr.flex7 = rack->r_wanted_output;
+ log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */
+ log.u_bbr.flex7 <<= 1;
+ log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */
log.u_bbr.flex8 = rack->rc_in_persist;
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->r_might_revert;
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1907,7 +2587,7 @@
}
static void
-rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm)
+rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
{
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
@@ -1918,13 +2598,16 @@
cts = tcp_get_usecs(&tv);
log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
- log.u_bbr.flex4 = len;
- log.u_bbr.flex5 = orig_len;
- log.u_bbr.flex6 = rack->r_ctl.rc_sacked;
- log.u_bbr.flex7 = mod;
+ log.u_bbr.flex4 = arg1;
+ log.u_bbr.flex5 = arg2;
+ log.u_bbr.flex6 = arg3;
log.u_bbr.flex8 = frm;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.applimited = rack->r_ctl.rc_sacked;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
TCP_LOG_EVENTP(tp, NULL,
&tp->t_inpcb->inp_socket->so_rcv,
&tp->t_inpcb->inp_socket->so_snd,
@@ -1956,6 +2639,9 @@
log.u_bbr.lt_epoch = cwnd_to_use;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -1988,6 +2674,9 @@
log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
log.u_bbr.timeStamp = us_cts;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2020,7 +2709,7 @@
log.u_bbr.flex5 = flex5;
log.u_bbr.flex6 = flex6;
log.u_bbr.flex7 = flex7;
- log.u_bbr.flex8 = mod;
+ log.u_bbr.flex8 = mod;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2046,6 +2735,9 @@
log.u_bbr.flex6 = 0;
else
log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
+ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
+ log.u_bbr.pacing_gain = rack->r_must_retran;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
TCP_LOG_EVENTP(rack->rc_tp, NULL,
@@ -2077,6 +2769,9 @@
log.u_bbr.pkts_out = orig_cwnd;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->r_might_revert;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2121,6 +2816,15 @@
static void
rack_counter_destroy(void)
{
+ int i;
+
+ counter_u64_free(rack_fto_send);
+ counter_u64_free(rack_fto_rsm_send);
+ counter_u64_free(rack_nfto_resend);
+ counter_u64_free(rack_hw_pace_init_fail);
+ counter_u64_free(rack_hw_pace_lost);
+ counter_u64_free(rack_non_fto_send);
+ counter_u64_free(rack_extended_rfo);
counter_u64_free(rack_ack_total);
counter_u64_free(rack_express_sack);
counter_u64_free(rack_sack_total);
@@ -2150,13 +2854,20 @@
counter_u64_free(rack_paced_segments);
counter_u64_free(rack_unpaced_segments);
counter_u64_free(rack_saw_enobuf);
+ counter_u64_free(rack_saw_enobuf_hw);
counter_u64_free(rack_saw_enetunreach);
+ counter_u64_free(rack_hot_alloc);
counter_u64_free(rack_to_alloc);
counter_u64_free(rack_to_alloc_hard);
counter_u64_free(rack_to_alloc_emerg);
counter_u64_free(rack_to_alloc_limited);
counter_u64_free(rack_alloc_limited_conns);
counter_u64_free(rack_split_limited);
+ for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
+ counter_u64_free(rack_proc_comp_ack[i]);
+ }
+ counter_u64_free(rack_multi_single_eq);
+ counter_u64_free(rack_proc_non_comp_ack);
counter_u64_free(rack_sack_proc_all);
counter_u64_free(rack_sack_proc_restart);
counter_u64_free(rack_sack_proc_short);
@@ -2171,6 +2882,11 @@
counter_u64_free(rack_tlp_does_nada);
counter_u64_free(rack_try_scwnd);
counter_u64_free(rack_per_timer_hole);
+ counter_u64_free(rack_large_ackcmp);
+ counter_u64_free(rack_small_ackcmp);
+#ifdef INVARIANTS
+ counter_u64_free(rack_adjust_map_bw);
+#endif
COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
}
@@ -2180,12 +2896,33 @@
{
struct rack_sendmap *rsm;
+ /*
+ * First get the top of the list it in
+ * theory is the "hottest" rsm we have,
+ * possibly just freed by ack processing.
+ */
+ if (rack->rc_free_cnt > rack_free_cache) {
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
+ TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
+ counter_u64_add(rack_hot_alloc, 1);
+ rack->rc_free_cnt--;
+ return (rsm);
+ }
+ /*
+ * Once we get under our free cache we probably
+ * no longer have a "hot" one available. Lets
+ * get one from UMA.
+ */
rsm = uma_zalloc(rack_zone, M_NOWAIT);
if (rsm) {
rack->r_ctl.rc_num_maps_alloced++;
counter_u64_add(rack_to_alloc, 1);
return (rsm);
}
+ /*
+ * Dig in to our aux rsm's (the last two) since
+ * UMA failed to get us one.
+ */
if (rack->rc_free_cnt) {
counter_u64_add(rack_to_alloc_emerg, 1);
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
@@ -2274,17 +3011,29 @@
rack->r_ctl.rc_tlpsend = NULL;
if (rack->r_ctl.rc_sacklast == rsm)
rack->r_ctl.rc_sacklast = NULL;
- if (rack->rc_free_cnt < rack_free_cache) {
- memset(rsm, 0, sizeof(struct rack_sendmap));
- TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
- rsm->r_limit_type = 0;
- rack->rc_free_cnt++;
- return;
+ memset(rsm, 0, sizeof(struct rack_sendmap));
+ TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
+ rack->rc_free_cnt++;
+}
+
+static void
+rack_free_trim(struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm;
+
+ /*
+ * Free up all the tail entries until
+ * we get our list down to the limit.
+ */
+ while (rack->rc_free_cnt > rack_free_cache) {
+ rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
+ TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
+ rack->rc_free_cnt--;
+ uma_zfree(rack_zone, rsm);
}
- rack->r_ctl.rc_num_maps_alloced--;
- uma_zfree(rack_zone, rsm);
}
+
static uint32_t
rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
{
@@ -2330,11 +3079,11 @@
* goal.
*/
bw = rack_get_bw(rack);
- srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
+ srtt = (uint64_t)tp->t_srtt;
len = bw * srtt;
len /= (uint64_t)HPTS_USEC_IN_SEC;
len *= max(1, rack_goal_bdp);
- /* Now we need to round up to the nearest MSS */
+ /* Now we need to round up to the nearest MSS */
len = roundup(len, segsiz);
if (rack_min_measure_usec) {
/* Now calculate our min length for this b/w */
@@ -2655,7 +3404,7 @@
{
/*
* norm_grad = rtt_diff / minrtt;
- * new_per = curper * (1 - B * norm_grad)
+ * new_per = curper * (1 - B * norm_grad)
*
* B = rack_gp_decrease_per (default 10%)
* rtt_dif = input var current rtt-diff
@@ -2694,8 +3443,8 @@
highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
- perf = (((uint64_t)curper * ((uint64_t)1000000 -
- ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
+ perf = (((uint64_t)curper * ((uint64_t)1000000 -
+ ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
((uint64_t)highrttthresh * (uint64_t)1000000) /
(uint64_t)rtt)) / 100)) /(uint64_t)1000000);
return (perf);
@@ -2708,7 +3457,7 @@
uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
if (rack->rc_gp_incr) {
- /* Turn off increment counting */
+ /* Turn off increment counting */
rack->rc_gp_incr = 0;
rack->rc_gp_timely_inc_cnt = 0;
}
@@ -2774,7 +3523,7 @@
if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
logged |= 4;
- } else if (rack->rc_gp_saw_ca) {
+ } else if (rack->rc_gp_saw_ca) {
/* Sent in CA */
if (timely_says == 2) {
new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
@@ -3051,7 +3800,7 @@
/* Set to entry gp rtt */
rack_set_prtt_target(rack, segsiz,
rack->r_ctl.rc_entry_gp_rtt);
- } else {
+ } else {
uint64_t sum;
uint32_t setval;
@@ -3197,12 +3946,12 @@
endtime += rack_min_probertt_hold;
endtime += rack->r_ctl.rc_time_probertt_starts;
if (TSTMP_GEQ(us_cts, endtime)) {
- /* yes, exit probertt */
+ /* yes, exit probertt */
rack_exit_probertt(rack, us_cts);
- }
+ }
- } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
- /* Go into probertt, its been too long since we went lower */
+ } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
+ /* Go into probertt, its been too long since we went lower */
rack_enter_probertt(rack, us_cts);
}
}
@@ -3303,7 +4052,7 @@
*/
goto use_timely;
}
- } else if ((timely_says != 2) &&
+ } else if ((timely_says != 2) &&
!losses &&
(last_bw_est > up_bnd)) {
/*
@@ -3335,11 +4084,11 @@
}
rack->rc_gp_bwred = 0;
rack->rc_gp_timely_dec_cnt = 0;
- /* You get a set number of pushes if timely is trying to reduce */
+ /* You get a set number of pushes if timely is trying to reduce */
if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
} else {
- /* Log it stays the same */
+ /* Log it stays the same */
rack_log_timely(rack, 0, last_bw_est, up_bnd, 0,
__LINE__, 12);
}
@@ -3366,7 +4115,7 @@
rack->rc_gp_timely_inc_cnt = 0;
} else
rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
- } else {
+ } else {
rack->rc_gp_bwred = 0;
rack->rc_gp_timely_dec_cnt = 0;
rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
@@ -3445,6 +4194,8 @@
uint64_t tim, bytes_ps, ltim, stim, utim;
uint32_t segsiz, bytes, reqbytes, us_cts;
int32_t gput, new_rtt_diff, timely_says;
+ uint64_t resid_bw, subpart = 0, addpart = 0, srtt;
+ int did_add = 0;
us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
@@ -3453,7 +4204,7 @@
else
tim = 0;
- if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts))
+ if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
else
stim = 0;
@@ -3468,8 +4219,8 @@
utim = max(tim, 1);
else
utim = max(stim, 1);
- /* Lets validate utim */
- ltim = max(1, (utim/HPTS_USEC_IN_MSEC));
+ /* Lets get a msec time ltim too for the old stuff */
+ ltim = max(1, (utim / HPTS_USEC_IN_MSEC));
gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim;
reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
if ((tim == 0) && (stim == 0)) {
@@ -3602,14 +4353,14 @@
11, __LINE__, NULL);
bytes_ps = rack->r_ctl.last_max_bw;
}
- /* We store gp for b/w in bytes per second */
+ /* We store gp for b/w in bytes per second */
if (rack->rc_gp_filled == 0) {
/* Initial measurment */
if (bytes_ps) {
rack->r_ctl.gp_bw = bytes_ps;
rack->rc_gp_filled = 1;
- rack->r_ctl.num_avg = 1;
- rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
+ rack->r_ctl.num_measurements = 1;
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
} else {
rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
rack->r_ctl.rc_app_limited_cnt,
@@ -3629,14 +4380,17 @@
rack->r_ctl.rc_hpts_flags = 0;
rack->r_ctl.rc_last_output_to = 0;
}
- } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) {
+ did_add = 2;
+ } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
/* Still a small number run an average */
rack->r_ctl.gp_bw += bytes_ps;
- rack->r_ctl.num_avg++;
- if (rack->r_ctl.num_avg >= RACK_REQ_AVG) {
+ addpart = rack->r_ctl.num_measurements;
+ rack->r_ctl.num_measurements++;
+ if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
/* We have collected enought to move forward */
- rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg;
+ rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
}
+ did_add = 3;
} else {
/*
* We want to take 1/wma of the goodput and add in to 7/8th
@@ -3650,15 +4404,16 @@
* other hand if we get a measurement over 1ms with a
* 10ms rtt we only want to take a much smaller portion.
*/
- uint64_t resid_bw, subpart, addpart, srtt;
-
- srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
+ if (rack->r_ctl.num_measurements < 0xff) {
+ rack->r_ctl.num_measurements++;
+ }
+ srtt = (uint64_t)tp->t_srtt;
if (srtt == 0) {
/*
* Strange why did t_srtt go back to zero?
*/
if (rack->r_ctl.rc_rack_min_rtt)
- srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC);
+ srtt = rack->r_ctl.rc_rack_min_rtt;
else
srtt = HPTS_USEC_IN_MSEC;
}
@@ -3704,6 +4459,7 @@
}
resid_bw = rack->r_ctl.gp_bw - subpart;
rack->r_ctl.gp_bw = resid_bw + addpart;
+ did_add = 1;
} else {
if ((utim / srtt) <= 1) {
/*
@@ -3734,11 +4490,22 @@
* if its larger, all others we just
* add in.
*/
+ did_add = 1;
resid_bw = rack->r_ctl.gp_bw - subpart;
rack->r_ctl.gp_bw = resid_bw + addpart;
}
}
}
+ if ((rack->gp_ready == 0) &&
+ (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
+ /* We have enough measurements now */
+ rack->gp_ready = 1;
+ rack_set_cc_pacing(rack);
+ if (rack->defer_options)
+ rack_apply_deferred_options(rack);
+ }
+ rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
+ rack_get_bw(rack), 22, did_add, NULL);
/* We do not update any multipliers if we are in or have seen a probe-rtt */
if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
rack_update_multiplier(rack, timely_says, bytes_ps,
@@ -3831,17 +4598,20 @@
tp->gput_seq = rsm->r_start;
}
if (rsm->r_flags & RACK_ACKED)
- tp->gput_ts = rsm->r_ack_arrival;
+ tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
else
rack->app_limited_needs_set = 1;
- rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
+ rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
} else {
/*
* If we don't find the rsm due to some
* send-limit set the current time, which
* basically disables the send-limit.
*/
- rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL);
+ struct timeval tv;
+
+ microuptime(&tv);
+ rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
}
rack_log_pacing_delay_calc(rack,
tp->gput_seq,
@@ -3858,12 +4628,16 @@
* CC wrapper hook functions
*/
static void
-rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
+rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
uint16_t type, int32_t recovery)
{
+ uint32_t prior_cwnd, acked;
+ struct tcp_log_buffer *lgb = NULL;
+ uint8_t labc_to_use;
+
INP_WLOCK_ASSERT(tp->t_inpcb);
tp->ccv->nsegs = nsegs;
- tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
+ acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una);
if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
uint32_t max;
@@ -3872,18 +4646,14 @@
tp->ccv->bytes_this_ack = max;
}
}
- if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd)
- tp->ccv->flags |= CCF_CWND_LIMITED;
- else
- tp->ccv->flags &= ~CCF_CWND_LIMITED;
#ifdef STATS
stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
#endif
if ((tp->t_flags & TF_GPUTINPROG) &&
- rack_enough_for_measurement(tp, rack, th->th_ack)) {
+ rack_enough_for_measurement(tp, rack, th_ack)) {
/* Measure the Goodput */
- rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__);
+ rack_do_goodput_measurement(tp, rack, th_ack, __LINE__);
#ifdef NETFLIX_PEAKRATE
if ((type == CC_ACK) &&
(tp->t_maxpeakrate)) {
@@ -3893,12 +4663,19 @@
* it will only be used if pace_always is off i.e
* we don't do this for paced flows.
*/
- tcp_update_peakrate_thr(tp);
+ rack_update_peakrate_thr(tp);
}
#endif
}
- if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) {
- tp->t_bytes_acked += tp->ccv->bytes_this_ack;
+ /* Which way our we limited, if not cwnd limited no advance in CA */
+ if (tp->snd_cwnd <= tp->snd_wnd)
+ tp->ccv->flags |= CCF_CWND_LIMITED;
+ else
+ tp->ccv->flags &= ~CCF_CWND_LIMITED;
+ if (tp->snd_cwnd > tp->snd_ssthresh) {
+ tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
+ nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
+ /* For the setting of a window past use the actual scwnd we are using */
if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
tp->ccv->flags |= CCF_ABC_SENTAWND;
@@ -3907,11 +4684,61 @@
tp->ccv->flags &= ~CCF_ABC_SENTAWND;
tp->t_bytes_acked = 0;
}
+ prior_cwnd = tp->snd_cwnd;
+ if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
+ (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf)))
+ labc_to_use = rack->rc_labc;
+ else
+ labc_to_use = rack_max_abc_post_recovery;
+ if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = th_ack;
+ log.u_bbr.flex2 = tp->ccv->flags;
+ log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
+ log.u_bbr.flex4 = tp->ccv->nsegs;
+ log.u_bbr.flex5 = labc_to_use;
+ log.u_bbr.flex6 = prior_cwnd;
+ log.u_bbr.flex7 = V_tcp_do_newsack;
+ log.u_bbr.flex8 = 1;
+ lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
if (CC_ALGO(tp)->ack_received != NULL) {
/* XXXLAS: Find a way to live without this */
- tp->ccv->curack = th->th_ack;
+ tp->ccv->curack = th_ack;
+ tp->ccv->labc = labc_to_use;
+ tp->ccv->flags |= CCF_USE_LOCAL_ABC;
CC_ALGO(tp)->ack_received(tp->ccv, type);
}
+ if (lgb) {
+ lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
+ }
+ if (rack->r_must_retran) {
+ if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
+ /*
+ * We now are beyond the rxt point so lets disable
+ * the flag.
+ */
+ rack->r_ctl.rc_out_at_rto = 0;
+ rack->r_must_retran = 0;
+ } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
+ /*
+ * Only decrement the rc_out_at_rto if the cwnd advances
+ * at least a whole segment. Otherwise next time the peer
+ * acks, we won't be able to send this generaly happens
+ * when we are in Congestion Avoidance.
+ */
+ if (acked <= rack->r_ctl.rc_out_at_rto){
+ rack->r_ctl.rc_out_at_rto -= acked;
+ } else {
+ rack->r_ctl.rc_out_at_rto = 0;
+ }
+ }
+ }
#ifdef STATS
stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
#endif
@@ -3929,7 +4756,7 @@
}
static void
-tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
+tcp_rack_partialack(struct tcpcb *tp)
{
struct tcp_rack *rack;
@@ -3948,7 +4775,7 @@
}
static void
-rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
+rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
{
struct tcp_rack *rack;
uint32_t orig_cwnd;
@@ -3956,21 +4783,57 @@
orig_cwnd = tp->snd_cwnd;
INP_WLOCK_ASSERT(tp->t_inpcb);
rack = (struct tcp_rack *)tp->t_fb_ptr;
- if (rack->rc_not_backing_off == 0) {
- /* only alert CC if we alerted when we entered */
- if (CC_ALGO(tp)->post_recovery != NULL) {
- tp->ccv->curack = th->th_ack;
- CC_ALGO(tp)->post_recovery(tp->ccv);
- }
- if (tp->snd_cwnd > tp->snd_ssthresh) {
- /* Drop us down to the ssthresh (1/2 cwnd at loss) */
+ /* only alert CC if we alerted when we entered */
+ if (CC_ALGO(tp)->post_recovery != NULL) {
+ tp->ccv->curack = th_ack;
+ CC_ALGO(tp)->post_recovery(tp->ccv);
+ if (tp->snd_cwnd < tp->snd_ssthresh) {
+ /*
+ * Rack has burst control and pacing
+ * so lets not set this any lower than
+ * snd_ssthresh per RFC-6582 (option 2).
+ */
tp->snd_cwnd = tp->snd_ssthresh;
}
}
+ if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = th_ack;
+ log.u_bbr.flex2 = tp->ccv->flags;
+ log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
+ log.u_bbr.flex4 = tp->ccv->nsegs;
+ log.u_bbr.flex5 = V_tcp_abc_l_var;
+ log.u_bbr.flex6 = orig_cwnd;
+ log.u_bbr.flex7 = V_tcp_do_newsack;
+ log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex8 = 2;
+ tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
if ((rack->rack_no_prr == 0) &&
+ (rack->no_prr_addback == 0) &&
(rack->r_ctl.rc_prr_sndcnt > 0)) {
- /* Suck the next prr cnt back into cwnd */
- tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
+ /*
+ * Suck the next prr cnt back into cwnd, but
+ * only do that if we are not application limited.
+ */
+ if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
+ /*
+ * We are allowed to add back to the cwnd the amount we did
+ * not get out if:
+ * a) no_prr_addback is off.
+ * b) we are not app limited
+ * c) we are doing prr
+ * <and>
+ * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
+ */
+ tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
+ rack->r_ctl.rc_prr_sndcnt);
+ }
rack->r_ctl.rc_prr_sndcnt = 0;
rack_log_to_prr(rack, 1, 0);
}
@@ -3980,12 +4843,21 @@
}
static void
-rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
+rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack)
{
struct tcp_rack *rack;
+ uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
INP_WLOCK_ASSERT(tp->t_inpcb);
-
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
+#endif
+ if (IN_RECOVERY(tp->t_flags) == 0) {
+ in_rec_at_entry = 0;
+ ssthresh_enter = tp->snd_ssthresh;
+ cwnd_enter = tp->snd_cwnd;
+ } else
+ in_rec_at_entry = 1;
rack = (struct tcp_rack *)tp->t_fb_ptr;
switch (type) {
case CC_NDUPACK:
@@ -3996,7 +4868,7 @@
rack->r_ctl.rc_prr_out = 0;
if (rack->rack_no_prr == 0) {
rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
- rack_log_to_prr(rack, 2, 0);
+ rack_log_to_prr(rack, 2, in_rec_at_entry);
}
rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
tp->snd_recover = tp->snd_max;
@@ -4010,7 +4882,7 @@
* Allow ECN reaction on ACK to CWR, if
* that data segment was also CE marked.
*/
- SEQ_GEQ(th->th_ack, tp->snd_recover)) {
+ SEQ_GEQ(ack, tp->snd_recover)) {
EXIT_CONGRECOVERY(tp->t_flags);
KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
tp->snd_recover = tp->snd_max + 1;
@@ -4024,7 +4896,9 @@
EXIT_RECOVERY(tp->t_flags);
tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
+ orig_cwnd = tp->snd_cwnd;
tp->snd_cwnd = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 16, orig_cwnd);
if (tp->t_flags2 & TF2_ECN_PERMIT)
tp->t_flags2 |= TF2_ECN_SND_CWR;
break;
@@ -4046,46 +4920,19 @@
tp->t_badrxtwin = 0;
break;
}
- /*
- * If we are below our max rtt, don't
- * signal the CC control to change things.
- * instead set it up so that we are in
- * recovery but not going to back off.
- */
-
- if (rack->rc_highly_buffered) {
- /*
- * Do we use the higher rtt for
- * our threshold to not backoff (like CDG)?
- */
- uint32_t rtt_mul, rtt_div;
-
- if (rack_use_max_for_nobackoff) {
- rtt_mul = (rack_gp_rtt_maxmul - 1);
- rtt_div = 1;
- } else {
- rtt_mul = rack_gp_rtt_minmul;
- rtt_div = max(rack_gp_rtt_mindiv , 1);
- }
- if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt +
- ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) /
- rtt_div))) {
- /* below our min threshold */
- rack->rc_not_backing_off = 1;
- ENTER_RECOVERY(rack->rc_tp->t_flags);
- rack_log_rtt_shrinks(rack, 0,
- rtt_mul,
- rtt_div,
- RACK_RTTS_NOBACKOFF);
- return;
- }
- }
- rack->rc_not_backing_off = 0;
- if (CC_ALGO(tp)->cong_signal != NULL) {
- if (th != NULL)
- tp->ccv->curack = th->th_ack;
+ if ((CC_ALGO(tp)->cong_signal != NULL) &&
+ (type != CC_RTO)){
+ tp->ccv->curack = ack;
CC_ALGO(tp)->cong_signal(tp->ccv, type);
}
+ if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
+ rack_log_to_prr(rack, 15, cwnd_enter);
+ rack->r_ctl.dsack_byte_cnt = 0;
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
+ rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
+ rack->r_ent_rec_ns = 1;
+ }
}
static inline void
@@ -4131,7 +4978,7 @@
*/
#define DELAY_ACK(tp, tlen) \
(((tp->t_flags & TF_RXWIN0SENT) == 0) && \
- ((tp->t_flags & TF_DELACK) == 0) && \
+ ((tp->t_flags & TF_DELACK) == 0) && \
(tlen <= tp->t_maxseg) && \
(tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
@@ -4238,8 +5085,8 @@
thresh += 1;
}
/* We don't let the rack timeout be above a RTO */
- if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
- thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
+ if (thresh > rack->rc_tp->t_rxtcur) {
+ thresh = rack->rc_tp->t_rxtcur;
}
/* And we don't want it above the RTO max either */
if (thresh > rack_rto_max) {
@@ -4263,7 +5110,7 @@
else
thresh = (srtt * 2);
- /* Get the previous sent packet, if any */
+ /* Get the previous sent packet, if any */
segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
counter_u64_add(rack_enter_tlp_calc, 1);
len = rsm->r_end - rsm->r_start;
@@ -4293,12 +5140,12 @@
counter_u64_add(rack_used_tlpmethod, 1);
idx = rsm->r_rtr_cnt - 1;
nidx = prsm->r_rtr_cnt - 1;
- if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
+ if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
/* Yes it was sent later (or at the same time) */
inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
}
thresh += inter_gap;
- } else if (len <= segsiz) {
+ } else if (len <= segsiz) {
/*
* Possibly compensate for delayed-ack.
*/
@@ -4322,9 +5169,9 @@
thresh = alt_thresh;
}
}
- /* Not above an RTO */
- if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
- thresh = TICKS_2_MSEC(tp->t_rxtcur);
+ /* Not above an RTO */
+ if (thresh > tp->t_rxtcur) {
+ thresh = tp->t_rxtcur;
}
/* Not above a RTO max */
if (thresh > rack_rto_max) {
@@ -4349,10 +5196,10 @@
* yet set.
*/
if (rack->rc_rack_rtt)
- return(rack->rc_rack_rtt);
+ return (rack->rc_rack_rtt);
else if (tp->t_srtt == 0)
- return(RACK_INITIAL_RTO);
- return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT));
+ return (RACK_INITIAL_RTO);
+ return (tp->t_srtt);
}
static struct rack_sendmap *
@@ -4384,10 +5231,10 @@
idx = rsm->r_rtr_cnt - 1;
srtt = rack_grab_rtt(tp, rack);
thresh = rack_calc_thresh_rack(rack, srtt, tsused);
- if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) {
+ if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
return (NULL);
}
- if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
+ if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
return (NULL);
}
/* Ok if we reach here we are over-due and this guy can be sent */
@@ -4400,7 +5247,7 @@
rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
}
- rack_cong_signal(tp, NULL, CC_NDUPACK);
+ rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
return (rsm);
}
@@ -4411,8 +5258,8 @@
int32_t tt;
uint32_t ret_val;
- t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
- TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
+ t = (tp->t_srtt + (tp->t_rttvar << 2));
+ RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
rack_persist_min, rack_persist_max);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
@@ -4446,32 +5293,65 @@
}
rack->rc_on_min_to = 0;
if ((tp->t_state < TCPS_ESTABLISHED) ||
- ((tp->t_flags & TF_SACK_PERMIT) == 0))
+ ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
goto activate_rxt;
+ }
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
if ((rsm == NULL) || sup_rack) {
- /* Nothing on the send map */
+ /* Nothing on the send map or no rack */
activate_rxt:
time_since_sent = 0;
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
if (rsm) {
+ /*
+ * Should we discount the RTX timer any?
+ *
+ * We want to discount it the smallest amount.
+ * If a timer (Rack/TLP or RXT) has gone off more
+ * recently thats the discount we want to use (now - timer time).
+ * If the retransmit of the oldest packet was more recent then
+ * we want to use that (now - oldest-packet-last_transmit_time).
+ *
+ */
idx = rsm->r_rtr_cnt - 1;
- if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
- tstmp_touse = rsm->r_tim_lastsent[idx];
+ if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
+ tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
else
- tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
+ tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
if (TSTMP_GT(cts, tstmp_touse))
time_since_sent = cts - tstmp_touse;
}
if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
- to = TICKS_2_MSEC(tp->t_rxtcur);
+ to = tp->t_rxtcur;
if (to > time_since_sent)
to -= time_since_sent;
else
to = rack->r_ctl.rc_min_to;
if (to == 0)
to = 1;
+ /* Special case for KEEPINIT */
+ if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
+ (TP_KEEPINIT(tp) != 0) &&
+ rsm) {
+ /*
+ * We have to put a ceiling on the rxt timer
+ * of the keep-init timeout.
+ */
+ uint32_t max_time, red;
+
+ max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
+ if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
+ red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
+ if (red < max_time)
+ max_time -= red;
+ else
+ max_time = 1;
+ }
+ /* Reduce timeout to the keep value if needed */
+ if (max_time < to)
+ to = max_time;
+ }
return (to);
}
return (0);
@@ -4505,7 +5385,7 @@
goto activate_rxt;
}
if ((rack->use_rack_rr == 0) &&
- (IN_RECOVERY(tp->t_flags)) &&
+ (IN_FASTRECOVERY(tp->t_flags)) &&
(rack->rack_no_prr == 0) &&
(rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
/*
@@ -4521,7 +5401,7 @@
srtt = rack_grab_rtt(tp, rack);
thresh = rack_calc_thresh_rack(rack, srtt, cts);
idx = rsm->r_rtr_cnt - 1;
- exp = rsm->r_tim_lastsent[idx] + thresh;
+ exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
if (SEQ_GEQ(exp, cts)) {
to = exp - cts;
if (to < rack->r_ctl.rc_min_to) {
@@ -4557,16 +5437,25 @@
}
idx = rsm->r_rtr_cnt - 1;
time_since_sent = 0;
- if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
- tstmp_touse = rsm->r_tim_lastsent[idx];
+ if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
+ tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
else
- tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
+ tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
if (TSTMP_GT(cts, tstmp_touse))
time_since_sent = cts - tstmp_touse;
is_tlp_timer = 1;
if (tp->t_srtt) {
- srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
- srtt = TICKS_2_MSEC(srtt_cur);
+ if ((rack->rc_srtt_measure_made == 0) &&
+ (tp->t_srtt == 1)) {
+ /*
+ * If another stack as run and set srtt to 1,
+ * then the srtt was 0, so lets use the initial.
+ */
+ srtt = RACK_INITIAL_RTO;
+ } else {
+ srtt_cur = tp->t_srtt;
+ srtt = srtt_cur;
+ }
} else
srtt = RACK_INITIAL_RTO;
/*
@@ -4574,23 +5463,29 @@
* rack RTT has spiked we want to use
* the last RTT not the smoothed one.
*/
- if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack)))
+ if (rack_tlp_use_greater &&
+ tp->t_srtt &&
+ (srtt < rack_grab_rtt(tp, rack))) {
srtt = rack_grab_rtt(tp, rack);
+ }
thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
- if (thresh > time_since_sent)
+ if (thresh > time_since_sent) {
to = thresh - time_since_sent;
- else {
+ } else {
to = rack->r_ctl.rc_min_to;
rack_log_alt_to_to_cancel(rack,
thresh, /* flex1 */
time_since_sent, /* flex2 */
tstmp_touse, /* flex3 */
rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
- rsm->r_tim_lastsent[idx],
+ (uint32_t)rsm->r_tim_lastsent[idx],
srtt,
idx, 99);
}
- if (to > TCPTV_REXMTMAX) {
+ if (to < rack_tlp_min) {
+ to = rack_tlp_min;
+ }
+ if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) {
/*
* If the TLP time works out to larger than the max
* RTO lets not do TLP.. just RTO.
@@ -4630,6 +5525,8 @@
rack->r_ctl.rc_went_idle_time = 1;
rack_timer_cancel(tp, rack, cts, __LINE__);
tp->t_rxtshift = 0;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max);
rack->rc_in_persist = 1;
}
}
@@ -4637,9 +5534,9 @@
static void
rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
- if (rack->rc_inp->inp_in_hpts) {
+ if (rack->rc_inp->inp_in_hpts) {
tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
- rack->r_ctl.rc_hpts_flags = 0;
+ rack->r_ctl.rc_hpts_flags = 0;
}
#ifdef NETFLIX_SHARED_CWND
if (rack->r_ctl.rc_scw) {
@@ -4665,7 +5562,7 @@
extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
idle_min += (uint32_t)extra;
}
- if (time_idle >= idle_min) {
+ if (time_idle >= idle_min) {
/* Yes, we count it as a probe-rtt. */
uint32_t us_cts;
@@ -4683,7 +5580,9 @@
rack->rc_in_persist = 0;
rack->r_ctl.rc_went_idle_time = 0;
tp->t_rxtshift = 0;
- rack->r_ctl.rc_agg_delayed = 0;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max);
+ rack->r_ctl.rc_agg_delayed = 0;
rack->r_early = 0;
rack->r_late = 0;
rack->r_ctl.rc_agg_early = 0;
@@ -4705,7 +5604,7 @@
log.u_bbr.flex6 = diag->need_new_to;
log.u_bbr.flex7 = diag->p_hpts_active;
log.u_bbr.flex8 = diag->p_on_min_sleep;
- /* Hijack other fields as needed */
+ /* Hijack other fields as needed */
log.u_bbr.epoch = diag->have_slept;
log.u_bbr.lt_epoch = diag->yet_to_sleep;
log.u_bbr.pkts_out = diag->co_ret;
@@ -4728,6 +5627,27 @@
}
+static void
+rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
+{
+ if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = sb->sb_flags;
+ log.u_bbr.flex2 = len;
+ log.u_bbr.flex3 = sb->sb_state;
+ log.u_bbr.flex8 = type;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_LOG_SB_WAKE, 0,
+ len, &log, false, &tv);
+ }
+}
+
static void
rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
int32_t slot, uint32_t tot_len_this_send, int sup_rack)
@@ -4737,6 +5657,7 @@
struct timeval tv;
uint32_t delayed_ack = 0;
uint32_t hpts_timeout;
+ uint32_t entry_slot = slot;
uint8_t stopped;
uint32_t left = 0;
uint32_t us_cts;
@@ -4758,11 +5679,16 @@
rack->r_ctl.rc_hpts_flags = 0;
us_cts = tcp_get_usecs(&tv);
/* Now early/late accounting */
- if (rack->r_early) {
+ rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL);
+ if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
/*
* We have a early carry over set,
* we can always add more time so we
* can always make this compensation.
+ *
+ * Note if ack's are allowed to wake us do not
+ * penalize the next timer for being awoke
+ * by an ack aka the rc_agg_early (non-paced mode).
*/
slot += rack->r_ctl.rc_agg_early;
rack->r_early = 0;
@@ -4825,7 +5751,7 @@
}
#endif
if (tp->t_flags & TF_DELACK) {
- delayed_ack = TICKS_2_MSEC(tcp_delacktime);
+ delayed_ack = TICKS_2_USEC(tcp_delacktime);
rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
}
if (delayed_ack && ((hpts_timeout == 0) ||
@@ -4848,10 +5774,16 @@
*/
if (TCPS_HAVEESTABLISHED(tp->t_state)) {
/* Get the established keep-alive time */
- hpts_timeout = TP_KEEPIDLE(tp);
+ hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
} else {
- /* Get the initial setup keep-alive time */
- hpts_timeout = TP_KEEPINIT(tp);
+ /*
+ * Get the initial setup keep-alive time,
+ * note that this is probably not going to
+ * happen, since rack will be running a rxt timer
+ * if a SYN of some sort is outstanding. It is
+ * actually handled in rack_timeout_rxt().
+ */
+ hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
}
rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
if (rack->in_probe_rtt) {
@@ -4862,7 +5794,7 @@
* This will get us out of probe-rtt and update
* our min-rtt.
*/
- hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC);
+ hpts_timeout = rack_min_probertt_hold;
}
}
}
@@ -4889,7 +5821,9 @@
hpts_timeout = 0x7ffffffe;
rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
}
- if ((rack->rc_gp_filled == 0) &&
+ rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL);
+ if ((rack->gp_ready == 0) &&
+ (rack->use_fixed_rate == 0) &&
(hpts_timeout < slot) &&
(rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
/*
@@ -4903,29 +5837,72 @@
slot = hpts_timeout;
}
rack->r_ctl.last_pacing_time = slot;
+ /**
+ * Turn off all the flags for queuing by default. The
+ * flags have important meanings to what happens when
+ * LRO interacts with the transport. Most likely (by default now)
+ * mbuf_queueing and ack compression are on. So the transport
+ * has a couple of flags that control what happens (if those
+ * are not on then these flags won't have any effect since it
+ * won't go through the queuing LRO path).
+ *
+ * INP_MBUF_QUEUE_READY - This flags says that I am busy
+ * pacing output, so don't disturb. But
+ * it also means LRO can wake me if there
+ * is a SACK arrival.
+ *
+ * INP_DONT_SACK_QUEUE - This flag is used in conjunction
+ * with the above flag (QUEUE_READY) and
+ * when present it says don't even wake me
+ * if a SACK arrives.
+ *
+ * The idea behind these flags is that if we are pacing we
+ * set the MBUF_QUEUE_READY and only get woken up if
+ * a SACK arrives (which could change things) or if
+ * our pacing timer expires. If, however, we have a rack
+ * timer running, then we don't even want a sack to wake
+ * us since the rack timer has to expire before we can send.
+ *
+ * Other cases should usually have none of the flags set
+ * so LRO can call into us.
+ */
+ inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
if (slot) {
rack->r_ctl.rc_last_output_to = us_cts + slot;
- if (rack->rc_always_pace || rack->r_mbuf_queue) {
- if ((rack->rc_gp_filled == 0) ||
- rack->pacing_longer_than_rtt) {
- inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
- } else {
- inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
- if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
- (rack->r_rr_config != 3))
- inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
- else
- inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
- }
+ /*
+ * A pacing timer (slot) is being set, in
+ * such a case we cannot send (we are blocked by
+ * the timer). So lets tell LRO that it should not
+ * wake us unless there is a SACK. Note this only
+ * will be effective if mbuf queueing is on or
+ * compressed acks are being processed.
+ */
+ inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
+ /*
+ * But wait if we have a Rack timer running
+ * even a SACK should not disturb us (with
+ * the exception of r_rr_config 3).
+ */
+ if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
+ (rack->r_rr_config != 3))
+ inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
+ if (rack->rc_ack_can_sendout_data) {
+ /*
+ * Ahh but wait, this is that special case
+ * where the pacing timer can be disturbed
+ * backout the changes (used for non-paced
+ * burst limiting).
+ */
+ inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
}
if ((rack->use_rack_rr) &&
(rack->r_rr_config < 2) &&
- ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) {
+ ((hpts_timeout) && (hpts_timeout < slot))) {
/*
* Arrange for the hpts to kick back in after the
* t-o if the t-o does not cause a send.
*/
- (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout),
+ (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
__LINE__, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
@@ -4936,21 +5913,15 @@
rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
}
} else if (hpts_timeout) {
- if (rack->rc_always_pace || rack->r_mbuf_queue) {
- if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) {
- /* For a rack timer, don't wake us */
- inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
- if (rack->r_rr_config != 3)
- inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
- else
- inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
- } else {
- /* All other timers wake us up */
- inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
- inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
- }
- }
- (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout),
+ /*
+ * With respect to inp_flags2 here, lets let any new acks wake
+ * us up here. Since we are not pacing (no pacing timer), output
+ * can happen so we should let it. If its a Rack timer, then any inbound
+ * packet probably won't change the sending (we will be blocked)
+ * but it may change the prr stats so letting it in (the set defaults
+ * at the start of this block) are good enough.
+ */
+ (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
__LINE__, &diag);
rack_log_hpts_diag(rack, us_cts, &diag, &tv);
rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
@@ -4986,12 +5957,10 @@
* settings.
*/
struct rack_sendmap *rsm;
- int32_t recovery;
if (tp->t_timers->tt_flags & TT_STOPPED) {
return (1);
}
- recovery = IN_RECOVERY(tp->t_flags);
counter_u64_add(rack_to_tot, 1);
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
@@ -4999,9 +5968,8 @@
rsm = rack_check_recovery_mode(tp, cts);
rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
if (rsm) {
- uint32_t rtt;
-
rack->r_ctl.rc_resend = rsm;
+ rack->r_timer_override = 1;
if (rack->use_rack_rr) {
/*
* Don't accumulate extra pacing delay
@@ -5011,36 +5979,8 @@
* time (in other words we get the min pacing
* time versus rrr pacing time).
*/
- rack->r_timer_override = 1;
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
}
- rtt = rack->rc_rack_rtt;
- if (rtt == 0)
- rtt = 1;
- if (rack->rack_no_prr == 0) {
- if ((recovery == 0) &&
- (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
- /*
- * The rack-timeout that enter's us into recovery
- * will force out one MSS and set us up so that we
- * can do one more send in 2*rtt (transitioning the
- * rack timeout into a rack-tlp).
- */
- rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
- rack->r_timer_override = 1;
- rack_log_to_prr(rack, 3, 0);
- } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) &&
- rack->use_rack_rr) {
- /*
- * When a rack timer goes, if the rack rr is
- * on, arrange it so we can send a full segment
- * overriding prr (though we pay a price for this
- * for future new sends).
- */
- rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
- rack_log_to_prr(rack, 4, 0);
- }
- }
}
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
if (rsm == NULL) {
@@ -5052,6 +5992,50 @@
return (0);
}
+static void
+rack_adjust_orig_mlen(struct rack_sendmap *rsm)
+{
+ if (rsm->m->m_len > rsm->orig_m_len) {
+ /*
+ * Mbuf grew, caused by sbcompress, our offset does
+ * not change.
+ */
+ rsm->orig_m_len = rsm->m->m_len;
+ } else if (rsm->m->m_len < rsm->orig_m_len) {
+ /*
+ * Mbuf shrank, trimmed off the top by an ack, our
+ * offset changes.
+ */
+ rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
+ rsm->orig_m_len = rsm->m->m_len;
+ }
+}
+
+static void
+rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
+{
+ struct mbuf *m;
+ uint32_t soff;
+
+ if (src_rsm->orig_m_len != src_rsm->m->m_len) {
+ /* Fix up the orig_m_len and possibly the mbuf offset */
+ rack_adjust_orig_mlen(src_rsm);
+ }
+ m = src_rsm->m;
+ soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
+ while (soff >= m->m_len) {
+ /* Move out past this mbuf */
+ soff -= m->m_len;
+ m = m->m_next;
+ KASSERT((m != NULL),
+ ("rsm:%p nrsm:%p hit at soff:%u null m",
+ src_rsm, rsm, soff));
+ }
+ rsm->m = m;
+ rsm->soff = soff;
+ rsm->orig_m_len = m->m_len;
+}
+
static __inline void
rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
struct rack_sendmap *rsm, uint32_t start)
@@ -5063,13 +6047,23 @@
nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
nrsm->r_flags = rsm->r_flags;
nrsm->r_dupack = rsm->r_dupack;
- nrsm->usec_orig_send = rsm->usec_orig_send;
+ nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
nrsm->r_rtr_bytes = 0;
rsm->r_end = nrsm->r_start;
nrsm->r_just_ret = rsm->r_just_ret;
for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
}
+ /*
+ * Now we need to find nrsm's new location in the mbuf chain
+ * we basically calculate a new offset, which is soff +
+ * how much is left in original rsm. Then we walk out the mbuf
+ * chain to find the righ postion, it may be the same mbuf
+ * or maybe not.
+ */
+ KASSERT((rsm->m != NULL),
+ ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
+ rack_setup_offset_for_rsm(rsm, nrsm);
}
static struct rack_sendmap *
@@ -5089,6 +6083,8 @@
*/
struct rack_sendmap *rm;
+ rack_log_map_chg(rack->rc_tp, rack, NULL,
+ l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
l_rsm->r_end = r_rsm->r_end;
if (l_rsm->r_dupack < r_rsm->r_dupack)
l_rsm->r_dupack = r_rsm->r_dupack;
@@ -5132,7 +6128,7 @@
l_rsm->r_limit_type = 0;
}
rack_free(rack, r_rsm);
- return(l_rsm);
+ return (l_rsm);
}
/*
@@ -5152,7 +6148,7 @@
struct rack_sendmap *rsm = NULL;
struct rack_sendmap *insret;
struct socket *so;
- uint32_t amm, old_prr_snd = 0;
+ uint32_t amm;
uint32_t out, avail;
int collapsed_win = 0;
@@ -5173,6 +6169,8 @@
* need to figure out how to force a full MSS segment out.
*/
rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.dsack_byte_cnt = 0;
counter_u64_add(rack_tlp_tot, 1);
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
@@ -5204,10 +6202,9 @@
/* not enough to fill a MTU */
goto need_retran;
}
- if (IN_RECOVERY(tp->t_flags)) {
+ if (IN_FASTRECOVERY(tp->t_flags)) {
/* Unlikely */
if (rack->rack_no_prr == 0) {
- old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
if (out + amm <= tp->snd_wnd) {
rack->r_ctl.rc_prr_sndcnt = amm;
rack_log_to_prr(rack, 4, 0);
@@ -5286,6 +6283,7 @@
}
rack_clone_rsm(rack, nrsm, rsm,
(rsm->r_end - ctf_fixed_maxseg(tp)));
+ rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
#ifdef INVARIANTS
if (insret != NULL) {
@@ -5374,7 +6372,7 @@
*/
if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
(ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
- ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
+ TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
KMOD_TCPSTAT_INC(tcps_persistdrop);
retval = 1;
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
@@ -5495,52 +6493,74 @@
*/
struct rack_sendmap *rsm, *trsm = NULL;
struct tcp_rack *rack;
- int32_t cnt = 0;
rack = (struct tcp_rack *)tp->t_fb_ptr;
- rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
+ rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
/*
* Ideally we would like to be able to
* mark SACK-PASS on anything not acked here.
+ *
* However, if we do that we would burst out
* all that data 1ms apart. This would be unwise,
* so for now we will just let the normal rxt timer
* and tlp timer take care of it.
+ *
+ * Also we really need to stick them back in sequence
+ * order. This way we send in the proper order and any
+ * sacks that come floating in will "re-ack" the data.
+ * To do this we zap the tmap with an INIT and then
+ * walk through and place every rsm in the RB tree
+ * back in its seq ordered place.
*/
+ TAILQ_INIT(&rack->r_ctl.rc_tmap);
RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
- if (rsm->r_flags & RACK_ACKED) {
- cnt++;
- rsm->r_dupack = 0;
- rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
- if (rsm->r_in_tmap == 0) {
- /* We must re-add it back to the tlist */
- if (trsm == NULL) {
- TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
- } else {
- TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
- }
- rsm->r_in_tmap = 1;
- }
+ rsm->r_dupack = 0;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ /* We must re-add it back to the tlist */
+ if (trsm == NULL) {
+ TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ } else {
+ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
}
+ rsm->r_in_tmap = 1;
trsm = rsm;
if (rsm->r_flags & RACK_ACKED)
rsm->r_flags |= RACK_WAS_ACKED;
rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
}
/* Clear the count (we just un-acked them) */
+ rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
rack->r_ctl.rc_sacked = 0;
+ rack->r_ctl.rc_sacklast = NULL;
rack->r_ctl.rc_agg_delayed = 0;
rack->r_early = 0;
rack->r_ctl.rc_agg_early = 0;
rack->r_late = 0;
/* Clear the tlp rtx mark */
rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if (rack->r_ctl.rc_resend != NULL)
+ rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
rack->r_ctl.rc_prr_sndcnt = 0;
rack_log_to_prr(rack, 6, 0);
rack->r_timer_override = 1;
+ if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
+#ifdef NETFLIX_EXP_DETECTION
+ || (rack->sack_attack_disable != 0)
+#endif
+ ) && ((tp->t_flags & TF_SENTFIN) == 0)) {
+ /*
+ * For non-sack customers new data
+ * needs to go out as retransmits until
+ * we retransmit up to snd_max.
+ */
+ rack->r_must_retran = 1;
+ rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
+ rack->r_ctl.rc_sacked);
+ }
+ rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
}
static void
@@ -5590,11 +6610,44 @@
return (1);
}
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.dsack_byte_cnt = 0;
+ if (IN_FASTRECOVERY(tp->t_flags))
+ tp->t_flags |= TF_WASFRECOVERY;
+ else
+ tp->t_flags &= ~TF_WASFRECOVERY;
+ if (IN_CONGRECOVERY(tp->t_flags))
+ tp->t_flags |= TF_WASCRECOVERY;
+ else
+ tp->t_flags &= ~TF_WASCRECOVERY;
if (TCPS_HAVEESTABLISHED(tp->t_state) &&
(tp->snd_una == tp->snd_max)) {
/* Nothing outstanding .. nothing to do */
return (0);
}
+ /*
+ * Rack can only run one timer at a time, so we cannot
+ * run a KEEPINIT (gating SYN sending) and a retransmit
+ * timer for the SYN. So if we are in a front state and
+ * have a KEEPINIT timer we need to check the first transmit
+ * against now to see if we have exceeded the KEEPINIT time
+ * (if one is set).
+ */
+ if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
+ (TP_KEEPINIT(tp) != 0)) {
+ struct rack_sendmap *rsm;
+
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if (rsm) {
+ /* Ok we have something outstanding to test keepinit with */
+ if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
+ ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
+ /* We have exceeded the KEEPINIT time */
+ tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
+ goto drop_it;
+ }
+ }
+ }
/*
* Retransmission timer went off. Message has not been acked within
* retransmit interval. Back off to a longer retransmit interval
@@ -5612,10 +6665,11 @@
tp->t_rxtshift++;
}
if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+ tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
+drop_it:
tp->t_rxtshift = TCP_MAXRXTSHIFT;
KMOD_TCPSTAT_INC(tcps_timeoutdrop);
retval = 1;
- tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
tcp_set_inp_to_drop(rack->rc_inp,
(tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
goto out;
@@ -5639,27 +6693,19 @@
tp->snd_cwnd_prev = tp->snd_cwnd;
tp->snd_ssthresh_prev = tp->snd_ssthresh;
tp->snd_recover_prev = tp->snd_recover;
- if (IN_FASTRECOVERY(tp->t_flags))
- tp->t_flags |= TF_WASFRECOVERY;
- else
- tp->t_flags &= ~TF_WASFRECOVERY;
- if (IN_CONGRECOVERY(tp->t_flags))
- tp->t_flags |= TF_WASCRECOVERY;
- else
- tp->t_flags &= ~TF_WASCRECOVERY;
- tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
tp->t_flags |= TF_PREVVALID;
- } else
+ } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
tp->t_flags &= ~TF_PREVVALID;
KMOD_TCPSTAT_INC(tcps_rexmttimeo);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
- rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
+ rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
else
- rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
- TCPT_RANGESET(tp->t_rxtcur, rexmt,
- max(MSEC_2_TICKS(rack_rto_min), rexmt),
- MSEC_2_TICKS(rack_rto_max));
+ rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
+
+ RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
+ max(rack_rto_min, rexmt), rack_rto_max);
/*
* We enter the path for PLMTUD if connection is established or, if
* connection is FIN_WAIT_1 status, reason for the last is that if
@@ -5759,7 +6805,17 @@
}
}
/*
- * If we backed off this far, our srtt estimate is probably bogus.
+ * Disable RFC1323 and SACK if we haven't got any response to
+ * our third SYN to work-around some broken terminal servers
+ * (most of which have hopefully been retired) that have bad VJ
+ * header compression code which trashes TCP segments containing
+ * unknown-to-them TCP options.
+ */
+ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
+ (tp->t_rxtshift == 3))
+ tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
+ /*
+ * If we backed off this far, our srtt estimate is probably bogus.
* Clobber it so we'll take the next rtt measurement as our srtt;
* move the current srtt into rttvar to keep the current retransmit
* times until then.
@@ -5771,14 +6827,14 @@
else
#endif
in_losing(tp->t_inpcb);
- tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
+ tp->t_rttvar += tp->t_srtt;
tp->t_srtt = 0;
}
sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
tp->snd_recover = tp->snd_max;
tp->t_flags |= TF_ACKNOW;
tp->t_rtttime = 0;
- rack_cong_signal(tp, NULL, CC_RTO);
+ rack_cong_signal(tp, CC_RTO, tp->snd_una);
out:
return (retval);
}
@@ -5848,12 +6904,14 @@
ret = rack_timeout_delack(tp, rack, cts);
} else if (timers & PACE_TMR_RACK) {
rack->r_ctl.rc_tlp_rxt_last_time = cts;
+ rack->r_fast_output = 0;
ret = rack_timeout_rack(tp, rack, cts);
} else if (timers & PACE_TMR_TLP) {
rack->r_ctl.rc_tlp_rxt_last_time = cts;
ret = rack_timeout_tlp(tp, rack, cts);
} else if (timers & PACE_TMR_RXT) {
rack->r_ctl.rc_tlp_rxt_last_time = cts;
+ rack->r_fast_output = 0;
ret = rack_timeout_rxt(tp, rack, cts);
} else if (timers & PACE_TMR_PERSIT) {
ret = rack_timeout_persist(tp, rack, cts);
@@ -5949,9 +7007,10 @@
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint32_t ts)
+ struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag)
{
int32_t idx;
+ uint16_t stripped_flags;
rsm->r_rtr_cnt++;
rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
@@ -5966,6 +7025,7 @@
}
idx = rsm->r_rtr_cnt - 1;
rsm->r_tim_lastsent[idx] = ts;
+ stripped_flags = rsm->r_flags & ~(RACK_SENT_SP|RACK_SENT_FP);
if (rsm->r_flags & RACK_ACKED) {
/* Problably MTU discovery messing with us */
rsm->r_flags &= ~RACK_ACKED;
@@ -5986,7 +7046,7 @@
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp)
+ struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag)
{
/*
* We (re-)transmitted starting at rsm->r_start for some length
@@ -6003,7 +7063,7 @@
* We retransmitted the whole piece or more than the whole
* slopping into the next rsm.
*/
- rack_update_rsm(tp, rack, rsm, ts);
+ rack_update_rsm(tp, rack, rsm, ts, add_flag);
if (c_end == rsm->r_end) {
*lenp = 0;
return (0);
@@ -6051,15 +7111,17 @@
nrsm->r_in_tmap = 1;
}
rsm->r_flags &= (~RACK_HAS_FIN);
- rack_update_rsm(tp, rack, rsm, ts);
+ rack_update_rsm(tp, rack, rsm, ts, add_flag);
+ /* Log a split of rsm into rsm and nrsm */
+ rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
*lenp = 0;
return (0);
}
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
- uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
- uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts)
+ uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t cts,
+ struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff)
{
struct tcp_rack *rack;
struct rack_sendmap *rsm, *nrsm, *insret, fe;
@@ -6103,21 +7165,6 @@
}
rack = (struct tcp_rack *)tp->t_fb_ptr;
snd_una = tp->snd_una;
- if (SEQ_LEQ((seq_out + len), snd_una)) {
- /* Are sending an old segment to induce an ack (keep-alive)? */
- return;
- }
- if (SEQ_LT(seq_out, snd_una)) {
- /* huh? should we panic? */
- uint32_t end;
-
- end = seq_out + len;
- seq_out = snd_una;
- if (SEQ_GEQ(end, seq_out))
- len = end - seq_out;
- else
- len = 0;
- }
snd_max = tp->snd_max;
if (th_flags & (TH_SYN | TH_FIN)) {
/*
@@ -6138,12 +7185,27 @@
snd_max = tp->snd_nxt;
}
}
+ if (SEQ_LEQ((seq_out + len), snd_una)) {
+ /* Are sending an old segment to induce an ack (keep-alive)? */
+ return;
+ }
+ if (SEQ_LT(seq_out, snd_una)) {
+ /* huh? should we panic? */
+ uint32_t end;
+
+ end = seq_out + len;
+ seq_out = snd_una;
+ if (SEQ_GEQ(end, seq_out))
+ len = end - seq_out;
+ else
+ len = 0;
+ }
if (len == 0) {
/* We don't log zero window probes */
return;
}
- rack->r_ctl.rc_time_last_sent = ts;
- if (IN_RECOVERY(tp->t_flags)) {
+ rack->r_ctl.rc_time_last_sent = cts;
+ if (IN_FASTRECOVERY(tp->t_flags)) {
rack->r_ctl.rc_prr_out += len;
}
/* First question is it a retransmission or new? */
@@ -6159,26 +7221,58 @@
return;
}
if (th_flags & TH_FIN) {
- rsm->r_flags = RACK_HAS_FIN;
+ rsm->r_flags = RACK_HAS_FIN|add_flag;
} else {
- rsm->r_flags = 0;
+ rsm->r_flags = add_flag;
}
- rsm->r_tim_lastsent[0] = ts;
+ rsm->r_tim_lastsent[0] = cts;
rsm->r_rtr_cnt = 1;
rsm->r_rtr_bytes = 0;
- rsm->usec_orig_send = us_cts;
if (th_flags & TH_SYN) {
/* The data space is one beyond snd_una */
- rsm->r_flags |= RACK_HAS_SIN;
- rsm->r_start = seq_out + 1;
- rsm->r_end = rsm->r_start + (len - 1);
- } else {
- /* Normal case */
- rsm->r_start = seq_out;
- rsm->r_end = rsm->r_start + len;
+ rsm->r_flags |= RACK_HAS_SYN;
}
+ rsm->r_start = seq_out;
+ rsm->r_end = rsm->r_start + len;
rsm->r_dupack = 0;
+ /*
+ * save off the mbuf location that
+ * sndmbuf_noadv returned (which is
+ * where we started copying from)..
+ */
+ rsm->m = s_mb;
+ rsm->soff = s_moff;
+ /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
+ if (rsm->m) {
+ if (rsm->m->m_len <= rsm->soff) {
+ /*
+ * XXXrrs Question, will this happen?
+ *
+ * If sbsndptr is set at the correct place
+ * then s_moff should always be somewhere
+ * within rsm->m. But if the sbsndptr was
+ * off then that won't be true. If it occurs
+ * we need to walkout to the correct location.
+ */
+ struct mbuf *lm;
+
+ lm = rsm->m;
+ while (lm->m_len <= rsm->soff) {
+ rsm->soff -= lm->m_len;
+ lm = lm->m_next;
+ KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
+ __func__, rack, s_moff, s_mb, rsm->soff));
+ }
+ rsm->m = lm;
+ counter_u64_add(rack_sbsndptr_wrong, 1);
+ } else
+ counter_u64_add(rack_sbsndptr_right, 1);
+ rsm->orig_m_len = rsm->m->m_len;
+ } else
+ rsm->orig_m_len = 0;
rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ /* Log a new rsm */
+ rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
#ifdef INVARIANTS
if (insret != NULL) {
@@ -6194,7 +7288,7 @@
*
* If this is true mark it so.
*/
- if ((IN_RECOVERY(tp->t_flags) == 0) &&
+ if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
(ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
struct rack_sendmap *prsm;
@@ -6217,7 +7311,7 @@
rsm = NULL;
}
if ((rsm) && (rsm->r_start == seq_out)) {
- seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
+ seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
if (len == 0) {
return;
} else {
@@ -6230,7 +7324,7 @@
rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
if (rsm) {
if (rsm->r_start == seq_out) {
- seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
+ seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
if (len == 0) {
return;
} else {
@@ -6245,7 +7339,7 @@
*/
nrsm = rack_alloc_full_limit(rack);
if (nrsm == NULL) {
- rack_update_rsm(tp, rack, rsm, ts);
+ rack_update_rsm(tp, rack, rsm, cts, add_flag);
return;
}
/*
@@ -6254,6 +7348,7 @@
*/
rack_clone_rsm(rack, nrsm, rsm, seq_out);
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
+ rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
#ifdef INVARIANTS
if (insret != NULL) {
panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
@@ -6265,7 +7360,7 @@
nrsm->r_in_tmap = 1;
}
rsm->r_flags &= (~RACK_HAS_FIN);
- seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
+ seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag);
if (len == 0) {
return;
} else if (len > 0)
@@ -6281,15 +7376,15 @@
} else if (SEQ_LT(seq_out, tp->snd_max)) {
#ifdef INVARIANTS
printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
- seq_out, len, tp->snd_una, tp->snd_max);
+ seq_out, len, tp->snd_una, tp->snd_max);
printf("Starting Dump of all rack entries\n");
RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
printf("rsm:%p start:%u end:%u\n",
- rsm, rsm->r_start, rsm->r_end);
+ rsm, rsm->r_start, rsm->r_end);
}
printf("Dump complete\n");
panic("seq_out not found rack:%p tp:%p",
- rack, tp);
+ rack, tp);
#endif
} else {
#ifdef INVARIANTS
@@ -6298,7 +7393,7 @@
* flag)
*/
panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
- seq_out, len, tp->snd_max, tp);
+ seq_out, len, tp->snd_max, tp);
#endif
}
}
@@ -6456,39 +7551,32 @@
rack->r_ctl.rc_lowest_us_rtt = 1;
}
}
- rack_log_rtt_sample(rack, rtt);
o_srtt = tp->t_srtt;
o_var = tp->t_rttvar;
rack = (struct tcp_rack *)tp->t_fb_ptr;
if (tp->t_srtt != 0) {
/*
- * srtt is stored as fixed point with 5 bits after the
- * binary point (i.e., scaled by 8). The following magic is
- * equivalent to the smoothing algorithm in rfc793 with an
- * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
- * Adjust rtt to origin 0.
+ * We keep a simple srtt in microseconds, like our rtt
+ * measurement. We don't need to do any tricks with shifting
+ * etc. Instead we just add in 1/8th of the new measurement
+ * and subtract out 1/8 of the old srtt. We do the same with
+ * the variance after finding the absolute value of the
+ * difference between this sample and the current srtt.
*/
- delta = ((rtt - 1) << TCP_DELTA_SHIFT)
- - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
-
- tp->t_srtt += delta;
+ delta = tp->t_srtt - rtt;
+ /* Take off 1/8th of the current sRTT */
+ tp->t_srtt -= (tp->t_srtt >> 3);
+ /* Add in 1/8th of the new RTT just measured */
+ tp->t_srtt += (rtt >> 3);
if (tp->t_srtt <= 0)
tp->t_srtt = 1;
-
- /*
- * We accumulate a smoothed rtt variance (actually, a
- * smoothed mean difference), then set the retransmit timer
- * to smoothed rtt + 4 times the smoothed variance. rttvar
- * is stored as fixed point with 4 bits after the binary
- * point (scaled by 16). The following is equivalent to
- * rfc793 smoothing with an alpha of .75 (rttvar =
- * rttvar*3/4 + |delta| / 4). This replaces rfc793's
- * wired-in beta.
- */
+ /* Now lets make the absolute value of the variance */
if (delta < 0)
delta = -delta;
- delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
- tp->t_rttvar += delta;
+ /* Subtract out 1/8th */
+ tp->t_rttvar -= (tp->t_rttvar >> 3);
+ /* Add in 1/8th of the new variance we just saw */
+ tp->t_rttvar += (delta >> 3);
if (tp->t_rttvar <= 0)
tp->t_rttvar = 1;
if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
@@ -6499,17 +7587,37 @@
* variance to half the rtt (so our first retransmit happens
* at 3*rtt).
*/
- tp->t_srtt = rtt << TCP_RTT_SHIFT;
- tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ tp->t_srtt = rtt;
+ tp->t_rttvar = rtt >> 1;
tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
}
+ rack->rc_srtt_measure_made = 1;
KMOD_TCPSTAT_INC(tcps_rttupdated);
tp->t_rttupdated++;
#ifdef STATS
- stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
-#endif
- tp->t_rxtshift = 0;
+ if (rack_stats_gets_ms_rtt == 0) {
+ /* Send in the microsecond rtt used for rxt timeout purposes */
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
+ } else if (rack_stats_gets_ms_rtt == 1) {
+ /* Send in the millisecond rtt used for rxt timeout purposes */
+ int32_t ms_rtt;
+
+ /* Round up */
+ ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
+ } else if (rack_stats_gets_ms_rtt == 2) {
+ /* Send in the millisecond rtt has close to the path RTT as we can get */
+ int32_t ms_rtt;
+
+ /* Round up */
+ ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
+ } else {
+ /* Send in the microsecond rtt has close to the path RTT as we can get */
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
+ }
+#endif
/*
* the retransmit should happen at rtt + 4 * rttvar. Because of the
* way we do the smoothing, srtt and rttvar will each average +1/2
@@ -6520,63 +7628,13 @@
* statistical, we have to test that we don't drop below the minimum
* feasible timer (which is 2 ticks).
*/
- TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
- max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
+ tp->t_rxtshift = 0;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ max(rack_rto_min, rtt + 2), rack_rto_max);
+ rack_log_rtt_sample(rack, rtt);
tp->t_softerror = 0;
}
-static void
-rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
- uint32_t t, uint32_t cts)
-{
- /*
- * For this RSM, we acknowledged the data from a previous
- * transmission, not the last one we made. This means we did a false
- * retransmit.
- */
- struct tcp_rack *rack;
-
- if (rsm->r_flags & RACK_HAS_FIN) {
- /*
- * The sending of the FIN often is multiple sent when we
- * have everything outstanding ack'd. We ignore this case
- * since its over now.
- */
- return;
- }
- if (rsm->r_flags & RACK_TLP) {
- /*
- * We expect TLP's to have this occur.
- */
- return;
- }
- rack = (struct tcp_rack *)tp->t_fb_ptr;
- /* should we undo cc changes and exit recovery? */
- if (IN_RECOVERY(tp->t_flags)) {
- if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
- /*
- * Undo what we ratched down and exit recovery if
- * possible
- */
- EXIT_RECOVERY(tp->t_flags);
- tp->snd_recover = tp->snd_una;
- if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
- tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
- if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
- tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
- }
- }
- if (rsm->r_flags & RACK_WAS_SACKPASS) {
- /*
- * We retransmitted based on a sack and the earlier
- * retransmission ack'd it - re-ordering is occuring.
- */
- counter_u64_add(rack_reorder_seen, 1);
- rack->r_ctl.rc_reorder_ts = cts;
- }
- counter_u64_add(rack_badfr, 1);
- counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
-}
static void
rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
@@ -6632,25 +7690,33 @@
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
{
- int32_t i;
+ int32_t i, all;
uint32_t t, len_acked;
if ((rsm->r_flags & RACK_ACKED) ||
(rsm->r_flags & RACK_WAS_ACKED))
/* Already done */
return (0);
-
+ if (rsm->r_no_rtt_allowed) {
+ /* Not allowed */
+ return (0);
+ }
if (ack_type == CUM_ACKED) {
- if (SEQ_GT(th_ack, rsm->r_end))
+ if (SEQ_GT(th_ack, rsm->r_end)) {
len_acked = rsm->r_end - rsm->r_start;
- else
+ all = 1;
+ } else {
len_acked = th_ack - rsm->r_start;
- } else
+ all = 0;
+ }
+ } else {
len_acked = rsm->r_end - rsm->r_start;
+ all = 0;
+ }
if (rsm->r_rtr_cnt == 1) {
uint32_t us_rtt;
- t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
if ((int)t <= 0)
t = 1;
if (!tp->t_rttlow || tp->t_rttlow > t)
@@ -6662,43 +7728,72 @@
rack->r_ctl.rc_rack_min_rtt = 1;
}
}
- us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send;
+ if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
+ us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ else
+ us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
if (us_rtt == 0)
us_rtt = 1;
rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
- if (ack_type == SACKED)
+ if (ack_type == SACKED) {
+ rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
- else {
+ } else {
/*
- * For cum-ack we are only confident if what
- * is being acked is included in a measurement.
- * Otherwise it could be an idle period that
- * includes Delayed-ack time.
+ * We need to setup what our confidence
+ * is in this ack.
+ *
+ * If the rsm was app limited and it is
+ * less than a mss in length (the end
+ * of the send) then we have a gap. If we
+ * were app limited but say we were sending
+ * multiple MSS's then we are more confident
+ * int it.
+ *
+ * When we are not app-limited then we see if
+ * the rsm is being included in the current
+ * measurement, we tell this by the app_limited_needs_set
+ * flag.
+ *
+ * Note that being cwnd blocked is not applimited
+ * as well as the pacing delay between packets which
+ * are sending only 1 or 2 MSS's also will show up
+ * in the RTT. We probably need to examine this algorithm
+ * a bit more and enhance it to account for the delay
+ * between rsm's. We could do that by saving off the
+ * pacing delay of each rsm (in an rsm) and then
+ * factoring that in somehow though for now I am
+ * not sure how :)
*/
+ int calc_conf = 0;
+
+ if (rsm->r_flags & RACK_APP_LIMITED) {
+ if (all && (len_acked <= ctf_fixed_maxseg(tp)))
+ calc_conf = 0;
+ else
+ calc_conf = 1;
+ } else if (rack->app_limited_needs_set == 0) {
+ calc_conf = 1;
+ } else {
+ calc_conf = 0;
+ }
+ rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
- (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt);
+ calc_conf, rsm, rsm->r_rtr_cnt);
}
if ((rsm->r_flags & RACK_TLP) &&
- (!IN_RECOVERY(tp->t_flags))) {
+ (!IN_FASTRECOVERY(tp->t_flags))) {
/* Segment was a TLP and our retrans matched */
if (rack->r_ctl.rc_tlp_cwnd_reduce) {
rack->r_ctl.rc_rsm_start = tp->snd_max;
rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
- rack_cong_signal(tp, NULL, CC_NDUPACK);
- /*
- * When we enter recovery we need to assure
- * we send one packet.
- */
- if (rack->rack_no_prr == 0) {
- rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
- rack_log_to_prr(rack, 7, 0);
- }
+ rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
}
}
- if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
+ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
/* New more recent rack_tmit_time */
- rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
rack->rc_rack_rtt = t;
}
return (1);
@@ -6709,8 +7804,10 @@
* so we need to clear these to avoid incorrect handling.
*/
tp->t_rxtshift = 0;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max);
tp->t_softerror = 0;
- if ((to->to_flags & TOF_TS) &&
+ if (to && (to->to_flags & TOF_TS) &&
(ack_type == CUM_ACKED) &&
(to->to_tsecr) &&
((rsm->r_flags & RACK_OVERMAX) == 0)) {
@@ -6719,13 +7816,18 @@
* must be coming from a previous transmission.
*/
for (i = 0; i < rsm->r_rtr_cnt; i++) {
- if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
- t = cts - rsm->r_tim_lastsent[i];
+ if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
+ t = cts - (uint32_t)rsm->r_tim_lastsent[i];
if ((int)t <= 0)
t = 1;
if ((i + 1) < rsm->r_rtr_cnt) {
- /* Likely */
- rack_earlier_retran(tp, rsm, t, cts);
+ /*
+ * The peer ack'd from our previous
+ * transmission. We have a spurious
+ * retransmission and thus we dont
+ * want to update our rack_rtt.
+ */
+ return (0);
}
if (!tp->t_rttlow || tp->t_rttlow > t)
tp->t_rttlow = t;
@@ -6736,12 +7838,13 @@
}
}
if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
- rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
+ (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
/* New more recent rack_tmit_time */
- rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
+ rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
rack->rc_rack_rtt = t;
}
- tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm,
+ rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
+ tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
rsm->r_rtr_cnt);
return (1);
}
@@ -6756,19 +7859,20 @@
*/
ts_not_found:
i = rsm->r_rtr_cnt - 1;
- t = cts - rsm->r_tim_lastsent[i];
+ t = cts - (uint32_t)rsm->r_tim_lastsent[i];
if ((int)t <= 0)
t = 1;
if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
/*
* We retransmitted and the ack came back in less
* than the smallest rtt we have observed. We most
- * likey did an improper retransmit as outlined in
- * 4.2 Step 3 point 2 in the rack-draft.
+ * likely did an improper retransmit as outlined in
+ * 6.2 Step 2 point 2 in the rack-draft so we
+ * don't want to update our rack_rtt. We in
+ * theory (in future) might want to think about reverting our
+ * cwnd state but we won't for now.
*/
- i = rsm->r_rtr_cnt - 2;
- t = cts - rsm->r_tim_lastsent[i];
- rack_earlier_retran(tp, rsm, t, cts);
+ return (0);
} else if (rack->r_ctl.rc_rack_min_rtt) {
/*
* We retransmitted it and the retransmit did the
@@ -6781,9 +7885,9 @@
rack->r_ctl.rc_rack_min_rtt = 1;
}
}
- if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
+ if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) {
/* New more recent rack_tmit_time */
- rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
+ rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
rack->rc_rack_rtt = t;
}
return (1);
@@ -6877,7 +7981,7 @@
* measurement not starts.
*/
tp->gput_seq = rsm->r_start;
- rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
+ rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
}
if ((use_which == RACK_USE_END) &&
SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
@@ -6893,7 +7997,7 @@
* in our measurement.
*/
tp->gput_seq = rsm->r_end;
- rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
+ rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
}
if (use_which == RACK_USE_END_OR_THACK) {
/*
@@ -6906,7 +8010,7 @@
tp->gput_seq = th_ack;
else
tp->gput_seq = rsm->r_end;
- rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
+ rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
}
if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
/*
@@ -6933,15 +8037,25 @@
((tp->gput_ack - tp->gput_seq) <
max(rc_init_window(rack), (MIN_GP_WIN *
ctf_fixed_maxseg(tp))))) {
- /*
- * There is no sense of continuing this measurement
- * because its too small to gain us anything we
- * trust. Skip it and that way we can start a new
- * measurement quicker.
- */
- rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
- 0, 0, 0, 6, __LINE__, NULL);
- tp->t_flags &= ~TF_GPUTINPROG;
+ uint32_t ideal_amount;
+
+ ideal_amount = rack_get_measure_window(tp, rack);
+ if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
+ /*
+ * There is no sense of continuing this measurement
+ * because its too small to gain us anything we
+ * trust. Skip it and that way we can start a new
+ * measurement quicker.
+ */
+ tp->t_flags &= ~TF_GPUTINPROG;
+ rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
+ 0, 0, 0, 6, __LINE__, NULL);
+ } else {
+ /*
+ * Reset the window further out.
+ */
+ tp->gput_ack = tp->gput_seq + ideal_amount;
+ }
}
}
}
@@ -7021,6 +8135,10 @@
/* Now adjust our tree blocks */
rsm->r_end = start;
next->r_start = start;
+ /* Now we must adjust back where next->m is */
+ rack_setup_offset_for_rsm(rsm, next);
+
+ /* We don't need to adjust rsm, it did not change */
/* Clear out the dup ack count of the remainder */
rsm->r_dupack = 0;
rsm->r_just_ret = 0;
@@ -7062,6 +8180,7 @@
/* Done with block */
goto out;
}
+ rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
counter_u64_add(rack_sack_used_next_merge, 1);
/* Postion for the next block */
start = next->r_end;
@@ -7108,6 +8227,7 @@
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
}
+ rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
rsm->r_flags &= (~RACK_HAS_FIN);
/* Position us to point to the new nrsm that starts the sack blk */
rsm = nrsm;
@@ -7161,19 +8281,20 @@
}
if (rack->app_limited_needs_set)
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
- rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_flags |= RACK_ACKED;
rsm->r_flags &= ~RACK_TLP;
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
}
+ rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
} else {
counter_u64_add(rack_sack_skipped_acked, 1);
moved++;
}
if (end == rsm->r_end) {
- /* This block only - done, setup for next */
+ /* This block only - done, setup for next */
goto out;
}
/*
@@ -7221,6 +8342,11 @@
nrsm->r_end = end;
rsm->r_dupack = 0;
rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ /*
+ * Now that the rsm has had its start moved forward
+ * lets go ahead and get its new place in the world.
+ */
+ rack_setup_offset_for_rsm(prev, rsm);
/*
* Now nrsm is our new little piece
* that is acked (which was merged
@@ -7236,6 +8362,7 @@
counter_u64_add(rack_reorder_seen, 1);
rack->r_ctl.rc_reorder_ts = cts;
}
+ rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
rsm = prev;
counter_u64_add(rack_sack_used_prev_merge, 1);
} else {
@@ -7297,9 +8424,10 @@
}
if (rack->app_limited_needs_set)
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
- rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_flags |= RACK_ACKED;
rsm->r_flags &= ~RACK_TLP;
+ rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
@@ -7448,218 +8576,497 @@
}
static void
-rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
+rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to)
{
- uint32_t changed, entered_recovery = 0;
- struct tcp_rack *rack;
struct rack_sendmap *rsm, *rm;
- struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
- register uint32_t th_ack;
- int32_t i, j, k, num_sack_blks = 0;
- uint32_t cts, acked, ack_point, sack_changed = 0;
- int loop_start = 0, moved_two = 0;
- uint32_t tsused;
-
- INP_WLOCK_ASSERT(tp->t_inpcb);
- if (th->th_flags & TH_RST) {
- /* We don't log resets */
- return;
- }
- rack = (struct tcp_rack *)tp->t_fb_ptr;
- cts = tcp_ts_getticks();
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
- changed = 0;
- th_ack = th->th_ack;
- if (rack->sack_attack_disable == 0)
- rack_do_decay(rack);
- if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
- /*
- * You only get credit for
- * MSS and greater (and you get extra
- * credit for larger cum-ack moves).
- */
- int ac;
- ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
- rack->r_ctl.ack_count += ac;
- counter_u64_add(rack_ack_total, ac);
- }
- if (rack->r_ctl.ack_count > 0xfff00000) {
- /*
- * reduce the number to keep us under
- * a uint32_t.
- */
- rack->r_ctl.ack_count /= 2;
- rack->r_ctl.sack_count /= 2;
- }
- if (SEQ_GT(th_ack, tp->snd_una)) {
- rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
- tp->t_acktime = ticks;
- }
- if (rsm && SEQ_GT(th_ack, rsm->r_start))
- changed = th_ack - rsm->r_start;
- if (changed) {
- /*
- * The ACK point is advancing to th_ack, we must drop off
- * the packets in the rack log and calculate any eligble
- * RTT's.
- */
- rack->r_wanted_output = 1;
+ /*
+ * The ACK point is advancing to th_ack, we must drop off
+ * the packets in the rack log and calculate any eligble
+ * RTT's.
+ */
+ rack->r_wanted_output = 1;
more:
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
- if (rsm == NULL) {
- if ((th_ack - 1) == tp->iss) {
- /*
- * For the SYN incoming case we will not
- * have called tcp_output for the sending of
- * the SYN, so there will be no map. All
- * other cases should probably be a panic.
- */
- goto proc_sack;
- }
- if (tp->t_flags & TF_SENTFIN) {
- /* if we send a FIN we will not hav a map */
- goto proc_sack;
- }
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if (rsm == NULL) {
+ if ((th_ack - 1) == tp->iss) {
+ /*
+ * For the SYN incoming case we will not
+ * have called tcp_output for the sending of
+ * the SYN, so there will be no map. All
+ * other cases should probably be a panic.
+ */
+ return;
+ }
+ if (tp->t_flags & TF_SENTFIN) {
+ /* if we sent a FIN we often will not have map */
+ return;
+ }
#ifdef INVARIANTS
- panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
- tp,
- th, tp->t_state, rack,
- tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
+ panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
+ tp,
+ tp->t_state, th_ack, rack,
+ tp->snd_una, tp->snd_max, tp->snd_nxt);
#endif
- goto proc_sack;
- }
- if (SEQ_LT(th_ack, rsm->r_start)) {
- /* Huh map is missing this */
+ return;
+ }
+ if (SEQ_LT(th_ack, rsm->r_start)) {
+ /* Huh map is missing this */
#ifdef INVARIANTS
- printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
- rsm->r_start,
- th_ack, tp->t_state, rack->r_state);
-#endif
- goto proc_sack;
- }
- rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
- /* Now do we consume the whole thing? */
- if (SEQ_GEQ(th_ack, rsm->r_end)) {
- /* Its all consumed. */
- uint32_t left;
- uint8_t newly_acked;
-
- rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
- rsm->r_rtr_bytes = 0;
- /* Record the time of highest cumack sent */
- rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send;
- rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
+ rsm->r_start,
+ th_ack, tp->t_state, rack->r_state);
+#endif
+ return;
+ }
+ rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
+ /* Now do we consume the whole thing? */
+ if (SEQ_GEQ(th_ack, rsm->r_end)) {
+ /* Its all consumed. */
+ uint32_t left;
+ uint8_t newly_acked;
+
+ rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
+ rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
+ rsm->r_rtr_bytes = 0;
+ /* Record the time of highest cumack sent */
+ rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
#ifdef INVARIANTS
- if (rm != rsm) {
- panic("removing head in rack:%p rsm:%p rm:%p",
- rack, rsm, rm);
- }
+ if (rm != rsm) {
+ panic("removing head in rack:%p rsm:%p rm:%p",
+ rack, rsm, rm);
+ }
#endif
- if (rsm->r_in_tmap) {
- TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
- rsm->r_in_tmap = 0;
- }
- newly_acked = 1;
- if (rsm->r_flags & RACK_ACKED) {
- /*
- * It was acked on the scoreboard -- remove
- * it from total
- */
- rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
- newly_acked = 0;
- } else if (rsm->r_flags & RACK_SACK_PASSED) {
+ if (rsm->r_in_tmap) {
+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
+ rsm->r_in_tmap = 0;
+ }
+ newly_acked = 1;
+ if (rsm->r_flags & RACK_ACKED) {
+ /*
+ * It was acked on the scoreboard -- remove
+ * it from total
+ */
+ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
+ newly_acked = 0;
+ } else if (rsm->r_flags & RACK_SACK_PASSED) {
+ /*
+ * There are segments ACKED on the
+ * scoreboard further up. We are seeing
+ * reordering.
+ */
+ rsm->r_flags &= ~RACK_SACK_PASSED;
+ counter_u64_add(rack_reorder_seen, 1);
+ rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
+ rsm->r_flags |= RACK_ACKED;
+ rack->r_ctl.rc_reorder_ts = cts;
+ if (rack->r_ent_rec_ns) {
/*
- * There are segments ACKED on the
- * scoreboard further up. We are seeing
- * reordering.
+ * We have sent no more, and we saw an sack
+ * then ack arrive.
*/
- rsm->r_flags &= ~RACK_SACK_PASSED;
- counter_u64_add(rack_reorder_seen, 1);
- rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
- rsm->r_flags |= RACK_ACKED;
- rack->r_ctl.rc_reorder_ts = cts;
- }
- left = th_ack - rsm->r_end;
- if (rack->app_limited_needs_set && newly_acked)
- rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
- /* Free back to zone */
- rack_free(rack, rsm);
- if (left) {
- goto more;
+ rack->r_might_revert = 1;
}
- goto proc_sack;
}
- if (rsm->r_flags & RACK_ACKED) {
+ if ((rsm->r_flags & RACK_TO_REXT) &&
+ (tp->t_flags & TF_RCVD_TSTMP) &&
+ (to->to_flags & TOF_TS) &&
+ (tp->t_flags & TF_PREVVALID)) {
/*
- * It was acked on the scoreboard -- remove it from
- * total for the part being cum-acked.
+ * We can use the timestamp to see
+ * if this retransmission was from the
+ * first transmit. If so we made a mistake.
*/
- rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
+ tp->t_flags &= ~TF_PREVVALID;
+ if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
+ /* The first transmit is what this ack is for */
+ rack_cong_signal(tp, CC_RTO_ERR, th_ack);
+ }
}
- /*
- * Clear the dup ack count for
- * the piece that remains.
- */
- rsm->r_dupack = 0;
- rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
- if (rsm->r_rtr_bytes) {
+ left = th_ack - rsm->r_end;
+ if (rack->app_limited_needs_set && newly_acked)
+ rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
+ /* Free back to zone */
+ rack_free(rack, rsm);
+ if (left) {
+ goto more;
+ }
+ /* Check for reneging */
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
/*
- * It was retransmitted adjust the
- * sack holes for what was acked.
+ * The peer has moved snd_una up to
+ * the edge of this send, i.e. one
+ * that it had previously acked. The only
+ * way that can be true if the peer threw
+ * away data (space issues) that it had
+ * previously sacked (else it would have
+ * given us snd_una up to (rsm->r_end).
+ * We need to undo the acked markings here.
+ *
+ * Note we have to look to make sure th_ack is
+ * our rsm->r_start in case we get an old ack
+ * where th_ack is behind snd_una.
*/
- int ack_am;
-
- ack_am = (th_ack - rsm->r_start);
- if (ack_am >= rsm->r_rtr_bytes) {
- rack->r_ctl.rc_holes_rxt -= ack_am;
- rsm->r_rtr_bytes -= ack_am;
- }
+ rack_peer_reneges(rack, rsm, th_ack);
}
+ return;
+ }
+ if (rsm->r_flags & RACK_ACKED) {
/*
- * Update where the piece starts and record
- * the time of send of highest cumack sent.
+ * It was acked on the scoreboard -- remove it from
+ * total for the part being cum-acked.
*/
- rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send;
- rsm->r_start = th_ack;
- if (rack->app_limited_needs_set)
- rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
+ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
}
-proc_sack:
- /* Check for reneging */
- rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
- if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
- /*
- * The peer has moved snd_una up to
- * the edge of this send, i.e. one
- * that it had previously acked. The only
- * way that can be true if the peer threw
- * away data (space issues) that it had
- * previously sacked (else it would have
- * given us snd_una up to (rsm->r_end).
- * We need to undo the acked markings here.
- *
- * Note we have to look to make sure th_ack is
- * our rsm->r_start in case we get an old ack
- * where th_ack is behind snd_una.
+ /*
+ * Clear the dup ack count for
+ * the piece that remains.
+ */
+ rsm->r_dupack = 0;
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ if (rsm->r_rtr_bytes) {
+ /*
+ * It was retransmitted adjust the
+ * sack holes for what was acked.
*/
- rack_peer_reneges(rack, rsm, th->th_ack);
- }
- if ((to->to_flags & TOF_SACK) == 0) {
- /* We are done nothing left */
- goto out;
+ int ack_am;
+
+ ack_am = (th_ack - rsm->r_start);
+ if (ack_am >= rsm->r_rtr_bytes) {
+ rack->r_ctl.rc_holes_rxt -= ack_am;
+ rsm->r_rtr_bytes -= ack_am;
+ }
}
- /* Sack block processing */
- if (SEQ_GT(th_ack, tp->snd_una))
- ack_point = th_ack;
- else
- ack_point = tp->snd_una;
- for (i = 0; i < to->to_nsacks; i++) {
- bcopy((to->to_sacks + i * TCPOLEN_SACK),
- &sack, sizeof(sack));
- sack.start = ntohl(sack.start);
- sack.end = ntohl(sack.end);
- if (SEQ_GT(sack.end, sack.start) &&
+ /*
+ * Update where the piece starts and record
+ * the time of send of highest cumack sent.
+ */
+ rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
+ rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
+ /* Now we need to move our offset forward too */
+ if (rsm->orig_m_len != rsm->m->m_len) {
+ /* Fix up the orig_m_len and possibly the mbuf offset */
+ rack_adjust_orig_mlen(rsm);
+ }
+ rsm->soff += (th_ack - rsm->r_start);
+ rsm->r_start = th_ack;
+ /* Now do we need to move the mbuf fwd too? */
+ while (rsm->soff >= rsm->m->m_len) {
+ rsm->soff -= rsm->m->m_len;
+ rsm->m = rsm->m->m_next;
+ KASSERT((rsm->m != NULL),
+ (" nrsm:%p hit at soff:%u null m",
+ rsm, rsm->soff));
+ }
+ rsm->orig_m_len = rsm->m->m_len;
+ if (rack->app_limited_needs_set)
+ rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
+}
+
+static void
+rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ struct rack_sendmap *rsm;
+ int sack_pass_fnd = 0;
+
+ if (rack->r_might_revert) {
+ /*
+ * Ok we have reordering, have not sent anything, we
+ * might want to revert the congestion state if nothing
+ * further has SACK_PASSED on it. Lets check.
+ *
+ * We also get here when we have DSACKs come in for
+ * all the data that we FR'd. Note that a rxt or tlp
+ * timer clears this from happening.
+ */
+
+ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
+ if (rsm->r_flags & RACK_SACK_PASSED) {
+ sack_pass_fnd = 1;
+ break;
+ }
+ }
+ if (sack_pass_fnd == 0) {
+ /*
+ * We went into recovery
+ * incorrectly due to reordering!
+ */
+ int orig_cwnd;
+
+ rack->r_ent_rec_ns = 0;
+ orig_cwnd = tp->snd_cwnd;
+ tp->snd_cwnd = rack->r_ctl.rc_cwnd_at_erec;
+ tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
+ tp->snd_recover = tp->snd_una;
+ rack_log_to_prr(rack, 14, orig_cwnd);
+ EXIT_RECOVERY(tp->t_flags);
+ }
+ rack->r_might_revert = 0;
+ }
+}
+
+#ifdef NETFLIX_EXP_DETECTION
+static void
+rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz)
+{
+ if ((rack->do_detection || tcp_force_detection) &&
+ tcp_sack_to_ack_thresh &&
+ tcp_sack_to_move_thresh &&
+ ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
+ /*
+ * We have thresholds set to find
+ * possible attackers and disable sack.
+ * Check them.
+ */
+ uint64_t ackratio, moveratio, movetotal;
+
+ /* Log detecting */
+ rack_log_sad(rack, 1);
+ ackratio = (uint64_t)(rack->r_ctl.sack_count);
+ ackratio *= (uint64_t)(1000);
+ if (rack->r_ctl.ack_count)
+ ackratio /= (uint64_t)(rack->r_ctl.ack_count);
+ else {
+ /* We really should not hit here */
+ ackratio = 1000;
+ }
+ if ((rack->sack_attack_disable == 0) &&
+ (ackratio > rack_highest_sack_thresh_seen))
+ rack_highest_sack_thresh_seen = (uint32_t)ackratio;
+ movetotal = rack->r_ctl.sack_moved_extra;
+ movetotal += rack->r_ctl.sack_noextra_move;
+ moveratio = rack->r_ctl.sack_moved_extra;
+ moveratio *= (uint64_t)1000;
+ if (movetotal)
+ moveratio /= movetotal;
+ else {
+ /* No moves, thats pretty good */
+ moveratio = 0;
+ }
+ if ((rack->sack_attack_disable == 0) &&
+ (moveratio > rack_highest_move_thresh_seen))
+ rack_highest_move_thresh_seen = (uint32_t)moveratio;
+ if (rack->sack_attack_disable == 0) {
+ if ((ackratio > tcp_sack_to_ack_thresh) &&
+ (moveratio > tcp_sack_to_move_thresh)) {
+ /* Disable sack processing */
+ rack->sack_attack_disable = 1;
+ if (rack->r_rep_attack == 0) {
+ rack->r_rep_attack = 1;
+ counter_u64_add(rack_sack_attacks_detected, 1);
+ }
+ if (tcp_attack_on_turns_on_logging) {
+ /*
+ * Turn on logging, used for debugging
+ * false positives.
+ */
+ rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
+ }
+ /* Clamp the cwnd at flight size */
+ rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
+ rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ rack_log_sad(rack, 2);
+ }
+ } else {
+ /* We are sack-disabled check for false positives */
+ if ((ackratio <= tcp_restoral_thresh) ||
+ (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) {
+ rack->sack_attack_disable = 0;
+ rack_log_sad(rack, 3);
+ /* Restart counting */
+ rack->r_ctl.sack_count = 0;
+ rack->r_ctl.sack_moved_extra = 0;
+ rack->r_ctl.sack_noextra_move = 1;
+ rack->r_ctl.ack_count = max(1,
+ (bytes_this_ack / segsiz));
+
+ if (rack->r_rep_reverse == 0) {
+ rack->r_rep_reverse = 1;
+ counter_u64_add(rack_sack_attacks_reversed, 1);
+ }
+ /* Restore the cwnd */
+ if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
+ rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
+ }
+ }
+ }
+}
+#endif
+
+static void
+rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
+{
+
+ uint32_t am;
+
+ if (SEQ_GT(end, start))
+ am = end - start;
+ else
+ am = 0;
+ /*
+ * We keep track of how many DSACK blocks we get
+ * after a recovery incident.
+ */
+ rack->r_ctl.dsack_byte_cnt += am;
+ if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
+ rack->r_ctl.retran_during_recovery &&
+ (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
+ /*
+ * False recovery most likely culprit is reordering. If
+ * nothing else is missing we need to revert.
+ */
+ rack->r_might_revert = 1;
+ rack_handle_might_revert(rack->rc_tp, rack);
+ rack->r_might_revert = 0;
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.dsack_byte_cnt = 0;
+ }
+}
+
+static void
+rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
+{
+ /* Deal with changed and PRR here (in recovery only) */
+ uint32_t pipe, snd_una;
+
+ rack->r_ctl.rc_prr_delivered += changed;
+
+ if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
+ /*
+ * It is all outstanding, we are application limited
+ * and thus we don't need more room to send anything.
+ * Note we use tp->snd_una here and not th_ack because
+ * the data as yet not been cut from the sb.
+ */
+ rack->r_ctl.rc_prr_sndcnt = 0;
+ return;
+ }
+ /* Compute prr_sndcnt */
+ if (SEQ_GT(tp->snd_una, th_ack)) {
+ snd_una = tp->snd_una;
+ } else {
+ snd_una = th_ack;
+ }
+ pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
+ if (pipe > tp->snd_ssthresh) {
+ long sndcnt;
+
+ sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
+ if (rack->r_ctl.rc_prr_recovery_fs > 0)
+ sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
+ else {
+ rack->r_ctl.rc_prr_sndcnt = 0;
+ rack_log_to_prr(rack, 9, 0);
+ sndcnt = 0;
+ }
+ sndcnt++;
+ if (sndcnt > (long)rack->r_ctl.rc_prr_out)
+ sndcnt -= rack->r_ctl.rc_prr_out;
+ else
+ sndcnt = 0;
+ rack->r_ctl.rc_prr_sndcnt = sndcnt;
+ rack_log_to_prr(rack, 10, 0);
+ } else {
+ uint32_t limit;
+
+ if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
+ limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
+ else
+ limit = 0;
+ if (changed > limit)
+ limit = changed;
+ limit += ctf_fixed_maxseg(tp);
+ if (tp->snd_ssthresh > pipe) {
+ rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
+ rack_log_to_prr(rack, 11, 0);
+ } else {
+ rack->r_ctl.rc_prr_sndcnt = min(0, limit);
+ rack_log_to_prr(rack, 12, 0);
+ }
+ }
+}
+
+static void
+rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck)
+{
+ uint32_t changed;
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm;
+ struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
+ register uint32_t th_ack;
+ int32_t i, j, k, num_sack_blks = 0;
+ uint32_t cts, acked, ack_point, sack_changed = 0;
+ int loop_start = 0, moved_two = 0;
+ uint32_t tsused;
+
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (th->th_flags & TH_RST) {
+ /* We don't log resets */
+ return;
+ }
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ cts = tcp_get_usecs(NULL);
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ changed = 0;
+ th_ack = th->th_ack;
+ if (rack->sack_attack_disable == 0)
+ rack_do_decay(rack);
+ if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
+ /*
+ * You only get credit for
+ * MSS and greater (and you get extra
+ * credit for larger cum-ack moves).
+ */
+ int ac;
+
+ ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
+ rack->r_ctl.ack_count += ac;
+ counter_u64_add(rack_ack_total, ac);
+ }
+ if (rack->r_ctl.ack_count > 0xfff00000) {
+ /*
+ * reduce the number to keep us under
+ * a uint32_t.
+ */
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
+ if (SEQ_GT(th_ack, tp->snd_una)) {
+ rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
+ tp->t_acktime = ticks;
+ }
+ if (rsm && SEQ_GT(th_ack, rsm->r_start))
+ changed = th_ack - rsm->r_start;
+ if (changed) {
+ rack_process_to_cumack(tp, rack, th_ack, cts, to);
+ }
+ if ((to->to_flags & TOF_SACK) == 0) {
+ /* We are done nothing left and no sack. */
+ rack_handle_might_revert(tp, rack);
+ /*
+ * For cases where we struck a dup-ack
+ * with no SACK, add to the changes so
+ * PRR will work right.
+ */
+ if (dup_ack_struck && (changed == 0)) {
+ changed += ctf_fixed_maxseg(rack->rc_tp);
+ }
+ goto out;
+ }
+ /* Sack block processing */
+ if (SEQ_GT(th_ack, tp->snd_una))
+ ack_point = th_ack;
+ else
+ ack_point = tp->snd_una;
+ for (i = 0; i < to->to_nsacks; i++) {
+ bcopy((to->to_sacks + i * TCPOLEN_SACK),
+ &sack, sizeof(sack));
+ sack.start = ntohl(sack.start);
+ sack.end = ntohl(sack.end);
+ if (SEQ_GT(sack.end, sack.start) &&
SEQ_GT(sack.start, ack_point) &&
SEQ_LT(sack.start, tp->snd_max) &&
SEQ_GT(sack.end, ack_point) &&
@@ -7674,6 +9081,7 @@
*/
tcp_record_dsack(sack.start, sack.end);
#endif
+ rack_note_dsack(rack, sack.start, sack.end);
}
}
/*
@@ -7683,7 +9091,7 @@
num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
num_sack_blks, th->th_ack);
ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
- if (num_sack_blks == 0) {
+ if (num_sack_blks == 0) {
/* Nothing to sack (DSACKs?) */
goto out_with_totals;
}
@@ -7883,116 +9291,36 @@
}
out:
#ifdef NETFLIX_EXP_DETECTION
- if ((rack->do_detection || tcp_force_detection) &&
- tcp_sack_to_ack_thresh &&
- tcp_sack_to_move_thresh &&
- ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
+ rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp));
+#endif
+ if (changed) {
+ /* Something changed cancel the rack timer */
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ }
+ tsused = tcp_get_usecs(NULL);
+ rsm = tcp_rack_output(tp, rack, tsused);
+ if ((!IN_FASTRECOVERY(tp->t_flags)) &&
+ rsm) {
+ /* Enter recovery */
+ rack->r_ctl.rc_rsm_start = rsm->r_start;
+ rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
+ rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
+ entered_recovery = 1;
+ rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
/*
- * We have thresholds set to find
- * possible attackers and disable sack.
- * Check them.
+ * When we enter recovery we need to assure we send
+ * one packet.
*/
- uint64_t ackratio, moveratio, movetotal;
-
- /* Log detecting */
- rack_log_sad(rack, 1);
- ackratio = (uint64_t)(rack->r_ctl.sack_count);
- ackratio *= (uint64_t)(1000);
- if (rack->r_ctl.ack_count)
- ackratio /= (uint64_t)(rack->r_ctl.ack_count);
- else {
- /* We really should not hit here */
- ackratio = 1000;
- }
- if ((rack->sack_attack_disable == 0) &&
- (ackratio > rack_highest_sack_thresh_seen))
- rack_highest_sack_thresh_seen = (uint32_t)ackratio;
- movetotal = rack->r_ctl.sack_moved_extra;
- movetotal += rack->r_ctl.sack_noextra_move;
- moveratio = rack->r_ctl.sack_moved_extra;
- moveratio *= (uint64_t)1000;
- if (movetotal)
- moveratio /= movetotal;
- else {
- /* No moves, thats pretty good */
- moveratio = 0;
- }
- if ((rack->sack_attack_disable == 0) &&
- (moveratio > rack_highest_move_thresh_seen))
- rack_highest_move_thresh_seen = (uint32_t)moveratio;
- if (rack->sack_attack_disable == 0) {
- if ((ackratio > tcp_sack_to_ack_thresh) &&
- (moveratio > tcp_sack_to_move_thresh)) {
- /* Disable sack processing */
- rack->sack_attack_disable = 1;
- if (rack->r_rep_attack == 0) {
- rack->r_rep_attack = 1;
- counter_u64_add(rack_sack_attacks_detected, 1);
- }
- if (tcp_attack_on_turns_on_logging) {
- /*
- * Turn on logging, used for debugging
- * false positives.
- */
- rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
- }
- /* Clamp the cwnd at flight size */
- rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
- rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- rack_log_sad(rack, 2);
- }
- } else {
- /* We are sack-disabled check for false positives */
- if ((ackratio <= tcp_restoral_thresh) ||
- (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) {
- rack->sack_attack_disable = 0;
- rack_log_sad(rack, 3);
- /* Restart counting */
- rack->r_ctl.sack_count = 0;
- rack->r_ctl.sack_moved_extra = 0;
- rack->r_ctl.sack_noextra_move = 1;
- rack->r_ctl.ack_count = max(1,
- (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp)));
-
- if (rack->r_rep_reverse == 0) {
- rack->r_rep_reverse = 1;
- counter_u64_add(rack_sack_attacks_reversed, 1);
- }
- /* Restore the cwnd */
- if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
- rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
- }
- }
- }
-#endif
- if (changed) {
- /* Something changed cancel the rack timer */
- rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
- }
- tsused = tcp_ts_getticks();
- rsm = tcp_rack_output(tp, rack, tsused);
- if ((!IN_RECOVERY(tp->t_flags)) &&
- rsm) {
- /* Enter recovery */
- rack->r_ctl.rc_rsm_start = rsm->r_start;
- rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
- rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
- entered_recovery = 1;
- rack_cong_signal(tp, NULL, CC_NDUPACK);
- /*
- * When we enter recovery we need to assure we send
- * one packet.
- */
- if (rack->rack_no_prr == 0) {
- rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
- rack_log_to_prr(rack, 8, 0);
+ if (rack->rack_no_prr == 0) {
+ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
+ rack_log_to_prr(rack, 8, 0);
}
rack->r_timer_override = 1;
rack->r_early = 0;
rack->r_ctl.rc_agg_early = 0;
- } else if (IN_RECOVERY(tp->t_flags) &&
+ } else if (IN_FASTRECOVERY(tp->t_flags) &&
rsm &&
- (rack->r_rr_config == 3)) {
+ (rack->r_rr_config == 3)) {
/*
* Assure we can output and we get no
* remembered pace time except the retransmit.
@@ -8001,56 +9329,10 @@
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
rack->r_ctl.rc_resend = rsm;
}
- if (IN_RECOVERY(tp->t_flags) &&
+ if (IN_FASTRECOVERY(tp->t_flags) &&
(rack->rack_no_prr == 0) &&
(entered_recovery == 0)) {
- /* Deal with PRR here (in recovery only) */
- uint32_t pipe, snd_una;
-
- rack->r_ctl.rc_prr_delivered += changed;
- /* Compute prr_sndcnt */
- if (SEQ_GT(tp->snd_una, th_ack)) {
- snd_una = tp->snd_una;
- } else {
- snd_una = th_ack;
- }
- pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
- if (pipe > tp->snd_ssthresh) {
- long sndcnt;
-
- sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
- if (rack->r_ctl.rc_prr_recovery_fs > 0)
- sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
- else {
- rack->r_ctl.rc_prr_sndcnt = 0;
- rack_log_to_prr(rack, 9, 0);
- sndcnt = 0;
- }
- sndcnt++;
- if (sndcnt > (long)rack->r_ctl.rc_prr_out)
- sndcnt -= rack->r_ctl.rc_prr_out;
- else
- sndcnt = 0;
- rack->r_ctl.rc_prr_sndcnt = sndcnt;
- rack_log_to_prr(rack, 10, 0);
- } else {
- uint32_t limit;
-
- if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
- limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
- else
- limit = 0;
- if (changed > limit)
- limit = changed;
- limit += ctf_fixed_maxseg(tp);
- if (tp->snd_ssthresh > pipe) {
- rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
- rack_log_to_prr(rack, 11, 0);
- } else {
- rack->r_ctl.rc_prr_sndcnt = min(0, limit);
- rack_log_to_prr(rack, 12, 0);
- }
- }
+ rack_update_prr(tp, rack, changed, th_ack);
if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
((rack->rc_inp->inp_in_hpts == 0) &&
((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
@@ -8071,12 +9353,32 @@
struct rack_sendmap *rsm;
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
+ rsm = TAILQ_NEXT(rsm, r_tnext);
+ }
if (rsm && (rsm->r_dupack < 0xff)) {
rsm->r_dupack++;
if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
- rack->r_wanted_output = 1;
- rack->r_timer_override = 1;
- rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
+ struct timeval tv;
+ uint32_t cts;
+ /*
+ * Here we see if we need to retransmit. For
+ * a SACK type connection if enough time has passed
+ * we will get a return of the rsm. For a non-sack
+ * connection we will get the rsm returned if the
+ * dupack value is 3 or more.
+ */
+ cts = tcp_get_usecs(&tv);
+ rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
+ if (rack->r_ctl.rc_resend != NULL) {
+ if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
+ rack_cong_signal(rack->rc_tp, CC_NDUPACK,
+ rack->rc_tp->snd_una);
+ }
+ rack->r_wanted_output = 1;
+ rack->r_timer_override = 1;
+ rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
+ }
} else {
rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
}
@@ -8118,6 +9420,14 @@
uint64_t bw, calc_bw, rtt;
rtt = rack->r_ctl.rack_rs.rs_us_rtt;
+ if (rtt == 0) {
+ /* no us sample is there a ms one? */
+ if (rack->r_ctl.rack_rs.rs_rtt_lowest) {
+ rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
+ } else {
+ goto no_measurement;
+ }
+ }
bw = acked;
calc_bw = bw * 1000000;
calc_bw /= rtt;
@@ -8145,15 +9455,25 @@
rack->r_ctl.rc_rtt_diff = 0;
rack->r_ctl.gp_bw = calc_bw;
rack->rc_gp_filled = 1;
- rack->r_ctl.num_avg = RACK_REQ_AVG;
- rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
+ if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
+ rack->r_ctl.num_measurements = RACK_REQ_AVG;
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
} else if (calc_bw > rack->r_ctl.gp_bw) {
rack->r_ctl.rc_rtt_diff = 0;
- rack->r_ctl.num_avg = RACK_REQ_AVG;
+ if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
+ rack->r_ctl.num_measurements = RACK_REQ_AVG;
rack->r_ctl.gp_bw = calc_bw;
- rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
} else
rack_increase_bw_mul(rack, -1, 0, 0, 1);
+ if ((rack->gp_ready == 0) &&
+ (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
+ /* We have enough measurements now */
+ rack->gp_ready = 1;
+ rack_set_cc_pacing(rack);
+ if (rack->defer_options)
+ rack_apply_deferred_options(rack);
+ }
/*
* For acks over 1mss we do a extra boost to simulate
* where we would get 2 acks (we want 110 for the mul).
@@ -8162,12 +9482,12 @@
rack_increase_bw_mul(rack, -1, 0, 0, 1);
} else {
/*
- * Huh, this should not be, settle
- * for just an old increase.
+ * zero rtt possibly?, settle for just an old increase.
*/
+no_measurement:
rack_increase_bw_mul(rack, -1, 0, 0, 1);
}
- } else if ((IN_RECOVERY(tp->t_flags) == 0) &&
+ } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
(sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
minseg)) &&
(rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
@@ -8188,6 +9508,114 @@
}
}
+
+
+static void
+rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
+{
+ /*
+ * The fast output path is enabled and we
+ * have moved the cumack forward. Lets see if
+ * we can expand forward the fast path length by
+ * that amount. What we would ideally like to
+ * do is increase the number of bytes in the
+ * fast path block (left_to_send) by the
+ * acked amount. However we have to gate that
+ * by two factors:
+ * 1) The amount outstanding and the rwnd of the peer
+ * (i.e. we don't want to exceed the rwnd of the peer).
+ * <and>
+ * 2) The amount of data left in the socket buffer (i.e.
+ * we can't send beyond what is in the buffer).
+ *
+ * Note that this does not take into account any increase
+ * in the cwnd. We will only extend the fast path by
+ * what was acked.
+ */
+ uint32_t new_total, gating_val;
+
+ new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
+ gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
+ (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
+ if (new_total <= gating_val) {
+ /* We can increase left_to_send by the acked amount */
+ counter_u64_add(rack_extended_rfo, 1);
+ rack->r_ctl.fsb.left_to_send = new_total;
+ KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
+ ("rack:%p left_to_send:%u sbavail:%u out:%u",
+ rack, rack->r_ctl.fsb.left_to_send,
+ sbavail(&rack->rc_inp->inp_socket->so_snd),
+ (tp->snd_max - tp->snd_una)));
+
+ }
+}
+
+static void
+rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
+{
+ /*
+ * Here any sendmap entry that points to the
+ * beginning mbuf must be adjusted to the correct
+ * offset. This must be called with:
+ * 1) The socket buffer locked
+ * 2) snd_una adjusted to its new postion.
+ *
+ * Note that (2) implies rack_ack_received has also
+ * been called.
+ *
+ * We grab the first mbuf in the socket buffer and
+ * then go through the front of the sendmap, recalculating
+ * the stored offset for any sendmap entry that has
+ * that mbuf. We must use the sb functions to do this
+ * since its possible an add was done has well as
+ * the subtraction we may have just completed. This should
+ * not be a penalty though, since we just referenced the sb
+ * to go in and trim off the mbufs that we freed (of course
+ * there will be a penalty for the sendmap references though).
+ */
+ struct mbuf *m;
+ struct rack_sendmap *rsm;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+ m = sb->sb_mb;
+ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if ((rsm == NULL) || (m == NULL)) {
+ /* Nothing outstanding */
+ return;
+ }
+ while (rsm->m == m) {
+ /* one to adjust */
+#ifdef INVARIANTS
+ struct mbuf *tm;
+ uint32_t soff;
+
+ tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
+ if (rsm->orig_m_len != m->m_len) {
+ rack_adjust_orig_mlen(rsm);
+ }
+ if (rsm->soff != soff) {
+ /*
+ * This is not a fatal error, we anticipate it
+ * might happen (the else code), so we count it here
+ * so that under invariant we can see that it really
+ * does happen.
+ */
+ counter_u64_add(rack_adjust_map_bw, 1);
+ }
+ rsm->m = tm;
+ rsm->soff = soff;
+ rsm->orig_m_len = rsm->m->m_len;
+#else
+ rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
+ rsm->orig_m_len = rsm->m->m_len;
+#endif
+ rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
+ rsm);
+ if (rsm == NULL)
+ break;
+ }
+}
+
/*
* Return value of 1, we do not need to call rack_process_data().
* return value of 0, rack_process_data can be called.
@@ -8198,7 +9626,7 @@
rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, struct tcpopt *to,
uint32_t tiwin, int32_t tlen,
- int32_t * ofia, int32_t thflags, int32_t * ret_val)
+ int32_t * ofia, int32_t thflags, int32_t *ret_val)
{
int32_t ourfinisacked = 0;
int32_t nsegs, acked_amount;
@@ -8210,29 +9638,41 @@
rack = (struct tcp_rack *)tp->t_fb_ptr;
if (SEQ_GT(th->th_ack, tp->snd_max)) {
- ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt);
rack->r_wanted_output = 1;
return (1);
}
- if (rack->rc_gp_filled &&
+ if (rack->gp_ready &&
(rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
under_pacing = 1;
}
if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
- if (rack->rc_in_persist)
+ int in_rec, dup_ack_struck = 0;
+
+ in_rec = IN_FASTRECOVERY(tp->t_flags);
+ if (rack->rc_in_persist) {
tp->t_rxtshift = 0;
- if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max);
+ }
+ if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) {
rack_strike_dupack(rack);
- rack_log_ack(tp, to, th);
+ dup_ack_struck = 1;
+ }
+ rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck);
}
if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
/*
* Old ack, behind (or duplicate to) the last one rcv'd
- * Note: Should mark reordering is occuring! We should also
- * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
- * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
- * retran and> ack 3
+ * Note: We mark reordering is occuring if its
+ * less than and we have not closed our window.
*/
+ if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ }
return (0);
}
/*
@@ -8267,15 +9707,18 @@
* retransmit in the first place. Recover our original cwnd and
* ssthresh, and proceed to transmit where we left off.
*/
- if (tp->t_flags & TF_PREVVALID) {
+ if ((tp->t_flags & TF_PREVVALID) &&
+ ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
tp->t_flags &= ~TF_PREVVALID;
if (tp->t_rxtshift == 1 &&
(int)(ticks - tp->t_badrxtwin) < 0)
- rack_cong_signal(tp, th, CC_RTO_ERR);
+ rack_cong_signal(tp, CC_RTO_ERR, th->th_ack);
}
if (acked) {
/* assure we are not backed off */
tp->t_rxtshift = 0;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max);
rack->rc_tlp_in_progress = 0;
rack->r_ctl.rc_tlp_cnt_out = 0;
/*
@@ -8311,15 +9754,13 @@
*ofia = ourfinisacked;
return (0);
}
- if (rack->r_ctl.rc_early_recovery) {
- if (IN_RECOVERY(tp->t_flags)) {
- if (SEQ_LT(th->th_ack, tp->snd_recover) &&
- (SEQ_LT(th->th_ack, tp->snd_max))) {
- tcp_rack_partialack(tp, th);
- } else {
- rack_post_recovery(tp, th);
- recovery = 1;
- }
+ if (IN_RECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover) &&
+ (SEQ_LT(th->th_ack, tp->snd_max))) {
+ tcp_rack_partialack(tp);
+ } else {
+ rack_post_recovery(tp, th->th_ack);
+ recovery = 1;
}
}
/*
@@ -8327,7 +9768,7 @@
* related information. This typically means increasing the
* congestion window.
*/
- rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
+ rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
SOCKBUF_LOCK(&so->so_snd);
acked_amount = min(acked, (int)sbavail(&so->so_snd));
tp->snd_wnd -= acked_amount;
@@ -8344,20 +9785,13 @@
*/
ourfinisacked = 1;
}
+ tp->snd_una = th->th_ack;
+ if (acked_amount && sbavail(&so->so_snd))
+ rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
+ rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
SOCKBUF_UNLOCK(&so->so_snd);
tp->t_flags |= TF_WAKESOW;
m_freem(mfree);
- if (rack->r_ctl.rc_early_recovery == 0) {
- if (IN_RECOVERY(tp->t_flags)) {
- if (SEQ_LT(th->th_ack, tp->snd_recover) &&
- (SEQ_LT(th->th_ack, tp->snd_max))) {
- tcp_rack_partialack(tp, th);
- } else {
- rack_post_recovery(tp, th);
- }
- }
- }
- tp->snd_una = th->th_ack;
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
@@ -8374,7 +9808,10 @@
}
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
+ tp->t_flags &= ~TF_PREVVALID;
rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.dsack_byte_cnt = 0;
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
@@ -8389,7 +9826,7 @@
(tp->t_flags2 & TF2_DROP_AF_DATA)) {
/*
* The socket was gone and the
- * peer sent data, time to
+ * peer sent data (now or in the past), time to
* reset him.
*/
*ret_val = 1;
@@ -8457,6 +9894,7 @@
nrsm, insret, rack, rsm);
}
#endif
+ rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__);
if (rsm->r_in_tmap) {
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
@@ -8471,8 +9909,8 @@
counter_u64_add(rack_collapsed_win, 1);
RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
nrsm->r_flags |= RACK_RWND_COLLAPSED;
- rack->rc_has_collapsed = 1;
}
+ rack->rc_has_collapsed = 1;
}
static void
@@ -8514,6 +9952,40 @@
}
}
}
+
+static void
+rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ /*
+ * If fast output is in progress, lets validate that
+ * the new window did not shrink on us and make it
+ * so fast output should end.
+ */
+ if (rack->r_fast_output) {
+ uint32_t out;
+
+ /*
+ * Calculate what we will send if left as is
+ * and compare that to our send window.
+ */
+ out = ctf_outstanding(tp);
+ if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
+ /* ok we have an issue */
+ if (out >= tp->snd_wnd) {
+ /* Turn off fast output the window is met or collapsed */
+ rack->r_fast_output = 0;
+ } else {
+ /* we have some room left */
+ rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
+ if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
+ /* If not at least 1 full segment never mind */
+ rack->r_fast_output = 0;
+ }
+ }
+ }
+ }
+}
+
/*
* Return value of 1, the TCB is unlocked and most
* likely gone, return value of 0, the TCP is still
@@ -8544,6 +10016,7 @@
tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
KMOD_TCPSTAT_INC(tcps_rcvwinupd);
tp->snd_wnd = tiwin;
+ rack_validate_fo_sendwin_up(tp, rack);
tp->snd_wl1 = th->th_seq;
tp->snd_wl2 = th->th_ack;
if (tp->snd_wnd > tp->max_sndwnd)
@@ -8552,6 +10025,7 @@
} else if (thflags & TH_ACK) {
if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
tp->snd_wnd = tiwin;
+ rack_validate_fo_sendwin_up(tp, rack);
tp->snd_wl1 = th->th_seq;
tp->snd_wl2 = th->th_ack;
}
@@ -8583,7 +10057,6 @@
* nothing is outstanding, and there is
* data to send. Enter persists.
*/
- tp->snd_nxt = tp->snd_una;
rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
}
if (tp->t_flags2 & TF2_DROP_AF_DATA) {
@@ -8665,6 +10138,8 @@
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
+
+ rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
@@ -8680,47 +10155,48 @@
* trimming from the head.
*/
tcp_seq temp = save_start;
+
thflags = tcp_reass(tp, th, &temp, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
if ((tp->t_flags & TF_SACK_PERMIT) &&
(save_tlen > 0) &&
TCPS_HAVEESTABLISHED(tp->t_state)) {
- if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
- /*
- * DSACK actually handled in the fastpath
- * above.
- */
+ if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
+ /*
+ * DSACK actually handled in the fastpath
+ * above.
+ */
RACK_OPTS_INC(tcp_sack_path_1);
- tcp_update_sack_list(tp, save_start,
- save_start + save_tlen);
- } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
- if ((tp->rcv_numsacks >= 1) &&
- (tp->sackblks[0].end == save_start)) {
- /*
- * Partial overlap, recorded at todrop
- * above.
- */
+ tcp_update_sack_list(tp, save_start,
+ save_start + save_tlen);
+ } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
+ if ((tp->rcv_numsacks >= 1) &&
+ (tp->sackblks[0].end == save_start)) {
+ /*
+ * Partial overlap, recorded at todrop
+ * above.
+ */
RACK_OPTS_INC(tcp_sack_path_2a);
- tcp_update_sack_list(tp,
- tp->sackblks[0].start,
- tp->sackblks[0].end);
- } else {
+ tcp_update_sack_list(tp,
+ tp->sackblks[0].start,
+ tp->sackblks[0].end);
+ } else {
RACK_OPTS_INC(tcp_sack_path_2b);
- tcp_update_dsack_list(tp, save_start,
- save_start + save_tlen);
- }
- } else if (tlen >= save_tlen) {
- /* Update of sackblks. */
+ tcp_update_dsack_list(tp, save_start,
+ save_start + save_tlen);
+ }
+ } else if (tlen >= save_tlen) {
+ /* Update of sackblks. */
RACK_OPTS_INC(tcp_sack_path_3);
- tcp_update_dsack_list(tp, save_start,
- save_start + save_tlen);
- } else if (tlen > 0) {
+ tcp_update_dsack_list(tp, save_start,
+ save_start + save_tlen);
+ } else if (tlen > 0) {
RACK_OPTS_INC(tcp_sack_path_4);
- tcp_update_dsack_list(tp, save_start,
- save_start + tlen);
- }
- }
+ tcp_update_dsack_list(tp, save_start,
+ save_start + tlen);
+ }
+ }
} else {
m_freem(m);
thflags &= ~TH_FIN;
@@ -8926,6 +10402,7 @@
sbappendstream_locked(&so->so_rcv, m, 0);
ctf_calc_rwin(so, tp);
}
+ rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
SOCKBUF_UNLOCK(&so->so_rcv);
tp->t_flags |= TF_WAKESOR;
#ifdef NETFLIX_SB_LIMITS
@@ -9000,15 +10477,16 @@
return (0);
}
/* Ok if we reach here, we can process a fast-ack */
- if (rack->rc_gp_filled &&
+ if (rack->gp_ready &&
(rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
under_pacing = 1;
}
nsegs = max(1, m->m_pkthdr.lro_nsegs);
- rack_log_ack(tp, to, th);
+ rack_log_ack(tp, to, th, 0, 0);
/* Did the window get updated? */
if (tiwin != tp->snd_wnd) {
tp->snd_wnd = tiwin;
+ rack_validate_fo_sendwin_up(tp, rack);
tp->snd_wl1 = th->th_seq;
if (tp->snd_wnd > tp->max_sndwnd)
tp->max_sndwnd = tp->snd_wnd;
@@ -9032,7 +10510,6 @@
* nothing is outstanding, and there is
* data to send. Enter persists.
*/
- tp->snd_nxt = tp->snd_una;
rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
}
/*
@@ -9053,11 +10530,12 @@
/*
* "bad retransmit" recovery.
*/
- if (tp->t_flags & TF_PREVVALID) {
+ if ((tp->t_flags & TF_PREVVALID) &&
+ ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
tp->t_flags &= ~TF_PREVVALID;
if (tp->t_rxtshift == 1 &&
(int)(ticks - tp->t_badrxtwin) < 0)
- rack_cong_signal(tp, th, CC_RTO_ERR);
+ rack_cong_signal(tp, CC_RTO_ERR, th->th_ack);
}
/*
* Recalculate the transmit timer / rtt.
@@ -9072,13 +10550,25 @@
/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
hhook_run_tcp_est_in(tp, th, to);
#endif
-
KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
- sbdrop(&so->so_snd, acked);
if (acked) {
- /* assure we are not backed off */
+ struct mbuf *mfree;
+
+ rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
+ SOCKBUF_LOCK(&so->so_snd);
+ mfree = sbcut_locked(&so->so_snd, acked);
+ tp->snd_una = th->th_ack;
+ /* Note we want to hold the sb lock through the sendmap adjust */
+ rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
+ /* Wake up the socket if we have room to write more */
+ rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ tp->t_flags |= TF_WAKESOW;
+ m_freem(mfree);
tp->t_rxtshift = 0;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max);
rack->rc_tlp_in_progress = 0;
rack->r_ctl.rc_tlp_cnt_out = 0;
/*
@@ -9096,9 +10586,6 @@
* related information. This typically means increasing the
* congestion window.
*/
- rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
-
- tp->snd_una = th->th_ack;
if (tp->snd_wnd < ctf_outstanding(tp)) {
/* The peer collapsed the window */
rack_collapsed_window(rack);
@@ -9135,6 +10622,9 @@
rack_check_bottom_drag(tp, rack, so, acked);
}
if (tp->snd_una == tp->snd_max) {
+ tp->t_flags &= ~TF_PREVVALID;
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.dsack_byte_cnt = 0;
rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
@@ -9143,8 +10633,8 @@
tp->t_acktime = 0;
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
}
- /* Wake up the socket if we have room to write more */
- tp->t_flags |= TF_WAKESOW;
+ if (acked && rack->r_fast_output)
+ rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
if (sbavail(&so->so_snd)) {
rack->r_wanted_output = 1;
}
@@ -9312,12 +10802,14 @@
if (thflags & TH_ACK) {
/* For syn-sent we need to possibly update the rtt */
if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
- uint32_t t;
+ uint32_t t, mcts;
- t = tcp_ts_getticks() - to->to_tsecr;
+ mcts = tcp_ts_getticks();
+ t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
if (!tp->t_rttlow || tp->t_rttlow > t)
tp->t_rttlow = t;
- tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2);
+ rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
+ tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
tcp_rack_xmit_timer_commit(rack, tp);
}
if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
@@ -9427,7 +10919,9 @@
ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
- if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt)) {
return (ret_val);
}
/*
@@ -9452,6 +10946,7 @@
tp->ts_recent = to->to_tsval;
}
tp->snd_wnd = tiwin;
+ rack_validate_fo_sendwin_up(tp, rack);
/*
* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
* is on (half-synchronized state), then queue data for later
@@ -9513,12 +11008,14 @@
tp->snd_wl1 = th->th_seq - 1;
/* For syn-recv we need to possibly update the rtt */
if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
- uint32_t t;
+ uint32_t t, mcts;
- t = tcp_ts_getticks() - to->to_tsecr;
+ mcts = tcp_ts_getticks();
+ t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
if (!tp->t_rttlow || tp->t_rttlow > t)
tp->t_rttlow = t;
- tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2);
+ rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
+ tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
tcp_rack_xmit_timer_commit(rack, tp);
}
if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
@@ -9622,7 +11119,9 @@
if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt)) {
return (ret_val);
}
/*
@@ -9658,7 +11157,7 @@
} else if (tp->t_flags & TF_ACKNOW) {
ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
- ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1;
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
return (ret_val);
} else {
ctf_do_drop(m, NULL);
@@ -9695,7 +11194,9 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ struct tcp_rack *rack;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
ctf_calc_rwin(so, tp);
if ((thflags & TH_RST) ||
(tp->t_fin_is_rst && (thflags & TH_FIN)))
@@ -9717,7 +11218,9 @@
if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt)) {
return (ret_val);
}
/*
@@ -9819,7 +11322,9 @@
{
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
+ struct tcp_rack *rack;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
ctf_calc_rwin(so, tp);
if ((thflags & TH_RST) ||
@@ -9842,7 +11347,9 @@
if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt)) {
return (ret_val);
}
/*
@@ -9942,7 +11449,9 @@
{
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
+ struct tcp_rack *rack;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
ctf_calc_rwin(so, tp);
if ((thflags & TH_RST) ||
@@ -9965,7 +11474,9 @@
if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt)) {
return (ret_val);
}
/*
@@ -10008,7 +11519,7 @@
tiwin, thflags, nxt_pkt));
} else if (tp->t_flags & TF_ACKNOW) {
ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
- ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1;
+ ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
return (ret_val);
} else {
ctf_do_drop(m, NULL);
@@ -10051,7 +11562,9 @@
{
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
+ struct tcp_rack *rack;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
ctf_calc_rwin(so, tp);
if ((thflags & TH_RST) ||
@@ -10074,7 +11587,9 @@
if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt)) {
return (ret_val);
}
/*
@@ -10160,7 +11675,9 @@
{
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
+ struct tcp_rack *rack;
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
ctf_calc_rwin(so, tp);
/* Reset receive buffer auto scaling when not in bulk receive mode. */
@@ -10184,7 +11701,9 @@
if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
- if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
+ if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
+ &rack->r_ctl.challenge_ack_ts,
+ &rack->r_ctl.challenge_ack_cnt)) {
return (ret_val);
}
/*
@@ -10263,12 +11782,14 @@
}
static void
-rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line)
+rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
{
uint64_t bw_est, rate_wanted;
int chged = 0;
- uint32_t user_max;
+ uint32_t user_max, orig_min, orig_max;
+ orig_min = rack->r_ctl.rc_pace_min_segs;
+ orig_max = rack->r_ctl.rc_pace_max_segs;
user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
chged = 1;
@@ -10282,7 +11803,7 @@
} else if (rack->use_fixed_rate) {
bw_est = rack_get_bw(rack);
if ((rack->r_ctl.crte == NULL) ||
- (bw_est != rack->r_ctl.crte->rate)) {
+ (bw_est != rack->r_ctl.crte->rate)) {
rack->r_ctl.rc_pace_max_segs = user_max;
} else {
/* We are pacing right at the hardware rate */
@@ -10305,7 +11826,10 @@
bw_est = rack_get_bw(rack);
orig = rack->r_ctl.rc_pace_max_segs;
- rate_wanted = rack_get_output_bw(rack, bw_est, NULL);
+ if (fill_override)
+ rate_wanted = *fill_override;
+ else
+ rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL);
if (rate_wanted) {
/* We have something */
rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
@@ -10330,7 +11854,78 @@
rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
}
if (chged)
- rack_log_type_hrdwtso(tp, rack, 0, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2);
+ rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
+}
+
+
+static void
+rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
+{
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+#endif
+#ifdef INET
+ struct ip *ip = NULL;
+#endif
+ struct udphdr *udp = NULL;
+
+ /* Ok lets fill in the fast block, it can only be used with no IP options! */
+#ifdef INET6
+ if (rack->r_is_v6) {
+ rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ rack->r_ctl.fsb.udp = udp;
+ rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
+ } else
+ {
+ rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
+ rack->r_ctl.fsb.udp = NULL;
+ }
+ tcpip_fillheaders(rack->rc_inp,
+ tp->t_port,
+ ip6, rack->r_ctl.fsb.th);
+ } else
+#endif /* INET6 */
+ {
+ rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
+ ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ rack->r_ctl.fsb.udp = udp;
+ rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
+ } else
+ {
+ rack->r_ctl.fsb.udp = NULL;
+ rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
+ }
+ tcpip_fillheaders(rack->rc_inp,
+ tp->t_port,
+ ip, rack->r_ctl.fsb.th);
+ }
+ rack->r_fsb_inited = 1;
+}
+
+static int
+rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
+{
+ /* Allocate the larger of spaces V6 if available else just V4 */
+ rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
+#ifdef INET6
+ rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+#endif
+ rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
+ M_TCPFSB, M_NOWAIT|M_ZERO);
+ if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
+ return (ENOMEM);
+ }
+ rack->r_fsb_inited = 0;
+ return (0);
}
static int
@@ -10339,6 +11934,7 @@
struct tcp_rack *rack = NULL;
struct rack_sendmap *insret;
uint32_t iwin, snt, us_cts;
+ int err;
tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
if (tp->t_fb_ptr == NULL) {
@@ -10357,11 +11953,27 @@
TAILQ_INIT(&rack->r_ctl.rc_free);
TAILQ_INIT(&rack->r_ctl.rc_tmap);
rack->rc_tp = tp;
- if (tp->t_inpcb) {
- rack->rc_inp = tp->t_inpcb;
- }
+ rack->rc_inp = tp->t_inpcb;
+ /* Set the flag */
+ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
/* Probably not needed but lets be sure */
rack_clear_rate_sample(rack);
+ /*
+ * Save off the default values, socket options will poke
+ * at these if pacing is not on or we have not yet
+ * reached where pacing is on (gp_ready/fixed enabled).
+ * When they get set into the CC module (when gp_ready
+ * is enabled or we enable fixed) then we will set these
+ * values into the CC and place in here the old values
+ * so we have a restoral. Then we will set the flag
+ * rc_pacing_cc_set. That way whenever we turn off pacing
+ * or switch off this stack, we will know to go restore
+ * the saved values.
+ */
+ rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
+ rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn;
+ /* We want abe like behavior as well */
+ rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN;
rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
@@ -10371,38 +11983,50 @@
tp->t_delayed_ack = 1;
else
tp->t_delayed_ack = 0;
+#ifdef TCP_ACCOUNTING
+ if (rack_tcp_accounting) {
+ tp->t_flags2 |= TF2_TCP_ACCOUNTING;
+ }
+#endif
if (rack_enable_shared_cwnd)
rack->rack_enable_scwnd = 1;
rack->rc_user_set_max_segs = rack_hptsi_segments;
rack->rc_force_max_seg = 0;
if (rack_use_imac_dack)
rack->rc_dack_mode = 1;
+ TAILQ_INIT(&rack->r_ctl.opt_list);
rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
- rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
- rack->r_ctl.rc_prop_rate = rack_proportional_rate;
rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
- rack->r_ctl.rc_early_recovery = rack_early_recovery;
rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
rack->r_ctl.rc_highest_us_rtt = 0;
+ rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
+ if (rack_use_cmp_acks)
+ rack->r_use_cmp_ack = 1;
if (rack_disable_prr)
rack->rack_no_prr = 1;
if (rack_gp_no_rec_chg)
rack->rc_gp_no_rec_chg = 1;
- rack->rc_always_pace = rack_pace_every_seg;
- if (rack_enable_mqueue_for_nonpaced)
+ if (rack_pace_every_seg && tcp_can_enable_pacing()) {
+ rack->rc_always_pace = 1;
+ if (rack->use_fixed_rate || rack->gp_ready)
+ rack_set_cc_pacing(rack);
+ } else
+ rack->rc_always_pace = 0;
+ if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
rack->r_mbuf_queue = 1;
else
rack->r_mbuf_queue = 0;
- if (rack->r_mbuf_queue || rack->rc_always_pace)
+ if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
else
tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
- rack_set_pace_segments(tp, rack, __LINE__);
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
if (rack_limits_scwnd)
- rack->r_limit_scw = 1;
+ rack->r_limit_scw = 1;
else
- rack->r_limit_scw = 0;
+ rack->r_limit_scw = 0;
+ rack->rc_labc = V_tcp_abc_l_var;
rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
@@ -10415,6 +12039,8 @@
rack->r_running_early = 0;
rack->rc_init_win = rack_default_init_window;
rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
+ if (rack_hw_up_only)
+ rack->r_up_only = 1;
if (rack_do_dyn_mul) {
/* When dynamic adjustment is on CA needs to start at 100% */
rack->rc_gp_dyn_mul = 1;
@@ -10430,7 +12056,17 @@
us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
rack->r_ctl.rc_time_of_last_probertt = us_cts;
+ rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
rack->r_ctl.rc_time_probertt_starts = 0;
+ /* We require at least one measurement, even if the sysctl is 0 */
+ if (rack_req_measurements)
+ rack->r_ctl.req_measurements = rack_req_measurements;
+ else
+ rack->r_ctl.req_measurements = 1;
+ if (rack_enable_hw_pacing)
+ rack->rack_hdw_pace_ena = 1;
+ if (rack_hw_rate_caps)
+ rack->r_rack_hw_rate_caps = 1;
/* Do we force on detection? */
#ifdef NETFLIX_EXP_DETECTION
if (tcp_force_detection)
@@ -10440,6 +12076,12 @@
rack->do_detection = 0;
if (rack_non_rxt_use_cr)
rack->rack_rec_nonrxt_use_cr = 1;
+ err = rack_init_fsb(tp, rack);
+ if (err) {
+ uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
+ tp->t_fb_ptr = NULL;
+ return (err);
+ }
if (tp->snd_una != tp->snd_max) {
/* Create a send map for the current outstanding data */
struct rack_sendmap *rsm;
@@ -10450,19 +12092,36 @@
tp->t_fb_ptr = NULL;
return (ENOMEM);
}
- rsm->r_flags = RACK_OVERMAX;
- rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time;
+ rsm->r_no_rtt_allowed = 1;
+ rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_rtr_cnt = 1;
rsm->r_rtr_bytes = 0;
- rsm->r_start = tp->snd_una;
if (tp->t_flags & TF_SENTFIN) {
rsm->r_end = tp->snd_max - 1;
rsm->r_flags |= RACK_HAS_FIN;
} else {
rsm->r_end = tp->snd_max;
}
- rsm->usec_orig_send = us_cts;
+ if (tp->snd_una == tp->iss) {
+ /* The data space is one beyond snd_una */
+ rsm->r_flags |= RACK_HAS_SYN;
+ rsm->r_start = tp->iss;
+ rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
+ } else
+ rsm->r_start = tp->snd_una;
rsm->r_dupack = 0;
+ if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
+ rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
+ rsm->orig_m_len = rsm->m->m_len;
+ } else {
+ /*
+ * This can happen if we have a stand-alone FIN or
+ * SYN.
+ */
+ rsm->m = NULL;
+ rsm->orig_m_len = 0;
+ rsm->soff = 0;
+ }
insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
#ifdef INVARIANTS
if (insret != NULL) {
@@ -10473,8 +12132,61 @@
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 1;
}
- /* Cancel the GP measurement in progress */
- tp->t_flags &= ~TF_GPUTINPROG;
+ /*
+ * Timers in Rack are kept in microseconds so lets
+ * convert any initial incoming variables
+ * from ticks into usecs. Note that we
+ * also change the values of t_srtt and t_rttvar, if
+ * they are non-zero. They are kept with a 5
+ * bit decimal so we have to carefully convert
+ * these to get the full precision.
+ */
+ if (tp->t_srtt > 1) {
+ uint32_t val, frac;
+
+ val = tp->t_srtt >> TCP_RTT_SHIFT;
+ frac = tp->t_srtt & 0x1f;
+ tp->t_srtt = TICKS_2_USEC(val);
+ /*
+ * frac is the fractional part of the srtt (if any)
+ * but its in ticks and every bit represents
+ * 1/32nd of a hz.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+ }
+ tp->t_srtt += frac;
+ }
+ }
+ if (tp->t_rttvar) {
+ uint32_t val, frac;
+
+ val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
+ frac = tp->t_rttvar & 0x1f;
+ tp->t_rttvar = TICKS_2_USEC(val);
+ /*
+ * frac is the fractional part of the srtt (if any)
+ * but its in ticks and every bit represents
+ * 1/32nd of a hz.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+ }
+ tp->t_rttvar += frac;
+ }
+ }
+ tp->t_rxtcur = TICKS_2_USEC(tp->t_rxtcur);
+ tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
+ if (rack_def_profile)
+ rack_set_profile(rack, rack_def_profile);
+ /* Cancel the GP measurement in progress */
+ tp->t_flags &= ~TF_GPUTINPROG;
if (SEQ_GT(tp->snd_max, tp->iss))
snt = tp->snd_max - tp->iss;
else
@@ -10510,7 +12222,8 @@
tp->snd_ssthresh = 0xffffffff;
}
rack_stop_all_timers(tp);
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
+ /* Lets setup the fsb block */
+ rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
rack_log_rtt_shrinks(rack, us_cts, 0,
__LINE__, RACK_RTTS_INIT);
return (0);
@@ -10555,14 +12268,55 @@
return (EINVAL);
}
+
static void
rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
{
+ int ack_cmp = 0;
+
if (tp->t_fb_ptr) {
struct tcp_rack *rack;
struct rack_sendmap *rsm, *nrsm, *rm;
rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (tp->t_in_pkt) {
+ /*
+ * Since we are switching we need to process any
+ * inbound packets in case a compressed ack is
+ * in queue or the new stack does not support
+ * mbuf queuing. These packets in theory should
+ * have been handled by the old stack anyway.
+ */
+ if ((rack->rc_inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) ||
+ (rack->rc_inp->inp_flags2 & INP_FREED)) {
+ /* Kill all the packets */
+ struct mbuf *save, *m;
+
+ m = tp->t_in_pkt;
+ tp->t_in_pkt = NULL;
+ tp->t_tail_pkt = NULL;
+ while (m) {
+ save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = save;
+ }
+ } else {
+ /* Process all the packets */
+ ctf_do_queued_segments(rack->rc_inp->inp_socket, rack->rc_tp, 0);
+ }
+ if ((tp->t_inpcb) &&
+ (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP))
+ ack_cmp = 1;
+ if (ack_cmp) {
+ /* Total if we used large or small (if ack-cmp was used). */
+ if (rack->rc_inp->inp_flags2 & INP_MBUF_L_ACKS)
+ counter_u64_add(rack_large_ackcmp, 1);
+ else
+ counter_u64_add(rack_small_ackcmp, 1);
+ }
+ }
+ tp->t_flags &= ~TF_FORCEDATA;
#ifdef NETFLIX_SHARED_CWND
if (rack->r_ctl.rc_scw) {
uint32_t limit;
@@ -10577,12 +12331,74 @@
rack->r_ctl.rc_scw = NULL;
}
#endif
+ if (rack->r_ctl.fsb.tcp_ip_hdr) {
+ free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
+ rack->r_ctl.fsb.tcp_ip_hdr = NULL;
+ rack->r_ctl.fsb.th = NULL;
+ }
+ /* Convert back to ticks, with */
+ if (tp->t_srtt > 1) {
+ uint32_t val, frac;
+
+ val = USEC_2_TICKS(tp->t_srtt);
+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+ tp->t_srtt = val << TCP_RTT_SHIFT;
+ /*
+ * frac is the fractional part here is left
+ * over from converting to hz and shifting.
+ * We need to convert this to the 5 bit
+ * remainder.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+ }
+ tp->t_srtt += frac;
+ }
+ }
+ if (tp->t_rttvar) {
+ uint32_t val, frac;
+
+ val = USEC_2_TICKS(tp->t_rttvar);
+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+ tp->t_rttvar = val << TCP_RTTVAR_SHIFT;
+ /*
+ * frac is the fractional part here is left
+ * over from converting to hz and shifting.
+ * We need to convert this to the 5 bit
+ * remainder.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+ }
+ tp->t_rttvar += frac;
+ }
+ }
+ tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur);
+ tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
+ if (rack->rc_always_pace) {
+ tcp_decrement_paced_conn();
+ rack_undo_cc_pacing(rack);
+ rack->rc_always_pace = 0;
+ }
+ /* Clean up any options if they were not applied */
+ while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
+ struct deferred_opt_list *dol;
+
+ dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
+ TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
+ free(dol, M_TCPDO);
+ }
/* rack does not use force data but other stacks may clear it */
- tp->t_flags &= ~TF_FORCEDATA;
- if (tp->t_inpcb) {
- tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
- tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
- tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+ if (rack->r_ctl.crte != NULL) {
+ tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
+ rack->rack_hdrw_pacing = 0;
+ rack->r_ctl.crte = NULL;
}
#ifdef TCP_BLACKBOX
tcp_log_flowend(tp);
@@ -10607,8 +12423,15 @@
uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
tp->t_fb_ptr = NULL;
}
- /* Cancel the GP measurement in progress */
- tp->t_flags &= ~TF_GPUTINPROG;
+ if (tp->t_inpcb) {
+ tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
+ tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+ tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP;
+ /* Cancel the GP measurement in progress */
+ tp->t_flags &= ~TF_GPUTINPROG;
+ tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS;
+ }
/* Make sure snd_nxt is correctly set */
tp->snd_nxt = tp->snd_max;
}
@@ -10626,27 +12449,32 @@
rack->r_substate = rack_do_syn_recv;
break;
case TCPS_ESTABLISHED:
- rack_set_pace_segments(tp, rack, __LINE__);
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_ESTABLISHED;
rack->r_substate = rack_do_established;
break;
case TCPS_CLOSE_WAIT:
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_CLOSE_WAIT;
rack->r_substate = rack_do_close_wait;
break;
case TCPS_FIN_WAIT_1:
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_FIN_WAIT_1;
rack->r_substate = rack_do_fin_wait_1;
break;
case TCPS_CLOSING:
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_CLOSING;
rack->r_substate = rack_do_closing;
break;
case TCPS_LAST_ACK:
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_LAST_ACK;
rack->r_substate = rack_do_lastack;
break;
case TCPS_FIN_WAIT_2:
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
rack->r_state = TCPS_FIN_WAIT_2;
rack->r_substate = rack_do_fin_wait_2;
break;
@@ -10656,6 +12484,9 @@
default:
break;
};
+ if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
+
}
static void
@@ -10738,63 +12569,67 @@
tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
}
rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
+ rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
}
-static int
-rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
- struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
- int32_t nxt_pkt, struct timeval *tv)
-{
- int32_t thflags, retval, did_out = 0;
- int32_t way_out = 0;
- uint32_t cts;
- uint32_t tiwin;
- struct timespec ts;
- struct tcpopt to;
- struct tcp_rack *rack;
- struct rack_sendmap *rsm;
- int32_t prev_state = 0;
- uint32_t us_cts;
- /*
- * tv passed from common code is from either M_TSTMP_LRO or
- * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The
- * rack_pacing stack assumes tv always refers to 'now', so we overwrite
- * tv here to guarantee that.
- */
- if (m->m_flags & M_TSTMP_LRO)
- tcp_get_usecs(tv);
- cts = tcp_tv_to_mssectick(tv);
- rack = (struct tcp_rack *)tp->t_fb_ptr;
+static void
+rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq)
+{
+ tp->snd_wnd = tiwin;
+ rack_validate_fo_sendwin_up(tp, rack);
+ tp->snd_wl1 = seq;
+ tp->snd_wl2 = ack;
+ if (tp->snd_wnd > tp->max_sndwnd)
+ tp->max_sndwnd = tp->snd_wnd;
+ if (tp->snd_wnd < (tp->snd_max - high_seq)) {
+ /* The peer collapsed the window */
+ rack_collapsed_window(rack);
+ } else if (rack->rc_has_collapsed)
+ rack_un_collapse_window(rack);
+ /* Do we exit persists? */
+ if ((rack->rc_in_persist != 0) &&
+ (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
+ rack->r_ctl.rc_pace_min_segs))) {
+ rack_exit_persist(tp, rack, cts);
+ }
+ /* Do we enter persists? */
+ if ((rack->rc_in_persist == 0) &&
+ (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->snd_max == tp->snd_una) &&
+ sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
+ (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
+ /*
+ * Here the rwnd is less than
+ * the pacing size, we are established,
+ * nothing is outstanding, and there is
+ * data to send. Enter persists.
+ */
+ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
+ }
+}
- if ((m->m_flags & M_TSTMP) ||
- (m->m_flags & M_TSTMP_LRO)) {
- mbuf_tstmp2timespec(m, &ts);
- rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
- rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
- } else
- rack->r_ctl.act_rcv_time = *tv;
- kern_prefetch(rack, &prev_state);
- prev_state = 0;
- thflags = th->th_flags;
+static void
+rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
+{
- NET_EPOCH_ASSERT();
- INP_WLOCK_ASSERT(tp->t_inpcb);
- KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
- __func__));
- KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
- __func__));
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
struct timeval ltv;
+ char tcp_hdr_buf[60];
+ struct tcphdr *th;
+ struct timespec ts;
+ uint32_t orig_snd_una;
+ uint8_t xx = 0;
+
#ifdef NETFLIX_HTTP_LOGGING
struct http_sendfile_track *http_req;
- if (SEQ_GT(th->th_ack, tp->snd_una)) {
- http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
+ if (SEQ_GT(ae->ack, tp->snd_una)) {
+ http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1));
} else {
- http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
+ http_req = tcp_http_find_req_for_seq(tp, ae->ack);
}
#endif
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
@@ -10804,27 +12639,37 @@
log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
else
log.u_bbr.flex1 = 0;
+ log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->r_might_revert;
log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
- log.u_bbr.flex3 = m->m_flags;
+ log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = tp->t_maxseg;
log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
- if (m->m_flags & M_TSTMP) {
+ log.u_bbr.flex7 = 1;
+ log.u_bbr.lost = ae->flags;
+ log.u_bbr.cwnd_gain = ackval;
+ log.u_bbr.pacing_gain = 0x2;
+ if (ae->flags & TSTMP_HDWR) {
/* Record the hardware timestamp if present */
- mbuf_tstmp2timespec(m, &ts);
+ log.u_bbr.flex3 = M_TSTMP;
+ ts.tv_sec = ae->timestamp / 1000000000;
+ ts.tv_nsec = ae->timestamp % 1000000000;
ltv.tv_sec = ts.tv_sec;
ltv.tv_usec = ts.tv_nsec / 1000;
log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
- } else if (m->m_flags & M_TSTMP_LRO) {
+ } else if (ae->flags & TSTMP_LRO) {
/* Record the LRO the arrival timestamp */
- mbuf_tstmp2timespec(m, &ts);
+ log.u_bbr.flex3 = M_TSTMP_LRO;
+ ts.tv_sec = ae->timestamp / 1000000000;
+ ts.tv_nsec = ae->timestamp % 1000000000;
ltv.tv_sec = ts.tv_sec;
ltv.tv_usec = ts.tv_nsec / 1000;
log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
}
log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
/* Log the rcv time */
- log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
+ log.u_bbr.delRate = ae->timestamp;
#ifdef NETFLIX_HTTP_LOGGING
log.u_bbr.applimited = tp->t_http_closed;
log.u_bbr.applimited <<= 8;
@@ -10852,900 +12697,1961 @@
}
}
#endif
- TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
- tlen, &log, true, &ltv);
- }
- if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
- way_out = 4;
- retval = 0;
- goto done_with_input;
- }
- /*
- * If a segment with the ACK-bit set arrives in the SYN-SENT state
- * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
- */
- if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
- (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
- tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
- ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
- return(1);
+ memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
+ th = (struct tcphdr *)tcp_hdr_buf;
+ th->th_seq = ae->seq;
+ th->th_ack = ae->ack;
+ th->th_win = ae->win;
+ /* Now fill in the ports */
+ th->th_sport = tp->t_inpcb->inp_fport;
+ th->th_dport = tp->t_inpcb->inp_lport;
+ th->th_flags = ae->flags & 0xff;
+ /* Now do we have a timestamp option? */
+ if (ae->flags & HAS_TSTMP) {
+ u_char *cp;
+ uint32_t val;
+
+ th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
+ cp = (u_char *)(th + 1);
+ *cp = TCPOPT_NOP;
+ cp++;
+ *cp = TCPOPT_NOP;
+ cp++;
+ *cp = TCPOPT_TIMESTAMP;
+ cp++;
+ *cp = TCPOLEN_TIMESTAMP;
+ cp++;
+ val = htonl(ae->ts_value);
+ bcopy((char *)&val,
+ (char *)cp, sizeof(uint32_t));
+ val = htonl(ae->ts_echo);
+ bcopy((char *)&val,
+ (char *)(cp + 4), sizeof(uint32_t));
+ } else
+ th->th_off = (sizeof(struct tcphdr) >> 2);
+
+ /*
+ * For sane logging we need to play a little trick.
+ * If the ack were fully processed we would have moved
+ * snd_una to high_seq, but since compressed acks are
+ * processed in two phases, at this point (logging) snd_una
+ * won't be advanced. So we would see multiple acks showing
+ * the advancement. We can prevent that by "pretending" that
+ * snd_una was advanced and then un-advancing it so that the
+ * logging code has the right value for tlb_snd_una.
+ */
+ if (tp->snd_una != high_seq) {
+ orig_snd_una = tp->snd_una;
+ tp->snd_una = high_seq;
+ xx = 1;
+ } else
+ xx = 0;
+ TCP_LOG_EVENTP(tp, th,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0,
+ 0, &log, true, &ltv);
+ if (xx) {
+ tp->snd_una = orig_snd_una;
+ }
}
- /*
- * Parse options on any incoming segment.
- */
- tcp_dooptions(&to, (u_char *)(th + 1),
- (th->th_off << 2) - sizeof(struct tcphdr),
- (thflags & TH_SYN) ? TO_SYN : 0);
+}
+static int
+rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
+{
/*
- * If timestamps were negotiated during SYN/ACK and a
- * segment without a timestamp is received, silently drop
- * the segment, unless it is a RST segment or missing timestamps are
- * tolerated.
- * See section 3.2 of RFC 7323.
+ * Handle a "special" compressed ack mbuf. Each incoming
+ * ack has only four possible dispositions:
+ *
+ * A) It moves the cum-ack forward
+ * B) It is behind the cum-ack.
+ * C) It is a window-update ack.
+ * D) It is a dup-ack.
+ *
+ * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
+ * in the incoming mbuf. We also need to still pay attention
+ * to nxt_pkt since there may be another packet after this
+ * one.
*/
- if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
- ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
- way_out = 5;
- retval = 0;
- goto done_with_input;
- }
+#ifdef TCP_ACCOUNTING
+ uint64_t ts_val;
+ uint64_t rdstc;
+#endif
+ int segsiz;
+ struct timespec ts;
+ struct tcp_rack *rack;
+ struct tcp_ackent *ae;
+ uint32_t tiwin, us_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
+ int cnt, i, did_out, ourfinisacked = 0;
+ int win_up_req = 0;
+ struct tcpopt to_holder, *to = NULL;
+ int nsegs = 0;
+ int under_pacing = 1;
+ int recovery = 0;
+ int idx;
+#ifdef TCP_ACCOUNTING
+ sched_pin();
+#endif
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack->gp_ready &&
+ (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
+ under_pacing = 0;
+ else
+ under_pacing = 1;
- /*
- * Segment received on connection. Reset idle time and keep-alive
- * timer. XXX: This should be done after segment validation to
- * ignore broken/spoofed segs.
- */
- if (tp->t_idle_reduce &&
- (tp->snd_max == tp->snd_una) &&
- ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
- counter_u64_add(rack_input_idle_reduces, 1);
- rack_cc_after_idle(rack, tp);
+ if (rack->r_state != tp->t_state)
+ rack_set_state(tp, rack);
+ to = &to_holder;
+ to->to_flags = 0;
+ KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
+ ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
+ cnt = m->m_len / sizeof(struct tcp_ackent);
+ idx = cnt / 5;
+ if (idx >= MAX_NUM_OF_CNTS)
+ idx = MAX_NUM_OF_CNTS - 1;
+ counter_u64_add(rack_proc_comp_ack[idx], 1);
+ counter_u64_add(rack_multi_single_eq, cnt);
+ high_seq = tp->snd_una;
+ the_win = tp->snd_wnd;
+ win_seq = tp->snd_wl1;
+ win_upd_ack = tp->snd_wl2;
+ cts = us_cts = tcp_tv_to_usectick(tv);
+ segsiz = ctf_fixed_maxseg(tp);
+ if ((rack->rc_gp_dyn_mul) &&
+ (rack->use_fixed_rate == 0) &&
+ (rack->rc_always_pace)) {
+ /* Check in on probertt */
+ rack_check_probe_rtt(rack, us_cts);
}
- tp->t_rcvtime = ticks;
- /*
- * Unscale the window into a 32-bit value. For the SYN_SENT state
- * the scale is zero.
- */
- tiwin = th->th_win << tp->snd_scale;
-#ifdef STATS
- stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
+ for (i = 0; i < cnt; i++) {
+#ifdef TCP_ACCOUNTING
+ ts_val = get_cyclecount();
#endif
- if (tiwin > rack->r_ctl.rc_high_rwnd)
- rack->r_ctl.rc_high_rwnd = tiwin;
- /*
- * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
- * this to occur after we've validated the segment.
- */
- if (tp->t_flags2 & TF2_ECN_PERMIT) {
- if (thflags & TH_CWR) {
- tp->t_flags2 &= ~TF2_ECN_SND_ECE;
- tp->t_flags |= TF_ACKNOW;
- }
- switch (iptos & IPTOS_ECN_MASK) {
- case IPTOS_ECN_CE:
- tp->t_flags2 |= TF2_ECN_SND_ECE;
- KMOD_TCPSTAT_INC(tcps_ecn_ce);
- break;
- case IPTOS_ECN_ECT0:
- KMOD_TCPSTAT_INC(tcps_ecn_ect0);
- break;
- case IPTOS_ECN_ECT1:
- KMOD_TCPSTAT_INC(tcps_ecn_ect1);
- break;
+ rack_clear_rate_sample(rack);
+ ae = ((mtod(m, struct tcp_ackent *)) + i);
+ /* Setup the window */
+ tiwin = ae->win << tp->snd_scale;
+ /* figure out the type of ack */
+ if (SEQ_LT(ae->ack, high_seq)) {
+ /* Case B*/
+ ae->ack_val_set = ACK_BEHIND;
+ } else if (SEQ_GT(ae->ack, high_seq)) {
+ /* Case A */
+ ae->ack_val_set = ACK_CUMACK;
+ } else if (tiwin == the_win) {
+ /* Case D */
+ ae->ack_val_set = ACK_DUPACK;
+ } else {
+ /* Case C */
+ ae->ack_val_set = ACK_RWND;
+ }
+ rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
+ /* Validate timestamp */
+ if (ae->flags & HAS_TSTMP) {
+ /* Setup for a timestamp */
+ to->to_flags = TOF_TS;
+ ae->ts_echo -= tp->ts_offset;
+ to->to_tsecr = ae->ts_echo;
+ to->to_tsval = ae->ts_value;
+ /*
+ * If echoed timestamp is later than the current time, fall back to
+ * non RFC1323 RTT calculation. Normalize timestamp if syncookies
+ * were used when this connection was established.
+ */
+ if (TSTMP_GT(ae->ts_echo, cts))
+ ae->ts_echo = 0;
+ if (tp->ts_recent &&
+ TSTMP_LT(ae->ts_value, tp->ts_recent)) {
+ if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ae->ack_val_set] ,
+ (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
+ }
+ }
+#endif
+ continue;
+ }
+ }
+ if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
+ SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
+ tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent = ae->ts_value;
+ }
+ } else {
+ /* Setup for a no options */
+ to->to_flags = 0;
+ }
+ /* Update the rcv time and perform idle reduction possibly */
+ if (tp->t_idle_reduce &&
+ (tp->snd_max == tp->snd_una) &&
+ ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
+ counter_u64_add(rack_input_idle_reduces, 1);
+ rack_cc_after_idle(rack, tp);
}
+ tp->t_rcvtime = ticks;
+ /* Now what about ECN? */
+ if (tp->t_flags2 & TF2_ECN_PERMIT) {
+ if (ae->flags & TH_CWR) {
+ tp->t_flags2 &= ~TF2_ECN_SND_ECE;
+ tp->t_flags |= TF_ACKNOW;
+ }
+ switch (ae->codepoint & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags2 |= TF2_ECN_SND_ECE;
+ KMOD_TCPSTAT_INC(tcps_ecn_ce);
+ break;
+ case IPTOS_ECN_ECT0:
+ KMOD_TCPSTAT_INC(tcps_ecn_ect0);
+ break;
+ case IPTOS_ECN_ECT1:
+ KMOD_TCPSTAT_INC(tcps_ecn_ect1);
+ break;
+ }
- /* Process a packet differently from RFC3168. */
- cc_ecnpkt_handler(tp, th, iptos);
-
- /* Congestion experienced. */
- if (thflags & TH_ECE) {
- rack_cong_signal(tp, th, CC_ECN);
+ /* Process a packet differently from RFC3168. */
+ cc_ecnpkt_handler_flags(tp, ae->flags, ae->codepoint);
+ /* Congestion experienced. */
+ if (ae->flags & TH_ECE) {
+ rack_cong_signal(tp, CC_ECN, ae->ack);
+ }
}
- }
-
- /*
- * If echoed timestamp is later than the current time, fall back to
- * non RFC1323 RTT calculation. Normalize timestamp if syncookies
- * were used when this connection was established.
- */
- if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
- to.to_tsecr -= tp->ts_offset;
- if (TSTMP_GT(to.to_tsecr, cts))
- to.to_tsecr = 0;
- }
-
- /*
- * If its the first time in we need to take care of options and
- * verify we can do SACK for rack!
- */
- if (rack->r_state == 0) {
- /* Should be init'd by rack_init() */
- KASSERT(rack->rc_inp != NULL,
- ("%s: rack->rc_inp unexpectedly NULL", __func__));
- if (rack->rc_inp == NULL) {
- rack->rc_inp = tp->t_inpcb;
+#ifdef TCP_ACCOUNTING
+ /* Count for the specific type of ack in */
+ counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[ae->ack_val_set]++;
}
-
+#endif
/*
- * Process options only when we get SYN/ACK back. The SYN
- * case for incoming connections is handled in tcp_syncache.
- * According to RFC1323 the window field in a SYN (i.e., a
- * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
- * this is traditional behavior, may need to be cleaned up.
+ * Note how we could move up these in the determination
+ * above, but we don't so that way the timestamp checks (and ECN)
+ * is done first before we do any processing on the ACK.
+ * The non-compressed path through the code has this
+ * weakness (noted by @jtl) that it actually does some
+ * processing before verifying the timestamp information.
+ * We don't take that path here which is why we set
+ * the ack_val_set first, do the timestamp and ecn
+ * processing, and then look at what we have setup.
*/
- if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
- /* Handle parallel SYN for ECN */
- if (!(thflags & TH_ACK) &&
- ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) &&
- ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) {
- tp->t_flags2 |= TF2_ECN_PERMIT;
- tp->t_flags2 |= TF2_ECN_SND_ECE;
- TCPSTAT_INC(tcps_ecn_shs);
- }
- if ((to.to_flags & TOF_SCALE) &&
- (tp->t_flags & TF_REQ_SCALE)) {
- tp->t_flags |= TF_RCVD_SCALE;
- tp->snd_scale = to.to_wscale;
- } else
- tp->t_flags &= ~TF_REQ_SCALE;
+ if (ae->ack_val_set == ACK_BEHIND) {
/*
- * Initial send window. It will be updated with the
- * next incoming segment to the scaled value.
+ * Case B flag reordering, if window is not closed
+ * or it could be a keep-alive or persists
*/
- tp->snd_wnd = th->th_win;
- if ((to.to_flags & TOF_TS) &&
- (tp->t_flags & TF_REQ_TSTMP)) {
- tp->t_flags |= TF_RCVD_TSTMP;
- tp->ts_recent = to.to_tsval;
- tp->ts_recent_age = cts;
- } else
- tp->t_flags &= ~TF_REQ_TSTMP;
- if (to.to_flags & TOF_MSS)
- tcp_mss(tp, to.to_mss);
- if ((tp->t_flags & TF_SACK_PERMIT) &&
- (to.to_flags & TOF_SACKPERM) == 0)
- tp->t_flags &= ~TF_SACK_PERMIT;
- if (IS_FASTOPEN(tp->t_flags)) {
- if (to.to_flags & TOF_FASTOPEN) {
- uint16_t mss;
+ if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
+ counter_u64_add(rack_reorder_seen, 1);
+ rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ }
+ } else if (ae->ack_val_set == ACK_DUPACK) {
+ /* Case D */
- if (to.to_flags & TOF_MSS)
- mss = to.to_mss;
- else
- if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
- mss = TCP6_MSS;
- else
- mss = TCP_MSS;
- tcp_fastopen_update_cache(tp, mss,
- to.to_tfo_len, to.to_tfo_cookie);
- } else
- tcp_fastopen_disable_path(tp);
+ rack_strike_dupack(rack);
+ } else if (ae->ack_val_set == ACK_RWND) {
+ /* Case C */
+
+ win_up_req = 1;
+ win_upd_ack = ae->ack;
+ win_seq = ae->seq;
+ the_win = tiwin;
+ } else {
+ /* Case A */
+
+ if (SEQ_GT(ae->ack, tp->snd_max)) {
+ /*
+ * We just send an ack since the incoming
+ * ack is beyond the largest seq we sent.
+ */
+ if ((tp->t_flags & TF_ACKNOW) == 0) {
+ ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt);
+ if (tp->t_flags && TF_ACKNOW)
+ rack->r_wanted_output = 1;
+ }
+ } else {
+ nsegs++;
+ /* If the window changed setup to update */
+ if (tiwin != tp->snd_wnd) {
+ win_up_req = 1;
+ win_upd_ack = ae->ack;
+ win_seq = ae->seq;
+ the_win = tiwin;
+ }
+#ifdef TCP_ACCOUNTING
+ /* Account for the acks */
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
+ }
+ counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN],
+ (((ae->ack - high_seq) + segsiz - 1) / segsiz));
+#endif
+ high_seq = ae->ack;
+ /* Setup our act_rcv_time */
+ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
+ ts.tv_sec = ae->timestamp / 1000000000;
+ ts.tv_nsec = ae->timestamp % 1000000000;
+ rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+ rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
+ } else {
+ rack->r_ctl.act_rcv_time = *tv;
+ }
+ rack_process_to_cumack(tp, rack, ae->ack, cts, to);
}
}
- /*
- * At this point we are at the initial call. Here we decide
- * if we are doing RACK or not. We do this by seeing if
- * TF_SACK_PERMIT is set and the sack-not-required is clear.
- * The code now does do dup-ack counting so if you don't
- * switch back you won't get rack & TLP, but you will still
- * get this stack.
- */
-
- if ((rack_sack_not_required == 0) &&
- ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
- tcp_switch_back_to_default(tp);
- (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
- tlen, iptos);
- return (1);
+ /* And lets be sure to commit the rtt measurements for this ack */
+ tcp_rack_xmit_timer_commit(rack, tp);
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
+ if (ae->ack_val_set == ACK_CUMACK)
+ tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
+ }
}
- /* Set the flag */
- rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
- tcp_set_hpts(tp->t_inpcb);
- sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
+#endif
}
- if (thflags & TH_FIN)
- tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
- us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
- if ((rack->rc_gp_dyn_mul) &&
- (rack->use_fixed_rate == 0) &&
- (rack->rc_always_pace)) {
- /* Check in on probertt */
- rack_check_probe_rtt(rack, us_cts);
+#ifdef TCP_ACCOUNTING
+ ts_val = get_cyclecount();
+#endif
+ acked_amount = acked = (high_seq - tp->snd_una);
+ if (win_up_req) {
+ rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
}
- if (rack->forced_ack) {
- uint32_t us_rtt;
+ if (acked) {
+ if (rack->sack_attack_disable == 0)
+ rack_do_decay(rack);
+ if (acked >= segsiz) {
+ /*
+ * You only get credit for
+ * MSS and greater (and you get extra
+ * credit for larger cum-ack moves).
+ */
+ int ac;
+ ac = acked / segsiz;
+ rack->r_ctl.ack_count += ac;
+ counter_u64_add(rack_ack_total, ac);
+ }
+ if (rack->r_ctl.ack_count > 0xfff00000) {
+ /*
+ * reduce the number to keep us under
+ * a uint32_t.
+ */
+ rack->r_ctl.ack_count /= 2;
+ rack->r_ctl.sack_count /= 2;
+ }
+ if (tp->t_flags & TF_NEEDSYN) {
+ /*
+ * T/TCP: Connection was half-synchronized, and our SYN has
+ * been ACK'd (so connection is now fully synchronized). Go
+ * to non-starred state, increment snd_una for ACK of SYN,
+ * and check if we can do window scaling.
+ */
+ tp->t_flags &= ~TF_NEEDSYN;
+ tp->snd_una++;
+ acked_amount = acked = (high_seq - tp->snd_una);
+ }
+ if (acked > sbavail(&so->so_snd))
+ acked_amount = sbavail(&so->so_snd);
+#ifdef NETFLIX_EXP_DETECTION
/*
- * A persist or keep-alive was forced out, update our
- * min rtt time. Note we do not worry about lost
- * retransmissions since KEEP-ALIVES and persists
- * are usually way long on times of sending (though
- * if we were really paranoid or worried we could
- * at least use timestamps if available to validate).
+ * We only care on a cum-ack move if we are in a sack-disabled
+ * state. We have already added in to the ack_count, and we never
+ * would disable on a cum-ack move, so we only care to do the
+ * detection if it may "undo" it, i.e. we were in disabled already.
*/
- rack->forced_ack = 0;
- us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
- if (us_rtt == 0)
- us_rtt = 1;
- rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3);
- rack_apply_updated_usrtt(rack, us_rtt, us_cts);
- }
- /*
- * This is the one exception case where we set the rack state
- * always. All other times (timers etc) we must have a rack-state
- * set (so we assure we have done the checks above for SACK).
- */
- rack->r_ctl.rc_rcvtime = cts;
- if (rack->r_state != tp->t_state)
- rack_set_state(tp, rack);
- if (SEQ_GT(th->th_ack, tp->snd_una) &&
- (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
- kern_prefetch(rsm, &prev_state);
- prev_state = rack->r_state;
- rack_clear_rate_sample(rack);
- retval = (*rack->r_substate) (m, th, so,
- tp, &to, drop_hdrlen,
- tlen, tiwin, thflags, nxt_pkt, iptos);
-#ifdef INVARIANTS
- if ((retval == 0) &&
- (tp->t_inpcb == NULL)) {
- panic("retval:%d tp:%p t_inpcb:NULL state:%d",
- retval, tp, prev_state);
- }
+ if (rack->sack_attack_disable)
+ rack_do_detection(tp, rack, acked_amount, segsiz);
#endif
- if (retval == 0) {
- /*
- * If retval is 1 the tcb is unlocked and most likely the tp
- * is gone.
- */
- INP_WLOCK_ASSERT(tp->t_inpcb);
- if ((rack->rc_gp_dyn_mul) &&
- (rack->rc_always_pace) &&
- (rack->use_fixed_rate == 0) &&
- rack->in_probe_rtt &&
- (rack->r_ctl.rc_time_probertt_starts == 0)) {
+ if (IN_FASTRECOVERY(tp->t_flags) &&
+ (rack->rack_no_prr == 0))
+ rack_update_prr(tp, rack, acked_amount, high_seq);
+ if (IN_RECOVERY(tp->t_flags)) {
+ if (SEQ_LT(high_seq, tp->snd_recover) &&
+ (SEQ_LT(high_seq, tp->snd_max))) {
+ tcp_rack_partialack(tp);
+ } else {
+ rack_post_recovery(tp, high_seq);
+ recovery = 1;
+ }
+ }
+ /* Handle the rack-log-ack part (sendmap) */
+ if ((sbused(&so->so_snd) == 0) &&
+ (acked > acked_amount) &&
+ (tp->t_state >= TCPS_FIN_WAIT_1) &&
+ (tp->t_flags & TF_SENTFIN)) {
/*
- * If we are going for target, lets recheck before
- * we output.
+ * We must be sure our fin
+ * was sent and acked (we can be
+ * in FIN_WAIT_1 without having
+ * sent the fin).
*/
- rack_check_probe_rtt(rack, us_cts);
+ ourfinisacked = 1;
+ /*
+ * Lets make sure snd_una is updated
+ * since most likely acked_amount = 0 (it
+ * should be).
+ */
+ tp->snd_una = high_seq;
+ }
+ /* Did we make a RTO error? */
+ if ((tp->t_flags & TF_PREVVALID) &&
+ ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
+ tp->t_flags &= ~TF_PREVVALID;
+ if (tp->t_rxtshift == 1 &&
+ (int)(ticks - tp->t_badrxtwin) < 0)
+ rack_cong_signal(tp, CC_RTO_ERR, high_seq);
+ }
+ /* Handle the data in the socket buffer */
+ KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
+ KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
+ if (acked_amount > 0) {
+ struct mbuf *mfree;
+
+ rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
+ SOCKBUF_LOCK(&so->so_snd);
+ mfree = sbcut_locked(&so->so_snd, acked);
+ tp->snd_una = high_seq;
+ /* Note we want to hold the sb lock through the sendmap adjust */
+ rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
+ /* Wake up the socket if we have room to write more */
+ rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ tp->t_flags |= TF_WAKESOW;
+ m_freem(mfree);
}
- if (rack->set_pacing_done_a_iw == 0) {
- /* How much has been acked? */
- if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
- /* We have enough to set in the pacing segment size */
- rack->set_pacing_done_a_iw = 1;
- rack_set_pace_segments(tp, rack, __LINE__);
+ /* update progress */
+ tp->t_acktime = ticks;
+ rack_log_progress_event(rack, tp, tp->t_acktime,
+ PROGRESS_UPDATE, __LINE__);
+ /* Clear out shifts and such */
+ tp->t_rxtshift = 0;
+ RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
+ rack_rto_min, rack_rto_max);
+ rack->rc_tlp_in_progress = 0;
+ rack->r_ctl.rc_tlp_cnt_out = 0;
+ /* Send recover and snd_nxt must be dragged along */
+ if (SEQ_GT(tp->snd_una, tp->snd_recover))
+ tp->snd_recover = tp->snd_una;
+ if (SEQ_LT(tp->snd_nxt, tp->snd_una))
+ tp->snd_nxt = tp->snd_una;
+ /*
+ * If the RXT timer is running we want to
+ * stop it, so we can restart a TLP (or new RXT).
+ */
+ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+#ifdef NETFLIX_HTTP_LOGGING
+ tcp_http_check_for_comp(rack->rc_tp, high_seq);
+#endif
+ tp->snd_wl2 = high_seq;
+ tp->t_dupacks = 0;
+ if (under_pacing &&
+ (rack->use_fixed_rate == 0) &&
+ (rack->in_probe_rtt == 0) &&
+ rack->rc_gp_dyn_mul &&
+ rack->rc_always_pace) {
+ /* Check if we are dragging bottom */
+ rack_check_bottom_drag(tp, rack, so, acked);
+ }
+ if (tp->snd_una == tp->snd_max) {
+ tp->t_flags &= ~TF_PREVVALID;
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.dsack_byte_cnt = 0;
+ rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
+ if (rack->r_ctl.rc_went_idle_time == 0)
+ rack->r_ctl.rc_went_idle_time = 1;
+ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
+ if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
+ tp->t_acktime = 0;
+ /* Set so we might enter persists... */
+ rack->r_wanted_output = 1;
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
+ if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
+ (sbavail(&so->so_snd) == 0) &&
+ (tp->t_flags2 & TF2_DROP_AF_DATA)) {
+ /*
+ * The socket was gone and the
+ * peer sent data (not now in the past), time to
+ * reset him.
+ */
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ /* tcp_close will kill the inp pre-log the Reset */
+ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
+ tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
+ }
+ }
+#endif
+ m_freem(m);
+ tp = tcp_close(tp);
+ if (tp == NULL) {
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
+ return (1);
+ }
+ /*
+ * We would normally do drop-with-reset which would
+ * send back a reset. We can't since we don't have
+ * all the needed bits. Instead lets arrange for
+ * a call to tcp_output(). That way since we
+ * are in the closed state we will generate a reset.
+ *
+ * Note if tcp_accounting is on we don't unpin since
+ * we do that after the goto label.
+ */
+ goto send_out_a_rst;
+ }
+ if ((sbused(&so->so_snd) == 0) &&
+ (tp->t_state >= TCPS_FIN_WAIT_1) &&
+ (tp->t_flags & TF_SENTFIN)) {
+ /*
+ * If we can't receive any more data, then closing user can
+ * proceed. Starting the timer is contrary to the
+ * specification, but if we don't get a FIN we'll hang
+ * forever.
+ *
+ */
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ if (ourfinisacked == 0) {
+ /*
+ * We don't change to fin-wait-2 if we have our fin acked
+ * which means we are probably in TCPS_CLOSING.
+ */
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ }
}
}
- tcp_rack_xmit_timer_commit(rack, tp);
- if (nxt_pkt == 0) {
- if (rack->r_wanted_output != 0) {
-do_output_now:
- did_out = 1;
- (void)tp->t_fb->tfb_tcp_output(tp);
+ /* Wake up the socket if we have room to write more */
+ if (sbavail(&so->so_snd)) {
+ rack->r_wanted_output = 1;
+ if (ctf_progress_timeout_check(tp, true)) {
+ rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
+ tp, tick, PROGRESS_DROP, __LINE__);
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ /*
+ * We cheat here and don't send a RST, we should send one
+ * when the pacer drops the connection.
+ */
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
+ tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
+ }
+ }
+ sched_unpin();
+#endif
+ INP_WUNLOCK(rack->rc_inp);
+ m_freem(m);
+ return (1);
}
- rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
}
- if ((nxt_pkt == 0) &&
- ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
- (SEQ_GT(tp->snd_max, tp->snd_una) ||
- (tp->t_flags & TF_DELACK) ||
- ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
- (tp->t_state <= TCPS_CLOSING)))) {
- /* We could not send (probably in the hpts but stopped the timer earlier)? */
- if ((tp->snd_max == tp->snd_una) &&
- ((tp->t_flags & TF_DELACK) == 0) &&
- (rack->rc_inp->inp_in_hpts) &&
- (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
- /* keep alive not needed if we are hptsi output yet */
- ;
- } else {
- int late = 0;
- if (rack->rc_inp->inp_in_hpts) {
- if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
- us_cts = tcp_get_usecs(NULL);
- if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
- rack->r_early = 1;
- rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
- } else
- late = 1;
- rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
+ if (ourfinisacked) {
+ switch(tp->t_state) {
+ case TCPS_CLOSING:
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
+ (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
+ tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
}
- tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
}
- if (late && (did_out == 0)) {
- /*
- * We are late in the sending
- * and we did not call the output
- * (this probably should not happen).
- */
- goto do_output_now;
+ sched_unpin();
+#endif
+ tcp_twstart(tp);
+ m_freem(m);
+ return (1);
+ break;
+ case TCPS_LAST_ACK:
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
+ (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
+ tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
+ }
+ }
+ sched_unpin();
+#endif
+ tp = tcp_close(tp);
+ ctf_do_drop(m, tp);
+ return (1);
+ break;
+ case TCPS_FIN_WAIT_1:
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
+ (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
+ tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
+ }
}
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
+#endif
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ soisdisconnected(so);
+ tcp_timer_activate(tp, TT_2MSL,
+ (tcp_fast_finwait2_recycle ?
+ tcp_finwait2_timeout :
+ TP_MAXIDLE(tp)));
+ }
+ tcp_state_change(tp, TCPS_FIN_WAIT_2);
+ break;
+ default:
+ break;
}
- way_out = 1;
- } else if (nxt_pkt == 0) {
- /* Do we have the correct timer running? */
- rack_timer_audit(tp, rack, &so->so_snd);
- way_out = 2;
}
- done_with_input:
- rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
- if (did_out)
- rack->r_wanted_output = 0;
-#ifdef INVARIANTS
- if (tp->t_inpcb == NULL) {
- panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
- did_out,
- retval, tp, prev_state);
+ if (rack->r_fast_output) {
+ /*
+ * We re doing fast output.. can we expand that?
+ */
+ rack_gain_for_fastoutput(rack, tp, so, acked_amount);
+ }
+#ifdef TCP_ACCOUNTING
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
+ tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
+ }
}
-#endif
- }
- return (retval);
-}
-
-void
-rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
- struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
-{
- struct timeval tv;
- /* First lets see if we have old packets */
- if (tp->t_in_pkt) {
- if (ctf_do_queued_segments(so, tp, 1)) {
- m_freem(m);
- return;
+ } else if (win_up_req) {
+ rdstc = get_cyclecount();
+ if (rdstc > ts_val) {
+ counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
+ }
}
+#endif
}
- if (m->m_flags & M_TSTMP_LRO) {
- tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
- tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
- } else {
- /* Should not be should we kassert instead? */
- tcp_get_usecs(&tv);
- }
- if(rack_do_segment_nounlock(m, th, so, tp,
- drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
- tcp_handle_wakeup(tp, so);
- INP_WUNLOCK(tp->t_inpcb);
+ /* Now is there a next packet, if so we are done */
+ m_freem(m);
+ did_out = 0;
+ if (nxt_pkt) {
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
+ rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
+ return (0);
}
+ rack_handle_might_revert(tp, rack);
+ ctf_calc_rwin(so, tp);
+ if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
+ send_out_a_rst:
+ (void)tp->t_fb->tfb_tcp_output(tp);
+ did_out = 1;
+ }
+ rack_free_trim(rack);
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
+ rack_timer_audit(tp, rack, &so->so_snd);
+ rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
+ return (0);
}
-struct rack_sendmap *
-tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
-{
- struct rack_sendmap *rsm = NULL;
- int32_t idx;
- uint32_t srtt = 0, thresh = 0, ts_low = 0;
- /* Return the next guy to be re-transmitted */
- if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
- return (NULL);
- }
- if (tp->t_flags & TF_SENTFIN) {
- /* retran the end FIN? */
- return (NULL);
- }
- /* ok lets look at this one */
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
- if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
- goto check_it;
- }
- rsm = rack_find_lowest_rsm(rack);
- if (rsm == NULL) {
- return (NULL);
- }
-check_it:
- if (rsm->r_flags & RACK_ACKED) {
- return (NULL);
- }
- if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
- (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
- /* Its not yet ready */
- return (NULL);
- }
- srtt = rack_grab_rtt(tp, rack);
- idx = rsm->r_rtr_cnt - 1;
- ts_low = rsm->r_tim_lastsent[idx];
- thresh = rack_calc_thresh_rack(rack, srtt, tsused);
- if ((tsused == ts_low) ||
- (TSTMP_LT(tsused, ts_low))) {
- /* No time since sending */
- return (NULL);
+static int
+rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
+ int32_t nxt_pkt, struct timeval *tv)
+{
+#ifdef TCP_ACCOUNTING
+ uint64_t ts_val;
+#endif
+ int32_t thflags, retval, did_out = 0;
+ int32_t way_out = 0;
+ uint32_t cts;
+ uint32_t tiwin;
+ struct timespec ts;
+ struct tcpopt to;
+ struct tcp_rack *rack;
+ struct rack_sendmap *rsm;
+ int32_t prev_state = 0;
+#ifdef TCP_ACCOUNTING
+ int ack_val_set = 0xf;
+#endif
+ uint32_t us_cts;
+ /*
+ * tv passed from common code is from either M_TSTMP_LRO or
+ * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
+ */
+ if (m->m_flags & M_ACKCMP) {
+ return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
}
- if ((tsused - ts_low) < thresh) {
- /* It has not been long enough yet */
- return (NULL);
+ if (m->m_flags & M_ACKCMP) {
+ panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
}
- if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
- ((rsm->r_flags & RACK_SACK_PASSED) &&
- (rack->sack_attack_disable == 0))) {
+ counter_u64_add(rack_proc_non_comp_ack, 1);
+ thflags = th->th_flags;
+#ifdef TCP_ACCOUNTING
+ sched_pin();
+ if (thflags & TH_ACK)
+ ts_val = get_cyclecount();
+#endif
+ cts = tcp_tv_to_usectick(tv);
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+
+ if ((m->m_flags & M_TSTMP) ||
+ (m->m_flags & M_TSTMP_LRO)) {
+ mbuf_tstmp2timespec(m, &ts);
+ rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+ rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
+ } else
+ rack->r_ctl.act_rcv_time = *tv;
+ kern_prefetch(rack, &prev_state);
+ prev_state = 0;
+ /*
+ * Unscale the window into a 32-bit value. For the SYN_SENT state
+ * the scale is zero.
+ */
+ tiwin = th->th_win << tp->snd_scale;
+ /*
+ * Parse options on any incoming segment.
+ */
+ memset(&to, 0, sizeof(to));
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
+#ifdef TCP_ACCOUNTING
+ if (thflags & TH_ACK) {
/*
- * We have passed the dup-ack threshold <or>
- * a SACK has indicated this is missing.
- * Note that if you are a declared attacker
- * it is only the dup-ack threshold that
- * will cause retransmits.
+ * We have a tradeoff here. We can either do what we are
+ * doing i.e. pinning to this CPU and then doing the accounting
+ * <or> we could do a critical enter, setup the rdtsc and cpu
+ * as in below, and then validate we are on the same CPU on
+ * exit. I have choosen to not do the critical enter since
+ * that often will gain you a context switch, and instead lock
+ * us (line above this if) to the same CPU with sched_pin(). This
+ * means we may be context switched out for a higher priority
+ * interupt but we won't be moved to another CPU.
+ *
+ * If this occurs (which it won't very often since we most likely
+ * are running this code in interupt context and only a higher
+ * priority will bump us ... clock?) we will falsely add in
+ * to the time the interupt processing time plus the ack processing
+ * time. This is ok since its a rare event.
*/
- /* log retransmit reason */
- rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
- return (rsm);
+ ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
+ ctf_fixed_maxseg(tp));
}
- return (NULL);
-}
-
-static void
-rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
- uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
- int line, struct rack_sendmap *rsm)
-{
- if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+#endif
+ NET_EPOCH_ASSERT();
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
+ __func__));
+ KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
+ __func__));
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
- struct timeval tv;
+ struct timeval ltv;
+#ifdef NETFLIX_HTTP_LOGGING
+ struct http_sendfile_track *http_req;
- memset(&log, 0, sizeof(log));
- log.u_bbr.flex1 = slot;
- log.u_bbr.flex2 = len;
- log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
- log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
- log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
- log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
- log.u_bbr.use_lt_bw = rack->app_limited_needs_set;
- log.u_bbr.use_lt_bw <<= 1;
- log.u_bbr.use_lt_bw = rack->rc_gp_filled;
- log.u_bbr.use_lt_bw <<= 1;
- log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
- log.u_bbr.use_lt_bw <<= 1;
- log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
- log.u_bbr.pkt_epoch = line;
- log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
- log.u_bbr.bw_inuse = bw_est;
- log.u_bbr.delRate = bw;
- if (rack->r_ctl.gp_bw == 0)
- log.u_bbr.cur_del_rate = 0;
- else
- log.u_bbr.cur_del_rate = rack_get_bw(rack);
- log.u_bbr.rttProp = len_time;
- log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
- log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
- log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
- if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
- /* We are in slow start */
- log.u_bbr.flex7 = 1;
+ if (SEQ_GT(th->th_ack, tp->snd_una)) {
+ http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
} else {
- /* we are on congestion avoidance */
- log.u_bbr.flex7 = 0;
+ http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
}
- log.u_bbr.flex8 = method;
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
- log.u_bbr.cwnd_gain <<= 1;
- log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
- log.u_bbr.cwnd_gain <<= 1;
- log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
- TCP_LOG_EVENTP(rack->rc_tp, NULL,
- &rack->rc_inp->inp_socket->so_rcv,
- &rack->rc_inp->inp_socket->so_snd,
- BBR_LOG_HPTSI_CALC, 0,
- 0, &log, false, &tv);
+#endif
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ if (rack->rack_no_prr == 0)
+ log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
+ else
+ log.u_bbr.flex1 = 0;
+ log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->r_might_revert;
+ log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
+ log.u_bbr.flex3 = m->m_flags;
+ log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
+ log.u_bbr.lost = thflags;
+ log.u_bbr.pacing_gain = 0x1;
+#ifdef TCP_ACCOUNTING
+ log.u_bbr.cwnd_gain = ack_val_set;
+#endif
+ log.u_bbr.flex7 = 2;
+ if (m->m_flags & M_TSTMP) {
+ /* Record the hardware timestamp if present */
+ mbuf_tstmp2timespec(m, &ts);
+ ltv.tv_sec = ts.tv_sec;
+ ltv.tv_usec = ts.tv_nsec / 1000;
+ log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
+ } else if (m->m_flags & M_TSTMP_LRO) {
+ /* Record the LRO the arrival timestamp */
+ mbuf_tstmp2timespec(m, &ts);
+ ltv.tv_sec = ts.tv_sec;
+ ltv.tv_usec = ts.tv_nsec / 1000;
+ log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
+ }
+ log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
+ /* Log the rcv time */
+ log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
+#ifdef NETFLIX_HTTP_LOGGING
+ log.u_bbr.applimited = tp->t_http_closed;
+ log.u_bbr.applimited <<= 8;
+ log.u_bbr.applimited |= tp->t_http_open;
+ log.u_bbr.applimited <<= 8;
+ log.u_bbr.applimited |= tp->t_http_req;
+ if (http_req) {
+ /* Copy out any client req info */
+ /* seconds */
+ log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
+ /* useconds */
+ log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
+ log.u_bbr.rttProp = http_req->timestamp;
+ log.u_bbr.cur_del_rate = http_req->start;
+ if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+ log.u_bbr.flex8 |= 1;
+ } else {
+ log.u_bbr.flex8 |= 2;
+ log.u_bbr.bw_inuse = http_req->end;
+ }
+ log.u_bbr.flex6 = http_req->start_seq;
+ if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
+ log.u_bbr.flex8 |= 4;
+ log.u_bbr.epoch = http_req->end_seq;
+ }
+ }
+#endif
+ TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
+ tlen, &log, true, &ltv);
}
-}
-
-static uint32_t
-rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
-{
- uint32_t new_tso, user_max;
-
- user_max = rack->rc_user_set_max_segs * mss;
- if (rack->rc_force_max_seg) {
- return (user_max);
+ if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
+ way_out = 4;
+ retval = 0;
+ goto done_with_input;
}
- if (rack->use_fixed_rate &&
- ((rack->r_ctl.crte == NULL) ||
- (bw != rack->r_ctl.crte->rate))) {
- /* Use the user mss since we are not exactly matched */
- return (user_max);
+ /*
+ * If a segment with the ACK-bit set arrives in the SYN-SENT state
+ * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
+ */
+ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
+ (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
+ tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
+ return (1);
}
- new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
- if (new_tso > user_max)
- new_tso = user_max;
- return(new_tso);
-}
-static void
-rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp,
- uint64_t rate, uint64_t hw_rate, int line,
- int error)
-{
- if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
- union tcp_log_stackspecific log;
- struct timeval tv;
+ /*
+ * Parse options on any incoming segment.
+ */
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
- memset(&log, 0, sizeof(log));
- log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
- log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
- log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff);
- log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.bw_inuse = rate;
- log.u_bbr.flex5 = line;
- log.u_bbr.flex6 = error;
- log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
- log.u_bbr.flex8 = rack->use_fixed_rate;
- log.u_bbr.flex8 <<= 1;
- log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
- log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
- TCP_LOG_EVENTP(rack->rc_tp, NULL,
- &rack->rc_inp->inp_socket->so_rcv,
- &rack->rc_inp->inp_socket->so_snd,
- BBR_LOG_HDWR_PACE, 0,
- 0, &log, false, &tv);
+ /*
+ * If timestamps were negotiated during SYN/ACK and a
+ * segment without a timestamp is received, silently drop
+ * the segment, unless it is a RST segment or missing timestamps are
+ * tolerated.
+ * See section 3.2 of RFC 7323.
+ */
+ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
+ ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
+ way_out = 5;
+ retval = 0;
+ goto done_with_input;
}
-}
-
-static int32_t
-pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz)
-{
- uint64_t lentim, fill_bw;
- /* Lets first see if we are full, if so continue with normal rate */
- if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
- return (slot);
- if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
- return (slot);
- if (rack->r_ctl.rc_last_us_rtt == 0)
- return (slot);
- if (rack->rc_pace_fill_if_rttin_range &&
- (rack->r_ctl.rc_last_us_rtt >=
- (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
- /* The rtt is huge, N * smallest, lets not fill */
- return (slot);
+ /*
+ * Segment received on connection. Reset idle time and keep-alive
+ * timer. XXX: This should be done after segment validation to
+ * ignore broken/spoofed segs.
+ */
+ if (tp->t_idle_reduce &&
+ (tp->snd_max == tp->snd_una) &&
+ ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
+ counter_u64_add(rack_input_idle_reduces, 1);
+ rack_cc_after_idle(rack, tp);
}
+ tp->t_rcvtime = ticks;
+#ifdef STATS
+ stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
+#endif
+ if (tiwin > rack->r_ctl.rc_high_rwnd)
+ rack->r_ctl.rc_high_rwnd = tiwin;
/*
- * first lets calculate the b/w based on the last us-rtt
- * and the sndwnd.
+ * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
+ * this to occur after we've validated the segment.
*/
- fill_bw = rack->r_ctl.cwnd_to_use;
- /* Take the rwnd if its smaller */
- if (fill_bw > rack->rc_tp->snd_wnd)
- fill_bw = rack->rc_tp->snd_wnd;
- fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
- fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
- /* We are below the min b/w */
- if (fill_bw < RACK_MIN_BW)
- return (slot);
+ if (tp->t_flags2 & TF2_ECN_PERMIT) {
+ if (thflags & TH_CWR) {
+ tp->t_flags2 &= ~TF2_ECN_SND_ECE;
+ tp->t_flags |= TF_ACKNOW;
+ }
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags2 |= TF2_ECN_SND_ECE;
+ KMOD_TCPSTAT_INC(tcps_ecn_ce);
+ break;
+ case IPTOS_ECN_ECT0:
+ KMOD_TCPSTAT_INC(tcps_ecn_ect0);
+ break;
+ case IPTOS_ECN_ECT1:
+ KMOD_TCPSTAT_INC(tcps_ecn_ect1);
+ break;
+ }
+
+ /* Process a packet differently from RFC3168. */
+ cc_ecnpkt_handler(tp, th, iptos);
+
+ /* Congestion experienced. */
+ if (thflags & TH_ECE) {
+ rack_cong_signal(tp, CC_ECN, th->th_ack);
+ }
+ }
+
/*
- * Ok fill_bw holds our mythical b/w to fill the cwnd
- * in a rtt, what does that time wise equate too?
+ * If echoed timestamp is later than the current time, fall back to
+ * non RFC1323 RTT calculation. Normalize timestamp if syncookies
+ * were used when this connection was established.
*/
- lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
- lentim /= fill_bw;
- if (lentim < slot) {
- rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
- 0, lentim, 12, __LINE__, NULL);
- return ((int32_t)lentim);
- } else
- return (slot);
-}
+ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
+ to.to_tsecr -= tp->ts_offset;
+ if (TSTMP_GT(to.to_tsecr, cts))
+ to.to_tsecr = 0;
+ }
-static int32_t
-rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
-{
- struct rack_sendmap *lrsm;
- int32_t slot = 0;
- int err;
+ /*
+ * If its the first time in we need to take care of options and
+ * verify we can do SACK for rack!
+ */
+ if (rack->r_state == 0) {
+ /* Should be init'd by rack_init() */
+ KASSERT(rack->rc_inp != NULL,
+ ("%s: rack->rc_inp unexpectedly NULL", __func__));
+ if (rack->rc_inp == NULL) {
+ rack->rc_inp = tp->t_inpcb;
+ }
- if (rack->rc_always_pace == 0) {
/*
- * We use the most optimistic possible cwnd/srtt for
- * sending calculations. This will make our
- * calculation anticipate getting more through
- * quicker then possible. But thats ok we don't want
- * the peer to have a gap in data sending.
+ * Process options only when we get SYN/ACK back. The SYN
+ * case for incoming connections is handled in tcp_syncache.
+ * According to RFC1323 the window field in a SYN (i.e., a
+ * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
+ * this is traditional behavior, may need to be cleaned up.
*/
- uint32_t srtt, cwnd, tr_perms = 0;
- int32_t reduce = 0;
-
- old_method:
- /*
- * We keep no precise pacing with the old method
- * instead we use the pacer to mitigate bursts.
- */
- rack->r_ctl.rc_agg_delayed = 0;
- rack->r_early = 0;
- rack->r_late = 0;
- rack->r_ctl.rc_agg_early = 0;
- if (rack->r_ctl.rc_rack_min_rtt)
- srtt = rack->r_ctl.rc_rack_min_rtt;
- else
- srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
- if (rack->r_ctl.rc_rack_largest_cwnd)
- cwnd = rack->r_ctl.rc_rack_largest_cwnd;
- else
- cwnd = rack->r_ctl.cwnd_to_use;
- tr_perms = cwnd / srtt;
- if (tr_perms == 0) {
- tr_perms = ctf_fixed_maxseg(tp);
- }
- /*
- * Calculate how long this will take to drain, if
- * the calculation comes out to zero, thats ok we
- * will use send_a_lot to possibly spin around for
- * more increasing tot_len_this_send to the point
- * that its going to require a pace, or we hit the
- * cwnd. Which in that case we are just waiting for
- * a ACK.
- */
- slot = len / tr_perms;
- /* Now do we reduce the time so we don't run dry? */
- if (slot && rack_slot_reduction) {
- reduce = (slot / rack_slot_reduction);
- if (reduce < slot) {
- slot -= reduce;
+ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
+ /* Handle parallel SYN for ECN */
+ if (!(thflags & TH_ACK) &&
+ ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) &&
+ ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) {
+ tp->t_flags2 |= TF2_ECN_PERMIT;
+ tp->t_flags2 |= TF2_ECN_SND_ECE;
+ TCPSTAT_INC(tcps_ecn_shs);
+ }
+ if ((to.to_flags & TOF_SCALE) &&
+ (tp->t_flags & TF_REQ_SCALE)) {
+ tp->t_flags |= TF_RCVD_SCALE;
+ tp->snd_scale = to.to_wscale;
} else
- slot = 0;
- }
- slot *= HPTS_USEC_IN_MSEC;
- if (rsm == NULL) {
+ tp->t_flags &= ~TF_REQ_SCALE;
/*
- * We always consider ourselves app limited with old style
- * that are not retransmits. This could be the initial
- * measurement, but thats ok its all setup and specially
- * handled. If another send leaks out, then that too will
- * be mark app-limited.
+ * Initial send window. It will be updated with the
+ * next incoming segment to the scaled value.
*/
- lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
- if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) {
- rack->r_ctl.rc_first_appl = lrsm;
- lrsm->r_flags |= RACK_APP_LIMITED;
- rack->r_ctl.rc_app_limited_cnt++;
+ tp->snd_wnd = th->th_win;
+ rack_validate_fo_sendwin_up(tp, rack);
+ if ((to.to_flags & TOF_TS) &&
+ (tp->t_flags & TF_REQ_TSTMP)) {
+ tp->t_flags |= TF_RCVD_TSTMP;
+ tp->ts_recent = to.to_tsval;
+ tp->ts_recent_age = cts;
+ } else
+ tp->t_flags &= ~TF_REQ_TSTMP;
+ if (to.to_flags & TOF_MSS) {
+ tcp_mss(tp, to.to_mss);
}
- }
- rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL);
- } else {
- uint64_t bw_est, res, lentim, rate_wanted;
- uint32_t orig_val, srtt, segs, oh;
+ if ((tp->t_flags & TF_SACK_PERMIT) &&
+ (to.to_flags & TOF_SACKPERM) == 0)
+ tp->t_flags &= ~TF_SACK_PERMIT;
+ if (IS_FASTOPEN(tp->t_flags)) {
+ if (to.to_flags & TOF_FASTOPEN) {
+ uint16_t mss;
- if ((rack->r_rr_config == 1) && rsm) {
- return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC);
+ if (to.to_flags & TOF_MSS)
+ mss = to.to_mss;
+ else
+ if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
+ mss = TCP6_MSS;
+ else
+ mss = TCP_MSS;
+ tcp_fastopen_update_cache(tp, mss,
+ to.to_tfo_len, to.to_tfo_cookie);
+ } else
+ tcp_fastopen_disable_path(tp);
+ }
}
- if (rack->use_fixed_rate) {
- rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
- } else if ((rack->r_ctl.init_rate == 0) &&
-#ifdef NETFLIX_PEAKRATE
- (rack->rc_tp->t_maxpeakrate == 0) &&
+ /*
+ * At this point we are at the initial call. Here we decide
+ * if we are doing RACK or not. We do this by seeing if
+ * TF_SACK_PERMIT is set and the sack-not-required is clear.
+ * The code now does do dup-ack counting so if you don't
+ * switch back you won't get rack & TLP, but you will still
+ * get this stack.
+ */
+
+ if ((rack_sack_not_required == 0) &&
+ ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
+ tcp_switch_back_to_default(tp);
+ (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
+ tlen, iptos);
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
#endif
- (rack->r_ctl.gp_bw == 0)) {
- /* no way to yet do an estimate */
- bw_est = rate_wanted = 0;
- } else {
- bw_est = rack_get_bw(rack);
- rate_wanted = rack_get_output_bw(rack, bw_est, rsm);
- }
- if ((bw_est == 0) || (rate_wanted == 0)) {
- /*
- * No way yet to make a b/w estimate or
- * our raise is set incorrectly.
- */
- goto old_method;
+ return (1);
}
- /* We need to account for all the overheads */
- segs = (len + segsiz - 1) / segsiz;
+ tcp_set_hpts(tp->t_inpcb);
+ sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
+ }
+ if (thflags & TH_FIN)
+ tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
+ us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ if ((rack->rc_gp_dyn_mul) &&
+ (rack->use_fixed_rate == 0) &&
+ (rack->rc_always_pace)) {
+ /* Check in on probertt */
+ rack_check_probe_rtt(rack, us_cts);
+ }
+ if (rack->forced_ack) {
+ uint32_t us_rtt;
+
/*
- * We need the diff between 1514 bytes (e-mtu with e-hdr)
- * and how much data we put in each packet. Yes this
- * means we may be off if we are larger than 1500 bytes
- * or smaller. But this just makes us more conservative.
+ * A persist or keep-alive was forced out, update our
+ * min rtt time. Note we do not worry about lost
+ * retransmissions since KEEP-ALIVES and persists
+ * are usually way long on times of sending (though
+ * if we were really paranoid or worried we could
+ * at least use timestamps if available to validate).
*/
- if (ETHERNET_SEGMENT_SIZE > segsiz)
- oh = ETHERNET_SEGMENT_SIZE - segsiz;
- else
- oh = 0;
- segs *= oh;
- lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
- res = lentim / rate_wanted;
- slot = (uint32_t)res;
- orig_val = rack->r_ctl.rc_pace_max_segs;
- rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
- /* Did we change the TSO size, if so log it */
- if (rack->r_ctl.rc_pace_max_segs != orig_val)
- rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL);
- if ((rack->rc_pace_to_cwnd) &&
- (rack->in_probe_rtt == 0) &&
- (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) {
+ rack->forced_ack = 0;
+ us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
+ if (us_rtt == 0)
+ us_rtt = 1;
+ rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3);
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ }
+ /*
+ * This is the one exception case where we set the rack state
+ * always. All other times (timers etc) we must have a rack-state
+ * set (so we assure we have done the checks above for SACK).
+ */
+ rack->r_ctl.rc_rcvtime = cts;
+ if (rack->r_state != tp->t_state)
+ rack_set_state(tp, rack);
+ if (SEQ_GT(th->th_ack, tp->snd_una) &&
+ (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
+ kern_prefetch(rsm, &prev_state);
+ prev_state = rack->r_state;
+ rack_clear_rate_sample(rack);
+ retval = (*rack->r_substate) (m, th, so,
+ tp, &to, drop_hdrlen,
+ tlen, tiwin, thflags, nxt_pkt, iptos);
+#ifdef INVARIANTS
+ if ((retval == 0) &&
+ (tp->t_inpcb == NULL)) {
+ panic("retval:%d tp:%p t_inpcb:NULL state:%d",
+ retval, tp, prev_state);
+ }
+#endif
+ if (retval == 0) {
+ /*
+ * If retval is 1 the tcb is unlocked and most likely the tp
+ * is gone.
+ */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if ((rack->rc_gp_dyn_mul) &&
+ (rack->rc_always_pace) &&
+ (rack->use_fixed_rate == 0) &&
+ rack->in_probe_rtt &&
+ (rack->r_ctl.rc_time_probertt_starts == 0)) {
/*
- * We want to pace at our rate *or* faster to
- * fill the cwnd to the max if its not full.
+ * If we are going for target, lets recheck before
+ * we output.
*/
- slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz);
+ rack_check_probe_rtt(rack, us_cts);
}
- if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
- (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
- if ((rack->rack_hdw_pace_ena) &&
- (rack->rack_hdrw_pacing == 0) &&
- (rack->rack_attempt_hdwr_pace == 0)) {
- /*
- * Lets attempt to turn on hardware pacing
- * if we can.
- */
- rack->rack_attempt_hdwr_pace = 1;
- rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
- rack->rc_inp->inp_route.ro_nh->nh_ifp,
- rate_wanted,
- RS_PACING_GEQ,
- &err, NULL);
- if (rack->r_ctl.crte) {
- rack->rack_hdrw_pacing = 1;
- rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted, segsiz,
- 0, rack->r_ctl.crte,
- NULL);
- rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp,
- rate_wanted, rack->r_ctl.crte->rate, __LINE__,
- err);
- }
- } else if (rack->rack_hdrw_pacing &&
- (rack->r_ctl.crte->rate != rate_wanted)) {
- /* Do we need to adjust our rate? */
- const struct tcp_hwrate_limit_table *nrte;
+ if (rack->set_pacing_done_a_iw == 0) {
+ /* How much has been acked? */
+ if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
+ /* We have enough to set in the pacing segment size */
+ rack->set_pacing_done_a_iw = 1;
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
+ }
+ }
+ tcp_rack_xmit_timer_commit(rack, tp);
+#ifdef TCP_ACCOUNTING
+ /*
+ * If we set the ack_val_se to what ack processing we are doing
+ * we also want to track how many cycles we burned. Note
+ * the bits after tcp_output we let be "free". This is because
+ * we are also tracking the tcp_output times as well. Note the
+ * use of 0xf here since we only have 11 counter (0 - 0xa) and
+ * 0xf cannot be returned and is what we initialize it too to
+ * indicate we are not doing the tabulations.
+ */
+ if (ack_val_set != 0xf) {
+ uint64_t crtsc;
- nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
- rack->rc_tp,
- rack->rc_inp->inp_route.ro_nh->nh_ifp,
- rate_wanted,
- RS_PACING_GEQ,
- &err, NULL);
- if (nrte == NULL) {
- /* Lost the rate */
- rack->rack_hdrw_pacing = 0;
- rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
- } else if (nrte != rack->r_ctl.crte) {
- rack->r_ctl.crte = nrte;
- rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted,
- segsiz, 0,
- rack->r_ctl.crte,
- NULL);
- rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp,
- rate_wanted, rack->r_ctl.crte->rate, __LINE__,
- err);
- }
+ crtsc = get_cyclecount();
+ counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
}
}
- if (rack_limit_time_with_srtt &&
- (rack->use_fixed_rate == 0) &&
-#ifdef NETFLIX_PEAKRATE
- (rack->rc_tp->t_maxpeakrate == 0) &&
#endif
- (rack->rack_hdrw_pacing == 0)) {
+ if (nxt_pkt == 0) {
+ if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
+do_output_now:
+ did_out = 1;
+ (void)tp->t_fb->tfb_tcp_output(tp);
+ }
+ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
+ rack_free_trim(rack);
+ }
+ if ((nxt_pkt == 0) &&
+ ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
+ (SEQ_GT(tp->snd_max, tp->snd_una) ||
+ (tp->t_flags & TF_DELACK) ||
+ ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
+ (tp->t_state <= TCPS_CLOSING)))) {
+ /* We could not send (probably in the hpts but stopped the timer earlier)? */
+ if ((tp->snd_max == tp->snd_una) &&
+ ((tp->t_flags & TF_DELACK) == 0) &&
+ (rack->rc_inp->inp_in_hpts) &&
+ (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
+ /* keep alive not needed if we are hptsi output yet */
+ ;
+ } else {
+ int late = 0;
+ if (rack->rc_inp->inp_in_hpts) {
+ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ us_cts = tcp_get_usecs(NULL);
+ if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
+ rack->r_early = 1;
+ rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
+ } else
+ late = 1;
+ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
+ }
+ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
+ }
+ if (late && (did_out == 0)) {
+ /*
+ * We are late in the sending
+ * and we did not call the output
+ * (this probably should not happen).
+ */
+ goto do_output_now;
+ }
+ rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
+ }
+ way_out = 1;
+ } else if (nxt_pkt == 0) {
+ /* Do we have the correct timer running? */
+ rack_timer_audit(tp, rack, &so->so_snd);
+ way_out = 2;
+ }
+ done_with_input:
+ rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, m->m_pkthdr.lro_nsegs));
+ if (did_out)
+ rack->r_wanted_output = 0;
+#ifdef INVARIANTS
+ if (tp->t_inpcb == NULL) {
+ panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
+ did_out,
+ retval, tp, prev_state);
+ }
+#endif
+#ifdef TCP_ACCOUNTING
+ } else {
+ /*
+ * Track the time (see above).
+ */
+ if (ack_val_set != 0xf) {
+ uint64_t crtsc;
+
+ crtsc = get_cyclecount();
+ counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
/*
- * Sanity check, we do not allow the pacing delay
- * to be longer than the SRTT of the path. If it is
- * a slow path, then adding a packet should increase
- * the RTT and compensate for this i.e. the srtt will
- * be greater so the allowed pacing time will be greater.
- *
- * Note this restriction is not for where a peak rate
- * is set, we are doing fixed pacing or hardware pacing.
+ * Note we *DO NOT* increment the per-tcb counters since
+ * in the else the TP may be gone!!
*/
- if (rack->rc_tp->t_srtt)
- srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
- else
- srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */
- if (srtt < slot) {
- rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL);
- slot = srtt;
- }
}
- rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm);
+#endif
}
- if (slot)
- counter_u64_add(rack_calc_nonzero, 1);
- else
- counter_u64_add(rack_calc_zero, 1);
- return (slot);
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
+ return (retval);
}
-static void
-rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
- tcp_seq startseq, uint32_t sb_offset)
+void
+rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
+ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
{
- struct rack_sendmap *my_rsm = NULL;
- struct rack_sendmap fe;
+ struct timeval tv;
- if (tp->t_state < TCPS_ESTABLISHED) {
+ /* First lets see if we have old packets */
+ if (tp->t_in_pkt) {
+ if (ctf_do_queued_segments(so, tp, 1)) {
+ m_freem(m);
+ return;
+ }
+ }
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
+ } else {
+ /* Should not be should we kassert instead? */
+ tcp_get_usecs(&tv);
+ }
+ if (rack_do_segment_nounlock(m, th, so, tp,
+ drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
+ tcp_handle_wakeup(tp, so);
+ INP_WUNLOCK(tp->t_inpcb);
+ }
+}
+
+struct rack_sendmap *
+tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
+{
+ struct rack_sendmap *rsm = NULL;
+ int32_t idx;
+ uint32_t srtt = 0, thresh = 0, ts_low = 0;
+
+ /* Return the next guy to be re-transmitted */
+ if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
+ return (NULL);
+ }
+ if (tp->t_flags & TF_SENTFIN) {
+ /* retran the end FIN? */
+ return (NULL);
+ }
+ /* ok lets look at this one */
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
+ goto check_it;
+ }
+ rsm = rack_find_lowest_rsm(rack);
+ if (rsm == NULL) {
+ return (NULL);
+ }
+check_it:
+ if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
+ (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
/*
- * We don't start any measurements if we are
- * not at least established.
+ * No sack so we automatically do the 3 strikes and
+ * retransmit (no rack timer would be started).
*/
- return;
+
+ return (rsm);
}
- tp->t_flags |= TF_GPUTINPROG;
- rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
- rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
- tp->gput_seq = startseq;
- rack->app_limited_needs_set = 0;
- if (rack->in_probe_rtt)
- rack->measure_saw_probe_rtt = 1;
- else if ((rack->measure_saw_probe_rtt) &&
- (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
- rack->measure_saw_probe_rtt = 0;
- if (rack->rc_gp_filled)
- tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
- else {
- /* Special case initial measurement */
- rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL);
+ if (rsm->r_flags & RACK_ACKED) {
+ return (NULL);
}
- /*
- * We take a guess out into the future,
- * if we have no measurement and no
- * initial rate, we measure the first
- * initial-windows worth of data to
- * speed up getting some GP measurement and
- * thus start pacing.
- */
- if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
- rack->app_limited_needs_set = 1;
- tp->gput_ack = startseq + max(rc_init_window(rack),
- (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
- rack_log_pacing_delay_calc(rack,
- tp->gput_seq,
- tp->gput_ack,
- 0,
- tp->gput_ts,
- rack->r_ctl.rc_app_limited_cnt,
- 9,
- __LINE__, NULL);
- return;
+ if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
+ (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
+ /* Its not yet ready */
+ return (NULL);
}
- if (sb_offset) {
+ srtt = rack_grab_rtt(tp, rack);
+ idx = rsm->r_rtr_cnt - 1;
+ ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
+ thresh = rack_calc_thresh_rack(rack, srtt, tsused);
+ if ((tsused == ts_low) ||
+ (TSTMP_LT(tsused, ts_low))) {
+ /* No time since sending */
+ return (NULL);
+ }
+ if ((tsused - ts_low) < thresh) {
+ /* It has not been long enough yet */
+ return (NULL);
+ }
+ if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
+ ((rsm->r_flags & RACK_SACK_PASSED) &&
+ (rack->sack_attack_disable == 0))) {
/*
- * We are out somewhere in the sb
- * can we use the already outstanding data?
+ * We have passed the dup-ack threshold <or>
+ * a SACK has indicated this is missing.
+ * Note that if you are a declared attacker
+ * it is only the dup-ack threshold that
+ * will cause retransmits.
*/
+ /* log retransmit reason */
+ rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
+ rack->r_fast_output = 0;
+ return (rsm);
+ }
+ return (NULL);
+}
- if (rack->r_ctl.rc_app_limited_cnt == 0) {
- /*
- * Yes first one is good and in this case
- * the tp->gput_ts is correctly set based on
- * the last ack that arrived (no need to
- * set things up when an ack comes in).
- */
- my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
- if ((my_rsm == NULL) ||
- (my_rsm->r_rtr_cnt != 1)) {
- /* retransmission? */
- goto use_latest;
- }
- } else {
- if (rack->r_ctl.rc_first_appl == NULL) {
- /*
- * If rc_first_appl is NULL
- * then the cnt should be 0.
- * This is probably an error, maybe
- * a KASSERT would be approprate.
- */
- goto use_latest;
- }
- /*
- * If we have a marker pointer to the last one that is
- * app limited we can use that, but we need to set
- * things up so that when it gets ack'ed we record
- * the ack time (if its not already acked).
- */
- rack->app_limited_needs_set = 1;
- /*
- * We want to get to the rsm that is either
- * next with space i.e. over 1 MSS or the one
- * after that (after the app-limited).
- */
- my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
- rack->r_ctl.rc_first_appl);
- if (my_rsm) {
- if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
- /* Have to use the next one */
+static void
+rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
+ uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
+ int line, struct rack_sendmap *rsm)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = slot;
+ log.u_bbr.flex2 = len;
+ log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
+ log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
+ log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->r_late;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->r_early;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
+ log.u_bbr.use_lt_bw <<= 1;
+ log.u_bbr.use_lt_bw |= rack->gp_ready;
+ log.u_bbr.pkt_epoch = line;
+ log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
+ log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
+ log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
+ log.u_bbr.bw_inuse = bw_est;
+ log.u_bbr.delRate = bw;
+ if (rack->r_ctl.gp_bw == 0)
+ log.u_bbr.cur_del_rate = 0;
+ else
+ log.u_bbr.cur_del_rate = rack_get_bw(rack);
+ log.u_bbr.rttProp = len_time;
+ log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
+ log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
+ log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
+ if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
+ /* We are in slow start */
+ log.u_bbr.flex7 = 1;
+ } else {
+ /* we are on congestion avoidance */
+ log.u_bbr.flex7 = 0;
+ }
+ log.u_bbr.flex8 = method;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
+ log.u_bbr.cwnd_gain <<= 1;
+ log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
+ log.u_bbr.cwnd_gain <<= 1;
+ log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_HPTSI_CALC, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+static uint32_t
+rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
+{
+ uint32_t new_tso, user_max;
+
+ user_max = rack->rc_user_set_max_segs * mss;
+ if (rack->rc_force_max_seg) {
+ return (user_max);
+ }
+ if (rack->use_fixed_rate &&
+ ((rack->r_ctl.crte == NULL) ||
+ (bw != rack->r_ctl.crte->rate))) {
+ /* Use the user mss since we are not exactly matched */
+ return (user_max);
+ }
+ new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
+ if (new_tso > user_max)
+ new_tso = user_max;
+ return (new_tso);
+}
+
+static int32_t
+pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
+{
+ uint64_t lentim, fill_bw;
+
+ /* Lets first see if we are full, if so continue with normal rate */
+ rack->r_via_fill_cw = 0;
+ if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
+ return (slot);
+ if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
+ return (slot);
+ if (rack->r_ctl.rc_last_us_rtt == 0)
+ return (slot);
+ if (rack->rc_pace_fill_if_rttin_range &&
+ (rack->r_ctl.rc_last_us_rtt >=
+ (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
+ /* The rtt is huge, N * smallest, lets not fill */
+ return (slot);
+ }
+ /*
+ * first lets calculate the b/w based on the last us-rtt
+ * and the sndwnd.
+ */
+ fill_bw = rack->r_ctl.cwnd_to_use;
+ /* Take the rwnd if its smaller */
+ if (fill_bw > rack->rc_tp->snd_wnd)
+ fill_bw = rack->rc_tp->snd_wnd;
+ if (rack->r_fill_less_agg) {
+ /*
+ * Now take away the inflight (this will reduce our
+ * aggressiveness and yeah, if we get that much out in 1RTT
+ * we will have had acks come back and still be behind).
+ */
+ fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ }
+ /* Now lets make it into a b/w */
+ fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
+ fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
+ /* We are below the min b/w */
+ if (non_paced)
+ *rate_wanted = fill_bw;
+ if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
+ return (slot);
+ if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap))
+ fill_bw = rack->r_ctl.bw_rate_cap;
+ rack->r_via_fill_cw = 1;
+ if (rack->r_rack_hw_rate_caps &&
+ (rack->r_ctl.crte != NULL)) {
+ uint64_t high_rate;
+
+ high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
+ if (fill_bw > high_rate) {
+ /* We are capping bw at the highest rate table entry */
+ if (*rate_wanted > high_rate) {
+ /* The original rate was also capped */
+ rack->r_via_fill_cw = 0;
+ }
+ rack_log_hdwr_pacing(rack,
+ fill_bw, high_rate, __LINE__,
+ 0, 3);
+ fill_bw = high_rate;
+ if (capped)
+ *capped = 1;
+ }
+ } else if ((rack->r_ctl.crte == NULL) &&
+ (rack->rack_hdrw_pacing == 0) &&
+ (rack->rack_hdw_pace_ena) &&
+ rack->r_rack_hw_rate_caps &&
+ (rack->rack_attempt_hdwr_pace == 0) &&
+ (rack->rc_inp->inp_route.ro_nh != NULL) &&
+ (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
+ /*
+ * Ok we may have a first attempt that is greater than our top rate
+ * lets check.
+ */
+ uint64_t high_rate;
+
+ high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
+ if (high_rate) {
+ if (fill_bw > high_rate) {
+ fill_bw = high_rate;
+ if (capped)
+ *capped = 1;
+ }
+ }
+ }
+ /*
+ * Ok fill_bw holds our mythical b/w to fill the cwnd
+ * in a rtt, what does that time wise equate too?
+ */
+ lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
+ lentim /= fill_bw;
+ *rate_wanted = fill_bw;
+ if (non_paced || (lentim < slot)) {
+ rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
+ 0, lentim, 12, __LINE__, NULL);
+ return ((int32_t)lentim);
+ } else
+ return (slot);
+}
+
+static int32_t
+rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
+{
+ struct rack_sendmap *lrsm;
+ int32_t slot = 0;
+ int can_start_hw_pacing = 1;
+ int err;
+
+ if (rack->rc_always_pace == 0) {
+ /*
+ * We use the most optimistic possible cwnd/srtt for
+ * sending calculations. This will make our
+ * calculation anticipate getting more through
+ * quicker then possible. But thats ok we don't want
+ * the peer to have a gap in data sending.
+ */
+ uint32_t srtt, cwnd, tr_perms = 0;
+ int32_t reduce = 0;
+
+ old_method:
+ /*
+ * We keep no precise pacing with the old method
+ * instead we use the pacer to mitigate bursts.
+ */
+ if (rack->r_ctl.rc_rack_min_rtt)
+ srtt = rack->r_ctl.rc_rack_min_rtt;
+ else
+ srtt = max(tp->t_srtt, 1);
+ if (rack->r_ctl.rc_rack_largest_cwnd)
+ cwnd = rack->r_ctl.rc_rack_largest_cwnd;
+ else
+ cwnd = rack->r_ctl.cwnd_to_use;
+ /* Inflate cwnd by 1000 so srtt of usecs is in ms */
+ tr_perms = (cwnd * 1000) / srtt;
+ if (tr_perms == 0) {
+ tr_perms = ctf_fixed_maxseg(tp);
+ }
+ /*
+ * Calculate how long this will take to drain, if
+ * the calculation comes out to zero, thats ok we
+ * will use send_a_lot to possibly spin around for
+ * more increasing tot_len_this_send to the point
+ * that its going to require a pace, or we hit the
+ * cwnd. Which in that case we are just waiting for
+ * a ACK.
+ */
+ slot = len / tr_perms;
+ /* Now do we reduce the time so we don't run dry? */
+ if (slot && rack_slot_reduction) {
+ reduce = (slot / rack_slot_reduction);
+ if (reduce < slot) {
+ slot -= reduce;
+ } else
+ slot = 0;
+ }
+ slot *= HPTS_USEC_IN_MSEC;
+ if (rsm == NULL) {
+ /*
+ * We always consider ourselves app limited with old style
+ * that are not retransmits. This could be the initial
+ * measurement, but thats ok its all setup and specially
+ * handled. If another send leaks out, then that too will
+ * be mark app-limited.
+ */
+ lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) {
+ rack->r_ctl.rc_first_appl = lrsm;
+ lrsm->r_flags |= RACK_APP_LIMITED;
+ rack->r_ctl.rc_app_limited_cnt++;
+ }
+ }
+ if (rack->rc_pace_to_cwnd) {
+ uint64_t rate_wanted = 0;
+
+ slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
+ rack->rc_ack_can_sendout_data = 1;
+ rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL);
+ } else
+ rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL);
+ } else {
+ uint64_t bw_est, res, lentim, rate_wanted;
+ uint32_t orig_val, srtt, segs, oh;
+ int capped = 0;
+ int prev_fill;
+
+ if ((rack->r_rr_config == 1) && rsm) {
+ return (rack->r_ctl.rc_min_to);
+ }
+ if (rack->use_fixed_rate) {
+ rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
+ } else if ((rack->r_ctl.init_rate == 0) &&
+#ifdef NETFLIX_PEAKRATE
+ (rack->rc_tp->t_maxpeakrate == 0) &&
+#endif
+ (rack->r_ctl.gp_bw == 0)) {
+ /* no way to yet do an estimate */
+ bw_est = rate_wanted = 0;
+ } else {
+ bw_est = rack_get_bw(rack);
+ rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
+ }
+ if ((bw_est == 0) || (rate_wanted == 0) ||
+ ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
+ /*
+ * No way yet to make a b/w estimate or
+ * our raise is set incorrectly.
+ */
+ goto old_method;
+ }
+ /* We need to account for all the overheads */
+ segs = (len + segsiz - 1) / segsiz;
+ /*
+ * We need the diff between 1514 bytes (e-mtu with e-hdr)
+ * and how much data we put in each packet. Yes this
+ * means we may be off if we are larger than 1500 bytes
+ * or smaller. But this just makes us more conservative.
+ */
+ if (rack_hw_rate_min &&
+ (bw_est < rack_hw_rate_min))
+ can_start_hw_pacing = 0;
+ if (ETHERNET_SEGMENT_SIZE > segsiz)
+ oh = ETHERNET_SEGMENT_SIZE - segsiz;
+ else
+ oh = 0;
+ segs *= oh;
+ lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
+ res = lentim / rate_wanted;
+ slot = (uint32_t)res;
+ orig_val = rack->r_ctl.rc_pace_max_segs;
+ if (rack->r_ctl.crte == NULL) {
+ /*
+ * Only do this if we are not hardware pacing
+ * since if we are doing hw-pacing below we will
+ * set make a call after setting up or changing
+ * the rate.
+ */
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
+ } else if (rack->rc_inp->inp_snd_tag == NULL) {
+ /*
+ * We lost our rate somehow, this can happen
+ * if the interface changed underneath us.
+ */
+ tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
+ rack->r_ctl.crte = NULL;
+ /* Lets re-allow attempting to setup pacing */
+ rack->rack_hdrw_pacing = 0;
+ rack->rack_attempt_hdwr_pace = 0;
+ rack_log_hdwr_pacing(rack,
+ rate_wanted, bw_est, __LINE__,
+ 0, 6);
+ }
+ /* Did we change the TSO size, if so log it */
+ if (rack->r_ctl.rc_pace_max_segs != orig_val)
+ rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL);
+ prev_fill = rack->r_via_fill_cw;
+ if ((rack->rc_pace_to_cwnd) &&
+ (capped == 0) &&
+ (rack->use_fixed_rate == 0) &&
+ (rack->in_probe_rtt == 0) &&
+ (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
+ /*
+ * We want to pace at our rate *or* faster to
+ * fill the cwnd to the max if its not full.
+ */
+ slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
+ }
+ if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
+ (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
+ if ((rack->rack_hdw_pace_ena) &&
+ (can_start_hw_pacing > 0) &&
+ (rack->rack_hdrw_pacing == 0) &&
+ (rack->rack_attempt_hdwr_pace == 0)) {
+ /*
+ * Lets attempt to turn on hardware pacing
+ * if we can.
+ */
+ rack->rack_attempt_hdwr_pace = 1;
+ rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
+ rack->rc_inp->inp_route.ro_nh->nh_ifp,
+ rate_wanted,
+ RS_PACING_GEQ,
+ &err, &rack->r_ctl.crte_prev_rate);
+ if (rack->r_ctl.crte) {
+ rack->rack_hdrw_pacing = 1;
+ rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz,
+ 0, rack->r_ctl.crte,
+ NULL);
+ rack_log_hdwr_pacing(rack,
+ rate_wanted, rack->r_ctl.crte->rate, __LINE__,
+ err, 0);
+ rack->r_ctl.last_hw_bw_req = rate_wanted;
+ } else {
+ counter_u64_add(rack_hw_pace_init_fail, 1);
+ }
+ } else if (rack->rack_hdrw_pacing &&
+ (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
+ /* Do we need to adjust our rate? */
+ const struct tcp_hwrate_limit_table *nrte;
+
+ if (rack->r_up_only &&
+ (rate_wanted < rack->r_ctl.crte->rate)) {
+ /**
+ * We have four possible states here
+ * having to do with the previous time
+ * and this time.
+ * previous | this-time
+ * A) 0 | 0 -- fill_cw not in the picture
+ * B) 1 | 0 -- we were doing a fill-cw but now are not
+ * C) 1 | 1 -- all rates from fill_cw
+ * D) 0 | 1 -- we were doing non-fill and now we are filling
+ *
+ * For case A, C and D we don't allow a drop. But for
+ * case B where we now our on our steady rate we do
+ * allow a drop.
+ *
+ */
+ if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
+ goto done_w_hdwr;
+ }
+ if ((rate_wanted > rack->r_ctl.crte->rate) ||
+ (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
+ if (rack_hw_rate_to_low &&
+ (bw_est < rack_hw_rate_to_low)) {
+ /*
+ * The pacing rate is too low for hardware, but
+ * do allow hardware pacing to be restarted.
+ */
+ rack_log_hdwr_pacing(rack,
+ bw_est, rack->r_ctl.crte->rate, __LINE__,
+ 0, 5);
+ tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
+ rack->r_ctl.crte = NULL;
+ rack->rack_attempt_hdwr_pace = 0;
+ rack->rack_hdrw_pacing = 0;
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
+ goto done_w_hdwr;
+ }
+ nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
+ rack->rc_tp,
+ rack->rc_inp->inp_route.ro_nh->nh_ifp,
+ rate_wanted,
+ RS_PACING_GEQ,
+ &err, &rack->r_ctl.crte_prev_rate);
+ if (nrte == NULL) {
+ /* Lost the rate */
+ rack->rack_hdrw_pacing = 0;
+ rack->r_ctl.crte = NULL;
+ rack_log_hdwr_pacing(rack,
+ rate_wanted, 0, __LINE__,
+ err, 1);
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
+ counter_u64_add(rack_hw_pace_lost, 1);
+ } else if (nrte != rack->r_ctl.crte) {
+ rack->r_ctl.crte = nrte;
+ rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted,
+ segsiz, 0,
+ rack->r_ctl.crte,
+ NULL);
+ rack_log_hdwr_pacing(rack,
+ rate_wanted, rack->r_ctl.crte->rate, __LINE__,
+ err, 2);
+ rack->r_ctl.last_hw_bw_req = rate_wanted;
+ }
+ } else {
+ /* We just need to adjust the segment size */
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
+ rack_log_hdwr_pacing(rack,
+ rate_wanted, rack->r_ctl.crte->rate, __LINE__,
+ 0, 4);
+ rack->r_ctl.last_hw_bw_req = rate_wanted;
+ }
+ }
+ }
+ if ((rack->r_ctl.crte != NULL) &&
+ (rack->r_ctl.crte->rate == rate_wanted)) {
+ /*
+ * We need to add a extra if the rates
+ * are exactly matched. The idea is
+ * we want the software to make sure the
+ * queue is empty before adding more, this
+ * gives us N MSS extra pace times where
+ * N is our sysctl
+ */
+ slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots);
+ }
+done_w_hdwr:
+ if (rack_limit_time_with_srtt &&
+ (rack->use_fixed_rate == 0) &&
+#ifdef NETFLIX_PEAKRATE
+ (rack->rc_tp->t_maxpeakrate == 0) &&
+#endif
+ (rack->rack_hdrw_pacing == 0)) {
+ /*
+ * Sanity check, we do not allow the pacing delay
+ * to be longer than the SRTT of the path. If it is
+ * a slow path, then adding a packet should increase
+ * the RTT and compensate for this i.e. the srtt will
+ * be greater so the allowed pacing time will be greater.
+ *
+ * Note this restriction is not for where a peak rate
+ * is set, we are doing fixed pacing or hardware pacing.
+ */
+ if (rack->rc_tp->t_srtt)
+ srtt = rack->rc_tp->t_srtt;
+ else
+ srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */
+ if (srtt < slot) {
+ rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL);
+ slot = srtt;
+ }
+ }
+ rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm);
+ }
+ if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
+ /*
+ * If this rate is seeing enobufs when it
+ * goes to send then either the nic is out
+ * of gas or we are mis-estimating the time
+ * somehow and not letting the queue empty
+ * completely. Lets add to the pacing time.
+ */
+ int hw_boost_delay;
+
+ hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
+ if (hw_boost_delay > rack_enobuf_hw_max)
+ hw_boost_delay = rack_enobuf_hw_max;
+ else if (hw_boost_delay < rack_enobuf_hw_min)
+ hw_boost_delay = rack_enobuf_hw_min;
+ slot += hw_boost_delay;
+ }
+ if (slot)
+ counter_u64_add(rack_calc_nonzero, 1);
+ else
+ counter_u64_add(rack_calc_zero, 1);
+ return (slot);
+}
+
+static void
+rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
+ tcp_seq startseq, uint32_t sb_offset)
+{
+ struct rack_sendmap *my_rsm = NULL;
+ struct rack_sendmap fe;
+
+ if (tp->t_state < TCPS_ESTABLISHED) {
+ /*
+ * We don't start any measurements if we are
+ * not at least established.
+ */
+ return;
+ }
+ tp->t_flags |= TF_GPUTINPROG;
+ rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
+ rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
+ tp->gput_seq = startseq;
+ rack->app_limited_needs_set = 0;
+ if (rack->in_probe_rtt)
+ rack->measure_saw_probe_rtt = 1;
+ else if ((rack->measure_saw_probe_rtt) &&
+ (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
+ rack->measure_saw_probe_rtt = 0;
+ if (rack->rc_gp_filled)
+ tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
+ else {
+ /* Special case initial measurement */
+ struct timeval tv;
+
+ tp->gput_ts = tcp_get_usecs(&tv);
+ rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
+ }
+ /*
+ * We take a guess out into the future,
+ * if we have no measurement and no
+ * initial rate, we measure the first
+ * initial-windows worth of data to
+ * speed up getting some GP measurement and
+ * thus start pacing.
+ */
+ if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
+ rack->app_limited_needs_set = 1;
+ tp->gput_ack = startseq + max(rc_init_window(rack),
+ (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
+ rack_log_pacing_delay_calc(rack,
+ tp->gput_seq,
+ tp->gput_ack,
+ 0,
+ tp->gput_ts,
+ rack->r_ctl.rc_app_limited_cnt,
+ 9,
+ __LINE__, NULL);
+ return;
+ }
+ if (sb_offset) {
+ /*
+ * We are out somewhere in the sb
+ * can we use the already outstanding data?
+ */
+
+ if (rack->r_ctl.rc_app_limited_cnt == 0) {
+ /*
+ * Yes first one is good and in this case
+ * the tp->gput_ts is correctly set based on
+ * the last ack that arrived (no need to
+ * set things up when an ack comes in).
+ */
+ my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
+ if ((my_rsm == NULL) ||
+ (my_rsm->r_rtr_cnt != 1)) {
+ /* retransmission? */
+ goto use_latest;
+ }
+ } else {
+ if (rack->r_ctl.rc_first_appl == NULL) {
+ /*
+ * If rc_first_appl is NULL
+ * then the cnt should be 0.
+ * This is probably an error, maybe
+ * a KASSERT would be approprate.
+ */
+ goto use_latest;
+ }
+ /*
+ * If we have a marker pointer to the last one that is
+ * app limited we can use that, but we need to set
+ * things up so that when it gets ack'ed we record
+ * the ack time (if its not already acked).
+ */
+ rack->app_limited_needs_set = 1;
+ /*
+ * We want to get to the rsm that is either
+ * next with space i.e. over 1 MSS or the one
+ * after that (after the app-limited).
+ */
+ my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
+ rack->r_ctl.rc_first_appl);
+ if (my_rsm) {
+ if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
+ /* Have to use the next one */
my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
my_rsm);
else {
@@ -11754,121 +14660,1231 @@
goto start_set;
}
}
- if ((my_rsm == NULL) ||
- (my_rsm->r_rtr_cnt != 1)) {
- /*
- * Either its a retransmit or
- * the last is the app-limited one.
- */
- goto use_latest;
+ if ((my_rsm == NULL) ||
+ (my_rsm->r_rtr_cnt != 1)) {
+ /*
+ * Either its a retransmit or
+ * the last is the app-limited one.
+ */
+ goto use_latest;
+ }
+ }
+ tp->gput_seq = my_rsm->r_start;
+start_set:
+ if (my_rsm->r_flags & RACK_ACKED) {
+ /*
+ * This one has been acked use the arrival ack time
+ */
+ tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
+ rack->app_limited_needs_set = 0;
+ }
+ rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
+ tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
+ rack_log_pacing_delay_calc(rack,
+ tp->gput_seq,
+ tp->gput_ack,
+ (uint64_t)my_rsm,
+ tp->gput_ts,
+ rack->r_ctl.rc_app_limited_cnt,
+ 9,
+ __LINE__, NULL);
+ return;
+ }
+
+use_latest:
+ /*
+ * We don't know how long we may have been
+ * idle or if this is the first-send. Lets
+ * setup the flag so we will trim off
+ * the first ack'd data so we get a true
+ * measurement.
+ */
+ rack->app_limited_needs_set = 1;
+ tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
+ /* Find this guy so we can pull the send time */
+ fe.r_start = startseq;
+ my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ if (my_rsm) {
+ rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
+ if (my_rsm->r_flags & RACK_ACKED) {
+ /*
+ * Unlikely since its probably what was
+ * just transmitted (but I am paranoid).
+ */
+ tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
+ rack->app_limited_needs_set = 0;
+ }
+ if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
+ /* This also is unlikely */
+ tp->gput_seq = my_rsm->r_start;
+ }
+ } else {
+ /*
+ * TSNH unless we have some send-map limit,
+ * and even at that it should not be hitting
+ * that limit (we should have stopped sending).
+ */
+ struct timeval tv;
+
+ microuptime(&tv);
+ rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
+ }
+ rack_log_pacing_delay_calc(rack,
+ tp->gput_seq,
+ tp->gput_ack,
+ (uint64_t)my_rsm,
+ tp->gput_ts,
+ rack->r_ctl.rc_app_limited_cnt,
+ 9, __LINE__, NULL);
+}
+
+static inline uint32_t
+rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use,
+ uint32_t avail, int32_t sb_offset)
+{
+ uint32_t len;
+ uint32_t sendwin;
+
+ if (tp->snd_wnd > cwnd_to_use)
+ sendwin = cwnd_to_use;
+ else
+ sendwin = tp->snd_wnd;
+ if (ctf_outstanding(tp) >= tp->snd_wnd) {
+ /* We never want to go over our peers rcv-window */
+ len = 0;
+ } else {
+ uint32_t flight;
+
+ flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
+ if (flight >= sendwin) {
+ /*
+ * We have in flight what we are allowed by cwnd (if
+ * it was rwnd blocking it would have hit above out
+ * >= tp->snd_wnd).
+ */
+ return (0);
+ }
+ len = sendwin - flight;
+ if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
+ /* We would send too much (beyond the rwnd) */
+ len = tp->snd_wnd - ctf_outstanding(tp);
+ }
+ if ((len + sb_offset) > avail) {
+ /*
+ * We don't have that much in the SB, how much is
+ * there?
+ */
+ len = avail - sb_offset;
+ }
+ }
+ return (len);
+}
+
+static void
+rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
+ unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
+ int rsm_is_null, int optlen, int line, uint16_t mode)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex1 = error;
+ log.u_bbr.flex2 = flags;
+ log.u_bbr.flex3 = rsm_is_null;
+ log.u_bbr.flex4 = ipoptlen;
+ log.u_bbr.flex5 = tp->rcv_numsacks;
+ log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
+ log.u_bbr.flex7 = optlen;
+ log.u_bbr.flex8 = rack->r_fsb_inited;
+ log.u_bbr.applimited = rack->r_fast_output;
+ log.u_bbr.bw_inuse = rack_get_bw(rack);
+ log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
+ log.u_bbr.cwnd_gain = mode;
+ log.u_bbr.pkts_out = orig_len;
+ log.u_bbr.lt_epoch = len;
+ log.u_bbr.delivered = line;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
+ len, &log, false, NULL, NULL, 0, &tv);
+ }
+}
+
+
+static struct mbuf *
+rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
+ struct rack_fast_send_blk *fsb,
+ int32_t seglimit, int32_t segsize)
+{
+#ifdef KERN_TLS
+ struct ktls_session *tls, *ntls;
+ struct mbuf *start;
+#endif
+ struct mbuf *m, *n, **np, *smb;
+ struct mbuf *top;
+ int32_t off, soff;
+ int32_t len = *plen;
+ int32_t fragsize;
+ int32_t len_cp = 0;
+ uint32_t mlen, frags;
+
+ soff = off = the_off;
+ smb = m = the_m;
+ np = &top;
+ top = NULL;
+#ifdef KERN_TLS
+ if (hw_tls && (m->m_flags & M_EXTPG))
+ tls = m->m_epg_tls;
+ else
+ tls = NULL;
+ start = m;
+#endif
+ while (len > 0) {
+ if (m == NULL) {
+ *plen = len_cp;
+ break;
+ }
+#ifdef KERN_TLS
+ if (hw_tls) {
+ if (m->m_flags & M_EXTPG)
+ ntls = m->m_epg_tls;
+ else
+ ntls = NULL;
+
+ /*
+ * Avoid mixing TLS records with handshake
+ * data or TLS records from different
+ * sessions.
+ */
+ if (tls != ntls) {
+ MPASS(m != start);
+ *plen = len_cp;
+ break;
+ }
+ }
+#endif
+ mlen = min(len, m->m_len - off);
+ if (seglimit) {
+ /*
+ * For M_EXTPG mbufs, add 3 segments
+ * + 1 in case we are crossing page boundaries
+ * + 2 in case the TLS hdr/trailer are used
+ * It is cheaper to just add the segments
+ * than it is to take the cache miss to look
+ * at the mbuf ext_pgs state in detail.
+ */
+ if (m->m_flags & M_EXTPG) {
+ fragsize = min(segsize, PAGE_SIZE);
+ frags = 3;
+ } else {
+ fragsize = segsize;
+ frags = 0;
+ }
+
+ /* Break if we really can't fit anymore. */
+ if ((frags + 1) >= seglimit) {
+ *plen = len_cp;
+ break;
+ }
+
+ /*
+ * Reduce size if you can't copy the whole
+ * mbuf. If we can't copy the whole mbuf, also
+ * adjust len so the loop will end after this
+ * mbuf.
+ */
+ if ((frags + howmany(mlen, fragsize)) >= seglimit) {
+ mlen = (seglimit - frags - 1) * fragsize;
+ len = mlen;
+ *plen = len_cp + len;
}
+ frags += howmany(mlen, fragsize);
+ if (frags == 0)
+ frags++;
+ seglimit -= frags;
+ KASSERT(seglimit > 0,
+ ("%s: seglimit went too low", __func__));
+ }
+ n = m_get(M_NOWAIT, m->m_type);
+ *np = n;
+ if (n == NULL)
+ goto nospace;
+ n->m_len = mlen;
+ soff += mlen;
+ len_cp += n->m_len;
+ if (m->m_flags & (M_EXT|M_EXTPG)) {
+ n->m_data = m->m_data + off;
+ mb_dupcl(n, m);
+ } else {
+ bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+ (u_int)n->m_len);
+ }
+ len -= n->m_len;
+ off = 0;
+ m = m->m_next;
+ np = &n->m_next;
+ if (len || (soff == smb->m_len)) {
+ /*
+ * We have more so we move forward or
+ * we have consumed the entire mbuf and
+ * len has fell to 0.
+ */
+ soff = 0;
+ smb = m;
}
- tp->gput_seq = my_rsm->r_start;
-start_set:
- if (my_rsm->r_flags & RACK_ACKED) {
+
+ }
+ if (fsb != NULL) {
+ fsb->m = smb;
+ fsb->off = soff;
+ if (smb) {
+ /*
+ * Save off the size of the mbuf. We do
+ * this so that we can recognize when it
+ * has been trimmed by sbcut() as acks
+ * come in.
+ */
+ fsb->o_m_len = smb->m_len;
+ } else {
+ /*
+ * This is the case where the next mbuf went to NULL. This
+ * means with this copy we have sent everything in the sb.
+ * In theory we could clear the fast_output flag, but lets
+ * not since its possible that we could get more added
+ * and acks that call the extend function which would let
+ * us send more.
+ */
+ fsb->o_m_len = 0;
+ }
+ }
+ return (top);
+nospace:
+ if (top)
+ m_freem(top);
+ return (NULL);
+
+}
+
+/*
+ * This is a copy of m_copym(), taking the TSO segment size/limit
+ * constraints into account, and advancing the sndptr as it goes.
+ */
+static struct mbuf *
+rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
+ int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
+{
+ struct mbuf *m, *n;
+ int32_t soff;
+
+ soff = rack->r_ctl.fsb.off;
+ m = rack->r_ctl.fsb.m;
+ if (rack->r_ctl.fsb.o_m_len != m->m_len) {
+ /*
+ * The mbuf had the front of it chopped off by an ack
+ * we need to adjust the soff/off by that difference.
+ */
+ uint32_t delta;
+
+ delta = rack->r_ctl.fsb.o_m_len - m->m_len;
+ soff -= delta;
+ }
+ KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
+ KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
+ KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
+ __FUNCTION__,
+ rack, *plen, m, m->m_len));
+ /* Save off the right location before we copy and advance */
+ *s_soff = soff;
+ *s_mb = rack->r_ctl.fsb.m;
+ n = rack_fo_base_copym(m, soff, plen,
+ &rack->r_ctl.fsb,
+ seglimit, segsize);
+ return (n);
+}
+
+static int
+rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
+ uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len)
+{
+ /*
+ * Enter the fast retransmit path. We are given that a sched_pin is
+ * in place (if accounting is compliled in) and the cycle count taken
+ * at the entry is in the ts_val. The concept her is that the rsm
+ * now holds the mbuf offsets and such so we can directly transmit
+ * without a lot of overhead, the len field is already set for
+ * us to prohibit us from sending too much (usually its 1MSS).
+ */
+ struct ip *ip = NULL;
+ struct udphdr *udp = NULL;
+ struct tcphdr *th = NULL;
+ struct mbuf *m = NULL;
+ struct inpcb *inp;
+ uint8_t *cpto;
+ struct tcp_log_buffer *lgb;
+#ifdef TCP_ACCOUNTING
+ uint64_t crtsc;
+ int cnt_thru = 1;
+#endif
+ int doing_tlp = 0;
+ struct tcpopt to;
+ u_char opt[TCP_MAXOLEN];
+ uint32_t hdrlen, optlen;
+ int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0;
+ uint32_t us_cts;
+ uint32_t if_hw_tsomaxsegcount = 0, startseq;
+ uint32_t if_hw_tsomaxsegsize;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+
+ if (rack->r_is_v6) {
+ ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
+#endif /* INET6 */
+ {
+ ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
+ hdrlen = sizeof(struct tcpiphdr);
+ }
+ if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
+ goto failed;
+ }
+ if (rsm->r_flags & RACK_TLP)
+ doing_tlp = 1;
+ startseq = rsm->r_start;
+ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ inp = rack->rc_inp;
+ to.to_flags = 0;
+ flags = tcp_outflags[tp->t_state];
+ if (flags & (TH_SYN|TH_RST)) {
+ goto failed;
+ }
+ if (rsm->r_flags & RACK_HAS_FIN) {
+ /* We can't send a FIN here */
+ goto failed;
+ }
+ if (flags & TH_FIN) {
+ /* We never send a FIN */
+ flags &= ~TH_FIN;
+ }
+ if (tp->t_flags & TF_RCVD_TSTMP) {
+ to.to_tsval = ms_cts + tp->ts_offset;
+ to.to_tsecr = tp->ts_recent;
+ to.to_flags = TOF_TS;
+ }
+ optlen = tcp_addoptions(&to, opt);
+ hdrlen += optlen;
+ udp = rack->r_ctl.fsb.udp;
+ if (rack->r_ctl.rc_pace_max_segs)
+ max_val = rack->r_ctl.rc_pace_max_segs;
+ else if (rack->rc_user_set_max_segs)
+ max_val = rack->rc_user_set_max_segs * segsiz;
+ else
+ max_val = len;
+ if ((tp->t_flags & TF_TSO) &&
+ V_tcp_do_tso &&
+ (len > segsiz) &&
+ (tp->t_port == 0))
+ tso = 1;
+#ifdef INET6
+ if (MHLEN < hdrlen + max_linkhdr)
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ else
+#endif
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL)
+ goto failed;
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ th = rack->r_ctl.fsb.th;
+ /* Establish the len to send */
+ if (len > max_val)
+ len = max_val;
+ if ((tso) && (len + optlen > tp->t_maxseg)) {
+ uint32_t if_hw_tsomax;
+ int32_t max_len;
+
+ /* extract TSO information */
+ if_hw_tsomax = tp->t_tsomax;
+ if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
+ if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
+ /*
+ * Check if we should limit by maximum payload
+ * length:
+ */
+ if (if_hw_tsomax != 0) {
+ /* compute maximum TSO length */
+ max_len = (if_hw_tsomax - hdrlen -
+ max_linkhdr);
+ if (max_len <= 0) {
+ goto failed;
+ } else if (len > max_len) {
+ len = max_len;
+ }
+ }
+ if (len <= segsiz) {
+ /*
+ * In case there are too many small fragments don't
+ * use TSO:
+ */
+ tso = 0;
+ }
+ } else {
+ tso = 0;
+ }
+ if ((tso == 0) && (len > segsiz))
+ len = segsiz;
+ us_cts = tcp_get_usecs(tv);
+ if ((len == 0) ||
+ (len <= MHLEN - hdrlen - max_linkhdr)) {
+ goto failed;
+ }
+ th->th_seq = htonl(rsm->r_start);
+ th->th_ack = htonl(tp->rcv_nxt);
+ if(rsm->r_flags & RACK_HAD_PUSH)
+ flags |= TH_PUSH;
+ th->th_flags = flags;
+ th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
+ if (th->th_win == 0) {
+ tp->t_sndzerowin++;
+ tp->t_flags |= TF_RXWIN0SENT;
+ } else
+ tp->t_flags &= ~TF_RXWIN0SENT;
+ if (rsm->r_flags & RACK_TLP) {
+ /*
+ * TLP should not count in retran count, but
+ * in its own bin
+ */
+ counter_u64_add(rack_tlp_retran, 1);
+ counter_u64_add(rack_tlp_retran_bytes, len);
+ } else {
+ tp->t_sndrexmitpack++;
+ KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
+ KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+ }
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
+ len);
+#endif
+ if (rsm->m == NULL)
+ goto failed;
+ if (rsm->orig_m_len != rsm->m->m_len) {
+ /* Fix up the orig_m_len and possibly the mbuf offset */
+ rack_adjust_orig_mlen(rsm);
+ }
+ m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize);
+ if (len <= segsiz) {
+ /*
+ * Must have ran out of mbufs for the copy
+ * shorten it to no longer need tso. Lets
+ * not put on sendalot since we are low on
+ * mbufs.
+ */
+ tso = 0;
+ }
+ if ((m->m_next == NULL) || (len <= 0)){
+ goto failed;
+ }
+ if (udp) {
+ if (rack->r_is_v6)
+ ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ else
+ ulen = hdrlen + len - sizeof(struct ip);
+ udp->uh_ulen = htons(ulen);
+ }
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+#ifdef INET6
+ if (rack->r_is_v6) {
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in6_cksum_pseudo(ip6,
+ sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
+ 0);
+ }
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ {
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
+ IPPROTO_TCP + len + optlen));
+ }
+ /* IP version must be set here for ipv4/ipv6 checking later */
+ KASSERT(ip->ip_v == IPVERSION,
+ ("%s: IP version incorrect: %d", __func__, ip->ip_v));
+ }
+#endif
+ if (tso) {
+ KASSERT(len > tp->t_maxseg - optlen,
+ ("%s: len <= tso_segsz tp:%p", __func__, tp));
+ m->m_pkthdr.csum_flags |= CSUM_TSO;
+ m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
+ }
+#ifdef INET6
+ if (rack->r_is_v6) {
+ ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
+ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
+ ip->ip_len = htons(m->m_pkthdr.len);
+ ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ if (tp->t_port == 0 || len < V_tcp_minmss) {
+ ip->ip_off |= htons(IP_DF);
+ }
+ } else {
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ }
+ }
+#endif
+ /* Time to copy in our header */
+ cpto = mtod(m, uint8_t *);
+ memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
+ th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
+ if (optlen) {
+ bcopy(opt, th + 1, optlen);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ } else {
+ th->th_off = sizeof(struct tcphdr) >> 2;
+ }
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ if (rack->rack_no_prr)
+ log.u_bbr.flex1 = 0;
+ else
+ log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex4 = max_val;
+ log.u_bbr.flex5 = 0;
+ /* Save off the early/late values */
+ log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
+ log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
+ log.u_bbr.bw_inuse = rack_get_bw(rack);
+ log.u_bbr.flex8 = 1;
+ log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
+ log.u_bbr.flex7 = 55;
+ log.u_bbr.pkts_out = tp->t_maxseg;
+ log.u_bbr.timeStamp = cts;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
+ log.u_bbr.delivered = 0;
+ lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+ len, &log, false, NULL, NULL, 0, tv);
+ } else
+ lgb = NULL;
+#ifdef INET6
+ if (rack->r_is_v6) {
+ error = ip6_output(m, NULL,
+ &inp->inp_route6,
+ 0, NULL, NULL, inp);
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
+ error = ip_output(m, NULL,
+ &inp->inp_route,
+ 0, 0, inp);
+ }
+#endif
+ m = NULL;
+ if (lgb) {
+ lgb->tlb_errno = error;
+ lgb = NULL;
+ }
+ if (error) {
+ goto failed;
+ }
+ rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
+ rsm, RACK_SENT_FP, rsm->m, rsm->soff);
+ if (doing_tlp && (rack->fast_rsm_hack == 0)) {
+ rack->rc_tlp_in_progress = 1;
+ rack->r_ctl.rc_tlp_cnt_out++;
+ }
+ tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
+ rack->forced_ack = 0; /* If we send something zap the FA flag */
+ if (IN_FASTRECOVERY(tp->t_flags) && rsm)
+ rack->r_ctl.retran_during_recovery += len;
+ {
+ int idx;
+
+ idx = (len / segsiz) + 3;
+ if (idx >= TCP_MSS_ACCT_ATIMER)
+ counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
+ else
+ counter_u64_add(rack_out_size[idx], 1);
+ }
+ if (tp->t_rtttime == 0) {
+ tp->t_rtttime = ticks;
+ tp->t_rtseq = startseq;
+ KMOD_TCPSTAT_INC(tcps_segstimed);
+ }
+ counter_u64_add(rack_fto_rsm_send, 1);
+ if (error && (error == ENOBUFS)) {
+ slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
+ if (rack->rc_enobuf < 0x7f)
+ rack->rc_enobuf++;
+ if (slot < (10 * HPTS_USEC_IN_MSEC))
+ slot = 10 * HPTS_USEC_IN_MSEC;
+ } else
+ slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
+ if ((slot == 0) ||
+ (rack->rc_always_pace == 0) ||
+ (rack->r_rr_config == 1)) {
+ /*
+ * We have no pacing set or we
+ * are using old-style rack or
+ * we are overriden to use the old 1ms pacing.
+ */
+ slot = rack->r_ctl.rc_min_to;
+ }
+ rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
+ if (rack->r_must_retran) {
+ rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
+ if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
/*
- * This one has been acked use the arrival ack time
+ * We have retransmitted all we need.
*/
- tp->gput_ts = my_rsm->r_ack_arrival;
- rack->app_limited_needs_set = 0;
+ rack->r_must_retran = 0;
+ rack->r_ctl.rc_out_at_rto = 0;
}
- rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send;
- tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
- rack_log_pacing_delay_calc(rack,
- tp->gput_seq,
- tp->gput_ack,
- (uint64_t)my_rsm,
- tp->gput_ts,
- rack->r_ctl.rc_app_limited_cnt,
- 9,
- __LINE__, NULL);
- return;
}
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
+ }
+ counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz));
+ sched_unpin();
+#endif
+ return (0);
+failed:
+ if (m)
+ m_free(m);
+ return (-1);
+}
-use_latest:
+static void
+rack_sndbuf_autoscale(struct tcp_rack *rack)
+{
/*
- * We don't know how long we may have been
- * idle or if this is the first-send. Lets
- * setup the flag so we will trim off
- * the first ack'd data so we get a true
- * measurement.
+ * Automatic sizing of send socket buffer. Often the send buffer
+ * size is not optimally adjusted to the actual network conditions
+ * at hand (delay bandwidth product). Setting the buffer size too
+ * small limits throughput on links with high bandwidth and high
+ * delay (eg. trans-continental/oceanic links). Setting the
+ * buffer size too big consumes too much real kernel memory,
+ * especially with many connections on busy servers.
+ *
+ * The criteria to step up the send buffer one notch are:
+ * 1. receive window of remote host is larger than send buffer
+ * (with a fudge factor of 5/4th);
+ * 2. send buffer is filled to 7/8th with data (so we actually
+ * have data to make use of it);
+ * 3. send buffer fill has not hit maximal automatic size;
+ * 4. our send window (slow start and cogestion controlled) is
+ * larger than sent but unacknowledged data in send buffer.
+ *
+ * Note that the rack version moves things much faster since
+ * we want to avoid hitting cache lines in the rack_fast_output()
+ * path so this is called much less often and thus moves
+ * the SB forward by a percentage.
*/
- rack->app_limited_needs_set = 1;
- tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
- /* Find this guy so we can pull the send time */
- fe.r_start = startseq;
- my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
- if (my_rsm) {
- rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send;
- if (my_rsm->r_flags & RACK_ACKED) {
+ struct socket *so;
+ struct tcpcb *tp;
+ uint32_t sendwin, scaleup;
+
+ tp = rack->rc_tp;
+ so = rack->rc_inp->inp_socket;
+ sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
+ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
+ if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
+ sbused(&so->so_snd) >=
+ (so->so_snd.sb_hiwat / 8 * 7) &&
+ sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
+ sendwin >= (sbused(&so->so_snd) -
+ (tp->snd_nxt - tp->snd_una))) {
+ if (rack_autosndbuf_inc)
+ scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
+ else
+ scaleup = V_tcp_autosndbuf_inc;
+ if (scaleup < V_tcp_autosndbuf_inc)
+ scaleup = V_tcp_autosndbuf_inc;
+ scaleup += so->so_snd.sb_hiwat;
+ if (scaleup > V_tcp_autosndbuf_max)
+ scaleup = V_tcp_autosndbuf_max;
+ if (!sbreserve_locked(&so->so_snd, scaleup, so, curthread))
+ so->so_snd.sb_flags &= ~SB_AUTOSIZE;
+ }
+ }
+}
+
+static int
+rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
+ uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
+{
+ /*
+ * Enter to do fast output. We are given that the sched_pin is
+ * in place (if accounting is compiled in) and the cycle count taken
+ * at entry is in place in ts_val. The idea here is that
+ * we know how many more bytes needs to be sent (presumably either
+ * during pacing or to fill the cwnd and that was greater than
+ * the max-burst). We have how much to send and all the info we
+ * need to just send.
+ */
+ struct ip *ip = NULL;
+ struct udphdr *udp = NULL;
+ struct tcphdr *th = NULL;
+ struct mbuf *m, *s_mb;
+ struct inpcb *inp;
+ uint8_t *cpto;
+ struct tcp_log_buffer *lgb;
+#ifdef TCP_ACCOUNTING
+ uint64_t crtsc;
+#endif
+ struct tcpopt to;
+ u_char opt[TCP_MAXOLEN];
+ uint32_t hdrlen, optlen;
+ int cnt_thru = 1;
+ int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0;
+ uint32_t us_cts, s_soff;
+ uint32_t if_hw_tsomaxsegcount = 0, startseq;
+ uint32_t if_hw_tsomaxsegsize;
+ uint16_t add_flag = RACK_SENT_FP;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+
+ if (rack->r_is_v6) {
+ ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
+#endif /* INET6 */
+ {
+ ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
+ hdrlen = sizeof(struct tcpiphdr);
+ }
+ if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
+ m = NULL;
+ goto failed;
+ }
+ startseq = tp->snd_max;
+ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ inp = rack->rc_inp;
+ len = rack->r_ctl.fsb.left_to_send;
+ to.to_flags = 0;
+ flags = rack->r_ctl.fsb.tcp_flags;
+ if (tp->t_flags & TF_RCVD_TSTMP) {
+ to.to_tsval = ms_cts + tp->ts_offset;
+ to.to_tsecr = tp->ts_recent;
+ to.to_flags = TOF_TS;
+ }
+ optlen = tcp_addoptions(&to, opt);
+ hdrlen += optlen;
+ udp = rack->r_ctl.fsb.udp;
+ if (rack->r_ctl.rc_pace_max_segs)
+ max_val = rack->r_ctl.rc_pace_max_segs;
+ else if (rack->rc_user_set_max_segs)
+ max_val = rack->rc_user_set_max_segs * segsiz;
+ else
+ max_val = len;
+ if ((tp->t_flags & TF_TSO) &&
+ V_tcp_do_tso &&
+ (len > segsiz) &&
+ (tp->t_port == 0))
+ tso = 1;
+again:
+#ifdef INET6
+ if (MHLEN < hdrlen + max_linkhdr)
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ else
+#endif
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL)
+ goto failed;
+ m->m_data += max_linkhdr;
+ m->m_len = hdrlen;
+ th = rack->r_ctl.fsb.th;
+ /* Establish the len to send */
+ if (len > max_val)
+ len = max_val;
+ if ((tso) && (len + optlen > tp->t_maxseg)) {
+ uint32_t if_hw_tsomax;
+ int32_t max_len;
+
+ /* extract TSO information */
+ if_hw_tsomax = tp->t_tsomax;
+ if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
+ if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
+ /*
+ * Check if we should limit by maximum payload
+ * length:
+ */
+ if (if_hw_tsomax != 0) {
+ /* compute maximum TSO length */
+ max_len = (if_hw_tsomax - hdrlen -
+ max_linkhdr);
+ if (max_len <= 0) {
+ goto failed;
+ } else if (len > max_len) {
+ len = max_len;
+ }
+ }
+ if (len <= segsiz) {
/*
- * Unlikely since its probably what was
- * just transmitted (but I am paranoid).
+ * In case there are too many small fragments don't
+ * use TSO:
*/
- tp->gput_ts = my_rsm->r_ack_arrival;
- rack->app_limited_needs_set = 0;
- }
- if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
- /* This also is unlikely */
- tp->gput_seq = my_rsm->r_start;
+ tso = 0;
}
} else {
+ tso = 0;
+ }
+ if ((tso == 0) && (len > segsiz))
+ len = segsiz;
+ us_cts = tcp_get_usecs(tv);
+ if ((len == 0) ||
+ (len <= MHLEN - hdrlen - max_linkhdr)) {
+ goto failed;
+ }
+ sb_offset = tp->snd_max - tp->snd_una;
+ th->th_seq = htonl(tp->snd_max);
+ th->th_ack = htonl(tp->rcv_nxt);
+ th->th_flags = flags;
+ th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
+ if (th->th_win == 0) {
+ tp->t_sndzerowin++;
+ tp->t_flags |= TF_RXWIN0SENT;
+ } else
+ tp->t_flags &= ~TF_RXWIN0SENT;
+ tp->snd_up = tp->snd_una; /* drag it along, its deprecated */
+ KMOD_TCPSTAT_INC(tcps_sndpack);
+ KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
+#ifdef STATS
+ stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
+ len);
+#endif
+ if (rack->r_ctl.fsb.m == NULL)
+ goto failed;
+
+ /* s_mb and s_soff are saved for rack_log_output */
+ m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, &s_mb, &s_soff);
+ if (len <= segsiz) {
/*
- * TSNH unless we have some send-map limit,
- * and even at that it should not be hitting
- * that limit (we should have stopped sending).
+ * Must have ran out of mbufs for the copy
+ * shorten it to no longer need tso. Lets
+ * not put on sendalot since we are low on
+ * mbufs.
+ */
+ tso = 0;
+ }
+ if (rack->r_ctl.fsb.rfo_apply_push &&
+ (len == rack->r_ctl.fsb.left_to_send)) {
+ th->th_flags |= TH_PUSH;
+ add_flag |= RACK_HAD_PUSH;
+ }
+ if ((m->m_next == NULL) || (len <= 0)){
+ goto failed;
+ }
+ if (udp) {
+ if (rack->r_is_v6)
+ ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ else
+ ulen = hdrlen + len - sizeof(struct ip);
+ udp->uh_ulen = htons(ulen);
+ }
+ m->m_pkthdr.rcvif = (struct ifnet *)0;
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags2 & TF2_ECN_PERMIT)) {
+ /*
+ * If the peer has ECN, mark data packets with ECN capable
+ * transmission (ECT). Ignore pure ack packets,
+ * retransmissions.
*/
- rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL);
+ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
+#ifdef INET6
+ if (rack->r_is_v6)
+ ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+ else
+#endif
+ ip->ip_tos |= IPTOS_ECN_ECT0;
+ KMOD_TCPSTAT_INC(tcps_ecn_ect0);
+ /*
+ * Reply with proper ECN notifications.
+ * Only set CWR on new data segments.
+ */
+ if (tp->t_flags2 & TF2_ECN_SND_CWR) {
+ flags |= TH_CWR;
+ tp->t_flags2 &= ~TF2_ECN_SND_CWR;
+ }
+ }
+ if (tp->t_flags2 & TF2_ECN_SND_ECE)
+ flags |= TH_ECE;
+ }
+ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+#ifdef INET6
+ if (rack->r_is_v6) {
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in6_cksum_pseudo(ip6,
+ sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
+ 0);
+ }
+ }
+#endif
+#if defined(INET6) && defined(INET)
+ else
+#endif
+#ifdef INET
+ {
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
+ IPPROTO_TCP + len + optlen));
+ }
+ /* IP version must be set here for ipv4/ipv6 checking later */
+ KASSERT(ip->ip_v == IPVERSION,
+ ("%s: IP version incorrect: %d", __func__, ip->ip_v));
+ }
+#endif
+ if (tso) {
+ KASSERT(len > tp->t_maxseg - optlen,
+ ("%s: len <= tso_segsz tp:%p", __func__, tp));
+ m->m_pkthdr.csum_flags |= CSUM_TSO;
+ m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
+ }
+#ifdef INET6
+ if (rack->r_is_v6) {
+ ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
+ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ else
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
+ ip->ip_len = htons(m->m_pkthdr.len);
+ ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
+ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
+ tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ if (tp->t_port == 0 || len < V_tcp_minmss) {
+ ip->ip_off |= htons(IP_DF);
+ }
+ } else {
+ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
+ }
+ }
+#endif
+ /* Time to copy in our header */
+ cpto = mtod(m, uint8_t *);
+ memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
+ th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
+ if (optlen) {
+ bcopy(opt, th + 1, optlen);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ } else {
+ th->th_off = sizeof(struct tcphdr) >> 2;
+ }
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
+ log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ if (rack->rack_no_prr)
+ log.u_bbr.flex1 = 0;
+ else
+ log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
+ log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
+ log.u_bbr.flex4 = max_val;
+ log.u_bbr.flex5 = 0;
+ /* Save off the early/late values */
+ log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
+ log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
+ log.u_bbr.bw_inuse = rack_get_bw(rack);
+ log.u_bbr.flex8 = 0;
+ log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
+ log.u_bbr.flex7 = 44;
+ log.u_bbr.pkts_out = tp->t_maxseg;
+ log.u_bbr.timeStamp = cts;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
+ log.u_bbr.delivered = 0;
+ lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+ len, &log, false, NULL, NULL, 0, tv);
+ } else
+ lgb = NULL;
+#ifdef INET6
+ if (rack->r_is_v6) {
+ error = ip6_output(m, NULL,
+ &inp->inp_route6,
+ 0, NULL, NULL, inp);
+ }
+#endif
+#if defined(INET) && defined(INET6)
+ else
+#endif
+#ifdef INET
+ {
+ error = ip_output(m, NULL,
+ &inp->inp_route,
+ 0, 0, inp);
+ }
+#endif
+ if (lgb) {
+ lgb->tlb_errno = error;
+ lgb = NULL;
+ }
+ if (error) {
+ *send_err = error;
+ m = NULL;
+ goto failed;
+ }
+ rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
+ NULL, add_flag, s_mb, s_soff);
+ m = NULL;
+ if (tp->snd_una == tp->snd_max) {
+ rack->r_ctl.rc_tlp_rxt_last_time = cts;
+ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
+ tp->t_acktime = ticks;
}
- rack_log_pacing_delay_calc(rack,
- tp->gput_seq,
- tp->gput_ack,
- (uint64_t)my_rsm,
- tp->gput_ts,
- rack->r_ctl.rc_app_limited_cnt,
- 9, __LINE__, NULL);
-}
-
-static inline uint32_t
-rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use,
- uint32_t avail, int32_t sb_offset)
-{
- uint32_t len;
- uint32_t sendwin;
+ rack->forced_ack = 0; /* If we send something zap the FA flag */
+ tot_len += len;
+ if ((tp->t_flags & TF_GPUTINPROG) == 0)
+ rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
+ tp->snd_max += len;
+ tp->snd_nxt = tp->snd_max;
+ {
+ int idx;
- if (tp->snd_wnd > cwnd_to_use)
- sendwin = cwnd_to_use;
+ idx = (len / segsiz) + 3;
+ if (idx >= TCP_MSS_ACCT_ATIMER)
+ counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
+ else
+ counter_u64_add(rack_out_size[idx], 1);
+ }
+ if (len <= rack->r_ctl.fsb.left_to_send)
+ rack->r_ctl.fsb.left_to_send -= len;
else
- sendwin = tp->snd_wnd;
- if (ctf_outstanding(tp) >= tp->snd_wnd) {
- /* We never want to go over our peers rcv-window */
- len = 0;
- } else {
- uint32_t flight;
-
- flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
- if (flight >= sendwin) {
- /*
- * We have in flight what we are allowed by cwnd (if
- * it was rwnd blocking it would have hit above out
- * >= tp->snd_wnd).
- */
- return (0);
- }
- len = sendwin - flight;
- if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
- /* We would send too much (beyond the rwnd) */
- len = tp->snd_wnd - ctf_outstanding(tp);
- }
- if ((len + sb_offset) > avail) {
- /*
- * We don't have that much in the SB, how much is
- * there?
- */
- len = avail - sb_offset;
- }
+ rack->r_ctl.fsb.left_to_send = 0;
+ if (rack->r_ctl.fsb.left_to_send < segsiz) {
+ rack->r_fast_output = 0;
+ rack->r_ctl.fsb.left_to_send = 0;
+ /* At the end of fast_output scale up the sb */
+ SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd);
+ rack_sndbuf_autoscale(rack);
+ SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd);
+ }
+ if (tp->t_rtttime == 0) {
+ tp->t_rtttime = ticks;
+ tp->t_rtseq = startseq;
+ KMOD_TCPSTAT_INC(tcps_segstimed);
+ }
+ if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
+ (max_val > len) &&
+ (tso == 0)) {
+ max_val -= len;
+ len = segsiz;
+ th = rack->r_ctl.fsb.th;
+ cnt_thru++;
+ goto again;
}
- return (len);
+ tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
+ counter_u64_add(rack_fto_send, 1);
+ slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
+ rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
+ }
+ counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz));
+ sched_unpin();
+#endif
+ return (0);
+failed:
+ if (m)
+ m_free(m);
+ rack->r_fast_output = 0;
+ return (-1);
}
static int
@@ -11876,15 +15892,17 @@
{
struct socket *so;
uint32_t recwin;
- uint32_t sb_offset;
+ uint32_t sb_offset, s_moff = 0;
int32_t len, flags, error = 0;
- struct mbuf *m;
+ struct mbuf *m, *s_mb = NULL;
struct mbuf *mb;
uint32_t if_hw_tsomaxsegcount = 0;
uint32_t if_hw_tsomaxsegsize;
int32_t segsiz, minseg;
long tot_len_this_send = 0;
+#ifdef INET
struct ip *ip = NULL;
+#endif
#ifdef TCPDEBUG
struct ipovly *ipov = NULL;
#endif
@@ -11910,17 +15928,22 @@
struct tcpopt to;
int32_t slot = 0;
int32_t sup_rack = 0;
- uint32_t cts, us_cts, delayed, early;
- uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0;
- uint32_t cwnd_to_use;
- int32_t do_a_prefetch;
+ uint32_t cts, ms_cts, delayed, early;
+ uint16_t add_flag = RACK_SENT_SP;
+ uint8_t hpts_calling, doing_tlp = 0;
+ uint32_t cwnd_to_use, pace_max_seg;
+ int32_t do_a_prefetch = 0;
int32_t prefetch_rsm = 0;
- int32_t orig_len;
+ int32_t orig_len = 0;
struct timeval tv;
int32_t prefetch_so_done = 0;
- struct tcp_log_buffer *lgb = NULL;
+ struct tcp_log_buffer *lgb;
struct inpcb *inp;
struct sockbuf *sb;
+ uint64_t ts_val = 0;
+#ifdef TCP_ACCOUNTING
+ uint64_t crtsc;
+#endif
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int32_t isipv6;
@@ -11930,19 +15953,20 @@
/* setup and take the cache hits here */
rack = (struct tcp_rack *)tp->t_fb_ptr;
- inp = rack->rc_inp;
- so = inp->inp_socket;
- sb = &so->so_snd;
- kern_prefetch(sb, &do_a_prefetch);
- do_a_prefetch = 1;
- hpts_calling = inp->inp_hpts_calls;
- hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
-
+#ifdef TCP_ACCOUNTING
+ sched_pin();
+ ts_val = get_cyclecount();
+#endif
+ hpts_calling = rack->rc_inp->inp_hpts_calls;
NET_EPOCH_ASSERT();
- INP_WLOCK_ASSERT(inp);
+ INP_WLOCK_ASSERT(rack->rc_inp);
#ifdef TCP_OFFLOAD
- if (tp->t_flags & TF_TOE)
+ if (tp->t_flags & TF_TOE) {
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
return (tcp_offload_output(tp));
+ }
#endif
/*
* For TFO connections in SYN_RECEIVED, only allow the initial
@@ -11951,21 +15975,25 @@
if (IS_FASTOPEN(tp->t_flags) &&
(tp->t_state == TCPS_SYN_RECEIVED) &&
SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */
- (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */
+ (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
return (0);
+ }
#ifdef INET6
if (rack->r_state) {
/* Use the cache line loaded if possible */
isipv6 = rack->r_is_v6;
} else {
- isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+ isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
}
#endif
early = 0;
- us_cts = tcp_get_usecs(&tv);
- cts = tcp_tv_to_mssectick(&tv);
+ cts = tcp_get_usecs(&tv);
+ ms_cts = tcp_tv_to_mssectick(&tv);
if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
- inp->inp_in_hpts) {
+ rack->rc_inp->inp_in_hpts) {
/*
* We are on the hpts for some timer but not hptsi output.
* Remove from the hpts unconditionally.
@@ -11974,37 +16002,64 @@
}
/* Are we pacing and late? */
if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
- TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) {
+ TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
/* We are delayed */
- delayed = us_cts - rack->r_ctl.rc_last_output_to;
+ delayed = cts - rack->r_ctl.rc_last_output_to;
} else {
delayed = 0;
}
- /* Do the timers, which may override the pacer */
+ /* Do the timers, which may override the pacer */
if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
if (rack_process_timers(tp, rack, cts, hpts_calling)) {
counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
return (0);
}
}
+ if (rack->rc_in_persist) {
+ if (rack->rc_inp->inp_in_hpts == 0) {
+ /* Timer is not running */
+ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
+ }
+#ifdef TCP_ACCOUNTING
+ sched_unpin();
+#endif
+ return (0);
+ }
if ((rack->r_timer_override) ||
+ (rack->rc_ack_can_sendout_data) ||
(delayed) ||
(tp->t_state < TCPS_ESTABLISHED)) {
- if (tp->t_inpcb->inp_in_hpts)
- tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
- } else if (tp->t_inpcb->inp_in_hpts) {
+ rack->rc_ack_can_sendout_data = 0;
+ if (rack->rc_inp->inp_in_hpts)
+ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
+ } else if (rack->rc_inp->inp_in_hpts) {
/*
* On the hpts you can't pass even if ACKNOW is on, we will
* when the hpts fires.
*/
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_BLOCKED]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1);
+ sched_unpin();
+#endif
counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
return (0);
}
- inp->inp_hpts_calls = 0;
+ rack->rc_inp->inp_hpts_calls = 0;
/* Finish out both pacing early and late accounting */
if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
- TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
- early = rack->r_ctl.rc_last_output_to - us_cts;
+ TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
+ early = rack->r_ctl.rc_last_output_to - cts;
} else
early = 0;
if (delayed) {
@@ -12018,6 +16073,26 @@
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
rack->r_wanted_output = 0;
rack->r_timer_override = 0;
+ if ((tp->t_state != rack->r_state) &&
+ TCPS_HAVEESTABLISHED(tp->t_state)) {
+ rack_set_state(tp, rack);
+ }
+ if ((rack->r_fast_output) &&
+ (tp->rcv_numsacks == 0)) {
+ int ret;
+
+ error = 0;
+ ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
+ if (ret >= 0)
+ return(ret);
+ else if (error) {
+ inp = rack->rc_inp;
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+ goto nomore;
+ }
+ }
+ inp = rack->rc_inp;
/*
* For TFO connections in SYN_SENT or SYN_RECEIVED,
* only allow the initial SYN or SYN|ACK and those sent
@@ -12029,6 +16104,8 @@
SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
(tp->t_rxtshift == 0)) { /* not a retransmit */
cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
+ so = inp->inp_socket;
+ sb = &so->so_snd;
goto just_return_nolock;
}
/*
@@ -12051,21 +16128,23 @@
}
if ((tp->snd_una == tp->snd_max) &&
rack->r_ctl.rc_went_idle_time &&
- TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) {
- idle = us_cts - rack->r_ctl.rc_went_idle_time;
+ TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
+ idle = cts - rack->r_ctl.rc_went_idle_time;
if (idle > rack_min_probertt_hold) {
/* Count as a probe rtt */
if (rack->in_probe_rtt == 0) {
- rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
+ rack->r_ctl.rc_lower_rtt_us_cts = cts;
rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
} else {
- rack_exit_probertt(rack, us_cts);
+ rack_exit_probertt(rack, cts);
}
}
idle = 0;
}
+ if (rack_use_fsb && (rack->r_fsb_inited == 0))
+ rack_init_fsb_block(tp, rack);
again:
/*
* If we've recently taken a timeout, snd_max will be greater than
@@ -12073,45 +16152,18 @@
* resending already delivered data. Adjust snd_nxt accordingly.
*/
sendalot = 0;
- us_cts = tcp_get_usecs(&tv);
- cts = tcp_tv_to_mssectick(&tv);
+ cts = tcp_get_usecs(&tv);
+ ms_cts = tcp_tv_to_mssectick(&tv);
tso = 0;
mtu = 0;
segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
minseg = segsiz;
+ if (rack->r_ctl.rc_pace_max_segs == 0)
+ pace_max_seg = rack->rc_user_set_max_segs * segsiz;
+ else
+ pace_max_seg = rack->r_ctl.rc_pace_max_segs;
sb_offset = tp->snd_max - tp->snd_una;
cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
-#ifdef NETFLIX_SHARED_CWND
- if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
- rack->rack_enable_scwnd) {
- /* We are doing cwnd sharing */
- if (rack->rc_gp_filled &&
- (rack->rack_attempted_scwnd == 0) &&
- (rack->r_ctl.rc_scw == NULL) &&
- tp->t_lib) {
- /* The pcbid is in, lets make an attempt */
- counter_u64_add(rack_try_scwnd, 1);
- rack->rack_attempted_scwnd = 1;
- rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
- &rack->r_ctl.rc_scw_index,
- segsiz);
- }
- if (rack->r_ctl.rc_scw &&
- (rack->rack_scwnd_is_idle == 1) &&
- (rack->rc_in_persist == 0) &&
- sbavail(sb)) {
- /* we are no longer out of data */
- tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
- rack->rack_scwnd_is_idle = 0;
- }
- if (rack->r_ctl.rc_scw) {
- /* First lets update and get the cwnd */
- rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
- rack->r_ctl.rc_scw_index,
- tp->snd_cwnd, tp->snd_wnd, segsiz);
- }
- }
-#endif
flags = tcp_outflags[tp->t_state];
while (rack->rc_free_cnt < rack_free_cache) {
rsm = rack_alloc(rack);
@@ -12119,6 +16171,8 @@
if (inp->inp_hpts_calls)
/* Retry in a ms */
slot = (1 * HPTS_USEC_IN_MSEC);
+ so = inp->inp_socket;
+ sb = &so->so_snd;
goto just_return_nolock;
}
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
@@ -12131,7 +16185,9 @@
len = 0;
rsm = NULL;
if (flags & TH_RST) {
- SOCKBUF_LOCK(sb);
+ SOCKBUF_LOCK(&inp->inp_socket->so_snd);
+ so = inp->inp_socket;
+ sb = &so->so_snd;
goto send;
}
if (rack->r_ctl.rc_resend) {
@@ -12149,25 +16205,16 @@
sb_offset = rsm->r_start - tp->snd_una;
if (len >= segsiz)
len = segsiz;
- } else if ((rack->rc_in_persist == 0) &&
- ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
+ } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
/* We have a retransmit that takes precedence */
rsm->r_flags &= ~RACK_TLP;
- if ((!IN_RECOVERY(tp->t_flags)) &&
- ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
+ if ((!IN_FASTRECOVERY(tp->t_flags)) &&
+ ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
/* Enter recovery if not induced by a time-out */
rack->r_ctl.rc_rsm_start = rsm->r_start;
rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
- rack_cong_signal(tp, NULL, CC_NDUPACK);
- /*
- * When we enter recovery we need to assure we send
- * one packet.
- */
- if (rack->rack_no_prr == 0) {
- rack->r_ctl.rc_prr_sndcnt = segsiz;
- rack_log_to_prr(rack, 13, 0);
- }
+ rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
}
#ifdef INVARIANTS
if (SEQ_LT(rsm->r_start, tp->snd_una)) {
@@ -12181,37 +16228,17 @@
__func__, __LINE__,
rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
- /* Can we send it within the PRR boundary? */
- if (rack->rack_no_prr == 0) {
- if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) {
- /* It does not fit */
- if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) &&
- (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
- /*
- * prr is less than a segment, we
- * have more acks due in besides
- * what we need to resend. Lets not send
- * to avoid sending small pieces of
- * what we need to retransmit.
- */
- len = 0;
- goto just_return_nolock;
- }
- len = rack->r_ctl.rc_prr_sndcnt;
- }
- }
sendalot = 0;
if (len >= segsiz)
len = segsiz;
if (len > 0) {
- sub_from_prr = 1;
sack_rxmit = 1;
KMOD_TCPSTAT_INC(tcps_sack_rexmits);
KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
min(len, segsiz));
counter_u64_add(rack_rtm_prr_retran, 1);
}
- } else if (rack->r_ctl.rc_tlpsend) {
+ } else if (rack->r_ctl.rc_tlpsend) {
/* Tail loss probe */
long cwin;
long tlen;
@@ -12225,11 +16252,14 @@
*/
rsm = rack->r_ctl.rc_tlpsend;
rsm->r_flags |= RACK_TLP;
+
rack->r_ctl.rc_tlpsend = NULL;
sack_rxmit = 1;
tlen = rsm->r_end - rsm->r_start;
if (tlen > segsiz)
tlen = segsiz;
+ tp->t_sndtlppack++;
+ tp->t_sndtlpbyte += tlen;
KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
__func__, __LINE__,
@@ -12238,6 +16268,46 @@
cwin = min(tp->snd_wnd, tlen);
len = cwin;
}
+ if (rack->r_must_retran &&
+ (rsm == NULL)) {
+ /*
+ * Non-Sack and we had a RTO or MTU change, we
+ * need to retransmit until we reach
+ * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
+ */
+ if (SEQ_GT(tp->snd_max, tp->snd_una)) {
+ int sendwin, flight;
+
+ sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+ flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
+ if (flight >= sendwin) {
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+ goto just_return_nolock;
+ }
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ KASSERT(rsm != NULL, ("rsm is NULL rack:%p r_must_retran set", rack));
+ if (rsm == NULL) {
+ /* TSNH */
+ rack->r_must_retran = 0;
+ rack->r_ctl.rc_out_at_rto = 0;
+ rack->r_must_retran = 0;
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+ goto just_return_nolock;
+ }
+ sack_rxmit = 1;
+ len = rsm->r_end - rsm->r_start;
+ sendalot = 0;
+ sb_offset = rsm->r_start - tp->snd_una;
+ if (len >= segsiz)
+ len = segsiz;
+ } else {
+ /* We must be done if there is nothing outstanding */
+ rack->r_must_retran = 0;
+ rack->r_ctl.rc_out_at_rto = 0;
+ }
+ }
/*
* Enforce a connection sendmap count limit if set
* as long as we are not retransmiting.
@@ -12251,6 +16321,8 @@
rack->alloc_limit_reported = 1;
counter_u64_add(rack_alloc_limited_conns, 1);
}
+ so = inp->inp_socket;
+ sb = &so->so_snd;
goto just_return_nolock;
}
if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
@@ -12267,6 +16339,50 @@
#ifdef INVARIANTS
/* For debugging */
rack->r_ctl.rc_rsm_at_retran = rsm;
+#endif
+ if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo &&
+ ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
+ int ret;
+
+ ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len);
+ if (ret == 0)
+ return (0);
+ }
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+ if (do_a_prefetch == 0) {
+ kern_prefetch(sb, &do_a_prefetch);
+ do_a_prefetch = 1;
+ }
+#ifdef NETFLIX_SHARED_CWND
+ if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
+ rack->rack_enable_scwnd) {
+ /* We are doing cwnd sharing */
+ if (rack->gp_ready &&
+ (rack->rack_attempted_scwnd == 0) &&
+ (rack->r_ctl.rc_scw == NULL) &&
+ tp->t_lib) {
+ /* The pcbid is in, lets make an attempt */
+ counter_u64_add(rack_try_scwnd, 1);
+ rack->rack_attempted_scwnd = 1;
+ rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
+ &rack->r_ctl.rc_scw_index,
+ segsiz);
+ }
+ if (rack->r_ctl.rc_scw &&
+ (rack->rack_scwnd_is_idle == 1) &&
+ sbavail(&so->so_snd)) {
+ /* we are no longer out of data */
+ tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
+ rack->rack_scwnd_is_idle = 0;
+ }
+ if (rack->r_ctl.rc_scw) {
+ /* First lets update and get the cwnd */
+ rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
+ rack->r_ctl.rc_scw_index,
+ tp->snd_cwnd, tp->snd_wnd, segsiz);
+ }
+ }
#endif
/*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
@@ -12314,15 +16430,20 @@
if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
}
- if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
- len = tp->snd_wnd;
- else
+ if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
+ if (tp->snd_wnd > sb_offset)
+ len = tp->snd_wnd - sb_offset;
+ else
+ len = 0;
+ } else {
len = rack->r_ctl.rc_tlp_new_data;
+ }
rack->r_ctl.rc_tlp_new_data = 0;
- new_data_tlp = doing_tlp = 1;
- } else
+ doing_tlp = 1;
+ } else {
len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
- if (IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) {
+ }
+ if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) {
/*
* For prr=off, we need to send only 1 MSS
* at a time. We do this because another sack could
@@ -12334,12 +16455,14 @@
}
} else {
uint32_t outstanding;
-
/*
- * We are inside of a SACK recovery episode and are
- * sending new data, having retransmitted all the
- * data possible so far in the scoreboard.
+ * We are inside of a Fast recovery episode, this
+ * is caused by a SACK or 3 dup acks. At this point
+ * we have sent all the retransmissions and we rely
+ * on PRR to dictate what we will send in the form of
+ * new data.
*/
+
outstanding = tp->snd_max - tp->snd_una;
if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
if (tp->snd_wnd > outstanding) {
@@ -12352,15 +16475,18 @@
else
len = 0;
}
- } else
+ } else {
len = 0;
- } else if (avail > sb_offset)
+ }
+ } else if (avail > sb_offset) {
len = avail - sb_offset;
- else
+ } else {
len = 0;
+ }
if (len > 0) {
- if (len > rack->r_ctl.rc_prr_sndcnt)
+ if (len > rack->r_ctl.rc_prr_sndcnt) {
len = rack->r_ctl.rc_prr_sndcnt;
+ }
if (len > 0) {
sub_from_prr = 1;
counter_u64_add(rack_rtm_prr_newdata, 1);
@@ -12374,8 +16500,9 @@
* likely the PRR algorithm is not going to
* let us send a lot as well :-)
*/
- if (rack->r_ctl.rc_prr_sendalot == 0)
+ if (rack->r_ctl.rc_prr_sendalot == 0) {
len = segsiz;
+ }
} else if (len < segsiz) {
/*
* Do we send any? The idea here is if the
@@ -12457,6 +16584,11 @@
tp->snd_nxt = tp->iss;
len = 0;
}
+ if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
+ /* We only send 1 MSS if we have a DSACK block */
+ add_flag |= RACK_SENT_W_DSACK;
+ len = segsiz;
+ }
orig_len = len;
if (len <= 0) {
/*
@@ -12479,18 +16611,17 @@
(TCPS_HAVEESTABLISHED(tp->t_state)) &&
(tp->snd_una == tp->snd_max) &&
(sb_offset < (int)sbavail(sb))) {
- tp->snd_nxt = tp->snd_una;
rack_enter_persist(tp, rack, cts);
}
} else if ((rsm == NULL) &&
- ((doing_tlp == 0) || (new_data_tlp == 1)) &&
- (len < rack->r_ctl.rc_pace_max_segs)) {
+ (doing_tlp == 0) &&
+ (len < pace_max_seg)) {
/*
* We are not sending a maximum sized segment for
* some reason. Should we not send anything (think
* sws or persists)?
*/
- if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) &&
+ if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
(TCPS_HAVEESTABLISHED(tp->t_state)) &&
(len < minseg) &&
(len < (int)(sbavail(sb) - sb_offset))) {
@@ -12509,9 +16640,8 @@
* go into persists.
*/
rack_enter_persist(tp, rack, cts);
- tp->snd_nxt = tp->snd_una;
}
- } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
+ } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
(ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
(len < (int)(sbavail(sb) - sb_offset)) &&
(len < minseg)) {
@@ -12540,11 +16670,41 @@
* its not a full pacing segment.
*/
len = 0;
+ } else if ((rack->r_ctl.crte != NULL) &&
+ (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
+ (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
+ (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
+ (len < (int)(sbavail(sb) - sb_offset))) {
+ /*
+ * Here we are doing hardware pacing, this is not a TLP,
+ * we are not sending a pace max segment size, there is rwnd
+ * room to send at least N pace_max_seg, the cwnd is greater
+ * than or equal to a full pacing segments plus 4 mss and we have 2 or
+ * more segments in flight and its not the tail of the socket buffer.
+ *
+ * We don't want to send instead we need to get more ack's in to
+ * allow us to send a full pacing segment. Normally, if we are pacing
+ * about the right speed, we should have finished our pacing
+ * send as most of the acks have come back if we are at the
+ * right rate. This is a bit fuzzy since return path delay
+ * can delay the acks, which is why we want to make sure we
+ * have cwnd space to have a bit more than a max pace segments in flight.
+ *
+ * If we have not gotten our acks back we are pacing at too high a
+ * rate delaying will not hurt and will bring our GP estimate down by
+ * injecting the delay. If we don't do this we will send
+ * 2 MSS out in response to the acks being clocked in which
+ * defeats the point of hw-pacing (i.e. to help us get
+ * larger TSO's out).
+ */
+ len = 0;
+
}
+
}
/* len will be >= 0 after this point. */
KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
- tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use));
+ rack_sndbuf_autoscale(rack);
/*
* Decide if we can use TCP Segmentation Offloading (if supported by
* hardware).
@@ -12561,17 +16721,7 @@
* the right thing below to provide length of just ip options and thus
* checking for ipoptlen is enough to decide if ip options are present.
*/
-
-#ifdef INET6
- if (isipv6)
- ipoptlen = ip6_optlen(tp->t_inpcb);
- else
-#endif
- if (tp->t_inpcb->inp_options)
- ipoptlen = tp->t_inpcb->inp_options->m_len -
- offsetof(struct ipoption, ipopt_list);
- else
- ipoptlen = 0;
+ ipoptlen = 0;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
/*
* Pre-calculate here as we save another lookup into the darknesses
@@ -12783,9 +16933,51 @@
if (tot_len_this_send > 0) {
/* Make sure snd_nxt is up to max */
+ rack->r_ctl.fsb.recwin = recwin;
+ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
+ if ((error == 0) &&
+ rack_use_rfo &&
+ ((flags & (TH_SYN|TH_FIN)) == 0) &&
+ (ipoptlen == 0) &&
+ (tp->snd_nxt == tp->snd_max) &&
+ (tp->rcv_numsacks == 0) &&
+ rack->r_fsb_inited &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (rack->r_must_retran == 0) &&
+ ((tp->t_flags & TF_NEEDFIN) == 0) &&
+ (len > 0) && (orig_len > 0) &&
+ (orig_len > len) &&
+ ((orig_len - len) >= segsiz) &&
+ ((optlen == 0) ||
+ ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
+ /* We can send at least one more MSS using our fsb */
+
+ rack->r_fast_output = 1;
+ rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
+ rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
+ rack->r_ctl.fsb.tcp_flags = flags;
+ rack->r_ctl.fsb.left_to_send = orig_len - len;
+ KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
+ ("rack:%p left_to_send:%u sbavail:%u out:%u",
+ rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
+ (tp->snd_max - tp->snd_una)));
+ if (rack->r_ctl.fsb.left_to_send < segsiz)
+ rack->r_fast_output = 0;
+ else {
+ if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
+ rack->r_ctl.fsb.rfo_apply_push = 1;
+ else
+ rack->r_ctl.fsb.rfo_apply_push = 0;
+ }
+ } else
+ rack->r_fast_output = 0;
+
+
+ rack_log_fsb(rack, tp, so, flags,
+ ipoptlen, orig_len, len, 0,
+ 1, optlen, __LINE__, 1);
if (SEQ_GT(tp->snd_max, tp->snd_nxt))
tp->snd_nxt = tp->snd_max;
- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
} else {
int end_window = 0;
uint32_t seq = tp->gput_ack;
@@ -12808,9 +17000,13 @@
minseg)) >= tp->snd_wnd) {
/* We are limited by the rwnd */
app_limited = CTF_JR_RWND_LIMITED;
+ if (IN_FASTRECOVERY(tp->t_flags))
+ rack->r_ctl.rc_prr_sndcnt = 0;
} else if (ctf_outstanding(tp) >= sbavail(sb)) {
/* We are limited by whats available -- app limited */
app_limited = CTF_JR_APP_LIMITED;
+ if (IN_FASTRECOVERY(tp->t_flags))
+ rack->r_ctl.rc_prr_sndcnt = 0;
} else if ((idle == 0) &&
((tp->t_flags & TF_NODELAY) == 0) &&
((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
@@ -12833,10 +17029,7 @@
} else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
/* Its the cwnd */
app_limited = CTF_JR_CWND_LIMITED;
- } else if (rack->rc_in_persist == 1) {
- /* We are in persists */
- app_limited = CTF_JR_PERSISTS;
- } else if (IN_RECOVERY(tp->t_flags) &&
+ } else if (IN_FASTRECOVERY(tp->t_flags) &&
(rack->rack_no_prr == 0) &&
(rack->r_ctl.rc_prr_sndcnt < segsiz)) {
app_limited = CTF_JR_PRR;
@@ -12864,15 +17057,6 @@
* this case.
*/
end_window = 1;
- } else if (app_limited == CTF_JR_PERSISTS) {
- /*
- * We never end the measurement window
- * in persists, though in theory we
- * should be only entering after everything
- * is acknowledged (so we will probably
- * never come here).
- */
- end_window = 0;
} else if (rack_rwnd_block_ends_measure &&
(app_limited == CTF_JR_RWND_LIMITED)) {
/*
@@ -12937,8 +17121,7 @@
counter_u64_add(rack_unpaced_segments, 1);
}
/* Check if we need to go into persists or not */
- if ((rack->rc_in_persist == 0) &&
- (tp->snd_max == tp->snd_una) &&
+ if ((tp->snd_max == tp->snd_una) &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
sbavail(sb) &&
(sbavail(sb) > tp->snd_wnd) &&
@@ -12955,10 +17138,42 @@
tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
rack->rack_scwnd_is_idle = 1;
}
+#endif
+#ifdef TCP_ACCOUNTING
+ if (tot_len_this_send > 0) {
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_DATA]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
+ }
+ counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz));
+ } else {
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_LIMITED]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val));
+ }
+ sched_unpin();
#endif
return (0);
send:
+ if (rsm || sack_rxmit)
+ counter_u64_add(rack_nfto_resend, 1);
+ else
+ counter_u64_add(rack_non_fto_send, 1);
if ((flags & TH_FIN) &&
sbavail(sb)) {
/*
@@ -13053,7 +17268,7 @@
/* Timestamps. */
if ((tp->t_flags & TF_RCVD_TSTMP) ||
((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
- to.to_tsval = cts + tp->ts_offset;
+ to.to_tsval = ms_cts + tp->ts_offset;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
}
@@ -13062,13 +17277,15 @@
(so->so_rcv.sb_flags & SB_AUTOSIZE))
tp->rfbuf_ts = tcp_ts_getticks();
/* Selective ACK's. */
- if (flags & TH_SYN)
- to.to_flags |= TOF_SACKPERM;
- else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
- tp->rcv_numsacks > 0) {
- to.to_flags |= TOF_SACK;
- to.to_nsacks = tp->rcv_numsacks;
- to.to_sacks = (u_char *)tp->sackblks;
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ if (flags & TH_SYN)
+ to.to_flags |= TOF_SACKPERM;
+ else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ tp->rcv_numsacks > 0) {
+ to.to_flags |= TOF_SACK;
+ to.to_nsacks = tp->rcv_numsacks;
+ to.to_sacks = (u_char *)tp->sackblks;
+ }
}
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
/* TCP-MD5 (RFC2385). */
@@ -13090,6 +17307,18 @@
if (V_tcp_udp_tunneling_port == 0) {
/* The port was removed?? */
SOCKBUF_UNLOCK(&so->so_snd);
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_FAIL]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
+ sched_unpin();
+#endif
return (EHOSTUNREACH);
}
hdrlen += sizeof(struct udphdr);
@@ -13154,7 +17383,7 @@
len -= moff;
}
}
- /*
+ /*
* In case there are too many small fragments don't
* use TSO:
*/
@@ -13224,6 +17453,7 @@
* and initialize the header from the template for sends on this
* connection.
*/
+ hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0;
if (len) {
uint32_t max_val;
uint32_t moff;
@@ -13262,6 +17492,8 @@
* sb_offset in the socket buffer chain.
*/
mb = sbsndptr_noadv(sb, sb_offset, &moff);
+ s_mb = mb;
+ s_moff = moff;
if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
m_copydata(mb, moff, (int)len,
mtod(m, caddr_t)+hdrlen);
@@ -13332,8 +17564,10 @@
*/
if (sb_offset + len == sbused(sb) &&
sbused(sb) &&
- !(flags & TH_SYN))
+ !(flags & TH_SYN)) {
flags |= TH_PUSH;
+ add_flag |= RACK_HAD_PUSH;
+ }
SOCKBUF_UNLOCK(sb);
} else {
@@ -13365,38 +17599,54 @@
#ifdef MAC
mac_inpcb_create_mbuf(inp, m);
#endif
+ if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
#ifdef INET6
- if (isipv6) {
- ip6 = mtod(m, struct ip6_hdr *);
- if (tp->t_port) {
- udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
- udp->uh_sport = htons(V_tcp_udp_tunneling_port);
- udp->uh_dport = tp->t_port;
- ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ if (isipv6)
+ ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
+ else
+#endif /* INET6 */
+ ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
+ th = rack->r_ctl.fsb.th;
+ udp = rack->r_ctl.fsb.udp;
+ if (udp) {
+ if (isipv6)
+ ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ else
+ ulen = hdrlen + len - sizeof(struct ip);
udp->uh_ulen = htons(ulen);
- th = (struct tcphdr *)(udp + 1);
- } else {
- th = (struct tcphdr *)(ip6 + 1);
}
- tcpip_fillheaders(inp, tp->t_port, ip6, th);
- } else
+ } else {
+#ifdef INET6
+ if (isipv6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else
+ th = (struct tcphdr *)(ip6 + 1);
+ tcpip_fillheaders(inp, tp->t_port, ip6, th);
+ } else
#endif /* INET6 */
- {
- ip = mtod(m, struct ip *);
+ {
+ ip = mtod(m, struct ip *);
#ifdef TCPDEBUG
- ipov = (struct ipovly *)ip;
+ ipov = (struct ipovly *)ip;
#endif
- if (tp->t_port) {
- udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
- udp->uh_sport = htons(V_tcp_udp_tunneling_port);
- udp->uh_dport = tp->t_port;
- ulen = hdrlen + len - sizeof(struct ip);
- udp->uh_ulen = htons(ulen);
- th = (struct tcphdr *)(udp + 1);
- } else {
- th = (struct tcphdr *)(ip + 1);
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else
+ th = (struct tcphdr *)(ip + 1);
+ tcpip_fillheaders(inp, tp->t_port, ip, th);
}
- tcpip_fillheaders(inp, tp->t_port, ip, th);
}
/*
* Fill in fields, remembering maximum advertised window for use in
@@ -13464,19 +17714,9 @@
* (retransmit and persist are mutually exclusive...)
*/
if (sack_rxmit == 0) {
- if (len || (flags & (TH_SYN | TH_FIN)) ||
- rack->rc_in_persist) {
+ if (len || (flags & (TH_SYN | TH_FIN))) {
th->th_seq = htonl(tp->snd_nxt);
rack_seq = tp->snd_nxt;
- } else if (flags & TH_RST) {
- /*
- * For a Reset send the last cum ack in sequence
- * (this like any other choice may still generate a
- * challenge ack, if a ack-update packet is in
- * flight).
- */
- th->th_seq = htonl(tp->snd_una);
- rack_seq = tp->snd_una;
} else {
th->th_seq = htonl(tp->snd_max);
rack_seq = tp->snd_max;
@@ -13486,10 +17726,6 @@
rack_seq = rsm->r_start;
}
th->th_ack = htonl(tp->rcv_nxt);
- if (optlen) {
- bcopy(opt, th + 1, optlen);
- th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
- }
th->th_flags = flags;
/*
* Calculate receive window. Don't shrink window, but avoid silly
@@ -13500,8 +17736,9 @@
recwin = 0;
} else {
if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
- recwin < (long)segsiz)
+ recwin < (long)segsiz) {
recwin = 0;
+ }
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
@@ -13533,8 +17770,35 @@
tp->t_flags |= TF_RXWIN0SENT;
} else
tp->t_flags &= ~TF_RXWIN0SENT;
- tp->snd_up = tp->snd_una; /* drag it along, its deprecated */
+ tp->snd_up = tp->snd_una; /* drag it along, its deprecated */
+ /* Now are we using fsb?, if so copy the template data to the mbuf */
+ if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
+ uint8_t *cpto;
+ cpto = mtod(m, uint8_t *);
+ memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
+ /*
+ * We have just copied in:
+ * IP/IP6
+ * <optional udphdr>
+ * tcphdr (no options)
+ *
+ * We need to grab the correct pointers into the mbuf
+ * for both the tcp header, and possibly the udp header (if tunneling).
+ * We do this by using the offset in the copy buffer and adding it
+ * to the mbuf base pointer (cpto).
+ */
+#ifdef INET6
+ if (isipv6)
+ ip6 = mtod(m, struct ip6_hdr *);
+ else
+#endif /* INET6 */
+ ip = mtod(m, struct ip *);
+ th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
+ /* If we have a udp header lets set it into the mbuf as well */
+ if (udp)
+ udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
+ }
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (to.to_flags & TOF_SIGNATURE) {
/*
@@ -13553,7 +17817,10 @@
}
}
#endif
-
+ if (optlen) {
+ bcopy(opt, th + 1, optlen);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ }
/*
* Put TCP length in extended header, and then checksum extended
* header and data.
@@ -13623,33 +17890,9 @@
/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
hhook_run_tcp_est_out(tp, th, &to, len, tso);
#endif
-#ifdef TCPDEBUG
- /*
- * Trace.
- */
- if (so->so_options & SO_DEBUG) {
- u_short save = 0;
-
-#ifdef INET6
- if (!isipv6)
-#endif
- {
- save = ipov->ih_len;
- ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen +
- * (th->th_off << 2) */ );
- }
- tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
-#ifdef INET6
- if (!isipv6)
-#endif
- ipov->ih_len = save;
- }
-#endif /* TCPDEBUG */
-
/* We're getting ready to send; log now. */
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
- struct timeval tv;
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
@@ -13679,8 +17922,10 @@
}
log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
log.u_bbr.flex7 = mark;
+ log.u_bbr.flex7 <<= 8;
+ log.u_bbr.flex7 |= pass;
log.u_bbr.pkts_out = tp->t_maxseg;
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.timeStamp = cts;
log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
log.u_bbr.lt_epoch = cwnd_to_use;
log.u_bbr.delivered = sendalot;
@@ -13706,7 +17951,7 @@
* desired default hop limit might be changed via Neighbor
* Discovery.
*/
- ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
/*
* Set the packet size here for the benefit of DTrace
@@ -13725,7 +17970,12 @@
TCP_PROBE5(send, NULL, tp, ip6, tp, th);
/* TODO: IPv6 IP6TOS_ECT bit on */
- error = ip6_output(m, inp->in6p_outputopts,
+ error = ip6_output(m,
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ inp->in6p_outputopts,
+#else
+ NULL,
+#endif
&inp->inp_route6,
((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
NULL, NULL, inp);
@@ -13744,6 +17994,7 @@
if (inp->inp_vflag & INP_IPV6PROTO)
ip->ip_ttl = in6_selecthlim(inp, NULL);
#endif /* INET6 */
+ rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
/*
* If we do path MTU discovery, then we set DF on every
* packet. This might not be the best thing to do according
@@ -13768,7 +18019,13 @@
TCP_PROBE5(send, NULL, tp, ip, tp, th);
- error = ip_output(m, inp->inp_options, &inp->inp_route,
+ error = ip_output(m,
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ inp->inp_options,
+#else
+ NULL,
+#endif
+ &inp->inp_route,
((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
inp);
if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
@@ -13818,23 +18075,31 @@
counter_u64_add(rack_out_size[idx], 1);
}
}
- if (rack->rack_no_prr == 0) {
- if (sub_from_prr && (error == 0)) {
- if (rack->r_ctl.rc_prr_sndcnt >= len)
- rack->r_ctl.rc_prr_sndcnt -= len;
- else
- rack->r_ctl.rc_prr_sndcnt = 0;
- }
- }
+ if ((rack->rack_no_prr == 0) &&
+ sub_from_prr &&
+ (error == 0)) {
+ if (rack->r_ctl.rc_prr_sndcnt >= len)
+ rack->r_ctl.rc_prr_sndcnt -= len;
+ else
+ rack->r_ctl.rc_prr_sndcnt = 0;
+ }
sub_from_prr = 0;
- rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
- pass, rsm, us_cts);
+ if (doing_tlp && (rsm == NULL)) {
+ /* New send doing a TLP */
+ add_flag |= RACK_TLP;
+ tp->t_sndtlppack++;
+ tp->t_sndtlpbyte += len;
+ }
+ rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
+ rack_to_usec_ts(&tv),
+ rsm, add_flag, s_mb, s_moff);
+
+
if ((error == 0) &&
(len > 0) &&
(tp->snd_una == tp->snd_max))
rack->r_ctl.rc_tlp_rxt_last_time = cts;
- /* Now are we in persists? */
- if (rack->rc_in_persist == 0) {
+ {
tcp_seq startseq = tp->snd_nxt;
/* Track our lost count */
@@ -13903,33 +18168,26 @@
((tp->t_flags & TF_GPUTINPROG) == 0))
rack_start_gp_measurement(tp, rack, startseq, sb_offset);
}
- } else {
/*
- * Persist case, update snd_max but since we are in persist
- * mode (no window) we do not update snd_nxt.
+ * If we are doing FO we need to update the mbuf position and subtract
+ * this happens when the peer sends us duplicate information and
+ * we thus want to send a DSACK.
+ *
+ * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
+ * turned off? If not then we are going to echo multiple DSACK blocks
+ * out (with the TSO), which we should not be doing.
*/
- int32_t xlen = len;
-
- if (error)
- goto nomore;
-
- if (flags & TH_SYN)
- ++xlen;
- if (flags & TH_FIN) {
- ++xlen;
- tp->t_flags |= TF_SENTFIN;
- }
- /* In the ENOBUFS case we do *not* update snd_max */
- if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
- if (tp->snd_una == tp->snd_max) {
- /*
- * Update the time we just added data since
- * none was outstanding.
- */
- rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
- tp->t_acktime = ticks;
+ if (rack->r_fast_output && len) {
+ if (rack->r_ctl.fsb.left_to_send > len)
+ rack->r_ctl.fsb.left_to_send -= len;
+ else
+ rack->r_ctl.fsb.left_to_send = 0;
+ if (rack->r_ctl.fsb.left_to_send < segsiz)
+ rack->r_fast_output = 0;
+ if (rack->r_fast_output) {
+ rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
+ rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
}
- tp->snd_max = tp->snd_nxt + len;
}
}
nomore:
@@ -13952,24 +18210,34 @@
switch (error) {
case EPERM:
tp->t_softerror = error;
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_FAIL]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
+ sched_unpin();
+#endif
return (error);
case ENOBUFS:
- if (slot == 0) {
- /*
- * Pace us right away to retry in a some
- * time
- */
- slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
- if (rack->rc_enobuf < 126)
- rack->rc_enobuf++;
- if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) {
- slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC;
- }
- if (slot < (10 * HPTS_USEC_IN_MSEC))
- slot = 10 * HPTS_USEC_IN_MSEC;
+ /*
+ * Pace us right away to retry in a some
+ * time
+ */
+ slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
+ if (rack->rc_enobuf < 0x7f)
+ rack->rc_enobuf++;
+ if (slot < (10 * HPTS_USEC_IN_MSEC))
+ slot = 10 * HPTS_USEC_IN_MSEC;
+ if (rack->r_ctl.crte != NULL) {
+ counter_u64_add(rack_saw_enobuf_hw, 1);
+ tcp_rl_log_enobuf(rack->r_ctl.crte);
}
counter_u64_add(rack_saw_enobuf, 1);
- error = 0;
goto enobufs;
case EMSGSIZE:
/*
@@ -13988,6 +18256,18 @@
}
slot = 10 * HPTS_USEC_IN_MSEC;
rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_FAIL]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
+ sched_unpin();
+#endif
return (error);
case ENETUNREACH:
counter_u64_add(rack_saw_enetunreach, 1);
@@ -14001,10 +18281,24 @@
default:
slot = 10 * HPTS_USEC_IN_MSEC;
rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount();
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_FAIL]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
+ sched_unpin();
+#endif
return (error);
}
} else {
rack->rc_enobuf = 0;
+ if (IN_FASTRECOVERY(tp->t_flags) && rsm)
+ rack->r_ctl.retran_during_recovery += len;
}
KMOD_TCPSTAT_INC(tcps_sndtotal);
@@ -14015,12 +18309,10 @@
*/
if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + recwin;
+
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
enobufs:
- /* Assure when we leave that snd_nxt will point to top */
- if (SEQ_GT(tp->snd_max, tp->snd_nxt))
- tp->snd_nxt = tp->snd_max;
if (sendalot) {
/* Do we need to turn off sendalot? */
if (rack->r_ctl.rc_pace_max_segs &&
@@ -14053,6 +18345,7 @@
slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
}
if (rsm &&
+ (rsm->r_flags & RACK_HAS_SYN) == 0 &&
rack->use_rack_rr) {
/* Its a retransmit and we use the rack cheat? */
if ((slot == 0) ||
@@ -14063,21 +18356,172 @@
* are using old-style rack or
* we are overriden to use the old 1ms pacing.
*/
- slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC;
+ slot = rack->r_ctl.rc_min_to;
}
}
+ /* We have sent clear the flag */
+ rack->r_ent_rec_ns = 0;
+ if (rack->r_must_retran) {
+ if (rsm) {
+ rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
+ if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
+ /*
+ * We have retransmitted all.
+ */
+ rack->r_must_retran = 0;
+ rack->r_ctl.rc_out_at_rto = 0;
+ }
+ } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
+ /*
+ * Sending new data will also kill
+ * the loop.
+ */
+ rack->r_must_retran = 0;
+ rack->r_ctl.rc_out_at_rto = 0;
+ }
+ }
+ rack->r_ctl.fsb.recwin = recwin;
+ if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
+ SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
+ /*
+ * We hit an RTO and now have past snd_max at the RTO
+ * clear all the WAS flags.
+ */
+ tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
+ }
if (slot) {
/* set the rack tcb into the slot N */
counter_u64_add(rack_paced_segments, 1);
+ if ((error == 0) &&
+ rack_use_rfo &&
+ ((flags & (TH_SYN|TH_FIN)) == 0) &&
+ (rsm == NULL) &&
+ (tp->snd_nxt == tp->snd_max) &&
+ (ipoptlen == 0) &&
+ (tp->rcv_numsacks == 0) &&
+ rack->r_fsb_inited &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (rack->r_must_retran == 0) &&
+ ((tp->t_flags & TF_NEEDFIN) == 0) &&
+ (len > 0) && (orig_len > 0) &&
+ (orig_len > len) &&
+ ((orig_len - len) >= segsiz) &&
+ ((optlen == 0) ||
+ ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
+ /* We can send at least one more MSS using our fsb */
+
+ rack->r_fast_output = 1;
+ rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
+ rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
+ rack->r_ctl.fsb.tcp_flags = flags;
+ rack->r_ctl.fsb.left_to_send = orig_len - len;
+ KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
+ ("rack:%p left_to_send:%u sbavail:%u out:%u",
+ rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
+ (tp->snd_max - tp->snd_una)));
+ if (rack->r_ctl.fsb.left_to_send < segsiz)
+ rack->r_fast_output = 0;
+ else {
+ if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
+ rack->r_ctl.fsb.rfo_apply_push = 1;
+ else
+ rack->r_ctl.fsb.rfo_apply_push = 0;
+ }
+ } else
+ rack->r_fast_output = 0;
+ rack_log_fsb(rack, tp, so, flags,
+ ipoptlen, orig_len, len, error,
+ (rsm == NULL), optlen, __LINE__, 2);
} else if (sendalot) {
+ int ret;
+
if (len)
counter_u64_add(rack_unpaced_segments, 1);
sack_rxmit = 0;
+ if ((error == 0) &&
+ rack_use_rfo &&
+ ((flags & (TH_SYN|TH_FIN)) == 0) &&
+ (rsm == NULL) &&
+ (ipoptlen == 0) &&
+ (tp->rcv_numsacks == 0) &&
+ (tp->snd_nxt == tp->snd_max) &&
+ (rack->r_must_retran == 0) &&
+ rack->r_fsb_inited &&
+ TCPS_HAVEESTABLISHED(tp->t_state) &&
+ ((tp->t_flags & TF_NEEDFIN) == 0) &&
+ (len > 0) && (orig_len > 0) &&
+ (orig_len > len) &&
+ ((orig_len - len) >= segsiz) &&
+ ((optlen == 0) ||
+ ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
+ /* we can use fast_output for more */
+
+ rack->r_fast_output = 1;
+ rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
+ rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
+ rack->r_ctl.fsb.tcp_flags = flags;
+ rack->r_ctl.fsb.left_to_send = orig_len - len;
+ KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
+ ("rack:%p left_to_send:%u sbavail:%u out:%u",
+ rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
+ (tp->snd_max - tp->snd_una)));
+ if (rack->r_ctl.fsb.left_to_send < segsiz) {
+ rack->r_fast_output = 0;
+ }
+ if (rack->r_fast_output) {
+ if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
+ rack->r_ctl.fsb.rfo_apply_push = 1;
+ else
+ rack->r_ctl.fsb.rfo_apply_push = 0;
+ rack_log_fsb(rack, tp, so, flags,
+ ipoptlen, orig_len, len, error,
+ (rsm == NULL), optlen, __LINE__, 3);
+ error = 0;
+ ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
+ if (ret >= 0)
+ return (ret);
+ else if (error)
+ goto nomore;
+
+ }
+ }
goto again;
} else if (len) {
counter_u64_add(rack_unpaced_segments, 1);
}
+ /* Assure when we leave that snd_nxt will point to top */
+ if (SEQ_GT(tp->snd_max, tp->snd_nxt))
+ tp->snd_nxt = tp->snd_max;
rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
+#ifdef TCP_ACCOUNTING
+ crtsc = get_cyclecount() - ts_val;
+ if (tot_len_this_send) {
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_DATA]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
+ }
+ counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz));
+ } else {
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_cnt_counters[SND_OUT_ACK]++;
+ }
+ counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1);
+ if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
+ tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
+ }
+ counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc);
+ }
+ sched_unpin();
+#endif
+ if (error == ENOBUFS)
+ error = 0;
return (error);
}
@@ -14087,133 +18531,380 @@
uint32_t orig_val;
orig_val = rack->r_ctl.rc_pace_max_segs;
- rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
+ rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
if (orig_val != rack->r_ctl.rc_pace_max_segs)
rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL);
}
-/*
- * rack_ctloutput() must drop the inpcb lock before performing copyin on
- * socket option arguments. When it re-acquires the lock after the copy, it
- * has to revalidate that the connection is still valid for the socket
- * option.
- */
+static void
+rack_mtu_change(struct tcpcb *tp)
+{
+ /*
+ * The MSS may have changed
+ */
+ struct tcp_rack *rack;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
+ /*
+ * The MTU has changed we need to resend everything
+ * since all we have sent is lost. We first fix
+ * up the mtu though.
+ */
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
+ /* We treat this like a full retransmit timeout without the cwnd adjustment */
+ rack_remxt_tmr(tp);
+ rack->r_fast_output = 0;
+ rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
+ rack->r_ctl.rc_sacked);
+ rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
+ rack->r_must_retran = 1;
+
+ }
+ sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
+ /* We don't use snd_nxt to retransmit */
+ tp->snd_nxt = tp->snd_max;
+}
+
static int
-rack_set_sockopt(struct socket *so, struct sockopt *sopt,
- struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
+rack_set_profile(struct tcp_rack *rack, int prof)
+{
+ int err = EINVAL;
+ if (prof == 1) {
+ /* pace_always=1 */
+ if (rack->rc_always_pace == 0) {
+ if (tcp_can_enable_pacing() == 0)
+ return (EBUSY);
+ }
+ rack->rc_always_pace = 1;
+ if (rack->use_fixed_rate || rack->gp_ready)
+ rack_set_cc_pacing(rack);
+ rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ rack->rack_attempt_hdwr_pace = 0;
+ /* cmpack=1 */
+ if (rack_use_cmp_acks)
+ rack->r_use_cmp_ack = 1;
+ if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
+ rack->r_use_cmp_ack)
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
+ /* scwnd=1 */
+ rack->rack_enable_scwnd = 1;
+ /* dynamic=100 */
+ rack->rc_gp_dyn_mul = 1;
+ /* gp_inc_ca */
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ /* rrr_conf=3 */
+ rack->r_rr_config = 3;
+ /* npush=2 */
+ rack->r_ctl.rc_no_push_at_mrtt = 2;
+ /* fillcw=1 */
+ rack->rc_pace_to_cwnd = 1;
+ rack->rc_pace_fill_if_rttin_range = 0;
+ rack->rtt_limit_mul = 0;
+ /* noprr=1 */
+ rack->rack_no_prr = 1;
+ /* lscwnd=1 */
+ rack->r_limit_scw = 1;
+ /* gp_inc_rec */
+ rack->r_ctl.rack_per_of_gp_rec = 90;
+ err = 0;
+
+ } else if (prof == 3) {
+ /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */
+ /* pace_always=1 */
+ if (rack->rc_always_pace == 0) {
+ if (tcp_can_enable_pacing() == 0)
+ return (EBUSY);
+ }
+ rack->rc_always_pace = 1;
+ if (rack->use_fixed_rate || rack->gp_ready)
+ rack_set_cc_pacing(rack);
+ rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ rack->rack_attempt_hdwr_pace = 0;
+ /* cmpack=1 */
+ if (rack_use_cmp_acks)
+ rack->r_use_cmp_ack = 1;
+ if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
+ rack->r_use_cmp_ack)
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
+ /* scwnd=1 */
+ rack->rack_enable_scwnd = 1;
+ /* dynamic=100 */
+ rack->rc_gp_dyn_mul = 1;
+ /* gp_inc_ca */
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ /* rrr_conf=3 */
+ rack->r_rr_config = 3;
+ /* npush=2 */
+ rack->r_ctl.rc_no_push_at_mrtt = 2;
+ /* fillcw=2 */
+ rack->rc_pace_to_cwnd = 1;
+ rack->r_fill_less_agg = 1;
+ rack->rc_pace_fill_if_rttin_range = 0;
+ rack->rtt_limit_mul = 0;
+ /* noprr=1 */
+ rack->rack_no_prr = 1;
+ /* lscwnd=1 */
+ rack->r_limit_scw = 1;
+ /* gp_inc_rec */
+ rack->r_ctl.rack_per_of_gp_rec = 90;
+ err = 0;
+
+
+ } else if (prof == 2) {
+ /* cmpack=1 */
+ if (rack->rc_always_pace == 0) {
+ if (tcp_can_enable_pacing() == 0)
+ return (EBUSY);
+ }
+ rack->rc_always_pace = 1;
+ if (rack->use_fixed_rate || rack->gp_ready)
+ rack_set_cc_pacing(rack);
+ rack->r_use_cmp_ack = 1;
+ if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
+ /* pace_always=1 */
+ rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ /* scwnd=1 */
+ rack->rack_enable_scwnd = 1;
+ /* dynamic=100 */
+ rack->rc_gp_dyn_mul = 1;
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ /* rrr_conf=3 */
+ rack->r_rr_config = 3;
+ /* npush=2 */
+ rack->r_ctl.rc_no_push_at_mrtt = 2;
+ /* fillcw=1 */
+ rack->rc_pace_to_cwnd = 1;
+ rack->rc_pace_fill_if_rttin_range = 0;
+ rack->rtt_limit_mul = 0;
+ /* noprr=1 */
+ rack->rack_no_prr = 1;
+ /* lscwnd=0 */
+ rack->r_limit_scw = 0;
+ err = 0;
+ } else if (prof == 0) {
+ /* This changes things back to the default settings */
+ err = 0;
+ if (rack->rc_always_pace) {
+ tcp_decrement_paced_conn();
+ rack_undo_cc_pacing(rack);
+ rack->rc_always_pace = 0;
+ }
+ if (rack_pace_every_seg && tcp_can_enable_pacing()) {
+ rack->rc_always_pace = 1;
+ if (rack->use_fixed_rate || rack->gp_ready)
+ rack_set_cc_pacing(rack);
+ } else
+ rack->rc_always_pace = 0;
+ if (rack_use_cmp_acks)
+ rack->r_use_cmp_ack = 1;
+ else
+ rack->r_use_cmp_ack = 0;
+ if (rack_disable_prr)
+ rack->rack_no_prr = 1;
+ else
+ rack->rack_no_prr = 0;
+ if (rack_gp_no_rec_chg)
+ rack->rc_gp_no_rec_chg = 1;
+ else
+ rack->rc_gp_no_rec_chg = 0;
+ if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
+ rack->r_mbuf_queue = 1;
+ if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
+ rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ } else {
+ rack->r_mbuf_queue = 0;
+ rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
+ }
+ if (rack_enable_shared_cwnd)
+ rack->rack_enable_scwnd = 1;
+ else
+ rack->rack_enable_scwnd = 0;
+ if (rack_do_dyn_mul) {
+ /* When dynamic adjustment is on CA needs to start at 100% */
+ rack->rc_gp_dyn_mul = 1;
+ if (rack_do_dyn_mul >= 100)
+ rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
+ } else {
+ rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
+ rack->rc_gp_dyn_mul = 0;
+ }
+ rack->r_rr_config = 0;
+ rack->r_ctl.rc_no_push_at_mrtt = 0;
+ rack->rc_pace_to_cwnd = 0;
+ rack->rc_pace_fill_if_rttin_range = 0;
+ rack->rtt_limit_mul = 0;
+
+ if (rack_enable_hw_pacing)
+ rack->rack_hdw_pace_ena = 1;
+ else
+ rack->rack_hdw_pace_ena = 0;
+ if (rack_disable_prr)
+ rack->rack_no_prr = 1;
+ else
+ rack->rack_no_prr = 0;
+ if (rack_limits_scwnd)
+ rack->r_limit_scw = 1;
+ else
+ rack->r_limit_scw = 0;
+ err = 0;
+ }
+ return (err);
+}
+
+static int
+rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
+{
+ struct deferred_opt_list *dol;
+
+ dol = malloc(sizeof(struct deferred_opt_list),
+ M_TCPFSB, M_NOWAIT|M_ZERO);
+ if (dol == NULL) {
+ /*
+ * No space yikes -- fail out..
+ */
+ return (0);
+ }
+ dol->optname = sopt_name;
+ dol->optval = loptval;
+ TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
+ return (1);
+}
+
+static int
+rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
+ uint32_t optval, uint64_t loptval)
{
struct epoch_tracker et;
+ struct sockopt sopt;
+ struct cc_newreno_opts opt;
uint64_t val;
- int32_t error = 0, optval;
+ int error = 0;
uint16_t ca, ss;
-
- switch (sopt->sopt_name) {
- case TCP_RACK_PROP_RATE: /* URL:prop_rate */
- case TCP_RACK_PROP : /* URL:prop */
- case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */
- case TCP_RACK_EARLY_RECOV: /* URL:early_recov */
- case TCP_RACK_PACE_REDUCE: /* Not used */
- /* Pacing related ones */
- case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
- case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
- case TCP_BBR_IWINTSO: /* URL:tso_iwin */
- case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
- case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
- case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */
- case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/
- case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */
- case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */
- case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */
- case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */
- case TCP_RACK_RR_CONF: /* URL:rrr_conf */
- case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */
- /* End pacing related */
- case TCP_DELACK:
- case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
- case TCP_RACK_MIN_TO: /* URL:min_to */
- case TCP_RACK_EARLY_SEG: /* URL:early_seg */
- case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */
- case TCP_RACK_REORD_FADE: /* URL:reord_fade */
- case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */
- case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */
- case TCP_RACK_TLP_USE: /* URL:tlp_use */
- case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */
- case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */
- case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */
- case TCP_BBR_USE_RACK_RR: /* URL:rackrr */
- case TCP_RACK_DO_DETECTION: /* URL:detect */
- case TCP_NO_PRR: /* URL:noprr */
- case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */
- case TCP_DATA_AFTER_CLOSE:
- case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */
- case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */
- case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */
- case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
- case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
- case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
- case TCP_RACK_PROFILE: /* URL:profile */
+
+ switch (sopt_name) {
+
+ case TCP_RACK_PACING_BETA:
+ RACK_OPTS_INC(tcp_rack_beta);
+ if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
+ /* This only works for newreno. */
+ error = EINVAL;
+ break;
+ }
+ if (rack->rc_pacing_cc_set) {
+ /*
+ * Set them into the real CC module
+ * whats in the rack pcb is the old values
+ * to be used on restoral/
+ */
+ sopt.sopt_dir = SOPT_SET;
+ opt.name = CC_NEWRENO_BETA;
+ opt.val = optval;
+ if (CC_ALGO(tp)->ctl_output != NULL)
+ error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
+ else {
+ error = ENOENT;
+ break;
+ }
+ } else {
+ /*
+ * Not pacing yet so set it into our local
+ * rack pcb storage.
+ */
+ rack->r_ctl.rc_saved_beta.beta = optval;
+ }
break;
- default:
- return (tcp_default_ctloutput(so, sopt, inp, tp));
+ case TCP_RACK_PACING_BETA_ECN:
+ RACK_OPTS_INC(tcp_rack_beta_ecn);
+ if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
+ /* This only works for newreno. */
+ error = EINVAL;
+ break;
+ }
+ if (rack->rc_pacing_cc_set) {
+ /*
+ * Set them into the real CC module
+ * whats in the rack pcb is the old values
+ * to be used on restoral/
+ */
+ sopt.sopt_dir = SOPT_SET;
+ opt.name = CC_NEWRENO_BETA_ECN;
+ opt.val = optval;
+ if (CC_ALGO(tp)->ctl_output != NULL)
+ error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
+ else
+ error = ENOENT;
+ } else {
+ /*
+ * Not pacing yet so set it into our local
+ * rack pcb storage.
+ */
+ rack->r_ctl.rc_saved_beta.beta_ecn = optval;
+ rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN;
+ }
+ break;
+ case TCP_DEFER_OPTIONS:
+ RACK_OPTS_INC(tcp_defer_opt);
+ if (optval) {
+ if (rack->gp_ready) {
+ /* Too late */
+ error = EINVAL;
+ break;
+ }
+ rack->defer_options = 1;
+ } else
+ rack->defer_options = 0;
+ break;
+ case TCP_RACK_MEASURE_CNT:
+ RACK_OPTS_INC(tcp_rack_measure_cnt);
+ if (optval && (optval <= 0xff)) {
+ rack->r_ctl.req_measurements = optval;
+ } else
+ error = EINVAL;
+ break;
+ case TCP_REC_ABC_VAL:
+ RACK_OPTS_INC(tcp_rec_abc_val);
+ if (optval > 0)
+ rack->r_use_labc_for_rec = 1;
+ else
+ rack->r_use_labc_for_rec = 0;
+ break;
+ case TCP_RACK_ABC_VAL:
+ RACK_OPTS_INC(tcp_rack_abc_val);
+ if ((optval > 0) && (optval < 255))
+ rack->rc_labc = optval;
+ else
+ error = EINVAL;
+ break;
+ case TCP_HDWR_UP_ONLY:
+ RACK_OPTS_INC(tcp_pacing_up_only);
+ if (optval)
+ rack->r_up_only = 1;
+ else
+ rack->r_up_only = 0;
+ break;
+ case TCP_PACING_RATE_CAP:
+ RACK_OPTS_INC(tcp_pacing_rate_cap);
+ rack->r_ctl.bw_rate_cap = loptval;
break;
- }
- INP_WUNLOCK(inp);
- error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
- if (error)
- return (error);
- INP_WLOCK(inp);
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
- INP_WUNLOCK(inp);
- return (ECONNRESET);
- }
- tp = intotcpcb(inp);
- rack = (struct tcp_rack *)tp->t_fb_ptr;
- switch (sopt->sopt_name) {
case TCP_RACK_PROFILE:
RACK_OPTS_INC(tcp_profile);
- if (optval == 1) {
- /* pace_always=1 */
- rack->rc_always_pace = 1;
- tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
- /* scwnd=1 */
- rack->rack_enable_scwnd = 1;
- /* dynamic=100 */
- rack->rc_gp_dyn_mul = 1;
- rack->r_ctl.rack_per_of_gp_ca = 100;
- /* rrr_conf=3 */
- rack->r_rr_config = 3;
- /* npush=2 */
- rack->r_ctl.rc_no_push_at_mrtt = 2;
- /* fillcw=1 */
- rack->rc_pace_to_cwnd = 1;
- rack->rc_pace_fill_if_rttin_range = 0;
- rack->rtt_limit_mul = 0;
- /* noprr=1 */
- rack->rack_no_prr = 1;
- /* lscwnd=1 */
- rack->r_limit_scw = 1;
- } else if (optval == 2) {
- /* pace_always=1 */
- rack->rc_always_pace = 1;
+ error = rack_set_profile(rack, optval);
+ break;
+ case TCP_USE_CMP_ACKS:
+ RACK_OPTS_INC(tcp_use_cmp_acks);
+ if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) {
+ /* You can't turn it off once its on! */
+ error = EINVAL;
+ } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
+ rack->r_use_cmp_ack = 1;
+ rack->r_mbuf_queue = 1;
tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
- /* scwnd=1 */
- rack->rack_enable_scwnd = 1;
- /* dynamic=100 */
- rack->rc_gp_dyn_mul = 1;
- rack->r_ctl.rack_per_of_gp_ca = 100;
- /* rrr_conf=3 */
- rack->r_rr_config = 3;
- /* npush=2 */
- rack->r_ctl.rc_no_push_at_mrtt = 2;
- /* fillcw=1 */
- rack->rc_pace_to_cwnd = 1;
- rack->rc_pace_fill_if_rttin_range = 0;
- rack->rtt_limit_mul = 0;
- /* noprr=1 */
- rack->rack_no_prr = 1;
- /* lscwnd=0 */
- rack->r_limit_scw = 0;
}
+ if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
+ rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
break;
case TCP_SHARED_CWND_TIME_LIMIT:
RACK_OPTS_INC(tcp_lscwnd);
@@ -14226,8 +18917,11 @@
RACK_OPTS_INC(tcp_fillcw);
if (optval == 0)
rack->rc_pace_to_cwnd = 0;
- else
+ else {
rack->rc_pace_to_cwnd = 1;
+ if (optval > 1)
+ rack->r_fill_less_agg = 1;
+ }
if ((optval >= rack_gp_rtt_maxmul) &&
rack_gp_rtt_maxmul &&
(optval < 0xf)) {
@@ -14257,11 +18951,11 @@
case TCP_RACK_MBUF_QUEUE:
/* Now do we use the LRO mbuf-queue feature */
RACK_OPTS_INC(tcp_rack_mbufq);
- if (optval)
+ if (optval || rack->r_use_cmp_ack)
rack->r_mbuf_queue = 1;
else
rack->r_mbuf_queue = 0;
- if (rack->r_mbuf_queue || rack->rc_always_pace)
+ if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
else
tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
@@ -14277,8 +18971,12 @@
RACK_OPTS_INC(tcp_rack_noprr);
if (optval == 0)
rack->rack_no_prr = 0;
- else
+ else if (optval == 1)
rack->rack_no_prr = 1;
+ else if (optval == 2)
+ rack->no_prr_addback = 1;
+ else
+ error = EINVAL;
break;
case TCP_TIMELY_DYN_ADJ:
RACK_OPTS_INC(tcp_timely_dyn);
@@ -14302,14 +19000,6 @@
else
rack->do_detection = 1;
break;
- case TCP_RACK_PROP_RATE:
- if ((optval <= 0) || (optval >= 100)) {
- error = EINVAL;
- break;
- }
- RACK_OPTS_INC(tcp_rack_prop_rate);
- rack->r_ctl.rc_prop_rate = optval;
- break;
case TCP_RACK_TLP_USE:
if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
error = EINVAL;
@@ -14318,34 +19008,39 @@
RACK_OPTS_INC(tcp_tlp_use);
rack->rack_tlp_threshold_use = optval;
break;
- case TCP_RACK_PROP:
- /* RACK proportional rate reduction (bool) */
- RACK_OPTS_INC(tcp_rack_prop);
- rack->r_ctl.rc_prop_reduce = optval;
- break;
case TCP_RACK_TLP_REDUCE:
/* RACK TLP cwnd reduction (bool) */
RACK_OPTS_INC(tcp_rack_tlp_reduce);
rack->r_ctl.rc_tlp_cwnd_reduce = optval;
break;
- case TCP_RACK_EARLY_RECOV:
- /* Should recovery happen early (bool) */
- RACK_OPTS_INC(tcp_rack_early_recov);
- rack->r_ctl.rc_early_recovery = optval;
- break;
-
- /* Pacing related ones */
+ /* Pacing related ones */
case TCP_RACK_PACE_ALWAYS:
/*
* zero is old rack method, 1 is new
* method using a pacing rate.
*/
RACK_OPTS_INC(tcp_rack_pace_always);
- if (optval > 0)
- rack->rc_always_pace = 1;
- else
- rack->rc_always_pace = 0;
- if (rack->r_mbuf_queue || rack->rc_always_pace)
+ if (optval > 0) {
+ if (rack->rc_always_pace) {
+ error = EALREADY;
+ break;
+ } else if (tcp_can_enable_pacing()) {
+ rack->rc_always_pace = 1;
+ if (rack->use_fixed_rate || rack->gp_ready)
+ rack_set_cc_pacing(rack);
+ }
+ else {
+ error = ENOSPC;
+ break;
+ }
+ } else {
+ if (rack->rc_always_pace) {
+ tcp_decrement_paced_conn();
+ rack->rc_always_pace = 0;
+ rack_undo_cc_pacing(rack);
+ }
+ }
+ if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
else
tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
@@ -14426,7 +19121,7 @@
/* Max segments size in a pace in bytes */
RACK_OPTS_INC(tcp_rack_max_seg);
rack->rc_user_set_max_segs = optval;
- rack_set_pace_segments(tp, rack, __LINE__);
+ rack_set_pace_segments(tp, rack, __LINE__, NULL);
break;
case TCP_RACK_PACE_RATE_REC:
/* Set the fixed pacing rate in Bytes per second ca */
@@ -14437,6 +19132,8 @@
if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
rack->use_fixed_rate = 1;
+ if (rack->rc_always_pace)
+ rack_set_cc_pacing(rack);
rack_log_pacing_delay_calc(rack,
rack->r_ctl.rc_fixed_pacing_rate_ss,
rack->r_ctl.rc_fixed_pacing_rate_ca,
@@ -14453,6 +19150,8 @@
if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
rack->use_fixed_rate = 1;
+ if (rack->rc_always_pace)
+ rack_set_cc_pacing(rack);
rack_log_pacing_delay_calc(rack,
rack->r_ctl.rc_fixed_pacing_rate_ss,
rack->r_ctl.rc_fixed_pacing_rate_ca,
@@ -14469,6 +19168,8 @@
if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
rack->use_fixed_rate = 1;
+ if (rack->rc_always_pace)
+ rack_set_cc_pacing(rack);
rack_log_pacing_delay_calc(rack,
rack->r_ctl.rc_fixed_pacing_rate_ss,
rack->r_ctl.rc_fixed_pacing_rate_ca,
@@ -14527,6 +19228,17 @@
else
rack->r_rr_config = 0;
break;
+ case TCP_HDWR_RATE_CAP:
+ RACK_OPTS_INC(tcp_hdwr_rate_cap);
+ if (optval) {
+ if (rack->r_rack_hw_rate_caps == 0)
+ rack->r_rack_hw_rate_caps = 1;
+ else
+ error = EALREADY;
+ } else {
+ rack->r_rack_hw_rate_caps = 0;
+ }
+ break;
case TCP_BBR_HDWR_PACE:
RACK_OPTS_INC(tcp_hdwr_pacing);
if (optval){
@@ -14538,14 +19250,16 @@
} else {
rack->rack_hdw_pace_ena = 0;
#ifdef RATELIMIT
- if (rack->rack_hdrw_pacing) {
+ if (rack->r_ctl.crte != NULL) {
rack->rack_hdrw_pacing = 0;
- in_pcbdetach_txrtlmt(rack->rc_inp);
+ rack->rack_attempt_hdwr_pace = 0;
+ tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
+ rack->r_ctl.crte = NULL;
}
#endif
}
break;
- /* End Pacing related ones */
+ /* End Pacing related ones */
case TCP_RACK_PRR_SENDALOT:
/* Allow PRR to send more than one seg */
RACK_OPTS_INC(tcp_rack_prr_sendalot);
@@ -14589,19 +19303,20 @@
else
rack->use_rack_rr = 0;
break;
+ case TCP_FAST_RSM_HACK:
+ RACK_OPTS_INC(tcp_rack_fastrsm_hack);
+ if (optval)
+ rack->fast_rsm_hack = 1;
+ else
+ rack->fast_rsm_hack = 0;
+ break;
case TCP_RACK_PKT_DELAY:
/* RACK added ms i.e. rack-rtt + reord + N */
RACK_OPTS_INC(tcp_rack_pkt_delay);
rack->r_ctl.rc_pkt_delay = optval;
break;
- case TCP_RACK_TLP_INC_VAR:
- /* Does TLP include rtt variance in t-o */
- error = EINVAL;
- break;
- case TCP_RACK_IDLE_REDUCE_HIGH:
- error = EINVAL;
- break;
case TCP_DELACK:
+ RACK_OPTS_INC(tcp_rack_delayed_ack);
if (optval == 0)
tp->t_delayed_ack = 0;
else
@@ -14616,6 +19331,7 @@
break;
case TCP_BBR_RACK_RTT_USE:
+ RACK_OPTS_INC(tcp_rack_rtt_use);
if ((optval != USE_RTT_HIGH) &&
(optval != USE_RTT_LOW) &&
(optval != USE_RTT_AVG))
@@ -14624,32 +19340,208 @@
rack->r_ctl.rc_rate_sample_method = optval;
break;
case TCP_DATA_AFTER_CLOSE:
+ RACK_OPTS_INC(tcp_data_after_close);
if (optval)
rack->rc_allow_data_af_clo = 1;
else
rack->rc_allow_data_af_clo = 0;
break;
- case TCP_RACK_PACE_REDUCE:
- /* sysctl only now */
- error = EINVAL;
- break;
default:
- return (tcp_default_ctloutput(so, sopt, inp, tp));
break;
}
#ifdef NETFLIX_STATS
- tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
+ tcp_log_socket_option(tp, sopt_name, optval, error);
#endif
+ return (error);
+}
+
+
+static void
+rack_apply_deferred_options(struct tcp_rack *rack)
+{
+ struct deferred_opt_list *dol, *sdol;
+ uint32_t s_optval;
+
+ TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
+ TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
+ /* Disadvantage of deferal is you loose the error return */
+ s_optval = (uint32_t)dol->optval;
+ (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval);
+ free(dol, M_TCPDO);
+ }
+}
+
+/*
+ * rack_ctloutput() must drop the inpcb lock before performing copyin on
+ * socket option arguments. When it re-acquires the lock after the copy, it
+ * has to revalidate that the connection is still valid for the socket
+ * option.
+ */
+static int
+rack_set_sockopt(struct socket *so, struct sockopt *sopt,
+ struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
+{
+ uint64_t loptval;
+ int32_t error = 0, optval;
+
+ switch (sopt->sopt_name) {
+ case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */
+ /* Pacing related ones */
+ case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
+ case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
+ case TCP_BBR_IWINTSO: /* URL:tso_iwin */
+ case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
+ case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
+ case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */
+ case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/
+ case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */
+ case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */
+ case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */
+ case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */
+ case TCP_RACK_RR_CONF: /* URL:rrr_conf */
+ case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */
+ case TCP_HDWR_RATE_CAP: /* URL: hdwrcap boolean */
+ case TCP_PACING_RATE_CAP: /* URL:cap-- used by side-channel */
+ case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */
+ /* End pacing related */
+ case TCP_FAST_RSM_HACK: /* URL:frsm_hack */
+ case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
+ case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
+ case TCP_RACK_MIN_TO: /* URL:min_to */
+ case TCP_RACK_EARLY_SEG: /* URL:early_seg */
+ case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */
+ case TCP_RACK_REORD_FADE: /* URL:reord_fade */
+ case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */
+ case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */
+ case TCP_RACK_TLP_USE: /* URL:tlp_use */
+ case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */
+ case TCP_BBR_USE_RACK_RR: /* URL:rackrr */
+ case TCP_RACK_DO_DETECTION: /* URL:detect */
+ case TCP_NO_PRR: /* URL:noprr */
+ case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */
+ case TCP_DATA_AFTER_CLOSE: /* no URL */
+ case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */
+ case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */
+ case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */
+ case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
+ case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
+ case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
+ case TCP_RACK_PROFILE: /* URL:profile */
+ case TCP_USE_CMP_ACKS: /* URL:cmpack */
+ case TCP_RACK_ABC_VAL: /* URL:labc */
+ case TCP_REC_ABC_VAL: /* URL:reclabc */
+ case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */
+ case TCP_DEFER_OPTIONS: /* URL:defer */
+ case TCP_RACK_PACING_BETA: /* URL:pacing_beta */
+ case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
+ break;
+ default:
+ /* Filter off all unknown options to the base stack */
+ return (tcp_default_ctloutput(so, sopt, inp, tp));
+ break;
+ }
+ INP_WUNLOCK(inp);
+ if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
+ error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
+ /*
+ * We truncate it down to 32 bits for the socket-option trace this
+ * means rates > 34Gbps won't show right, but thats probably ok.
+ */
+ optval = (uint32_t)loptval;
+ } else {
+ error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
+ /* Save it in 64 bit form too */
+ loptval = optval;
+ }
+ if (error)
+ return (error);
+ INP_WLOCK(inp);
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+ INP_WUNLOCK(inp);
+ return (ECONNRESET);
+ }
+ if (rack->defer_options && (rack->gp_ready == 0) &&
+ (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
+ (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
+ (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
+ (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
+ /* Options are beind deferred */
+ if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
+ INP_WUNLOCK(inp);
+ return (0);
+ } else {
+ /* No memory to defer, fail */
+ INP_WUNLOCK(inp);
+ return (ENOMEM);
+ }
+ }
+ error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval);
INP_WUNLOCK(inp);
return (error);
}
+static void
+rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
+{
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ bzero(ti, sizeof(*ti));
+
+ ti->tcpi_state = tp->t_state;
+ if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
+ ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ if (tp->t_flags & TF_SACK_PERMIT)
+ ti->tcpi_options |= TCPI_OPT_SACK;
+ if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
+ ti->tcpi_options |= TCPI_OPT_WSCALE;
+ ti->tcpi_snd_wscale = tp->snd_scale;
+ ti->tcpi_rcv_wscale = tp->rcv_scale;
+ }
+ if (tp->t_flags2 & TF2_ECN_PERMIT)
+ ti->tcpi_options |= TCPI_OPT_ECN;
+ if (tp->t_flags & TF_FASTOPEN)
+ ti->tcpi_options |= TCPI_OPT_TFO;
+ /* still kept in ticks is t_rcvtime */
+ ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
+ /* Since we hold everything in precise useconds this is easy */
+ ti->tcpi_rtt = tp->t_srtt;
+ ti->tcpi_rttvar = tp->t_rttvar;
+ ti->tcpi_rto = tp->t_rxtcur;
+ ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
+ ti->tcpi_snd_cwnd = tp->snd_cwnd;
+ /*
+ * FreeBSD-specific extension fields for tcp_info.
+ */
+ ti->tcpi_rcv_space = tp->rcv_wnd;
+ ti->tcpi_rcv_nxt = tp->rcv_nxt;
+ ti->tcpi_snd_wnd = tp->snd_wnd;
+ ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
+ ti->tcpi_snd_nxt = tp->snd_nxt;
+ ti->tcpi_snd_mss = tp->t_maxseg;
+ ti->tcpi_rcv_mss = tp->t_maxseg;
+ ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
+ ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
+ ti->tcpi_snd_zerowin = tp->t_sndzerowin;
+#ifdef NETFLIX_STATS
+ ti->tcpi_total_tlp = tp->t_sndtlppack;
+ ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
+ memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
+#endif
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE) {
+ ti->tcpi_options |= TCPI_OPT_TOE;
+ tcp_offload_tcp_info(tp, ti);
+ }
+#endif
+}
+
static int
rack_get_sockopt(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
{
int32_t error, optval;
- uint64_t val;
+ uint64_t val, loptval;
+ struct tcp_info ti;
/*
* Because all our options are either boolean or an int, we can just
* pull everything into optval and then unlock and copy. If we ever
@@ -14658,12 +19550,93 @@
*/
error = 0;
switch (sopt->sopt_name) {
+ case TCP_INFO:
+ /* First get the info filled */
+ rack_fill_info(tp, &ti);
+ /* Fix up the rtt related fields if needed */
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &ti, sizeof ti);
+ return (error);
+ /*
+ * Beta is the congestion control value for NewReno that influences how
+ * much of a backoff happens when loss is detected. It is normally set
+ * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
+ * when you exit recovery.
+ */
+ case TCP_RACK_PACING_BETA:
+ if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
+ error = EINVAL;
+ else if (rack->rc_pacing_cc_set == 0)
+ optval = rack->r_ctl.rc_saved_beta.beta;
+ else {
+ /*
+ * Reach out into the CC data and report back what
+ * I have previously set. Yeah it looks hackish but
+ * we don't want to report the saved values.
+ */
+ if (tp->ccv->cc_data)
+ optval = ((struct newreno *)tp->ccv->cc_data)->beta;
+ else
+ error = EINVAL;
+ }
+ break;
+ /*
+ * Beta_ecn is the congestion control value for NewReno that influences how
+ * much of a backoff happens when a ECN mark is detected. It is normally set
+ * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
+ * you exit recovery. Note that classic ECN has a beta of 50, it is only
+ * ABE Ecn that uses this "less" value, but we do too with pacing :)
+ */
+
+ case TCP_RACK_PACING_BETA_ECN:
+ if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
+ error = EINVAL;
+ else if (rack->rc_pacing_cc_set == 0)
+ optval = rack->r_ctl.rc_saved_beta.beta_ecn;
+ else {
+ /*
+ * Reach out into the CC data and report back what
+ * I have previously set. Yeah it looks hackish but
+ * we don't want to report the saved values.
+ */
+ if (tp->ccv->cc_data)
+ optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn;
+ else
+ error = EINVAL;
+ }
+ break;
+ case TCP_FAST_RSM_HACK:
+ optval = rack->fast_rsm_hack;
+ break;
+ case TCP_DEFER_OPTIONS:
+ optval = rack->defer_options;
+ break;
+ case TCP_RACK_MEASURE_CNT:
+ optval = rack->r_ctl.req_measurements;
+ break;
+ case TCP_REC_ABC_VAL:
+ optval = rack->r_use_labc_for_rec;
+ break;
+ case TCP_RACK_ABC_VAL:
+ optval = rack->rc_labc;
+ break;
+ case TCP_HDWR_UP_ONLY:
+ optval= rack->r_up_only;
+ break;
+ case TCP_PACING_RATE_CAP:
+ loptval = rack->r_ctl.bw_rate_cap;
+ break;
case TCP_RACK_PROFILE:
/* You cannot retrieve a profile, its write only */
error = EINVAL;
break;
+ case TCP_USE_CMP_ACKS:
+ optval = rack->r_use_cmp_ack;
+ break;
case TCP_RACK_PACE_TO_FILL:
optval = rack->rc_pace_to_cwnd;
+ if (optval && rack->r_fill_less_agg)
+ optval++;
break;
case TCP_RACK_NO_PUSH_AT_MAX:
optval = rack->r_ctl.rc_no_push_at_mrtt;
@@ -14675,7 +19648,12 @@
optval = rack->rack_rec_nonrxt_use_cr;
break;
case TCP_NO_PRR:
- optval = rack->rack_no_prr;
+ if (rack->rack_no_prr == 1)
+ optval = 1;
+ else if (rack->no_prr_addback == 1)
+ optval = 2;
+ else
+ optval = 0;
break;
case TCP_RACK_DO_DETECTION:
optval = rack->do_detection;
@@ -14690,25 +19668,10 @@
case TCP_BBR_IWINTSO:
optval = rack->rc_init_win;
break;
- case TCP_RACK_PROP_RATE:
- optval = rack->r_ctl.rc_prop_rate;
- break;
- case TCP_RACK_PROP:
- /* RACK proportional rate reduction (bool) */
- optval = rack->r_ctl.rc_prop_reduce;
- break;
case TCP_RACK_TLP_REDUCE:
/* RACK TLP cwnd reduction (bool) */
optval = rack->r_ctl.rc_tlp_cwnd_reduce;
break;
- case TCP_RACK_EARLY_RECOV:
- /* Should recovery happen early (bool) */
- optval = rack->r_ctl.rc_early_recovery;
- break;
- case TCP_RACK_PACE_REDUCE:
- /* RACK Hptsi reduction factor (divisor) */
- error = EINVAL;
- break;
case TCP_BBR_RACK_INIT_RATE:
val = rack->r_ctl.init_rate;
/* convert to kbits per sec */
@@ -14754,6 +19717,9 @@
case TCP_RACK_RR_CONF:
optval = rack->r_rr_config;
break;
+ case TCP_HDWR_RATE_CAP:
+ optval = rack->r_rack_hw_rate_caps;
+ break;
case TCP_BBR_HDWR_PACE:
optval = rack->rack_hdw_pace_ena;
break;
@@ -14768,13 +19734,6 @@
case TCP_RACK_TLP_USE:
optval = rack->rack_tlp_threshold_use;
break;
- case TCP_RACK_TLP_INC_VAR:
- /* Does TLP include rtt variance in t-o */
- error = EINVAL;
- break;
- case TCP_RACK_IDLE_REDUCE_HIGH:
- error = EINVAL;
- break;
case TCP_RACK_PACE_RATE_CA:
optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
break;
@@ -14808,7 +19767,10 @@
}
INP_WUNLOCK(inp);
if (error == 0) {
- error = sooptcopyout(sopt, &optval, sizeof optval);
+ if (TCP_PACING_RATE_CAP)
+ error = sooptcopyout(sopt, &loptval, sizeof loptval);
+ else
+ error = sooptcopyout(sopt, &optval, sizeof optval);
}
return (error);
}
@@ -14857,7 +19819,9 @@
.tfb_tcp_timer_stop = rack_timer_stop,
.tfb_tcp_rexmit_tmr = rack_remxt_tmr,
.tfb_tcp_handoff_ok = rack_handoff_ok,
+ .tfb_tcp_mtu_chg = rack_mtu_change,
.tfb_pru_options = rack_pru_options,
+
};
static const char *rack_stack_names[] = {
Index: sys/netinet/tcp_stacks/rack_bbr_common.h
===================================================================
--- sys/netinet/tcp_stacks/rack_bbr_common.h
+++ sys/netinet/tcp_stacks/rack_bbr_common.h
@@ -98,12 +98,20 @@
uint32_t ctf_outstanding(struct tcpcb *tp);
uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
int
-ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
- struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
- int32_t * drop_hdrlen, int32_t * ret_val);
+_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t *tlenp,
+ int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val,
+ uint32_t *ts, uint32_t *cnt);
+void ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt);
+#define ctf_drop_checks(a, b, c, d, e, f, g, h) _ctf_drop_checks(a, b, c, d, e, f, g, h, NULL, NULL)
+
void
-ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
- struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
+__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t thflags, int32_t tlen,
+ int32_t *ret_val, uint32_t *ts, uint32_t *cnt);
+
+#define ctf_do_dropafterack(a, b, c, d, e, f) __ctf_do_dropafterack(a, b, c, d, e, f, NULL, NULL)
+
void
ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
struct tcphdr *th, int32_t rstreason, int32_t tlen);
@@ -122,6 +130,9 @@
ctf_ts_check(struct mbuf *m, struct tcphdr *th,
struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
+int
+ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags);
+
void
ctf_calc_rwin(struct socket *so, struct tcpcb *tp);
Index: sys/netinet/tcp_stacks/rack_bbr_common.c
===================================================================
--- sys/netinet/tcp_stacks/rack_bbr_common.c
+++ sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -96,6 +96,7 @@
#include <netinet/tcp_var.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_lro.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_log_buf.h>
#ifdef TCPDEBUG
@@ -161,6 +162,130 @@
}
#endif
+static int
+ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
+{
+ struct ether_header *eh;
+ struct tcphdr *th;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
+#endif
+#ifdef INET
+ struct ip *ip = NULL; /* Keep compiler happy. */
+#endif
+ int32_t tlen;
+ uint16_t drop_hdrlen;
+ uint16_t etype;
+ uint8_t iptos;
+
+ /* Is it the easy way? */
+ if (m->m_flags & M_LRO_EHDRSTRP)
+ return (m->m_pkthdr.lro_etype);
+ /*
+ * Ok this is the old style call, the ethernet header is here.
+ * This also means no checksum or BPF were done. This
+ * can happen if the race to setup the inp fails and
+ * LRO sees no INP at packet input, but by the time
+ * we queue the packets an INP gets there. Its rare
+ * but it can occur so we will handle it. Note that
+ * this means duplicated work but with the rarity of it
+ * its not worth worrying about.
+ */
+ /* Let the BPF see the packet */
+ if (bpf_peers_present(ifp->if_bpf))
+ ETHER_BPF_MTAP(ifp, m);
+ /* Now the csum */
+ eh = mtod(m, struct ether_header *);
+ etype = ntohs(eh->ether_type);
+ m_adj(m, sizeof(*eh));
+ switch (etype) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ {
+ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+ m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+ if (m == NULL) {
+ KMOD_TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ return (-1);
+ }
+ }
+ ip6 = (struct ip6_hdr *)(eh + 1);
+ th = (struct tcphdr *)(ip6 + 1);
+ drop_hdrlen = sizeof(*ip6);
+ tlen = ntohs(ip6->ip6_plen);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in6_cksum_pseudo(ip6, tlen,
+ IPPROTO_TCP,
+ m->m_pkthdr.csum_data);
+ th->th_sum ^= 0xffff;
+ } else
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
+ if (th->th_sum) {
+ KMOD_TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ return (-1);
+ }
+ return (etype);
+ }
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ if (m->m_len < sizeof (struct tcpiphdr)) {
+ m = m_pullup(m, sizeof (struct tcpiphdr));
+ if (m == NULL) {
+ KMOD_TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ return (-1);
+ }
+ }
+ ip = (struct ip *)(eh + 1);
+ th = (struct tcphdr *)(ip + 1);
+ drop_hdrlen = sizeof(*ip);
+ iptos = ip->ip_tos;
+ tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr,
+ htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP));
+ th->th_sum ^= 0xffff;
+ } else {
+ int len;
+ struct ipovly *ipov = (struct ipovly *)ip;
+ /*
+ * Checksum extended TCP header and data.
+ */
+ len = drop_hdrlen + tlen;
+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+ ipov->ih_len = htons(tlen);
+ th->th_sum = in_cksum(m, len);
+ /* Reset length for SDT probes. */
+ ip->ip_len = htons(len);
+ /* Reset TOS bits */
+ ip->ip_tos = iptos;
+ /* Re-initialization for later version check */
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(*ip) >> 2;
+ }
+ if (th->th_sum) {
+ KMOD_TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ return (-1);
+ }
+ break;
+ }
+#endif
+ };
+ return (etype);
+}
+
/*
* The function ctf_process_inbound_raw() is used by
* transport developers to do the steps needed to
@@ -170,6 +295,7 @@
* - INP_SUPPORTS_MBUFQ
* - INP_MBUF_QUEUE_READY
* - INP_DONT_SACK_QUEUE
+ * - INP_MBUF_ACKCMP
*
* These flags help control how LRO will deliver
* packets to the transport. You first set in inp_flags2
@@ -186,6 +312,18 @@
* In some transport designs this is important since knowing
* the actual time we got the packet is useful information.
*
+ * A new special type of mbuf may also be supported by the transport
+ * if it has set the INP_MBUF_ACKCMP flag. If its set, LRO will
+ * possibly create a M_ACKCMP type mbuf. This is a mbuf with
+ * an array of "acks". One thing also to note is that when this
+ * occurs a subsequent LRO may find at the back of the untouched
+ * mbuf queue chain a M_ACKCMP and append on to it. This means
+ * that until the transport pulls in the mbuf chain queued
+ * for it more ack's may get on the mbufs that were already
+ * delivered. There currently is a limit of 6 acks condensed
+ * into 1 mbuf which means often when this is occuring, we
+ * don't get that effect but it does happen.
+ *
* Now there are some interesting Caveats that the transport
* designer needs to take into account when using this feature.
*
@@ -247,7 +385,6 @@
* shipped in, the tcb has been destroyed (or about to be destroyed).
*/
struct mbuf *m_save;
- struct ether_header *eh;
struct tcphdr *th;
#ifdef INET6
struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
@@ -257,20 +394,18 @@
#endif
struct ifnet *ifp;
struct timeval tv;
+ struct inpcb *inp;
int32_t retval, nxt_pkt, tlen, off;
- uint16_t etype;
+ int etype = 0;
uint16_t drop_hdrlen;
- uint8_t iptos, no_vn=0, bpf_req=0;
+ uint8_t iptos, no_vn=0;
NET_EPOCH_ASSERT();
-
- if (m && m->m_pkthdr.rcvif)
- ifp = m->m_pkthdr.rcvif;
+ if (m)
+ ifp = m_rcvif(m);
else
ifp = NULL;
- if (ifp) {
- bpf_req = bpf_peers_present(ifp->if_bpf);
- } else {
+ if (ifp == NULL) {
/*
* We probably should not work around
* but kassert, since lro alwasy sets rcvif.
@@ -280,147 +415,86 @@
}
CURVNET_SET(ifp->if_vnet);
skip_vnet:
+ tcp_get_usecs(&tv);
while (m) {
m_save = m->m_nextpkt;
m->m_nextpkt = NULL;
- /* Now lets get the ether header */
- eh = mtod(m, struct ether_header *);
- etype = ntohs(eh->ether_type);
- /* Let the BPF see the packet */
- if (bpf_req && ifp)
- ETHER_BPF_MTAP(ifp, m);
- m_adj(m, sizeof(*eh));
- /* Trim off the ethernet header */
- switch (etype) {
-#ifdef INET6
- case ETHERTYPE_IPV6:
- {
- if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
- m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
- if (m == NULL) {
- KMOD_TCPSTAT_INC(tcps_rcvshort);
- m_freem(m);
- goto skipped_pkt;
- }
- }
- ip6 = (struct ip6_hdr *)(eh + 1);
- th = (struct tcphdr *)(ip6 + 1);
- tlen = ntohs(ip6->ip6_plen);
- drop_hdrlen = sizeof(*ip6);
- if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
- if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
- th->th_sum = m->m_pkthdr.csum_data;
- else
- th->th_sum = in6_cksum_pseudo(ip6, tlen,
- IPPROTO_TCP, m->m_pkthdr.csum_data);
- th->th_sum ^= 0xffff;
- } else
- th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
- if (th->th_sum) {
- KMOD_TCPSTAT_INC(tcps_rcvbadsum);
- m_freem(m);
- goto skipped_pkt;
- }
- /*
- * Be proactive about unspecified IPv6 address in source.
- * As we use all-zero to indicate unbounded/unconnected pcb,
- * unspecified IPv6 address can be used to confuse us.
- *
- * Note that packets with unspecified IPv6 destination is
- * already dropped in ip6_input.
- */
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
- /* XXX stat */
- m_freem(m);
+ if ((m->m_flags & M_ACKCMP) == 0) {
+ /* Now lets get the ether header */
+ etype = ctf_get_enet_type(ifp, m);
+ if (etype == -1) {
+ /* Skip this packet it was freed by checksum */
goto skipped_pkt;
}
- iptos = IPV6_TRAFFIC_CLASS(ip6);
- break;
- }
+ KASSERT(((etype == ETHERTYPE_IPV6) || (etype == ETHERTYPE_IP)),
+ ("tp:%p m:%p etype:0x%x -- not IP or IPv6", tp, m, etype));
+ /* Trim off the ethernet header */
+ switch (etype) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ tlen = ntohs(ip6->ip6_plen);
+ drop_hdrlen = sizeof(*ip6);
+ iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ break;
#endif
#ifdef INET
- case ETHERTYPE_IP:
- {
- if (m->m_len < sizeof (struct tcpiphdr)) {
- if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
- == NULL) {
- KMOD_TCPSTAT_INC(tcps_rcvshort);
- m_freem(m);
- goto skipped_pkt;
- }
- }
- ip = (struct ip *)(eh + 1);
- th = (struct tcphdr *)(ip + 1);
- drop_hdrlen = sizeof(*ip);
- iptos = ip->ip_tos;
- tlen = ntohs(ip->ip_len) - sizeof(struct ip);
- if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
- if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
- th->th_sum = m->m_pkthdr.csum_data;
- else
- th->th_sum = in_pseudo(ip->ip_src.s_addr,
- ip->ip_dst.s_addr,
- htonl(m->m_pkthdr.csum_data + tlen +
- IPPROTO_TCP));
- th->th_sum ^= 0xffff;
- } else {
- int len;
- struct ipovly *ipov = (struct ipovly *)ip;
- /*
- * Checksum extended TCP header and data.
- */
- len = drop_hdrlen + tlen;
- bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
- ipov->ih_len = htons(tlen);
- th->th_sum = in_cksum(m, len);
- /* Reset length for SDT probes. */
- ip->ip_len = htons(len);
- /* Reset TOS bits */
- ip->ip_tos = iptos;
- /* Re-initialization for later version check */
- ip->ip_v = IPVERSION;
- ip->ip_hl = sizeof(*ip) >> 2;
- }
- if (th->th_sum) {
- KMOD_TCPSTAT_INC(tcps_rcvbadsum);
- m_freem(m);
- goto skipped_pkt;
- }
- break;
- }
+ case ETHERTYPE_IP:
+ ip = mtod(m, struct ip *);
+ th = (struct tcphdr *)(ip + 1);
+ drop_hdrlen = sizeof(*ip);
+ iptos = ip->ip_tos;
+ tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+ break;
#endif
- }
- /*
- * Convert TCP protocol specific fields to host format.
- */
- tcp_fields_to_host(th);
-
- off = th->th_off << 2;
- if (off < sizeof (struct tcphdr) || off > tlen) {
- KMOD_TCPSTAT_INC(tcps_rcvbadoff);
+ } /* end switch */
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+ tcp_fields_to_host(th);
+ off = th->th_off << 2;
+ if (off < sizeof (struct tcphdr) || off > tlen) {
+ printf("off:%d < hdrlen:%zu || > tlen:%u -- dump\n",
+ off,
+ sizeof(struct tcphdr),
+ tlen);
+ KMOD_TCPSTAT_INC(tcps_rcvbadoff);
m_freem(m);
goto skipped_pkt;
- }
- tlen -= off;
- drop_hdrlen += off;
- /*
- * Now lets setup the timeval to be when we should
- * have been called (if we can).
- */
- m->m_pkthdr.lro_nsegs = 1;
- if (m->m_flags & M_TSTMP_LRO) {
- tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
- tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
+ }
+ tlen -= off;
+ drop_hdrlen += off;
+ /*
+ * Now lets setup the timeval to be when we should
+ * have been called (if we can).
+ */
+ m->m_pkthdr.lro_nsegs = 1;
+ /* Now what about next packet? */
} else {
- /* Should not be should we kassert instead? */
- tcp_get_usecs(&tv);
+ /*
+ * This mbuf is an array of acks that have
+ * been compressed. We assert the inp has
+ * the flag set to enable this!
+ */
+ KASSERT((tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP),
+ ("tp:%p inp:%p no INP_MBUF_ACKCMP flags?", tp, tp->t_inpcb));
+ tlen = 0;
+ drop_hdrlen = 0;
+ th = NULL;
+ iptos = 0;
}
- /* Now what about next packet? */
+ tcp_get_usecs(&tv);
if (m_save || has_pkt)
nxt_pkt = 1;
else
nxt_pkt = 0;
- KMOD_TCPSTAT_INC(tcps_rcvtotal);
+ if ((m->m_flags & M_ACKCMP) == 0)
+ KMOD_TCPSTAT_INC(tcps_rcvtotal);
+ else
+ KMOD_TCPSTAT_ADD(tcps_rcvtotal, (m->m_len / sizeof(struct tcp_ackent)));
+ inp = tp->t_inpcb;
+ INP_WLOCK_ASSERT(inp);
retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
iptos, nxt_pkt, &tv);
if (retval) {
@@ -434,6 +508,7 @@
}
if (no_vn == 0)
CURVNET_RESTORE();
+ INP_UNLOCK_ASSERT(inp);
return(retval);
}
skipped_pkt:
@@ -482,11 +557,6 @@
if (rc_sacked <= ctf_outstanding(tp))
return(ctf_outstanding(tp) - rc_sacked);
else {
- /* TSNH */
-#ifdef INVARIANTS
- panic("tp:%p rc_sacked:%d > out:%d",
- tp, rc_sacked, ctf_outstanding(tp));
-#endif
return (0);
}
}
@@ -502,6 +572,36 @@
tcp_dropwithreset(m, th, NULL, tlen, rstreason);
}
+void
+ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt)
+{
+ if ((ts != NULL) && (cnt != NULL) &&
+ (tcp_ack_war_time_window > 0) &&
+ (tcp_ack_war_cnt > 0)) {
+ /* We are possibly doing ack war prevention */
+ uint32_t cts;
+
+ /*
+ * We use a msec tick here which gives us
+ * roughly 49 days. We don't need the
+ * precision of a microsecond timestamp which
+ * would only give us hours.
+ */
+ cts = tcp_ts_getticks();
+ if (TSTMP_LT((*ts), cts)) {
+ /* Timestamp is in the past */
+ *cnt = 0;
+ *ts = (cts + tcp_ack_war_time_window);
+ }
+ if (*cnt < tcp_ack_war_cnt) {
+ *cnt = (*cnt + 1);
+ tp->t_flags |= TF_ACKNOW;
+ } else
+ tp->t_flags &= ~TF_ACKNOW;
+ } else
+ tp->t_flags |= TF_ACKNOW;
+}
+
/*
* ctf_drop_checks returns 1 for you should not proceed. It places
* in ret_val what should be returned 1/0 by the caller. The 1 indicates
@@ -509,7 +609,10 @@
* TCB is still valid and locked.
*/
int
-ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t *tlenp,
+ int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val,
+ uint32_t *ts, uint32_t *cnt)
{
int32_t todrop;
int32_t thflags;
@@ -543,7 +646,7 @@
* Send an ACK to resynchronize and drop any data.
* But keep on processing for RST or ACK.
*/
- tp->t_flags |= TF_ACKNOW;
+ ctf_ack_war_checks(tp, ts, cnt);
todrop = tlen;
KMOD_TCPSTAT_INC(tcps_rcvduppack);
KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
@@ -555,13 +658,14 @@
* DSACK - add SACK block for dropped range
*/
if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
- tcp_update_sack_list(tp, th->th_seq,
- th->th_seq + todrop);
/*
* ACK now, as the next in-sequence segment
* will clear the DSACK block again
*/
- tp->t_flags |= TF_ACKNOW;
+ ctf_ack_war_checks(tp, ts, cnt);
+ if (tp->t_flags & TF_ACKNOW)
+ tcp_update_sack_list(tp, th->th_seq,
+ th->th_seq + todrop);
}
*drop_hdrlen += todrop; /* drop from the top afterwards */
th->th_seq += todrop;
@@ -590,10 +694,10 @@
* ack.
*/
if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
- tp->t_flags |= TF_ACKNOW;
+ ctf_ack_war_checks(tp, ts, cnt);
KMOD_TCPSTAT_INC(tcps_rcvwinprobe);
} else {
- ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, ts, cnt);
return (1);
}
} else
@@ -614,7 +718,7 @@
* and valid.
*/
void
-ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
+__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val, uint32_t *ts, uint32_t *cnt)
{
/*
* Generate an ACK dropping incoming segment if it occupies sequence
@@ -638,7 +742,7 @@
return;
} else
*ret_val = 0;
- tp->t_flags |= TF_ACKNOW;
+ ctf_ack_war_checks(tp, ts, cnt);
if (m)
m_freem(m);
}
@@ -671,7 +775,7 @@
*/
int dropped = 0;
- if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
+ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
(tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
KASSERT(tp->t_state != TCPS_SYN_SENT,
@@ -680,8 +784,7 @@
if (V_tcp_insecure_rst ||
(tp->last_ack_sent == th->th_seq) ||
- (tp->rcv_nxt == th->th_seq) ||
- ((tp->last_ack_sent - 1) == th->th_seq)) {
+ (tp->rcv_nxt == th->th_seq)) {
KMOD_TCPSTAT_INC(tcps_drops);
/* Drop the connection. */
switch (tp->t_state) {
@@ -748,7 +851,7 @@
}
/*
- * bbr_ts_check returns 1 for you should not proceed, the state
+ * ctf_ts_check returns 1 for you should not proceed, the state
* machine should return. It places in ret_val what should
* be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
* that the TCB is unlocked and probably dropped. The 0 indicates the
@@ -786,6 +889,32 @@
return (0);
}
+int
+ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags)
+{
+
+ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates ts_recent,
+ * the age will be reset later and ts_recent will get a
+ * valid value. If it does not, setting ts_recent to zero
+ * will at least satisfy the requirement that zero be placed
+ * in the timestamp echo reply when ts_recent isn't valid.
+ * The age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be dropped
+ * when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ KMOD_TCPSTAT_INC(tcps_rcvduppack);
+ KMOD_TCPSTAT_INC(tcps_pawsdrop);
+ return (1);
+ }
+ return (0);
+}
+
+
+
void
ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
{
@@ -817,45 +946,7 @@
uint32_t
ctf_fixed_maxseg(struct tcpcb *tp)
{
- int optlen;
-
- if (tp->t_flags & TF_NOOPT)
- return (tp->t_maxseg);
-
- /*
- * Here we have a simplified code from tcp_addoptions(),
- * without a proper loop, and having most of paddings hardcoded.
- * We only consider fixed options that we would send every
- * time I.e. SACK is not considered.
- *
- */
-#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
- if (TCPS_HAVEESTABLISHED(tp->t_state)) {
- if (tp->t_flags & TF_RCVD_TSTMP)
- optlen = TCPOLEN_TSTAMP_APPA;
- else
- optlen = 0;
-#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
- if (tp->t_flags & TF_SIGNATURE)
- optlen += PAD(TCPOLEN_SIGNATURE);
-#endif
- } else {
- if (tp->t_flags & TF_REQ_TSTMP)
- optlen = TCPOLEN_TSTAMP_APPA;
- else
- optlen = PAD(TCPOLEN_MAXSEG);
- if (tp->t_flags & TF_REQ_SCALE)
- optlen += PAD(TCPOLEN_WINDOW);
-#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
- if (tp->t_flags & TF_SIGNATURE)
- optlen += PAD(TCPOLEN_SIGNATURE);
-#endif
- if (tp->t_flags & TF_SACK_PERMIT)
- optlen += PAD(TCPOLEN_SACK_PERMITTED);
- }
-#undef PAD
- optlen = min(optlen, TCP_MAXOLEN);
- return (tp->t_maxseg - optlen);
+ return (tcp_fixed_maxseg(tp));
}
void
Index: sys/netinet/tcp_stacks/tcp_bbr.h
===================================================================
--- sys/netinet/tcp_stacks/tcp_bbr.h
+++ sys/netinet/tcp_stacks/tcp_bbr.h
@@ -71,7 +71,7 @@
uint32_t r_del_time; /* The time of the last delivery update */
uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time
* sent */
- unused_bit:1,
+ r_rtt_not_allowed:1, /* No rtt measurement allowed */
r_is_drain:1, /* In a draining cycle */
r_app_limited:1,/* We went app limited */
r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */
@@ -588,9 +588,9 @@
uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */
- /*- ---
+ /*- ---
* used only initial and close
- */
+ */
uint32_t rc_high_rwnd; /* Highest rwnd seen */
uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */
Index: sys/netinet/tcp_stacks/tcp_rack.h
===================================================================
--- sys/netinet/tcp_stacks/tcp_rack.h
+++ sys/netinet/tcp_stacks/tcp_rack.h
@@ -29,7 +29,7 @@
#define _NETINET_TCP_RACK_H_
#define RACK_ACKED 0x0001/* The remote endpoint acked this */
-#define RACK_TO_MIXED 0x0002/* A timeout occurred that mixed the send order - not used */
+#define RACK_TO_REXT 0x0002/* A timeout occured on this sendmap entry */
#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc - not used */
#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */
#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */
@@ -39,37 +39,94 @@
#define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */
#define RACK_APP_LIMITED 0x0200/* We went app limited after this send */
#define RACK_WAS_ACKED 0x0400/* a RTO undid the ack, but it already had a rtt calc done */
-#define RACK_HAS_SIN 0x0800/* SIN is on this guy */
+#define RACK_HAS_SYN 0x0800/* SYN is on this guy */
+#define RACK_SENT_W_DSACK 0x1000/* Sent with a dsack */
+#define RACK_SENT_SP 0x2000/* sent in slow path */
+#define RACK_SENT_FP 0x4000/* sent in fast path */
+#define RACK_HAD_PUSH 0x8000/* Push was sent on original send */
#define RACK_NUM_OF_RETRANS 3
-#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */
+#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
-#define RACK_REQ_AVG 4 /* Must be less than 256 */
+#define RACK_REQ_AVG 3 /* Must be less than 256 */
struct rack_sendmap {
+ TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
uint32_t r_start; /* Sequence number of the segment */
uint32_t r_end; /* End seq, this is 1 beyond actually */
- TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */
- RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */
uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time
* sent */
uint16_t r_flags; /* Flags as defined above */
- uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
- uint32_t usec_orig_send; /* time of orginal send in useconds */
+ struct mbuf *m;
+ uint32_t soff;
+ uint32_t orig_m_len;
uint32_t r_nseq_appl; /* If this one is app limited, this is the nxt seq limited */
- uint32_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */
uint8_t r_dupack; /* Dup ack count */
uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */
uint8_t r_limit_type; /* is this entry counted against a limit? */
uint8_t r_just_ret : 1, /* After sending, the next pkt was just returned, i.e. limited */
r_one_out_nr : 1, /* Special case 1 outstanding and not in recovery */
- r_avail : 6;
- uint8_t r_resv[36];
+ r_no_rtt_allowed : 1, /* No rtt measurement allowed */
+ r_avail : 5;
+ uint64_t r_tim_lastsent[RACK_NUM_OF_RETRANS];
+ uint64_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */
+ RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */
};
+struct deferred_opt_list {
+ TAILQ_ENTRY(deferred_opt_list) next;
+ int optname;
+ uint64_t optval;
+};
+
+/*
+ * Timestamps in the rack sendmap are now moving to be
+ * uint64_t's. This means that if you want a uint32_t
+ * usec timestamp (the old usecond timestamp) you simply have
+ * to cast it to uint32_t. The reason we do this is not for
+ * wrap, but we need to get back, at times, to the millisecond
+ * timestamp that is used in the TSTMP option. To do this we
+ * can use the rack_ts_to_msec() inline below which can take
+ * the 64bit ts and make into the correct timestamp millisecond
+ * wise. Thats not possible with the 32bit usecond timestamp since
+ * the seconds wrap too quickly to cover all bases.
+ *
+ * There are quite a few places in rack where I simply cast
+ * back to uint32_t and then end up using the TSTMP_XX()
+ * macros. This is ok, but we could do simple compares if
+ * we ever decided to move all of those variables to 64 bits
+ * as well.
+ */
+
+inline uint64_t
+rack_to_usec_ts(struct timeval *tv)
+{
+ return ((tv->tv_sec * HPTS_USEC_IN_SEC) + tv->tv_usec);
+}
+
+inline uint32_t
+rack_ts_to_msec(uint64_t ts)
+{
+ return((uint32_t)(ts / HPTS_MSEC_IN_SEC));
+}
+
+
RB_HEAD(rack_rb_tree_head, rack_sendmap);
TAILQ_HEAD(rack_head, rack_sendmap);
+TAILQ_HEAD(def_opt_head, deferred_opt_list);
+
+/* Map change logging */
+#define MAP_MERGE 0x01
+#define MAP_SPLIT 0x02
+#define MAP_NEW 0x03
+#define MAP_SACK_M1 0x04
+#define MAP_SACK_M2 0x05
+#define MAP_SACK_M3 0x06
+#define MAP_SACK_M4 0x07
+#define MAP_SACK_M5 0x08
+#define MAP_FREE 0x09
+#define MAP_TRIM_HEAD 0x0a
#define RACK_LIMIT_TYPE_SPLIT 1
@@ -128,10 +185,7 @@
#define RACK_TO_FRM_DELACK 6
struct rack_opts_stats {
- uint64_t tcp_rack_prop_rate;
- uint64_t tcp_rack_prop;
uint64_t tcp_rack_tlp_reduce;
- uint64_t tcp_rack_early_recov;
uint64_t tcp_rack_pace_always;
uint64_t tcp_rack_pace_reduce;
uint64_t tcp_rack_max_seg;
@@ -177,6 +231,20 @@
uint64_t tcp_npush;
uint64_t tcp_lscwnd;
uint64_t tcp_profile;
+ uint64_t tcp_hdwr_rate_cap;
+ uint64_t tcp_pacing_rate_cap;
+ uint64_t tcp_pacing_up_only;
+ uint64_t tcp_use_cmp_acks;
+ uint64_t tcp_rack_abc_val;
+ uint64_t tcp_rec_abc_val;
+ uint64_t tcp_rack_measure_cnt;
+ uint64_t tcp_rack_delayed_ack;
+ uint64_t tcp_rack_rtt_use;
+ uint64_t tcp_data_after_close;
+ uint64_t tcp_defer_opt;
+ uint64_t tcp_rack_fastrsm_hack;
+ uint64_t tcp_rack_beta;
+ uint64_t tcp_rack_beta_ecn;
};
/* RTT shrink reasons */
@@ -247,6 +315,23 @@
*/
#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */
+#define RACK_NUM_FSB_DEBUG 16
+struct rack_fast_send_blk {
+ uint32_t left_to_send;
+ uint16_t tcp_ip_hdr_len;
+ uint8_t tcp_flags;
+ uint8_t hoplimit;
+ uint8_t *tcp_ip_hdr;
+ uint32_t recwin;
+ uint32_t off;
+ struct tcphdr *th;
+ struct udphdr *udp;
+ struct mbuf *m;
+ uint32_t o_m_len;
+ uint32_t rfo_apply_push : 1,
+ unused : 31;
+};
+
struct rack_control {
/* Second cache line 0x40 from tcp_rack */
struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */
@@ -255,6 +340,7 @@
* tlp_sending Lock(a) */
struct rack_sendmap *rc_resend; /* something we have been asked to
* resend */
+ struct rack_fast_send_blk fsb; /* The fast-send block */
uint32_t input_pkt;
uint32_t saved_input_pkt;
uint32_t rc_hpts_flags;
@@ -268,6 +354,9 @@
/* Third Cache line 0x80 */
struct rack_head rc_free; /* Allocation array */
+ uint64_t last_hw_bw_req;
+ uint64_t crte_prev_rate;
+ uint64_t bw_rate_cap;
uint32_t rc_time_last_sent; /* Time we last sent some data and
* logged it Lock(a). */
uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */
@@ -342,8 +431,8 @@
uint32_t rc_agg_delayed;
uint32_t rc_tlp_rxt_last_time;
uint32_t rc_saved_cwnd;
- uint32_t rc_gp_output_ts;
- uint32_t rc_gp_cumack_ts;
+ uint64_t rc_gp_output_ts; /* chg*/
+ uint64_t rc_gp_cumack_ts; /* chg*/
struct timeval act_rcv_time;
struct timeval rc_last_time_decay; /* SAD time decay happened here */
uint64_t gp_bw;
@@ -354,6 +443,7 @@
uint64_t last_gp_comp_bw;
uint64_t last_max_bw; /* Our calculated max b/w last */
struct time_filter_small rc_gp_min_rtt;
+ struct def_opt_head opt_list;
int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */
uint32_t rc_gp_srtt; /* Current GP srtt */
uint32_t rc_prev_gp_srtt; /* Previous RTT */
@@ -370,21 +460,40 @@
uint32_t rc_time_of_last_probertt;
uint32_t rc_target_probertt_flight;
uint32_t rc_probertt_sndmax_atexit; /* Highest sent to in probe-rtt */
+ uint32_t rc_cwnd_at_erec;
+ uint32_t rc_ssthresh_at_erec;
+ uint32_t dsack_byte_cnt;
+ uint32_t retran_during_recovery;
uint32_t rc_gp_lowrtt; /* Lowest rtt seen during GPUT measurement */
uint32_t rc_gp_high_rwnd; /* Highest rwnd seen during GPUT measurement */
+ uint32_t rc_snd_max_at_rto; /* For non-sack when the RTO occured what was snd-max */
+ uint32_t rc_out_at_rto;
int32_t rc_scw_index;
uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
+ uint32_t rc_last_timeout_snduna;
+ uint32_t challenge_ack_ts;
+ uint32_t challenge_ack_cnt;
+ uint32_t rc_min_to; /* Socket option value Lock(a) */
+ uint32_t rc_pkt_delay; /* Socket option value Lock(a) */
+ struct newreno rc_saved_beta; /*
+ * For newreno cc:
+ * rc_saved_cc are the values we have had
+ * set by the user, if pacing is not happening
+ * (i.e. its early and we have not turned on yet
+ * or it was turned off). The minute pacing
+ * is turned on we pull out the values currently
+ * being used by newreno and replace them with
+ * these values, then save off the old values here,
+ * we also set the flag (if ecn_beta is set) to make
+ * new_reno do less of a backoff for ecn (think abe).
+ */
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
- uint16_t rc_pkt_delay; /* Socket option value Lock(a) */
uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */
- uint8_t num_avg; /* average count before we go to normal decay */
- uint8_t rc_prop_rate; /* Socket option value Lock(a) */
- uint8_t rc_prop_reduce; /* Socket option value Lock(a) */
+ uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */
+ uint8_t req_measurements; /* How many measurements are required? */
uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
- uint8_t rc_early_recovery; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
- uint8_t rc_min_to; /* Socket option value Lock(a) */
uint8_t rc_rate_sample_method;
uint8_t rc_gp_hist_idx;
};
@@ -402,21 +511,57 @@
int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */
struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
struct inpcb *rc_inp; /* The inpcb Lock(a) */
- uint32_t rc_free_cnt; /* Number of free entries on the rc_free list
+ uint8_t rc_free_cnt; /* Number of free entries on the rc_free list
* Lock(a) */
+ uint8_t client_bufferlvl; /* 0 - 5 normaly, less than or at 2 means its real low */
+ uint8_t no_prr_addback : 1,
+ gp_ready : 1,
+ defer_options: 1,
+ fast_rsm_hack: 1,
+ rc_ack_can_sendout_data: 1, /*
+ * If set it will override pacing restrictions on not sending
+ * data when the pacing timer is running. I.e. you set this
+ * and an ACK will send data. Default is off and its only used
+ * without pacing when we are doing 5G speed up for there
+ * ack filtering.
+ */
+ rc_pacing_cc_set: 1, /*
+ * If we are pacing (pace_always=1) and we have reached the
+ * point where we start pacing (fixed or gp has reached its
+ * magic gp_ready state) this flag indicates we have set in
+ * values to effect CC's backoff's. If pacing is turned off
+ * then we must restore the values saved in rc_saved_beta,
+ * if its going to gp_ready we need to copy the values into
+ * the CC module and set our flags.
+ *
+ * Note this only happens if the cc name is newreno (CCALGONAME_NEWRENO).
+ */
+
+ avail :2;
+ uint8_t avail_bytes;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */
r_limit_scw : 1,
- r_avail_bits : 10; /* Available */
-
- uint16_t rc_user_set_max_segs; /* Socket option value Lock(a) */
+ r_must_retran : 1, /* For non-sack customers we hit an RTO and new data should be resends */
+ r_use_cmp_ack: 1, /* Do we use compressed acks */
+ r_ent_rec_ns: 1, /* We entered recovery and have not sent */
+ r_might_revert: 1, /* Flag to find out if we might need to revert */
+ r_fast_output: 1, /* Fast output is in progress we can skip the bulk of rack_output */
+ r_fsb_inited: 1,
+ r_rack_hw_rate_caps: 1,
+ r_up_only: 1,
+ r_via_fill_cw : 1,
+ r_fill_less_agg : 1;
+
+ uint8_t rc_user_set_max_segs; /* Socket option value Lock(a) */
+ uint8_t rc_labc; /* Appropriate Byte Counting Value */
uint16_t forced_ack : 1,
rc_gp_incr : 1,
rc_gp_bwred : 1,
rc_gp_timely_inc_cnt : 3,
rc_gp_timely_dec_cnt : 3,
- rc_not_backing_off: 1,
+ r_use_labc_for_rec: 1,
rc_highly_buffered: 1, /* The path is highly buffered */
rc_dragged_bottom: 1,
rc_dack_mode : 1, /* Mac O/S emulation of d-ack */
@@ -435,7 +580,7 @@
rc_always_pace : 1, /* Socket option value Lock(a) */
rc_pace_to_cwnd : 1,
rc_pace_fill_if_rttin_range : 1,
- xxx_avail_bits : 1;
+ rc_srtt_measure_made : 1;
uint8_t app_limited_needs_set : 1,
use_fixed_rate : 1,
rc_has_collapsed : 1,
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -193,6 +193,16 @@
&tcp_sad_low_pps, 100,
"What is the input pps that below which we do not decay?");
#endif
+uint32_t tcp_ack_war_time_window = 1000;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow,
+ CTLFLAG_RW,
+ &tcp_ack_war_time_window, 1000,
+ "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?");
+uint32_t tcp_ack_war_cnt = 5;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt,
+ CTLFLAG_RW,
+ &tcp_ack_war_cnt, 5,
+ "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?");
struct rwlock tcp_function_lock;
@@ -268,6 +278,18 @@
&VNET_NAME(tcp_ts_offset_per_conn), 0,
"Initialize TCP timestamps per connection instead of per host pair");
+/* How many connections are pacing */
+static volatile uint32_t number_of_tcp_connections_pacing = 0;
+static uint32_t shadow_num_connections = 0;
+
+static int tcp_pacing_limit = 10000;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
+ &tcp_pacing_limit, 1000,
+ "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
+
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
+ &shadow_num_connections, 0, "Number of TCP connections being paced");
+
static int tcp_log_debug = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
@@ -3511,6 +3533,54 @@
return (tp->t_maxseg - optlen);
}
+
+u_int
+tcp_fixed_maxseg(const struct tcpcb *tp)
+{
+ int optlen;
+
+ if (tp->t_flags & TF_NOOPT)
+ return (tp->t_maxseg);
+
+ /*
+ * Here we have a simplified code from tcp_addoptions(),
+ * without a proper loop, and having most of paddings hardcoded.
+ * We only consider fixed options that we would send every
+ * time I.e. SACK is not considered. This is important
+ * for cc modules to figure out what the modulo of the
+ * cwnd should be.
+ */
+#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (tp->t_flags & TF_RCVD_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = 0;
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ } else {
+ if (tp->t_flags & TF_REQ_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = PAD(TCPOLEN_MAXSEG);
+ if (tp->t_flags & TF_REQ_SCALE)
+ optlen += PAD(TCPOLEN_WINDOW);
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ if (tp->t_flags & TF_SACK_PERMIT)
+ optlen += PAD(TCPOLEN_SACK_PERMITTED);
+ }
+#undef PAD
+ optlen = min(optlen, TCP_MAXOLEN);
+ return (tp->t_maxseg - optlen);
+}
+
+
+
static int
sysctl_drop(SYSCTL_HANDLER_ARGS)
{
@@ -3972,3 +4042,38 @@
}
}
}
+
+int
+tcp_can_enable_pacing(void)
+{
+
+ if ((tcp_pacing_limit == -1) ||
+ (tcp_pacing_limit > number_of_tcp_connections_pacing)) {
+ atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1);
+ shadow_num_connections = number_of_tcp_connections_pacing;
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+static uint8_t tcp_pacing_warning = 0;
+
+void
+tcp_decrement_paced_conn(void)
+{
+ uint32_t ret;
+
+ ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1);
+ shadow_num_connections = number_of_tcp_connections_pacing;
+ KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?"));
+ if (ret == 0) {
+ if (tcp_pacing_limit != -1) {
+ printf("Warning all pacing is now disabled, count decrements invalidly!\n");
+ tcp_pacing_limit = 0;
+ } else if (tcp_pacing_warning == 0) {
+ printf("Warning pacing count is invalid, invalid decrement\n");
+ tcp_pacing_warning = 1;
+ }
+ }
+}
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -258,6 +258,10 @@
tcp_seq gput_seq; /* Outbound measurement seq */
tcp_seq gput_ack; /* Inbound measurement ack */
int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */
+ uint32_t t_maxpeakrate; /* max peak rate set by user, in bytes/s */
+ uint32_t t_sndtlppack; /* tail loss probe packets sent */
+ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */
+
uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
uint32_t t_end_info_status; /* Status flag of end info */
unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */
@@ -974,6 +978,7 @@
void cc_conn_init(struct tcpcb *tp);
void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos);
+void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos);
void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
#ifdef TCP_HHOOK
void hhook_run_tcp_est_in(struct tcpcb *tp,
@@ -1022,10 +1027,13 @@
extern int32_t tcp_map_minimum;
extern int32_t tcp_attack_on_turns_on_logging;
#endif
+extern uint32_t tcp_ack_war_time_window;
+extern uint32_t tcp_ack_war_cnt;
uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
u_int tcp_maxseg(const struct tcpcb *);
+u_int tcp_fixed_maxseg(const struct tcpcb *);
void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
struct tcp_ifcap *);
void tcp_mss(struct tcpcb *, int);
@@ -1075,6 +1083,7 @@
tcp_seq tcp_new_isn(struct in_conninfo *);
int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+int tcp_dsack_block_exists(struct tcpcb *);
void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
void tcp_clean_dsack_blocks(struct tcpcb *tp);
@@ -1090,6 +1099,9 @@
void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
size_t seed_len);
+int tcp_can_enable_pacing(void);
+void tcp_decrement_paced_conn(void);
+
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);

File Metadata

Mime Type
text/plain
Expires
Tue, Nov 19, 2:29 PM (22 h, 1 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14718112
Default Alt Text
D30036.diff (483 KB)

Event Timeline