Page MenuHomeFreeBSD

D43986.diff
No OneTemporary

D43986.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@@ -5,7 +5,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
-SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c
+SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c rack_pcm.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_kern_tls.h
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -334,9 +334,22 @@
#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */
#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */
-#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */
+#define TCP_POLICER_DETECT 1149 /* Do we apply a thresholds to rack to detect and compensate for policers? */
+#define TCP_RXT_CLAMP TCP_POLICER_DETECT
#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */
#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */
+#define TCP_SS_EEXIT 1152 /* Do we do early exit from slowtart if no b/w growth */
+#define TCP_DGP_UPPER_BOUNDS 1153 /* SS and CA upper bound in percentage */
+#define TCP_NO_TIMELY 1154 /* Disable/enable Timely */
+#define TCP_HONOR_HPTS_MIN 1155 /* Do we honor hpts min to */
+#define TCP_REC_IS_DYN 1156 /* Do we allow timely to change recovery multiplier? */
+#define TCP_SIDECHAN_DIS 1157 /* Disable/enable the side-channel */
+#define TCP_FILLCW_RATE_CAP 1158 /* Set a cap for DGP's fillcw */
+#define TCP_POLICER_MSS 1159 /* Policer MSS requirement */
+#define TCP_STACK_SPEC_INFO 1160 /* Get stack specific information (if present) */
+#define RACK_CSPR_IS_FCC 1161
+#define TCP_GP_USE_LTBW 1162 /* how we use lt_bw 0=not, 1=min, 2=max */
+
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@@ -447,6 +460,7 @@
u_int32_t tcpi_rcv_adv; /* Peer advertised window */
u_int32_t tcpi_dupacks; /* Consecutive dup ACKs recvd */
+ u_int32_t tcpi_rttmin; /* Min observed RTT */
/* Padding to grow without breaking ABI. */
u_int32_t __tcpi_pad[14]; /* Padding. */
};
@@ -463,6 +477,20 @@
#define TCP_FUNCTION_NAME_LEN_MAX 32
+struct stack_specific_info {
+ char stack_name[TCP_FUNCTION_NAME_LEN_MAX];
+ uint64_t policer_last_bw; /* Only valid if detection enabled and policer detected */
+ uint64_t bytes_transmitted;
+ uint64_t bytes_retransmitted;
+ uint32_t policer_detection_enabled: 1,
+ policer_detected : 1, /* transport thinks a policer is on path */
+ highly_buffered : 1, /* transport considers the path highly buffered */
+ spare : 29;
+ uint32_t policer_bucket_size; /* Only valid if detection enabled and policer detected */
+ uint32_t current_round;
+ uint32_t _rack_i_pad[18];
+};
+
struct tcp_function_set {
char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
uint32_t pcbcnt;
@@ -488,6 +516,7 @@
uint64_t start;
uint64_t end;
uint32_t flags;
+ uint32_t playout_ms;
};
union tcp_log_userdata {
@@ -518,9 +547,12 @@
#define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */
#define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */
#define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */
-#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */
+#define TCP_HAS_PLAYOUT_MS 0x0040 /* The client included the chunk playout milliseconds: deprecate */
+/* the below are internal only flags */
+#define TCP_HYBRID_PACING_USER_MASK 0x0FFF /* Non-internal flags mask */
+#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tells us we set the mss on this entry */
#define TCP_HYBRID_PACING_WASSET 0x2000 /* We init to this to know if a hybrid command was issued */
-
+#define TCP_HYBRID_PACING_SENDTIME 0x4000 /* Duplicate tm to last, use sendtime for catch up mode */
struct tcp_hybrid_req {
struct tcp_snd_req req;
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -267,7 +267,9 @@
TCP_RACK_TP_TRIGGERED, /* A rack tracepoint is triggered 68 */
TCP_HYBRID_PACING_LOG, /* Hybrid pacing log 69 */
TCP_LOG_PRU, /* TCP protocol user request 70 */
- TCP_LOG_END /* End (keep at end) 71 */
+ TCP_POLICER_DET, /* TCP Policer detectionn 71 */
+ TCP_PCM_MEASURE, /* TCP Path Capacity Measurement 72 */
+ TCP_LOG_END /* End (keep at end) 72 */
};
enum tcp_log_states {
@@ -371,10 +373,11 @@
#define TCP_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */
#define TCP_TP_REQ_LOG_FAIL 0x00000005 /* We tried to allocate a Request log but had no space */
#define TCP_TP_RESET_RCV 0x00000006 /* Triggers when we receive a RST */
-#define TCP_TP_EXCESS_RXT 0x00000007 /* When we get excess RXT's clamping the cwnd */
+#define TCP_TP_POLICER_DET 0x00000007 /* When we detect a policer */
+#define TCP_TP_EXCESS_RXT TCP_TP_POLICER_DET /* alias */
#define TCP_TP_SAD_TRIGGERED 0x00000008 /* Sack Attack Detection triggers */
-
#define TCP_TP_SAD_SUSPECT 0x0000000a /* A sack has supicious information in it */
+#define TCP_TP_PACED_BOTTOM 0x0000000b /* We have paced at the bottom */
#ifdef _KERNEL
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -11529,7 +11529,9 @@
bbr_set_pktepoch(bbr, cts, __LINE__);
bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
if (nxt_pkt == 0) {
- if (bbr->r_wanted_output != 0) {
+ if ((bbr->r_wanted_output != 0) ||
+ (tp->t_flags & TF_ACKNOW)) {
+
bbr->rc_output_starts_timer = 0;
did_out = 1;
if (tcp_output(tp) < 0)
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -142,9 +142,12 @@
#define V_newreno_beta VNET(newreno_beta)
#define V_newreno_beta_ecn VNET(newreno_beta_ecn)
+#define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME)
+#define M_TCPDO __CONCAT(M_TCPDO, STACKNAME)
-MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
-MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
+MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block");
+MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options");
+MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information");
struct sysctl_ctx_list rack_sysctl_ctx;
struct sysctl_oid *rack_sysctl_root;
@@ -190,12 +193,24 @@
static int32_t rack_reorder_thresh = 2;
static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
* - 60 seconds */
-static uint32_t rack_clamp_ss_upper = 110;
-static uint32_t rack_clamp_ca_upper = 105;
-static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */
-static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */
-static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */
-static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */
+static uint16_t rack_policer_rxt_thresh= 0; /* 499 = 49.9%, 0 is off */
+static uint8_t rack_policer_avg_thresh = 0; /* 3.2 */
+static uint8_t rack_policer_med_thresh = 0; /* 1 - 16 */
+static uint16_t rack_policer_bucket_reserve = 20; /* How much % is reserved in the bucket */
+static uint64_t rack_pol_min_bw = 125000; /* 1mbps in Bytes per sec */
+static uint32_t rack_policer_data_thresh = 64000; /* 64,000 bytes must be sent before we engage */
+static uint32_t rack_policing_do_bw_comp = 1;
+static uint32_t rack_pcm_every_n_rounds = 100;
+static uint32_t rack_pcm_blast = 0;
+static uint32_t rack_pcm_is_enabled = 1;
+static uint8_t rack_req_del_mss = 18; /* How many segments need to be sent in a recovery episode to do policer_detection */
+static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
+
+static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */
+static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
+
+
+static int32_t rack_rxt_scoreboard_clear_thresh = 2;
static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */
static int32_t rack_rxt_controls = 0;
static int32_t rack_fill_cw_state = 0;
@@ -217,9 +232,8 @@
static int32_t rack_apply_rtt_with_reduced_conf = 0;
static int32_t rack_hibeta_setting = 0;
static int32_t rack_default_pacing_divisor = 250;
-static int32_t rack_uses_full_dgp_in_rec = 1;
static uint16_t rack_pacing_min_seg = 0;
-
+static int32_t rack_timely_off = 0;
static uint32_t sad_seg_size_per = 800; /* 80.0 % */
static int32_t rack_pkt_delay = 1000;
@@ -235,7 +249,7 @@
static int32_t rack_max_abc_post_recovery = 2;
static int32_t rack_client_low_buf = 0;
static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
-static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */
+static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */
#ifdef TCP_ACCOUNTING
static int32_t rack_tcp_accounting = 0;
#endif
@@ -247,8 +261,9 @@
static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
static int32_t rack_persist_min = 250000; /* 250usec */
static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
+static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */
+static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */
static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
-static int32_t rack_default_init_window = 0; /* Use system default */
static int32_t rack_limit_time_with_srtt = 0;
static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */
@@ -282,7 +297,6 @@
static int32_t rack_def_profile = 0;
static int32_t rack_lower_cwnd_at_tlp = 0;
-static int32_t rack_limited_retran = 0;
static int32_t rack_always_send_oldest = 0;
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
@@ -356,6 +370,7 @@
static int32_t rack_down_raise_thresh = 100;
static int32_t rack_req_segs = 1;
static uint64_t rack_bw_rate_cap = 0;
+static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */
/* Rack specific counters */
@@ -377,6 +392,7 @@
counter_u64_t rack_tlp_retran_bytes;
counter_u64_t rack_to_tot;
counter_u64_t rack_hot_alloc;
+counter_u64_t tcp_policer_detected;
counter_u64_t rack_to_alloc;
counter_u64_t rack_to_alloc_hard;
counter_u64_t rack_to_alloc_emerg;
@@ -440,7 +456,7 @@
static int
rack_process_ack(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to,
- uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
+ uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen);
static int
rack_process_data(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
@@ -454,6 +470,8 @@
static struct rack_sendmap *
rack_check_recovery_mode(struct tcpcb *tp,
uint32_t tsused);
+static uint32_t
+rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack);
static void
rack_cong_signal(struct tcpcb *tp,
uint32_t type, uint32_t ack, int );
@@ -504,13 +522,14 @@
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
- struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz);
+ struct rack_sendmap *hintrsm, uint32_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz);
static uint64_t rack_get_gp_est(struct tcp_rack *rack);
+
static void
rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm);
+ struct rack_sendmap *rsm, uint32_t cts);
static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
static int32_t rack_output(struct tcpcb *tp);
@@ -526,10 +545,10 @@
static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz);
+ struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint32_t add_flag, int segsiz);
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz);
+ struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz);
static int
rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
@@ -538,6 +557,10 @@
rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
+
+static void
+rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz);
+
static int
rack_do_closing(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
@@ -720,6 +743,22 @@
rack_swap_beta_values(rack, 4);
}
+static void
+rack_remove_pacing(struct tcp_rack *rack)
+{
+ if (rack->rc_pacing_cc_set)
+ rack_undo_cc_pacing(rack);
+ if (rack->r_ctl.pacing_method & RACK_REG_PACING)
+ tcp_decrement_paced_conn();
+ if (rack->r_ctl.pacing_method & RACK_DGP_PACING)
+ tcp_dec_dgp_pacing_cnt();
+ rack->rc_always_pace = 0;
+ rack->r_ctl.pacing_method = RACK_PACING_NONE;
+ rack->dgp_on = 0;
+ rack->rc_hybrid_mode = 0;
+ rack->use_fixed_rate = 0;
+}
+
static void
rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t,
uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm)
@@ -742,6 +781,8 @@
log.u_bbr.pkts_out = line;
log.u_bbr.cwnd_gain = rack->app_limited_needs_set;
log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt;
+ log.u_bbr.epoch = rack->r_ctl.current_round;
+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
if (rsm != NULL) {
log.u_bbr.applimited = rsm->r_start;
log.u_bbr.delivered = rsm->r_end;
@@ -857,6 +898,7 @@
struct sysctl_oid *rack_measure;
struct sysctl_oid *rack_probertt;
struct sysctl_oid *rack_hw_pacing;
+ struct sysctl_oid *rack_policing;
rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -994,11 +1036,36 @@
"pacing",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Pacing related Controls");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "pcm_enabled", CTLFLAG_RW,
+ &rack_pcm_is_enabled, 1,
+ "Do we by default do PCM measurements?");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "pcm_rnds", CTLFLAG_RW,
+ &rack_pcm_every_n_rounds, 100,
+ "How many rounds before we need to do a PCM measurement");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "pcm_blast", CTLFLAG_RW,
+ &rack_pcm_blast, 0,
+ "Blast out the full cwnd/rwnd when doing a PCM measurement");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "rnd_gp_gain", CTLFLAG_RW,
+ &rack_gp_gain_req, 1200,
+ "How much do we have to increase the GP to record the round 1200 = 120.0");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW,
+ &rack_rnd_cnt_req, 0x10005,
+ "How many rounds less than rnd_gp_gain will drop us out of SS");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
- OID_AUTO, "fulldgpinrec", CTLFLAG_RW,
- &rack_uses_full_dgp_in_rec, 1,
- "Do we use all DGP features in recovery (fillcw, timely et.al.)?");
+ OID_AUTO, "no_timely", CTLFLAG_RW,
+ &rack_timely_off, 0,
+ "Do we not use timely in DGP?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "fullbufdisc", CTLFLAG_RW,
@@ -1017,13 +1084,13 @@
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "divisor", CTLFLAG_RW,
- &rack_default_pacing_divisor, 4,
+ &rack_default_pacing_divisor, 250,
"What is the default divisor given to the rl code?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "fillcw_max_mult", CTLFLAG_RW,
- &rack_bw_multipler, 2,
- "What is the multiplier of the current gp_est that fillcw can increase the b/w too?");
+ &rack_bw_multipler, 0,
+ "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "max_pace_over", CTLFLAG_RW,
@@ -1039,11 +1106,6 @@
OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
&rack_limit_time_with_srtt, 0,
"Do we limit pacing time based on srtt");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_pacing),
- OID_AUTO, "init_win", CTLFLAG_RW,
- &rack_default_init_window, 0,
- "Do we have a rack initial window 0 = system default");
SYSCTL_ADD_U16(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_pacing),
OID_AUTO, "gp_per_ss", CTLFLAG_RW,
@@ -1079,6 +1141,11 @@
OID_AUTO, "rate_cap", CTLFLAG_RW,
&rack_bw_rate_cap, 0,
"If set we apply this value to the absolute rate cap used by pacing");
+ SYSCTL_ADD_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_pacing),
+ OID_AUTO, "fillcw_cap", CTLFLAG_RW,
+ &rack_fillcw_bw_cap, 3750000,
+ "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?");
SYSCTL_ADD_U8(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
@@ -1317,11 +1384,6 @@
OID_AUTO, "send_oldest", CTLFLAG_RW,
&rack_always_send_oldest, 0,
"Should we always send the oldest TLP and RACK-TLP");
- SYSCTL_ADD_S32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_tlp),
- OID_AUTO, "rack_tlimit", CTLFLAG_RW,
- &rack_limited_retran, 0,
- "How many times can a rack timeout drive out sends");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_tlp),
OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
@@ -1355,6 +1417,26 @@
"timers",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Timer related controls");
+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_timers),
+ OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW,
+ &rack_ssthresh_rest_rto_rec, 0,
+ "When doing recovery -> rto -> recovery do we reset SSthresh?");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_timers),
+ OID_AUTO, "scoreboard_thresh", CTLFLAG_RW,
+ &rack_rxt_scoreboard_clear_thresh, 2,
+ "How many RTO's are allowed before we clear the scoreboard");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_timers),
+ OID_AUTO, "honor_hpts_min", CTLFLAG_RW,
+ &rack_honors_hpts_min_to, 1,
+ "Do rack pacing timers honor hpts min timeout");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_timers),
+ OID_AUTO, "hpts_max_reduce", CTLFLAG_RW,
+ &rack_max_reduce, 10,
+ "Max percentage we will reduce slot by for pacing when we are behind");
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_timers),
OID_AUTO, "persmin", CTLFLAG_RW,
@@ -1434,11 +1516,6 @@
"features",
CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"Feature controls");
- SYSCTL_ADD_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_features),
- OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW,
- &rack_rxt_clamp_thresh, 0,
- "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_features),
OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW,
@@ -1474,6 +1551,53 @@
OID_AUTO, "hystartplusplus", CTLFLAG_RW,
&rack_do_hystart, 0,
"Should RACK enable HyStart++ on connections?");
+ /* Policer detection */
+ rack_policing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO,
+ "policing",
+ CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "policer detection");
+ SYSCTL_ADD_U16(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "rxt_thresh", CTLFLAG_RW,
+ &rack_policer_rxt_thresh, 0,
+ "Percentage of retransmits we need to be a possible policer (499 = 49.9 percent)");
+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "avg_thresh", CTLFLAG_RW,
+ &rack_policer_avg_thresh, 0,
+ "What threshold of average retransmits needed to recover a lost packet (1 - 169 aka 21 = 2.1)?");
+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "med_thresh", CTLFLAG_RW,
+ &rack_policer_med_thresh, 0,
+ "What threshold of Median retransmits needed to recover a lost packet (1 - 16)?");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "data_thresh", CTLFLAG_RW,
+ &rack_policer_data_thresh, 64000,
+ "How many bytes must have gotten through before we can start doing policer detection?");
+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "bwcomp", CTLFLAG_RW,
+ &rack_policing_do_bw_comp, 1,
+ "Do we raise up low b/w so that at least pace_max_seg can be sent in the srtt?");
+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "recmss", CTLFLAG_RW,
+ &rack_req_del_mss, 18,
+ "How many MSS must be delivered during recovery to engage policer detection?");
+ SYSCTL_ADD_U16(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "res_div", CTLFLAG_RW,
+ &rack_policer_bucket_reserve, 20,
+ "What percentage is reserved in the policer bucket?");
+ SYSCTL_ADD_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_policing),
+ OID_AUTO, "min_comp_bw", CTLFLAG_RW,
+ &rack_pol_min_bw, 125000,
+ "Do we have a min b/w for b/w compensation (0 = no)?");
/* Misc rack controls */
rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -1578,31 +1702,8 @@
OID_AUTO, "autoscale", CTLFLAG_RW,
&rack_autosndbuf_inc, 20,
"What percentage should rack scale up its snd buffer by?");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW,
- &rack_rxt_min_rnds, 10,
- "Number of rounds needed between RTT clamps due to high loss rates");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW,
- &rack_unclamp_round_thresh, 100,
- "Number of rounds needed with no loss to unclamp");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW,
- &rack_unclamp_rxt_thresh, 5,
- "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "clamp_ss_upper", CTLFLAG_RW,
- &rack_clamp_ss_upper, 110,
- "Clamp percentage ceiling in SS?");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_misc),
- OID_AUTO, "clamp_ca_upper", CTLFLAG_RW,
- &rack_clamp_ca_upper, 110,
- "Clamp percentage ceiling in CA?");
+
+
/* Sack Attacker detection stuff */
SYSCTL_ADD_U32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_attack),
@@ -1779,6 +1880,13 @@
OID_AUTO, "alloc_hot", CTLFLAG_RD,
&rack_hot_alloc,
"Total allocations from the top of our list");
+ tcp_policer_detected = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "policer_detected", CTLFLAG_RD,
+ &tcp_policer_detected,
+ "Total policer_detections");
+
rack_to_alloc = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -1957,17 +2065,8 @@
static uint32_t
rc_init_window(struct tcp_rack *rack)
{
- uint32_t win;
+ return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
- if (rack->rc_init_win == 0) {
- /*
- * Nothing set by the user, use the system stack
- * default.
- */
- return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
- }
- win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
- return (win);
}
static uint64_t
@@ -2071,6 +2170,7 @@
off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
#endif
+ log.u_bbr.inhpts = 1;
log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs);
log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs);
log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags;
@@ -2116,9 +2216,24 @@
memset(&log, 0, sizeof(log));
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
log.u_bbr.delRate = cur->sent_at_fs;
- log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes;
+
+ if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) {
+ /*
+ * We did not get a new Rules Applied to set so
+ * no overlapping send occured, this means the
+ * current byte counts are correct.
+ */
+ log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
+ log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes;
+ } else {
+ /*
+ * Overlapping send case, we switched to a new
+ * send and did a rules applied.
+ */
+ log.u_bbr.cur_del_rate = cur->sent_at_ls;
+ log.u_bbr.rttProp = cur->rxt_at_ls;
+ }
log.u_bbr.bw_inuse = cur->rxt_at_fs;
log.u_bbr.cwnd_gain = line;
off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
@@ -2138,6 +2253,7 @@
log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff);
/* now set all the flags in */
log.u_bbr.pkts_out = cur->hybrid_flags;
+ log.u_bbr.lost = cur->playout_ms;
log.u_bbr.flex6 = cur->flags;
/*
* Last send time = <flex5 | pkt_epoch> note we do not distinguish cases
@@ -2146,6 +2262,20 @@
*/
log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff);
log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff);
+ /*
+ * Compose bbr_state to be a bit wise 0000ADHF
+ * where A is the always_pace flag
+ * where D is the dgp_on flag
+ * where H is the hybrid_mode on flag
+ * where F is the use_fixed_rate flag.
+ */
+ log.u_bbr.bbr_state = rack->rc_always_pace;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->dgp_on;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->use_fixed_rate;
log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST;
tcp_log_event(rack->rc_tp, NULL,
@@ -2299,6 +2429,7 @@
#ifdef TCP_REQUEST_TRK
if (rack->rc_hybrid_mode &&
rack->rc_catch_up &&
+ (rack->r_ctl.rc_last_sft != NULL) &&
(rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
(rack_hybrid_allow_set_maxseg == 1) &&
((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
@@ -2338,7 +2469,10 @@
*/
uint64_t srtt;
- lt_bw = rack_get_lt_bw(rack);
+ if (rack->dis_lt_bw == 1)
+ lt_bw = 0;
+ else
+ lt_bw = rack_get_lt_bw(rack);
if (lt_bw) {
/*
* No goodput bw but a long-term b/w does exist
@@ -2374,19 +2508,22 @@
/* Still doing initial average must calculate */
bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1);
}
+ if (rack->dis_lt_bw) {
+ /* We are not using lt-bw */
+ ret_bw = bw;
+ goto compensate;
+ }
lt_bw = rack_get_lt_bw(rack);
if (lt_bw == 0) {
/* If we don't have one then equate it to the gp_bw */
lt_bw = rack->r_ctl.gp_bw;
}
- if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){
- /* if clamped take the lowest */
+ if (rack->use_lesser_lt_bw) {
if (lt_bw < bw)
ret_bw = lt_bw;
else
ret_bw = bw;
} else {
- /* If not set for clamped to get lowest, take the highest */
if (lt_bw > bw)
ret_bw = lt_bw;
else
@@ -2487,6 +2624,8 @@
log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
log.u_bbr.flex8 = mod;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.epoch = rack->r_ctl.current_round;
+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2535,6 +2674,8 @@
else
log.u_bbr.cur_del_rate = 0;
log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
+ log.u_bbr.epoch = rack->r_ctl.current_round;
+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2552,28 +2693,9 @@
uint64_t bw_est, high_rate;
uint64_t gain;
- if ((rack->r_pacing_discount == 0) ||
- (rack_full_buffer_discount == 0)) {
- /*
- * No buffer level based discount from client buffer
- * level is enabled or the feature is disabled.
- */
- gain = (uint64_t)rack_get_output_gain(rack, rsm);
- bw_est = bw * gain;
- bw_est /= (uint64_t)100;
- } else {
- /*
- * We have a discount in place apply it with
- * just a 100% gain (we get no boost if the buffer
- * is full).
- */
- uint64_t discount;
-
- discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm);
- discount /= 100;
- /* What %% of the b/w do we discount */
- bw_est = bw - discount;
- }
+ gain = (uint64_t)rack_get_output_gain(rack, rsm);
+ bw_est = bw * gain;
+ bw_est /= (uint64_t)100;
/* Never fall below the minimum (def 64kbps) */
if (bw_est < RACK_MIN_BW)
bw_est = RACK_MIN_BW;
@@ -2659,6 +2781,8 @@
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
log.u_bbr.pacing_gain = rack->r_must_retran;
+ log.u_bbr.epoch = rack->r_ctl.current_round;
+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2698,6 +2822,10 @@
log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
log.u_bbr.lost = rack_rto_min;
log.u_bbr.epoch = rack->r_ctl.roundends;
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
+ log.u_bbr.applimited = rack->rc_tp->t_flags2;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2731,6 +2859,9 @@
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
log.u_bbr.pacing_gain = rack->r_must_retran;
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2780,6 +2911,9 @@
log.u_bbr.lost = 0;
else
log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2927,6 +3061,9 @@
log.u_bbr.flex4 = where;
log.u_bbr.flex7 = 2;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2939,7 +3076,7 @@
static void
rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho)
{
- if (tcp_bblogging_on(rack->rc_tp)) {
+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
struct timeval tv;
@@ -2951,6 +3088,9 @@
log.u_bbr.flex7 = 3;
log.u_bbr.rttProp = tsv;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -2979,6 +3119,9 @@
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
log.u_bbr.pacing_gain = rack->r_must_retran;
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -3051,6 +3194,13 @@
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
log.u_bbr.pacing_gain = rack->r_must_retran;
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
+ log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat;
+ log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat;
+ log.u_bbr.lost = rack->rc_tp->t_srtt;
+ log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -3112,6 +3262,9 @@
log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
log.u_bbr.pacing_gain = rack->r_must_retran;
log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -3146,6 +3299,9 @@
log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
log.u_bbr.pacing_gain = rack->r_must_retran;
+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse <<= 32;
+ log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -3314,6 +3470,7 @@
counter_u64_free(rack_saw_enobuf_hw);
counter_u64_free(rack_saw_enetunreach);
counter_u64_free(rack_hot_alloc);
+ counter_u64_free(tcp_policer_detected);
counter_u64_free(rack_to_alloc);
counter_u64_free(rack_to_alloc_hard);
counter_u64_free(rack_to_alloc_emerg);
@@ -3475,6 +3632,8 @@
rack->r_ctl.rc_num_split_allocs--;
}
if (rsm == rack->r_ctl.rc_first_appl) {
+ rack->r_ctl.cleared_app_ack_seq = rsm->r_start + (rsm->r_end - rsm->r_start);
+ rack->r_ctl.cleared_app_ack = 1;
if (rack->r_ctl.rc_app_limited_cnt == 0)
rack->r_ctl.rc_first_appl = NULL;
else
@@ -3490,7 +3649,7 @@
rack->r_ctl.rc_sacklast = NULL;
memset(rsm, 0, sizeof(struct rack_sendmap));
/* Make sure we are not going to overrun our count limit of 0xff */
- if ((rack->rc_free_cnt + 1) > 0xff) {
+ if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) {
rack_free_trim(rack);
}
TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
@@ -3806,6 +3965,8 @@
logged = 0;
+ if (rack->rc_skip_timely)
+ return;
if (override) {
/*
* override is passed when we are
@@ -3976,6 +4137,8 @@
uint64_t logvar, logvar2, logvar3;
uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
+ if (rack->rc_skip_timely)
+ return;
if (rack->rc_gp_incr) {
/* Turn off increment counting */
rack->rc_gp_incr = 0;
@@ -4177,6 +4340,7 @@
*/
uint32_t segsiz;
+ rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
if (rack->rc_gp_dyn_mul == 0)
return;
@@ -4203,7 +4367,6 @@
rack->r_ctl.rc_pace_min_segs);
rack->in_probe_rtt = 1;
rack->measure_saw_probe_rtt = 1;
- rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
rack->r_ctl.rc_time_probertt_starts = 0;
rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
if (rack_probertt_use_min_rtt_entry)
@@ -4387,6 +4550,7 @@
rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
{
/* Check in on probe-rtt */
+
if (rack->rc_gp_filled == 0) {
/* We do not do p-rtt unless we have gp measurements */
return;
@@ -4431,7 +4595,10 @@
if (calc) {
/* Maybe */
calc *= rack_per_of_gp_probertt_reduce;
- rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
+ if (calc > rack_per_of_gp_probertt)
+ rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
+ else
+ rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
/* Limit it too */
if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
@@ -4472,7 +4639,9 @@
rack_exit_probertt(rack, us_cts);
}
- } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
+ } else if ((rack->rc_skip_timely == 0) &&
+ (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) &&
+ ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) {
/* Go into probertt, its been too long since we went lower */
rack_enter_probertt(rack, us_cts);
}
@@ -4831,6 +5000,32 @@
}
}
+static void
+rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line)
+{
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = add_part;
+ log.u_bbr.flex2 = sub_part;
+ log.u_bbr.flex3 = rack_wma_divisor;
+ log.u_bbr.flex4 = srtt;
+ log.u_bbr.flex7 = (uint16_t)line;
+ log.u_bbr.flex8 = meth;
+ log.u_bbr.delRate = rack->r_ctl.gp_bw;
+ log.u_bbr.cur_del_rate = meas_bw;
+ log.u_bbr.rttProp = utim;
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ BBR_LOG_THRESH_CALC, 0,
+ 0, &log, false, &rack->r_ctl.act_rcv_time);
+ }
+}
+
static void
rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
tcp_seq th_ack, int line, uint8_t quality)
@@ -5046,6 +5241,8 @@
* other hand if we get a measurement over 1ms with a
* 10ms rtt we only want to take a much smaller portion.
*/
+ uint8_t meth;
+
if (rack->r_ctl.num_measurements < 0xff) {
rack->r_ctl.num_measurements++;
}
@@ -5086,6 +5283,7 @@
*/
addpart = bytes_ps * utim;
addpart /= (srtt * 8);
+ meth = 1;
} else {
/*
* Don't allow a single measurement
@@ -5098,7 +5296,9 @@
*/
subpart = rack->r_ctl.gp_bw / 2;
addpart = bytes_ps / 2;
+ meth = 2;
}
+ rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
resid_bw = rack->r_ctl.gp_bw - subpart;
rack->r_ctl.gp_bw = resid_bw + addpart;
did_add = 1;
@@ -5116,6 +5316,7 @@
subpart /= (srtt * rack_wma_divisor);
addpart = bytes_ps * utim;
addpart /= (srtt * rack_wma_divisor);
+ meth = 3;
} else {
/*
* The scaled measurement was long
@@ -5124,6 +5325,7 @@
*/
subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
addpart = bytes_ps / rack_wma_divisor;
+ meth = 4;
}
if ((rack->measure_saw_probe_rtt == 0) ||
(bytes_ps > rack->r_ctl.gp_bw)) {
@@ -5133,12 +5335,83 @@
* add in.
*/
did_add = 1;
+ rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
resid_bw = rack->r_ctl.gp_bw - subpart;
rack->r_ctl.gp_bw = resid_bw + addpart;
}
}
rack_set_pace_segments(tp, rack, __LINE__, NULL);
}
+ /*
+ * We only watch the growth of the GP during the initial startup
+ * or first-slowstart that ensues. If we ever needed to watch
+ * growth of gp outside of that period all we need to do is
+ * remove the first clause of this if (rc_initial_ss_comp).
+ */
+ if ((rack->rc_initial_ss_comp == 0) &&
+ (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) {
+ uint64_t gp_est;
+
+ gp_est = bytes_ps;
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = rack->r_ctl.current_round;
+ log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
+ log.u_bbr.delRate = gp_est;
+ log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
+ log.u_bbr.flex8 = 41;
+ (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, __func__, __LINE__,&tv);
+ }
+ if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) ||
+ (rack->r_ctl.last_gpest == 0)) {
+ /*
+ * The round we get our measurement averaging going
+ * is the base round so it always is the source point
+ * for when we had our first increment. From there on
+ * we only record the round that had a rise.
+ */
+ rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
+ rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
+ } else if (gp_est >= rack->r_ctl.last_gpest) {
+ /*
+ * Test to see if its gone up enough
+ * to set the round count up to now. Note
+ * that on the seeding of the 4th measurement we
+ */
+ gp_est *= 1000;
+ gp_est /= rack->r_ctl.last_gpest;
+ if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) {
+ /*
+ * We went up enough to record the round.
+ */
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = rack->r_ctl.current_round;
+ log.u_bbr.flex2 = (uint32_t)gp_est;
+ log.u_bbr.flex3 = rack->r_ctl.gp_gain_req;
+ log.u_bbr.delRate = gp_est;
+ log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
+ log.u_bbr.flex8 = 42;
+ (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, __func__, __LINE__,&tv);
+ }
+ rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
+ if (rack->r_ctl.use_gp_not_last == 1)
+ rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
+ else
+ rack->r_ctl.last_gpest = bytes_ps;
+ }
+ }
+ }
if ((rack->gp_ready == 0) &&
(rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
/* We have enough measurements now */
@@ -5152,10 +5425,15 @@
rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
rack_get_bw(rack), 22, did_add, NULL, quality);
/* We do not update any multipliers if we are in or have seen a probe-rtt */
- if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
- rack_update_multiplier(rack, timely_says, bytes_ps,
- rack->r_ctl.rc_gp_srtt,
- rack->r_ctl.rc_rtt_diff);
+
+ if ((rack->measure_saw_probe_rtt == 0) &&
+ rack->rc_gp_rtt_set) {
+ if (rack->rc_skip_timely == 0) {
+ rack_update_multiplier(rack, timely_says, bytes_ps,
+ rack->r_ctl.rc_gp_srtt,
+ rack->r_ctl.rc_rtt_diff);
+ }
+ }
rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
rack_get_bw(rack), 3, line, NULL, quality);
rack_log_pacing_delay_calc(rack,
@@ -5179,7 +5457,6 @@
rack->rc_gp_saw_ca = 0;
rack->rc_gp_saw_ss = 0;
rack->rc_dragged_bottom = 0;
-
if (quality == RACK_QUALITY_HIGH) {
/*
* Gput in the stats world is in kbps where bytes_ps is
@@ -5326,7 +5603,7 @@
*/
static void
rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
- uint16_t type, int32_t recovery)
+ uint16_t type, int32_t post_recovery)
{
uint32_t prior_cwnd, acked;
struct tcp_log_buffer *lgb = NULL;
@@ -5335,7 +5612,7 @@
INP_WLOCK_ASSERT(tptoinpcb(tp));
tp->t_ccv.nsegs = nsegs;
acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una);
- if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
+ if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
uint32_t max;
max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
@@ -5348,17 +5625,21 @@
((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
#endif
if ((th_ack == tp->snd_max) && rack->lt_bw_up) {
- /* We will ack all, time
- * to end any lt_bw_up we
- * have running until something
- * new is sent.
+ /*
+ * We will ack all the data, time to end any
+ * lt_bw_up we have running until something
+ * new is sent. Note we need to use the actual
+ * ack_rcv_time which with pacing may be different.
*/
- struct timeval tv;
+ uint64_t tmark;
rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq);
rack->r_ctl.lt_seq = tp->snd_max;
- (void)tcp_get_usecs(&tv);
- rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark);
+ tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
+ if (tmark >= rack->r_ctl.lt_timemark) {
+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
+ }
+ rack->r_ctl.lt_timemark = tmark;
rack->lt_bw_up = 0;
}
quality = RACK_QUALITY_NONE;
@@ -5385,7 +5666,7 @@
tp->t_bytes_acked = 0;
}
prior_cwnd = tp->snd_cwnd;
- if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
+ if ((post_recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
(rack_client_low_buf && rack->client_bufferlvl &&
(rack->client_bufferlvl < rack_client_low_buf)))
labc_to_use = rack->rc_labc;
@@ -5446,6 +5727,14 @@
if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
}
+ if ((rack->rc_initial_ss_comp == 0) &&
+ (tp->snd_cwnd >= tp->snd_ssthresh)) {
+ /*
+ * The cwnd has grown beyond ssthresh we have
+ * entered ca and completed our first Slowstart.
+ */
+ rack->rc_initial_ss_comp = 1;
+ }
}
static void
@@ -5467,180 +5756,64 @@
rack->r_wanted_output = 1;
}
-static inline void
-rack_set_most_aggr(struct tcp_rack *rack)
-{
- rack->r_fill_less_agg = 0;
- /* Once the cwnd as been clamped we don't do fill_cw */
- if (rack->r_cwnd_was_clamped == 0)
- rack->rc_pace_to_cwnd = 1;
- rack->r_pacing_discount = 0;
-}
-
-static inline void
-rack_limit_fillcw(struct tcp_rack *rack)
-{
- rack->r_fill_less_agg = 1;
- /* Once the cwnd as been clamped we don't do fill_cw */
- if (rack->r_cwnd_was_clamped == 0)
- rack->rc_pace_to_cwnd = 1;
- rack->r_pacing_discount = 0;
-}
-
-static inline void
-rack_disable_fillcw(struct tcp_rack *rack)
+static inline uint64_t
+rack_get_rxt_per(uint64_t snds, uint64_t rxts)
{
- rack->r_fill_less_agg = 1;
- rack->rc_pace_to_cwnd = 0;
- rack->r_pacing_discount = 0;
-}
+ uint64_t rxt_per;
-static void
-rack_client_buffer_level_set(struct tcp_rack *rack)
-{
- /*
- * Only if DGP is on do we do anything that
- * changes stack behavior. If DGP is off all
- * we will do is issue a BB log (if BB logging is
- * on) and return.
- */
- if (rack->dgp_on == 0) {
- rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl,
- 0, 0, 0, 30, __LINE__, NULL, 0);
- return;
- }
- if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) {
- goto set_most_agg;
- }
- /*
- * We are in DGP so what setting should we
- * apply based on where the client is?
- */
- switch(rack->r_ctl.rc_dgp_bl_agg) {
- default:
- case DGP_LEVEL0:
-set_most_agg:
- rack_set_most_aggr(rack);
- break;
- case DGP_LEVEL1:
- if (rack->client_bufferlvl == 4)
- rack_limit_fillcw(rack);
- else if (rack->client_bufferlvl == 5)
- rack_disable_fillcw(rack);
- else
- rack_set_most_aggr(rack);
- break;
- case DGP_LEVEL2:
- if (rack->client_bufferlvl == 3)
- rack_limit_fillcw(rack);
- else if (rack->client_bufferlvl == 4)
- rack_disable_fillcw(rack);
- else if (rack->client_bufferlvl == 5) {
- rack_disable_fillcw(rack);
- rack->r_pacing_discount = 1;
- rack->r_ctl.pacing_discount_amm = 1;
- } else
- rack_set_most_aggr(rack);
- break;
- case DGP_LEVEL3:
- if (rack->client_bufferlvl == 2)
- rack_limit_fillcw(rack);
- else if (rack->client_bufferlvl == 3)
- rack_disable_fillcw(rack);
- else if (rack->client_bufferlvl == 4) {
- rack_disable_fillcw(rack);
- rack->r_pacing_discount = 1;
- rack->r_ctl.pacing_discount_amm = 1;
- } else if (rack->client_bufferlvl == 5) {
- rack_disable_fillcw(rack);
- rack->r_pacing_discount = 1;
- rack->r_ctl.pacing_discount_amm = 2;
- } else
- rack_set_most_aggr(rack);
- break;
+ if (snds > 0) {
+ rxt_per = rxts * 1000;
+ rxt_per /= snds;
+ } else {
+ /* This is an unlikely path */
+ if (rxts) {
+ /* Its the max it was all re-transmits */
+ rxt_per = 0xffffffffffffffff;
+ } else {
+ rxt_per = 0;
+ }
}
- rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0,
- 0, 0, 30, __LINE__, NULL, 0);
+ return (rxt_per);
}
static void
-do_rack_check_for_unclamp(struct tcpcb *tp, struct tcp_rack *rack)
+policer_detection_log(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint8_t flex8)
{
- /*
- * Can we unclamp. We unclamp if more than
- * N rounds have transpired with no loss.
- */
- uint64_t snds, rxts, rxt_per;
- uint32_t rnds;
-
- rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
- if ((rack_unclamp_round_thresh > 0) &&
- (rnds >= rack_unclamp_round_thresh)) {
- snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
- KASSERT ((snds > 0), ("rack:%p tp:%p snds:%ju is 0", rack, tp,
- (uintmax_t)snds));
- rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
- rxt_per = rxts * 1000;
- rxt_per /= snds;
- if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) {
- /* Unclamp */
- if (tcp_bblogging_on(rack->rc_tp)) {
- union tcp_log_stackspecific log;
- struct timeval tv;
-
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.flex3 = rnds;
- log.u_bbr.flex4 = rack_unclamp_round_thresh;
- log.u_bbr.flex5 = (uint32_t)rxt_per;
- log.u_bbr.flex8 = 6;
- log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
- log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
- log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
- log.u_bbr.applimited = rack->r_ctl.max_clamps;
- log.u_bbr.epoch = rack->r_ctl.clamp_options;
- log.u_bbr.cur_del_rate = rxts;
- log.u_bbr.bw_inuse = rack_get_lt_bw(rack);
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
- log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
- tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- }
- rack->r_ctl.num_of_clamps_applied = 0;
- rack->r_cwnd_was_clamped = 0;
- rack->excess_rxt_on = 1;
- if (rack->r_ctl.clamp_options) {
- /*
- * We only allow fillcw to be toggled
- * if you are setting a max seg too.
- */
- if (rack->r_ctl.clamp_options & 0x1) {
- if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
- /* turn on fill cw for non-dgp*/
- rack->rc_pace_to_cwnd = 0;
- } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
- /* For DGP we want it off */
- rack->rc_pace_to_cwnd = 1;
- }
- }
- }
- if (rack->dgp_on) {
- /* Reset all multipliers to 100.0 so just the measured bw */
- /* Crash any per boosts down to 100% */
- rack->r_ctl.rack_per_of_gp_rec = 100;
- rack->r_ctl.rack_per_of_gp_ss = 100;
- rack->r_ctl.rack_per_of_gp_ca = 100;
- /* Set in an upper bound for ss/ca % increase */
- rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
- rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
- }
- }
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = flex1;
+ log.u_bbr.flex2 = flex2;
+ log.u_bbr.flex3 = flex3;
+ log.u_bbr.flex4 = flex4;
+ log.u_bbr.flex5 = rack->r_ctl.current_policer_bucket;
+ log.u_bbr.flex6 = rack->r_ctl.policer_bucket_size;
+ log.u_bbr.flex7 = 0;
+ log.u_bbr.flex8 = flex8;
+ log.u_bbr.bw_inuse = rack->r_ctl.policer_bw;
+ log.u_bbr.applimited = rack->r_ctl.current_round;
+ log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
+ log.u_bbr.delivered = (uint32_t)rack->r_ctl.bytes_acked_in_recovery;
+ log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
+ log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes;
+ log.u_bbr.rttProp = rack->r_ctl.gp_bw;
+ log.u_bbr.bbr_state = rack->rc_policer_detected;
+ log.u_bbr.bbr_substate = 0;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.use_lt_bw = rack->policer_detect_on;
+ log.u_bbr.lt_epoch = 0;
+ log.u_bbr.pkts_out = 0;
+ tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
}
+
}
static void
-do_rack_excess_rxt(struct tcpcb *tp, struct tcp_rack *rack)
+policer_detection(struct tcpcb *tp, struct tcp_rack *rack, int post_recovery)
{
/*
* Rack excess rxt accounting is turned on. If we
@@ -5648,166 +5821,395 @@
* rounds, then back off the cwnd and ssthresh
* to fit into the long-term b/w.
*/
- uint64_t snds, rxts, rxt_per, lt_bw, bdp;
- uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0;
- /* Is it shut off by 0 rounds? */
- if (rack_rxt_min_rnds == 0)
- return;
- if ((rack->r_ctl.max_clamps > 0) &&
- (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) {
- /*
- * The idea, if max_clamps is set, is that if clamping it
- * N times did not work again, then there is no sense
- * clamping it again. The link is just a lossy link and
- * our clamps are doing no good. Turn it off so we don't come
- * back here again.
- */
- rack->excess_rxt_on = 0;
- rack->r_cwnd_was_clamped = 0;
- rack->r_ctl.num_of_clamps_applied = 0;
- return;
- }
- snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
- rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
- rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
- /* Has enough rounds progressed for us to re-measure? */
- if ((rnds >= rack_rxt_min_rnds) &&
- (rack->r_ctl.rxt_threshold > 0)){
- rxt_per = rxts * 1000;
- rxt_per /= snds;
- if (rxt_per >= rack->r_ctl.rxt_threshold) {
- /*
- * Action required:
- * We are above our excess retransmit level, lets
- * cut down the cwnd and ssthresh to match the long-term
- * b/w we are getting.
- */
- /* First disable scwnd if enabled */
-#ifdef NETFLIX_SHARED_CWND
- rack->rack_enable_scwnd = 0;
- if (rack->r_ctl.rc_scw) {
- uint32_t limit;
+ uint32_t pkts, mid, med, alt_med, avg, segsiz, tot_retran_pkt_count = 0;
+ uint32_t cnt_of_mape_rxt = 0;
+ uint64_t snds, rxts, rxt_per, tim, del, del_bw;
+ int i;
+ struct timeval tv;
- shared_cwnd_was_enabled = 1;
- if (rack->r_limit_scw)
- limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
- else
- limit = 0;
- tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
- rack->r_ctl.rc_scw_index,
- limit);
- rack->r_ctl.rc_scw = NULL;
- }
-#endif
- /* Calculate what the cwnd and ssthresh should be */
- tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT);
- lt_bw = rack_get_lt_bw(rack);
- if (lt_bw == 0) {
- /*
- * No lt_bw, lets chop things to one MSS
- * and the ssthresh to the iwnd.
- */
-reset_to_iw:
- new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
- new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp));
- } else {
- rtt = rack->rc_rack_rtt;
- if (rtt == 0) {
- /* If we have no rack_rtt drop to the IW situation */
- goto reset_to_iw;
- }
- bdp = lt_bw * (uint64_t)rtt;
- bdp /= HPTS_USEC_IN_SEC;
- new_cwnd = (uint32_t)bdp;
- new_ssthresh = new_cwnd - 1;
- if (new_cwnd < ctf_fixed_maxseg(tp)) {
- /* Rock bottom, goto IW settings */
- goto reset_to_iw;
- }
- }
- rack->r_cwnd_was_clamped = 1;
- rack->r_ctl.num_of_clamps_applied++;
- /* Reset the counter fromn now */
- tp->t_bytes_acked = 0;
+ /*
+ * First is there enough packets delivered during recovery to make
+ * a determiniation of b/w?
+ */
+ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ if ((rack->rc_policer_detected == 0) &&
+ (rack->r_ctl.policer_del_mss > 0) &&
+ ((uint32_t)rack->r_ctl.policer_del_mss > ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz))) {
+ /*
+ * Not enough data sent in recovery for initial detection. Once
+ * we have deteced a policer we allow less than the threshold (polcer_del_mss)
+ * amount of data in a recovery to let us fall through and double check
+ * our policer settings and possibly expand or collapse the bucket size and
+ * the polcier b/w.
+ *
+ * Once you are declared to be policed. this block of code cannot be
+ * reached, instead blocks further down will re-check the policer detection
+ * triggers and possibly reset the measurements if somehow we have let the
+ * policer bucket size grow too large.
+ */
+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
+ policer_detection_log(rack, rack->r_ctl.policer_del_mss,
+ ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz),
+ rack->r_ctl.bytes_acked_in_recovery, segsiz, 18);
+ }
+ return;
+ }
+ tcp_get_usecs(&tv);
+ tim = tcp_tv_to_lusectick(&tv) - rack->r_ctl.time_entered_recovery;
+ del = rack->r_ctl.bytes_acked_in_recovery;
+ if (tim > 0)
+ del_bw = (del * (uint64_t)1000000) / tim;
+ else
+ del_bw = 0;
+ /* B/W compensation? */
+
+ if (rack->r_ctl.pol_bw_comp && ((rack->r_ctl.policer_bw > 0) ||
+ (del_bw > 0))) {
+ /*
+ * Sanity check now that the data is in. How long does it
+ * take for us to pace out two of our policer_max_seg's?
+ *
+ * If it is longer than the RTT then we are set
+ * too slow, maybe because of not enough data
+ * sent during recovery.
+ */
+ uint64_t lentime, res, srtt, max_delbw, alt_bw;
+
+ srtt = (uint64_t)rack_grab_rtt(tp, rack);
+ if ((tp->t_srtt > 0) && (srtt > tp->t_srtt))
+ srtt = tp->t_srtt;
+ lentime = rack->r_ctl.policer_max_seg * (uint64_t)HPTS_USEC_IN_SEC * 2;
+ if (del_bw > rack->r_ctl.policer_bw) {
+ max_delbw = del_bw;
+ } else {
+ max_delbw = rack->r_ctl.policer_bw;
+ }
+ res = lentime / max_delbw;
+ if ((srtt > 0) && (res > srtt)) {
/*
- * Now what about options?
- * We look at the bottom 8 bits:
- * F = fill cw bit (toggle it if set)
- * S = Segment bits
- * M = set max segment bit
+ * At this rate we can not get two policer_maxsegs
+ * out before the ack arrives back.
*
- * SSSS SSMF
+ * Lets at least get it raised up so that
+ * we can be a bit faster than that if possible.
*/
- if (rack->r_ctl.clamp_options) {
- if (rack->r_ctl.clamp_options & 0x1) {
- if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
- /* turn on fill cw for non-dgp*/
- rack->rc_pace_to_cwnd = 1;
- } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
- /* For DGP we want it off */
- rack->rc_pace_to_cwnd = 0;
- }
+ lentime = (rack->r_ctl.policer_max_seg * 2);
+ tim = srtt;
+ alt_bw = (lentime * (uint64_t)HPTS_USEC_IN_SEC) / tim;
+ if (alt_bw > max_delbw) {
+ uint64_t cap_alt_bw;
+
+ cap_alt_bw = (max_delbw + (max_delbw * rack->r_ctl.pol_bw_comp));
+ if ((rack_pol_min_bw > 0) && (cap_alt_bw < rack_pol_min_bw)) {
+ /* We place a min on the cap which defaults to 1Mbps */
+ cap_alt_bw = rack_pol_min_bw;
+ }
+ if (alt_bw <= cap_alt_bw) {
+ /* It should be */
+ del_bw = alt_bw;
+ policer_detection_log(rack,
+ (uint32_t)tim,
+ rack->r_ctl.policer_max_seg,
+ 0,
+ 0,
+ 16);
+ } else {
+ /*
+ * This is an odd case where likely the RTT is very very
+ * low. And yet it is still being policed. We don't want
+ * to get more than (rack_policing_do_bw_comp+1) x del-rate
+ * where del-rate is what we got in recovery for either the
+ * first Policer Detection(PD) or this PD we are on now.
+ */
+ del_bw = cap_alt_bw;
+ policer_detection_log(rack,
+ (uint32_t)tim,
+ rack->r_ctl.policer_max_seg,
+ (uint32_t)max_delbw,
+ (rack->r_ctl.pol_bw_comp + 1),
+ 16);
}
}
- if (rack->dgp_on) {
- /* Reset all multipliers to 100.0 so just the measured bw */
- /* Crash any per boosts down to 100% */
- rack->r_ctl.rack_per_of_gp_rec = 100;
- rack->r_ctl.rack_per_of_gp_ss = 100;
- rack->r_ctl.rack_per_of_gp_ca = 100;
- /* Set in an upper bound for ss/ca % increase */
- rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper;
- rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper;
- /* Now move to the lt_bw */
- rack->r_ctl.gp_bw = lt_bw;
- rack->rc_gp_filled = 1;
- rack->r_ctl.num_measurements = RACK_REQ_AVG;
- }
- if (tcp_bblogging_on(rack->rc_tp)) {
- union tcp_log_stackspecific log;
- struct timeval tv;
-
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.flex1 = new_cwnd;
- log.u_bbr.flex2 = new_ssthresh;
- log.u_bbr.flex3 = rnds;
- log.u_bbr.flex4 = rack_rxt_min_rnds;
- log.u_bbr.flex5 = rtt;
- log.u_bbr.flex6 = shared_cwnd_was_enabled;
- log.u_bbr.flex8 = 5;
- log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
- log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
- log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
- log.u_bbr.applimited = rack->r_ctl.max_clamps;
- log.u_bbr.epoch = rack->r_ctl.clamp_options;
- log.u_bbr.cur_del_rate = rxts;
- log.u_bbr.delRate = snds;
- log.u_bbr.rttProp = rack->r_ctl.rxt_threshold;
- log.u_bbr.bw_inuse = lt_bw;
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
- log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
- tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- }
- /* Update our point where we did it */
- if (rack->r_ctl.already_had_a_excess == 0) {
- rack->r_ctl.already_had_a_excess = 1;
- counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1);
+ }
+ }
+ snds = tp->t_sndbytes - rack->r_ctl.last_policer_sndbytes;
+ rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_policer_snd_rxt_bytes;
+ rxt_per = rack_get_rxt_per(snds, rxts);
+ /* Figure up the average and median */
+ for(i = 0; i < RETRAN_CNT_SIZE; i++) {
+ if (rack->r_ctl.rc_cnt_of_retran[i] > 0) {
+ tot_retran_pkt_count += (i + 1) * rack->r_ctl.rc_cnt_of_retran[i];
+ cnt_of_mape_rxt += rack->r_ctl.rc_cnt_of_retran[i];
+ }
+ }
+ if (cnt_of_mape_rxt)
+ avg = (tot_retran_pkt_count * 10)/cnt_of_mape_rxt;
+ else
+ avg = 0;
+ alt_med = med = 0;
+ mid = tot_retran_pkt_count/2;
+ for(i = 0; i < RETRAN_CNT_SIZE; i++) {
+ pkts = (i + 1) * rack->r_ctl.rc_cnt_of_retran[i];
+ if (mid > pkts) {
+ mid -= pkts;
+ continue;
+ }
+ med = (i + 1);
+ break;
+ }
+ mid = cnt_of_mape_rxt / 2;
+ for(i = 0; i < RETRAN_CNT_SIZE; i++) {
+ if (mid > rack->r_ctl.rc_cnt_of_retran[i]) {
+ mid -= rack->r_ctl.rc_cnt_of_retran[i];
+ continue;
+ }
+ alt_med = (i + 1);
+ break;
+ }
+ if (rack->r_ctl.policer_alt_median) {
+ /* Swap the medians */
+ uint32_t swap;
+
+ swap = med;
+ med = alt_med;
+ alt_med = swap;
+ }
+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = avg;
+ log.u_bbr.flex2 = med;
+ log.u_bbr.flex3 = (uint32_t)rxt_per;
+ log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold;
+ log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold;
+ log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold;
+ log.u_bbr.flex7 = rack->r_ctl.policer_alt_median;
+ log.u_bbr.flex8 = 1;
+ log.u_bbr.delivered = rack->r_ctl.policer_bucket_size;
+ log.u_bbr.applimited = rack->r_ctl.current_round;
+ log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
+ log.u_bbr.bw_inuse = del_bw;
+ log.u_bbr.cur_del_rate = rxts;
+ log.u_bbr.delRate = snds;
+ log.u_bbr.rttProp = rack->r_ctl.gp_bw;
+ log.u_bbr.bbr_state = rack->rc_policer_detected;
+ log.u_bbr.bbr_substate = 0;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.use_lt_bw = rack->policer_detect_on;
+ log.u_bbr.lt_epoch = (uint32_t)tim;
+ log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery;
+ tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+ if (med == RETRAN_CNT_SIZE) {
+ /*
+ * If the median is the maximum, then what we
+ * likely have here is a network breakage. Either that
+ * or we are so unlucky that all of our traffic is being
+ * dropped and having to be retransmitted the maximum times
+ * and this just is not how a policer works.
+ *
+ * If it is truely a policer eventually we will come
+ * through and it won't be the maximum.
+ */
+ return;
+ }
+ /* Has enough rounds progressed for us to re-measure? */
+ if ((rxt_per >= (uint64_t)rack->r_ctl.policer_rxt_threshold) &&
+ (avg >= rack->r_ctl.policer_avg_threshold) &&
+ (med >= rack->r_ctl.policer_med_threshold)) {
+ /*
+ * We hit all thresholds that indicate we are
+ * being policed. Now we may be doing this from a rack timeout
+ * which then means the rest of recovery will hopefully go
+ * smoother as we pace. At the end of recovery we will
+ * fall back in here and reset the values using the
+ * results of the entire recovery episode (we could also
+ * hit this as we exit recovery as well which means only
+ * one time in here).
+ *
+ * This is done explicitly that if we hit the thresholds
+ * again in a second recovery we overwrite the values. We do
+ * that because over time, as we pace the policer_bucket_size may
+ * continue to grow. This then provides more and more times when
+ * we are not pacing to the policer rate. This lets us compensate
+ * for when we hit a false positive and those flows continue to
+ * increase. However if its a real policer we will then get over its
+ * limit, over time, again and thus end up back here hitting the
+ * thresholds again.
+ *
+ * The alternative to this is to instead whenever we pace due to
+ * policing in rack_policed_sending we could add the amount len paced to the
+ * idle_snd_una value (which decreases the amount in last_amount_before_rec
+ * since that is always [th_ack - idle_snd_una]). This would then prevent
+ * the polcier_bucket_size from growing in additional recovery episodes
+ * Which would then mean false postives would be pretty much stuck
+ * after things got back to normal (assuming that what caused the
+ * false positive was a small network outage).
+ *
+ */
+ tcp_trace_point(rack->rc_tp, TCP_TP_POLICER_DET);
+ if (rack->rc_policer_detected == 0) {
+ /*
+ * Increment the stat that tells us we identified
+ * a policer only once. Note that if we ever allow
+ * the flag to be cleared (reverted) then we need
+ * to adjust this to not do multi-counting.
+ */
+ counter_u64_add(tcp_policer_detected, 1);
+ }
+ rack->r_ctl.last_policer_sndbytes = tp->t_sndbytes;
+ rack->r_ctl.last_policer_snd_rxt_bytes = tp->t_snd_rxt_bytes;
+ rack->r_ctl.policer_bw = del_bw;
+ rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp,
+ rack->r_ctl.policer_bw,
+ min(ctf_fixed_maxseg(rack->rc_tp),
+ rack->r_ctl.rc_pace_min_segs),
+ 0, NULL,
+ NULL, rack->r_ctl.pace_len_divisor);
+ /* Now what about the policer bucket size */
+ rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
+ if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) {
+ /* We must be able to send our max-seg or else chaos ensues */
+ rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2;
+ }
+ if (rack->rc_policer_detected == 0)
+ rack->r_ctl.current_policer_bucket = 0;
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = avg;
+ log.u_bbr.flex2 = med;
+ log.u_bbr.flex3 = rxt_per;
+ log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold;
+ log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold;
+ log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold;
+ log.u_bbr.flex7 = rack->r_ctl.policer_alt_median;
+ log.u_bbr.flex8 = 2;
+ log.u_bbr.applimited = rack->r_ctl.current_round;
+ log.u_bbr.bw_inuse = del_bw;
+ log.u_bbr.delivered = rack->r_ctl.policer_bucket_size;
+ log.u_bbr.cur_del_rate = rxts;
+ log.u_bbr.delRate = snds;
+ log.u_bbr.rttProp = rack->r_ctl.gp_bw;
+ log.u_bbr.bbr_state = rack->rc_policer_detected;
+ log.u_bbr.bbr_substate = 0;
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.use_lt_bw = rack->policer_detect_on;
+ log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
+ log.u_bbr.lt_epoch = (uint32_t)tim;
+ log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery;
+ tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ /*
+ * Put out an added log, 19, for the sole purpose
+ * of getting the txt/rxt so that we can benchmark
+ * in read-bbrlog the ongoing rxt rate after our
+ * policer invocation in the HYSTART announcments.
+ */
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
+ log.u_bbr.flex1 = alt_med;
+ log.u_bbr.flex8 = 19;
+ log.u_bbr.cur_del_rate = tp->t_sndbytes;
+ log.u_bbr.delRate = tp->t_snd_rxt_bytes;
+ tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+ /* Turn off any fast output, thats ended */
+ rack->r_fast_output = 0;
+ /* Mark the time for credits */
+ rack->r_ctl.last_sendtime = tcp_get_u64_usecs(NULL);
+ if (rack->r_rr_config < 2) {
+ /*
+ * We need to be stricter on the RR config so
+ * the pacing has priority.
+ */
+ rack->r_rr_config = 2;
+ }
+ policer_detection_log(rack,
+ rack->r_ctl.idle_snd_una,
+ rack->r_ctl.ack_for_idle,
+ 0,
+ (uint32_t)tim,
+ 14);
+ rack->rc_policer_detected = 1;
+ } else if ((rack->rc_policer_detected == 1) &&
+ (post_recovery == 1)) {
+ /*
+ * If we are exiting recovery and have already detected
+ * we need to possibly update the values.
+ *
+ * First: Update the idle -> recovery sent value.
+ */
+ uint32_t srtt;
+
+ if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) {
+ rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
+ }
+ srtt = (uint64_t)rack_grab_rtt(tp, rack);
+ if ((tp->t_srtt > 0) && (srtt > tp->t_srtt))
+ srtt = tp->t_srtt;
+ if ((srtt != 0) &&
+ (tim < (uint64_t)srtt)) {
+ /*
+ * Not long enough.
+ */
+ if (rack_verbose_logging)
+ policer_detection_log(rack,
+ (uint32_t)tim,
+ 0,
+ 0,
+ 0,
+ 15);
+ return;
+ }
+ /*
+ * Finally update the b/w if its grown.
+ */
+ if (del_bw > rack->r_ctl.policer_bw) {
+ rack->r_ctl.policer_bw = del_bw;
+ rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp,
+ rack->r_ctl.policer_bw,
+ min(ctf_fixed_maxseg(rack->rc_tp),
+ rack->r_ctl.rc_pace_min_segs),
+ 0, NULL,
+ NULL, rack->r_ctl.pace_len_divisor);
+ if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) {
+ /* We must be able to send our max-seg or else chaos ensues */
+ rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2;
}
- counter_u64_add(rack_rxt_clamps_cwnd, 1);
- rack->r_ctl.last_sndbytes = tp->t_sndbytes;
- rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes;
- rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round;
- if (new_cwnd < tp->snd_cwnd)
- tp->snd_cwnd = new_cwnd;
- if (new_ssthresh < tp->snd_ssthresh)
- tp->snd_ssthresh = new_ssthresh;
}
+ policer_detection_log(rack,
+ rack->r_ctl.idle_snd_una,
+ rack->r_ctl.ack_for_idle,
+ 0,
+ (uint32_t)tim,
+ 3);
+ }
+}
+
+static void
+rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how)
+{
+ /* now check with the policer if on */
+ if (rack->policer_detect_on == 1) {
+ policer_detection(tp, rack, 1);
}
+ /*
+ * Now exit recovery, note we must do the idle set after the policer_detection
+ * to get the amount acked prior to recovery correct.
+ */
+ rack->r_ctl.idle_snd_una = tp->snd_una;
+ EXIT_RECOVERY(tp->t_flags);
}
static void
@@ -5882,9 +6284,12 @@
}
rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
}
- EXIT_RECOVERY(tp->t_flags);
- if (rack->r_ctl.full_dgp_in_rec)
- rack_client_buffer_level_set(rack);
+ if (rack->rto_from_rec == 1) {
+ rack->rto_from_rec = 0;
+ if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
+ tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
+ }
+ rack_exit_recovery(tp, rack, 1);
}
static void
@@ -5909,12 +6314,69 @@
tp->t_flags &= ~TF_WASFRECOVERY;
tp->t_flags &= ~TF_WASCRECOVERY;
if (!IN_FASTRECOVERY(tp->t_flags)) {
- if (rack->dgp_on && rack->r_cwnd_was_clamped) {
- /* Reset the gains so that on exit we will be softer longer */
- rack->r_ctl.rack_per_of_gp_rec = 100;
- rack->r_ctl.rack_per_of_gp_ss = 98;
- rack->r_ctl.rack_per_of_gp_ca = 98;
+ struct rack_sendmap *rsm;
+ struct timeval tv;
+ uint32_t segsiz;
+
+ /* Check if this is the end of the initial Start-up i.e. initial slow-start */
+ if (rack->rc_initial_ss_comp == 0) {
+ /* Yep it is the end of the initial slowstart */
+ rack->rc_initial_ss_comp = 1;
+ }
+ microuptime(&tv);
+ rack->r_ctl.time_entered_recovery = tcp_tv_to_lusectick(&tv);
+ if (SEQ_GEQ(ack, tp->snd_una)) {
+ /*
+ * The ack is above snd_una. Lets see
+ * if we can establish a postive distance from
+ * our idle mark.
+ */
+ rack->r_ctl.ack_for_idle = ack;
+ if (SEQ_GT(ack, rack->r_ctl.idle_snd_una)) {
+ rack->r_ctl.last_amount_before_rec = ack - rack->r_ctl.idle_snd_una;
+ } else {
+ /* No data thru yet */
+ rack->r_ctl.last_amount_before_rec = 0;
+ }
+ } else if (SEQ_GT(tp->snd_una, rack->r_ctl.idle_snd_una)) {
+ /*
+ * The ack is out of order and behind the snd_una. It may
+ * have contained SACK information which we processed else
+ * we would have rejected it.
+ */
+ rack->r_ctl.ack_for_idle = tp->snd_una;
+ rack->r_ctl.last_amount_before_rec = tp->snd_una - rack->r_ctl.idle_snd_una;
+ } else {
+ rack->r_ctl.ack_for_idle = ack;
+ rack->r_ctl.last_amount_before_rec = 0;
+ }
+ if (rack->rc_policer_detected) {
+ /*
+ * If we are being policed and we have a loss, it
+ * means our bucket is now empty. This can happen
+ * where some other flow on the same host sends
+ * that this connection is not aware of.
+ */
+ rack->r_ctl.current_policer_bucket = 0;
+ if (rack_verbose_logging)
+ policer_detection_log(rack, rack->r_ctl.last_amount_before_rec, 0, 0, 0, 4);
+ if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) {
+ rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
+ }
+ }
+ memset(rack->r_ctl.rc_cnt_of_retran, 0, sizeof(rack->r_ctl.rc_cnt_of_retran));
+ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
+ /*
+ * Go through the outstanding and re-peg
+ * any that should have been left in the
+ * retransmit list (on a double recovery).
+ */
+ if (rsm->r_act_rxt_cnt > 0) {
+ rack_peg_rxt(rack, rsm, segsiz);
+ }
}
+ rack->r_ctl.bytes_acked_in_recovery = 0;
rack->r_ctl.rc_prr_delivered = 0;
rack->r_ctl.rc_prr_out = 0;
rack->r_fast_output = 0;
@@ -5947,15 +6409,19 @@
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
rack->r_fast_output = 0;
- EXIT_RECOVERY(tp->t_flags);
- if (tp->t_rxtshift == 1) {
+ if (IN_RECOVERY(tp->t_flags))
+ rack_exit_recovery(tp, rack, 2);
+ rack->r_ctl.bytes_acked_in_recovery = 0;
+ rack->r_ctl.time_entered_recovery = 0;
+ orig_cwnd = tp->snd_cwnd;
+ rack_log_to_prr(rack, 16, orig_cwnd, line);
+ if (CC_ALGO(tp)->cong_signal == NULL) {
+ /* TSNH */
tp->snd_ssthresh = max(2,
min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
+ tp->snd_cwnd = ctf_fixed_maxseg(tp);
}
- orig_cwnd = tp->snd_cwnd;
- tp->snd_cwnd = ctf_fixed_maxseg(tp);
- rack_log_to_prr(rack, 16, orig_cwnd, line);
if (tp->t_flags2 & TF2_ECN_PERMIT)
tp->t_flags2 |= TF2_ECN_SND_CWR;
break;
@@ -5984,8 +6450,6 @@
}
if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
rack_log_to_prr(rack, 15, cwnd_enter, line);
- if (rack->r_ctl.full_dgp_in_rec)
- rack_client_buffer_level_set(rack);
rack->r_ctl.dsack_byte_cnt = 0;
rack->r_ctl.retran_during_recovery = 0;
rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
@@ -6078,7 +6542,7 @@
}
static uint32_t
-rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
+rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed)
{
int32_t lro;
uint32_t thresh;
@@ -6149,7 +6613,8 @@
* have seen reordering <and> we have a DSACK count.
*/
thresh += rack->r_ctl.num_dsack * (srtt >> 2);
- rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
+ if (log_allowed)
+ rack_log_dsack_event(rack, 4, line, srtt, thresh);
}
/* SRTT * 2 is the ceiling */
if (thresh > (srtt * 2)) {
@@ -6159,7 +6624,8 @@
if (thresh > rack_rto_max) {
thresh = rack_rto_max;
}
- rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
+ if (log_allowed)
+ rack_log_dsack_event(rack, 6, line, srtt, thresh);
return (thresh);
}
@@ -6294,7 +6760,7 @@
}
idx = rsm->r_rtr_cnt - 1;
srtt = rack_grab_rtt(tp, rack);
- thresh = rack_calc_thresh_rack(rack, srtt, tsused);
+ thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
return (NULL);
}
@@ -6456,7 +6922,7 @@
goto activate_tlp;
}
srtt = rack_grab_rtt(tp, rack);
- thresh = rack_calc_thresh_rack(rack, srtt, cts);
+ thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1);
idx = rsm->r_rtr_cnt - 1;
exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
if (SEQ_GEQ(exp, cts)) {
@@ -6563,8 +7029,6 @@
static void
rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una)
{
- struct timeval tv;
-
if (rack->rc_in_persist == 0) {
if (tp->t_flags & TF_GPUTINPROG) {
/*
@@ -6580,21 +7044,23 @@
rack->rack_scwnd_is_idle = 1;
}
#endif
- rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv);
+ rack->r_ctl.rc_went_idle_time = cts;
+ if (rack->r_ctl.rc_went_idle_time == 0)
+ rack->r_ctl.rc_went_idle_time = 1;
if (rack->lt_bw_up) {
/* Suspend our LT BW measurement */
uint64_t tmark;
rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq);
rack->r_ctl.lt_seq = snd_una;
- tmark = tcp_tv_to_lusectick(&tv);
- rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
+ tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
+ if (tmark >= rack->r_ctl.lt_timemark) {
+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
+ }
rack->r_ctl.lt_timemark = tmark;
rack->lt_bw_up = 0;
rack->r_persist_lt_bw_off = 1;
}
- if (rack->r_ctl.rc_went_idle_time == 0)
- rack->r_ctl.rc_went_idle_time = 1;
rack_timer_cancel(tp, rack, cts, __LINE__);
rack->r_ctl.persist_lost_ends = 0;
rack->probe_not_answered = 0;
@@ -6609,9 +7075,6 @@
static void
rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
- struct timeval tv;
- uint32_t t_time;
-
if (tcp_in_hpts(rack->rc_tp)) {
tcp_hpts_remove(rack->rc_tp);
rack->r_ctl.rc_hpts_flags = 0;
@@ -6622,7 +7085,6 @@
rack->rack_scwnd_is_idle = 0;
}
#endif
- t_time = tcp_get_usecs(&tv);
if (rack->rc_gp_dyn_mul &&
(rack->use_fixed_rate == 0) &&
(rack->rc_always_pace)) {
@@ -6632,7 +7094,7 @@
*/
uint32_t time_idle, idle_min;
- time_idle = t_time - rack->r_ctl.rc_went_idle_time;
+ time_idle = cts - rack->r_ctl.rc_went_idle_time;
idle_min = rack_min_probertt_hold;
if (rack_probertt_gpsrtt_cnt_div) {
uint64_t extra;
@@ -6658,10 +7120,11 @@
}
if (rack->r_persist_lt_bw_off) {
/* Continue where we left off */
- rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
+ rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL);
rack->lt_bw_up = 1;
rack->r_persist_lt_bw_off = 0;
}
+ rack->r_ctl.idle_snd_una = tp->snd_una;
rack->rc_in_persist = 0;
rack->r_ctl.rc_went_idle_time = 0;
tp->t_rxtshift = 0;
@@ -6734,7 +7197,7 @@
}
static void
-rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
+rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
int32_t slot, uint32_t tot_len_this_send, int sup_rack)
{
struct hpts_diag diag;
@@ -6778,7 +7241,8 @@
rack->r_early = 0;
rack->r_ctl.rc_agg_early = 0;
}
- if (rack->r_late) {
+ if ((rack->r_late) &&
+ ((rack->r_use_hpts_min == 0) || (rack->dgp_on == 0))) {
/*
* This is harder, we can
* compensate some but it
@@ -6812,6 +7276,32 @@
if (rack->r_ctl.rc_agg_delayed == 0)
rack->r_late = 0;
}
+ } else if (rack->r_late) {
+ /* r_use_hpts_min is on and so is DGP */
+ uint32_t max_red;
+
+ max_red = (slot * rack->r_ctl.max_reduction) / 100;
+ if (max_red >= rack->r_ctl.rc_agg_delayed) {
+ slot -= rack->r_ctl.rc_agg_delayed;
+ rack->r_ctl.rc_agg_delayed = 0;
+ } else {
+ slot -= max_red;
+ rack->r_ctl.rc_agg_delayed -= max_red;
+ }
+ }
+ if ((rack->r_use_hpts_min == 1) &&
+ (slot > 0) &&
+ (rack->dgp_on == 1)) {
+ /*
+ * We are enforcing a min pacing timer
+ * based on our hpts min timeout.
+ */
+ uint32_t min;
+
+ min = get_hpts_min_sleep_time();
+ if (min > slot) {
+ slot = min;
+ }
}
hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
#ifdef TCP_SAD_DETECTION
@@ -7041,6 +7531,34 @@
rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
}
+static void
+rack_mark_lost(struct tcpcb *tp,
+ struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts)
+{
+ struct rack_sendmap *nrsm;
+ uint32_t thresh, exp;
+
+ thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
+ nrsm = rsm;
+ TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) {
+ if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) {
+ /* Got up to all that were marked sack-passed */
+ break;
+ }
+ if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
+ exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
+ if (TSTMP_LT(exp, cts) || (exp == cts)) {
+ /* We now consider it lost */
+ nrsm->r_flags |= RACK_WAS_LOST;
+ rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
+ } else {
+ /* Past here it won't be lost so stop */
+ break;
+ }
+ }
+ }
+}
+
/*
* RACK Timer, here we simply do logging and house keeping.
* the normal rack_output() function will call the
@@ -7067,6 +7585,8 @@
rsm = rack_check_recovery_mode(tp, cts);
rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
if (rsm) {
+ /* We need to stroke any lost that are now declared as lost */
+ rack_mark_lost(tp, rack, rsm, cts);
rack->r_ctl.rc_resend = rsm;
rack->r_timer_override = 1;
if (rack->use_rack_rr) {
@@ -7088,6 +7608,16 @@
0, 0, 0);
return (1);
}
+ if ((rack->policer_detect_on == 1) &&
+ (rack->rc_policer_detected == 0)) {
+ /*
+ * We do this early if we have not
+ * deteceted to attempt to detect
+ * quicker. Normally we want to do this
+ * as recovery exits (and we will again).
+ */
+ policer_detection(tp, rack, 0);
+ }
return (0);
}
@@ -7189,13 +7719,14 @@
nrsm->r_start = start;
nrsm->r_end = rsm->r_end;
nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
+ nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt;
nrsm->r_flags = rsm->r_flags;
nrsm->r_dupack = rsm->r_dupack;
nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
nrsm->r_rtr_bytes = 0;
nrsm->r_fas = rsm->r_fas;
nrsm->r_bas = rsm->r_bas;
- rsm->r_end = nrsm->r_start;
+ tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start);
nrsm->r_just_ret = rsm->r_just_ret;
for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
@@ -7242,7 +7773,7 @@
*/
rack_log_map_chg(rack->rc_tp, rack, NULL,
l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
- l_rsm->r_end = r_rsm->r_end;
+ tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end);
if (l_rsm->r_dupack < r_rsm->r_dupack)
l_rsm->r_dupack = r_rsm->r_dupack;
if (r_rsm->r_rtr_bytes)
@@ -7344,6 +7875,7 @@
*/
rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
rack->r_ctl.retran_during_recovery = 0;
+ rack->r_might_revert = 0;
rack->r_ctl.dsack_byte_cnt = 0;
counter_u64_add(rack_tlp_tot, 1);
if (rack->r_state && (rack->r_state != tp->t_state))
@@ -7517,6 +8049,32 @@
return (0);
}
+static inline int
+rack_send_ack_challange(struct tcp_rack *rack)
+{
+ struct tcptemp *t_template;
+
+ t_template = tcpip_maketemplate(rack->rc_inp);
+ if (t_template) {
+ if (rack->forced_ack == 0) {
+ rack->forced_ack = 1;
+ rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
+ }
+ tcp_respond(rack->rc_tp, t_template->tt_ipgen,
+ &t_template->tt_t, (struct mbuf *)NULL,
+ rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0);
+ free(t_template, M_TEMP);
+ /* This does send an ack so kill any D-ack timer */
+ if (rack->rc_tp->t_flags & TF_DELACK)
+ rack->rc_tp->t_flags &= ~TF_DELACK;
+ return(1);
+ } else
+ return (0);
+
+}
+
/*
* Persists timer, here we simply send the
* same thing as a keepalive will.
@@ -7528,7 +8086,6 @@
static int
rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
- struct tcptemp *t_template;
int32_t retval = 1;
if (rack->rc_in_persist == 0)
@@ -7575,26 +8132,14 @@
retval = -ETIMEDOUT; /* tcp_drop() */
goto out;
}
- t_template = tcpip_maketemplate(rack->rc_inp);
- if (t_template) {
+ if (rack_send_ack_challange(rack)) {
/* only set it if we were answered */
- if (rack->forced_ack == 0) {
- rack->forced_ack = 1;
- rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
- } else {
- rack->probe_not_answered = 1;
+ if (rack->probe_not_answered) {
counter_u64_add(rack_persists_loss, 1);
rack->r_ctl.persist_lost_ends++;
}
counter_u64_add(rack_persists_sends, 1);
counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
- tcp_respond(tp, t_template->tt_ipgen,
- &t_template->tt_t, (struct mbuf *)NULL,
- tp->rcv_nxt, tp->snd_una - 1, 0);
- /* This sends an ack */
- if (tp->t_flags & TF_DELACK)
- tp->t_flags &= ~TF_DELACK;
- free(t_template, M_TEMP);
}
if (tp->t_rxtshift < V_tcp_retries)
tp->t_rxtshift++;
@@ -7614,7 +8159,6 @@
static int
rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
{
- struct tcptemp *t_template;
struct inpcb *inp = tptoinpcb(tp);
rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
@@ -7641,19 +8185,7 @@
* respond.
*/
KMOD_TCPSTAT_INC(tcps_keepprobe);
- t_template = tcpip_maketemplate(inp);
- if (t_template) {
- if (rack->forced_ack == 0) {
- rack->forced_ack = 1;
- rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
- } else {
- rack->probe_not_answered = 1;
- }
- tcp_respond(tp, t_template->tt_ipgen,
- &t_template->tt_t, (struct mbuf *)NULL,
- tp->rcv_nxt, tp->snd_una - 1, 0);
- free(t_template, M_TEMP);
- }
+ rack_send_ack_challange(rack);
}
rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
return (1);
@@ -7680,8 +8212,26 @@
rack = (struct tcp_rack *)tp->t_fb_ptr;
rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
+ rack->r_timer_override = 1;
+ rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
+ rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
+ rack->r_late = 0;
+ rack->r_early = 0;
+ rack->r_ctl.rc_agg_delayed = 0;
+ rack->r_ctl.rc_agg_early = 0;
if (rack->r_state && (rack->r_state != tp->t_state))
rack_set_state(tp, rack);
+ if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) {
+ /*
+ * We do not clear the scoreboard until we have had
+ * more than rack_rxt_scoreboard_clear_thresh time-outs.
+ */
+ rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rack->r_ctl.rc_resend != NULL)
+ rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
+
+ return;
+ }
/*
* Ideally we would like to be able to
* mark SACK-PASS on anything not acked here.
@@ -7714,27 +8264,26 @@
trsm = rsm;
if (rsm->r_flags & RACK_ACKED)
rsm->r_flags |= RACK_WAS_ACKED;
- rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
+ rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED | RACK_WAS_LOST);
rsm->r_flags |= RACK_MUST_RXT;
}
+ /* zero the lost since it's all gone */
+ rack->r_ctl.rc_considered_lost = 0;
/* Clear the count (we just un-acked them) */
- rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
rack->r_ctl.rc_sacked = 0;
rack->r_ctl.rc_sacklast = NULL;
- rack->r_ctl.rc_agg_delayed = 0;
- rack->r_early = 0;
- rack->r_ctl.rc_agg_early = 0;
- rack->r_late = 0;
/* Clear the tlp rtx mark */
rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
if (rack->r_ctl.rc_resend != NULL)
rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
rack->r_ctl.rc_prr_sndcnt = 0;
rack_log_to_prr(rack, 6, 0, __LINE__);
- rack->r_timer_override = 1;
+ rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
+ if (rack->r_ctl.rc_resend != NULL)
+ rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
#ifdef TCP_SAD_DETECTION
- || (rack->sack_attack_disable != 0)
+ || (rack->sack_attack_disable != 0)
#endif
) && ((tp->t_flags & TF_SENTFIN) == 0)) {
/*
@@ -7744,9 +8293,8 @@
*/
rack->r_must_retran = 1;
rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
- rack->r_ctl.rc_sacked);
+ rack->r_ctl.rc_sacked);
}
- rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
}
static void
@@ -7829,6 +8377,17 @@
rack->r_ctl.retran_during_recovery = 0;
rack->rc_ack_required = 1;
rack->r_ctl.dsack_byte_cnt = 0;
+ if (IN_RECOVERY(tp->t_flags) &&
+ (rack->rto_from_rec == 0)) {
+ /*
+ * Mark that we had a rto while in recovery
+ * and save the ssthresh so if we go back
+ * into recovery we will have a chance
+ * to slowstart back to the level.
+ */
+ rack->rto_from_rec = 1;
+ rack->r_ctl.rto_ssthresh = tp->snd_ssthresh;
+ }
if (IN_FASTRECOVERY(tp->t_flags))
tp->t_flags |= TF_WASFRECOVERY;
else
@@ -7877,7 +8436,6 @@
* retransmit interval. Back off to a longer retransmit interval
* and retransmit one segment.
*/
- rack_remxt_tmr(tp);
if ((rack->r_ctl.rc_resend == NULL) ||
((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
/*
@@ -7888,6 +8446,7 @@
*/
tp->t_rxtshift++;
}
+ rack_remxt_tmr(tp);
if (tp->t_rxtshift > V_tcp_retries) {
tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
drop_it:
@@ -8240,23 +8799,124 @@
}
}
+/*
+ * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This
+ * array is zeroed at the start of recovery. Each time a segment
+ * is retransmitted, we translate that into a number of packets
+ * (based on segsiz) and based on how many times its been retransmitted
+ * increment by the number of packets the counter that represents
+ * retansmitted N times. Index 0 is retransmitted 1 time, index 1
+ * is retransmitted 2 times etc.
+ *
+ * So for example when we send a 4344 byte transmission with a 1448
+ * byte segsize, and its the third time we have retransmitted this
+ * segment, we would add to the rc_cnt_of_retran[2] the value of
+ * 3. That represents 3 MSS were retransmitted 3 times (index is
+ * the number of times retranmitted minus 1).
+ */
+static void
+rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz)
+{
+ int idx;
+ uint32_t peg;
+
+ peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1;
+ peg /= segsiz;
+ idx = rsm->r_act_rxt_cnt - 1;
+ if (idx >= RETRAN_CNT_SIZE)
+ idx = RETRAN_CNT_SIZE - 1;
+ /* Max of a uint16_t retransmits in a bucket */
+ if ((rack->r_ctl.rc_cnt_of_retran[idx] + peg) < 0xffff)
+ rack->r_ctl.rc_cnt_of_retran[idx] += peg;
+ else
+ rack->r_ctl.rc_cnt_of_retran[idx] = 0xffff;
+}
+
+/*
+ * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This
+ * array is zeroed at the start of recovery. Each time a segment
+ * is retransmitted, we translate that into a number of packets
+ * (based on segsiz) and based on how many times its been retransmitted
+ * increment by the number of packets the counter that represents
+ * retansmitted N times. Index 0 is retransmitted 1 time, index 1
+ * is retransmitted 2 times etc.
+ *
+ * The rack_unpeg_rxt is used when we go to retransmit a segment
+ * again. Basically if the segment had previously been retransmitted
+ * say 3 times (as our previous example illustrated in the comment
+ * above rack_peg_rxt() prior to calling that and incrementing
+ * r_ack_rxt_cnt we would have called rack_unpeg_rxt() that would
+ * subtract back the previous add from its last rxt (in this
+ * example r_act_cnt would have been 2 for 2 retransmissions. So
+ * we would have subtracted 3 from rc_cnt_of_reetran[1] to remove
+ * those 3 segments. You will see this in the rack_update_rsm()
+ * below where we do:
+ * if (rsm->r_act_rxt_cnt > 0) {
+ * rack_unpeg_rxt(rack, rsm, segsiz);
+ * }
+ * rsm->r_act_rxt_cnt++;
+ * rack_peg_rxt(rack, rsm, segsiz);
+ *
+ * This effectively moves the count from rc_cnt_of_retran[1] to
+ * rc_cnt_of_retran[2].
+ */
+static void
+rack_unpeg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz)
+{
+ int idx;
+ uint32_t peg;
+
+ idx = rsm->r_act_rxt_cnt - 1;
+ if (idx >= RETRAN_CNT_SIZE)
+ idx = RETRAN_CNT_SIZE - 1;
+ peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1;
+ peg /= segsiz;
+ if (peg < rack->r_ctl.rc_cnt_of_retran[idx])
+ rack->r_ctl.rc_cnt_of_retran[idx] -= peg;
+ else {
+ /* TSNH */
+ rack->r_ctl.rc_cnt_of_retran[idx] = 0;
+ }
+}
+
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz)
+ struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz)
{
int32_t idx;
rsm->r_rtr_cnt++;
- rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
- rsm->r_dupack = 0;
if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
rsm->r_flags |= RACK_OVERMAX;
}
+ if (rsm->r_act_rxt_cnt > 0) {
+ /* Drop the count back for this, its retransmitting again */
+ rack_unpeg_rxt(rack, rsm, segsiz);
+ }
+ rsm->r_act_rxt_cnt++;
+ /* Peg the count/index */
+ rack_peg_rxt(rack, rsm, segsiz);
+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
+ rsm->r_dupack = 0;
if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
}
+ if (rsm->r_flags & RACK_WAS_LOST) {
+ /*
+ * We retransmitted it putting it back in flight
+ * remove the lost desgination and reduce the
+ * bytes considered lost.
+ */
+ rsm->r_flags &= ~RACK_WAS_LOST;
+ KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
+ if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
+ rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start;
+ else
+ rack->r_ctl.rc_considered_lost = 0;
+ }
idx = rsm->r_rtr_cnt - 1;
rsm->r_tim_lastsent[idx] = ts;
/*
@@ -8304,7 +8964,7 @@
static uint32_t
rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
- struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag, int segsiz)
+ struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint32_t add_flag, int segsiz)
{
/*
* We (re-)transmitted starting at rsm->r_start for some length
@@ -8381,7 +9041,7 @@
static void
rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
- struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb,
+ struct rack_sendmap *hintrsm, uint32_t add_flag, struct mbuf *s_mb,
uint32_t s_moff, int hw_tls, int segsiz)
{
struct tcp_rack *rack;
@@ -8440,13 +9100,6 @@
len++;
if (th_flags & TH_FIN)
len++;
- if (SEQ_LT(snd_max, tp->snd_nxt)) {
- /*
- * The add/update as not been done for the FIN/SYN
- * yet.
- */
- snd_max = tp->snd_nxt;
- }
}
if (SEQ_LEQ((seq_out + len), snd_una)) {
/* Are sending an old segment to induce an ack (keep-alive)? */
@@ -8492,6 +9145,7 @@
rsm->r_hw_tls = 1;
rsm->r_tim_lastsent[0] = cts;
rsm->r_rtr_cnt = 1;
+ rsm->r_act_rxt_cnt = 0;
rsm->r_rtr_bytes = 0;
if (th_flags & TH_SYN) {
/* The data space is one beyond snd_una */
@@ -8515,6 +9169,10 @@
rsm->r_fas = (ctf_flight_size(rack->rc_tp,
rack->r_ctl.rc_sacked) +
(rsm->r_end - rsm->r_start));
+ if ((rack->rc_initial_ss_comp == 0) &&
+ (rack->r_ctl.ss_hi_fs < rsm->r_fas)) {
+ rack->r_ctl.ss_hi_fs = rsm->r_fas;
+ }
/* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
if (rsm->m) {
if (rsm->m->m_len <= rsm->soff) {
@@ -8558,6 +9216,13 @@
#endif
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 1;
+ if (rsm->r_flags & RACK_IS_PCM) {
+ rack->r_ctl.pcm_i.send_time = cts;
+ rack->r_ctl.pcm_i.eseq = rsm->r_end;
+ /* First time through we set the start too */
+ if (rack->pcm_in_progress == 0)
+ rack->r_ctl.pcm_i.sseq = rsm->r_start;
+ }
/*
* Special case detection, is there just a single
* packet outstanding when we are not in recovery?
@@ -8886,6 +9551,7 @@
}
stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
#endif
+ rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
/*
* the retransmit should happen at rtt + 4 * rttvar. Because of the
* way we do the smoothing, srtt and rttvar will each average +1/2
@@ -8939,6 +9605,7 @@
val = rack_probertt_lower_within * rack_time_between_probertt;
val /= 100;
if ((rack->in_probe_rtt == 0) &&
+ (rack->rc_skip_timely == 0) &&
((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
rack_enter_probertt(rack, us_cts);
}
@@ -9051,7 +9718,7 @@
(!IN_FASTRECOVERY(tp->t_flags))) {
/* Segment was a TLP and our retrans matched */
if (rack->r_ctl.rc_tlp_cwnd_reduce) {
- rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
+ rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
}
}
if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
@@ -9198,10 +9865,14 @@
*/
static void
rack_log_sack_passed(struct tcpcb *tp,
- struct tcp_rack *rack, struct rack_sendmap *rsm)
+ struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts)
{
struct rack_sendmap *nrsm;
+ uint32_t thresh;
+ /* Get our rxt threshold for lost consideration */
+ thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
+ /* Now start looking at rsm's */
nrsm = rsm;
TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
rack_head, r_tnext) {
@@ -9224,6 +9895,17 @@
*/
continue;
}
+ /* Check lost state */
+ if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
+ uint32_t exp;
+
+ exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
+ if (TSTMP_LT(exp, cts) || (exp == cts)) {
+ /* We consider it lost */
+ nrsm->r_flags |= RACK_WAS_LOST;
+ rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
+ }
+ }
if (nrsm->r_flags & RACK_SACK_PASSED) {
/*
* We found one that is already marked
@@ -9407,8 +10089,6 @@
return (1);
}
-
-
static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts,
@@ -9625,16 +10305,11 @@
(rsm->bindex == next->bindex) &&
((rsm->r_flags & RACK_STRADDLE) == 0) &&
((next->r_flags & RACK_STRADDLE) == 0) &&
+ ((rsm->r_flags & RACK_IS_PCM) == 0) &&
+ ((next->r_flags & RACK_IS_PCM) == 0) &&
(rsm->r_flags & RACK_IN_GP_WIN) &&
(next->r_flags & RACK_IN_GP_WIN))
can_use_hookery = 1;
- else if (next &&
- (rsm->bindex == next->bindex) &&
- ((rsm->r_flags & RACK_STRADDLE) == 0) &&
- ((next->r_flags & RACK_STRADDLE) == 0) &&
- ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
- ((next->r_flags & RACK_IN_GP_WIN) == 0))
- can_use_hookery = 1;
else
can_use_hookery = 0;
if (next && can_use_hookery &&
@@ -9661,7 +10336,7 @@
nrsm = &stack_map;
memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
/* Now adjust our tree blocks */
- rsm->r_end = start;
+ tqhash_update_end(rack->r_ctl.tqh, rsm, start);
next->r_start = start;
rsm->r_flags |= RACK_SHUFFLED;
next->r_flags |= RACK_SHUFFLED;
@@ -9712,6 +10387,17 @@
if ((nrsm->r_end - nrsm->r_start) >= segsiz)
rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);
rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
+ if (rsm->r_flags & RACK_WAS_LOST) {
+ int my_chg;
+
+ my_chg = (nrsm->r_end - nrsm->r_start);
+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
+ if (my_chg <= rack->r_ctl.rc_considered_lost)
+ rack->r_ctl.rc_considered_lost -= my_chg;
+ else
+ rack->r_ctl.rc_considered_lost = 0;
+ }
if (nrsm->r_flags & RACK_SACK_PASSED) {
rack->r_ctl.rc_reorder_ts = cts;
if (rack->r_ctl.rc_reorder_ts == 0)
@@ -9734,7 +10420,7 @@
* one walk backwards from there.
*/
if (nrsm && nrsm->r_in_tmap)
- rack_log_sack_passed(tp, rack, nrsm);
+ rack_log_sack_passed(tp, rack, nrsm, cts);
}
/* Now are we done? */
if (SEQ_LT(end, next->r_end) ||
@@ -9875,9 +10561,21 @@
/* You get a count for acking a whole segment or more */
if ((rsm->r_end - rsm->r_start) >= segsiz)
rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
+ if (rsm->r_flags & RACK_WAS_LOST) {
+ int my_chg;
+
+ my_chg = (rsm->r_end - rsm->r_start);
+ rsm->r_flags &= ~RACK_WAS_LOST;
+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
+ if (my_chg <= rack->r_ctl.rc_considered_lost)
+ rack->r_ctl.rc_considered_lost -= my_chg;
+ else
+ rack->r_ctl.rc_considered_lost = 0;
+ }
rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
if (rsm->r_in_tmap) /* should be true */
- rack_log_sack_passed(tp, rack, rsm);
+ rack_log_sack_passed(tp, rack, rsm, cts);
/* Is Reordering occuring? */
if (rsm->r_flags & RACK_SACK_PASSED) {
rsm->r_flags &= ~RACK_SACK_PASSED;
@@ -9889,6 +10587,7 @@
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_flags |= RACK_ACKED;
+ rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
@@ -9968,19 +10667,13 @@
(rsm->bindex == prev->bindex) &&
((rsm->r_flags & RACK_STRADDLE) == 0) &&
((prev->r_flags & RACK_STRADDLE) == 0) &&
+ ((rsm->r_flags & RACK_IS_PCM) == 0) &&
+ ((prev->r_flags & RACK_IS_PCM) == 0) &&
(rsm->r_flags & RACK_IN_GP_WIN) &&
(prev->r_flags & RACK_IN_GP_WIN))
can_use_hookery = 1;
- else if (prev &&
- (rsm->bindex == prev->bindex) &&
- ((rsm->r_flags & RACK_STRADDLE) == 0) &&
- ((prev->r_flags & RACK_STRADDLE) == 0) &&
- ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
- ((prev->r_flags & RACK_IN_GP_WIN) == 0))
- can_use_hookery = 1;
else
can_use_hookery = 0;
-
if (prev && can_use_hookery &&
(prev->r_flags & RACK_ACKED)) {
/**
@@ -10003,7 +10696,7 @@
noextra++;
nrsm = &stack_map;
memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
- prev->r_end = end;
+ tqhash_update_end(rack->r_ctl.tqh, prev, end);
rsm->r_start = end;
rsm->r_flags |= RACK_SHUFFLED;
prev->r_flags |= RACK_SHUFFLED;
@@ -10064,6 +10757,17 @@
rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);
rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
+ if (rsm->r_flags & RACK_WAS_LOST) {
+ int my_chg;
+
+ my_chg = (nrsm->r_end - nrsm->r_start);
+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
+ if (my_chg <= rack->r_ctl.rc_considered_lost)
+ rack->r_ctl.rc_considered_lost -= my_chg;
+ else
+ rack->r_ctl.rc_considered_lost = 0;
+ }
if (nrsm->r_flags & RACK_SACK_PASSED) {
rack->r_ctl.rc_reorder_ts = cts;
if (rack->r_ctl.rc_reorder_ts == 0)
@@ -10160,10 +10864,22 @@
/* You get a count for acking a whole segment or more */
if ((rsm->r_end - rsm->r_start) >= segsiz)
rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
-
+ if (rsm->r_flags & RACK_WAS_LOST) {
+ int my_chg;
+
+ my_chg = (rsm->r_end - rsm->r_start);
+ rsm->r_flags &= ~RACK_WAS_LOST;
+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
+ if (my_chg <= rack->r_ctl.rc_considered_lost)
+ rack->r_ctl.rc_considered_lost -= my_chg;
+ else
+ rack->r_ctl.rc_considered_lost = 0;
+ }
rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
+
if (rsm->r_in_tmap) /* should be true */
- rack_log_sack_passed(tp, rack, rsm);
+ rack_log_sack_passed(tp, rack, rsm, cts);
/* Is Reordering occuring? */
if (rsm->r_flags & RACK_SACK_PASSED) {
rsm->r_flags &= ~RACK_SACK_PASSED;
@@ -10175,6 +10891,7 @@
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_flags |= RACK_ACKED;
+ rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
@@ -10214,8 +10931,12 @@
break;
if (rsm->r_flags & RACK_STRADDLE)
break;
+ if (rsm->r_flags & RACK_IS_PCM)
+ break;
if (next->r_flags & RACK_STRADDLE)
break;
+ if (next->r_flags & RACK_IS_PCM)
+ break;
if (next->r_flags & RACK_ACKED) {
/* yep this and next can be merged */
rsm = rack_merge_rsm(rack, rsm, next);
@@ -10242,8 +10963,12 @@
break;
if (rsm->r_flags & RACK_STRADDLE)
break;
+ if (rsm->r_flags & RACK_IS_PCM)
+ break;
if (prev->r_flags & RACK_STRADDLE)
break;
+ if (prev->r_flags & RACK_IS_PCM)
+ break;
if (prev->r_flags & RACK_ACKED) {
/* yep the previous and this can be merged */
rsm = rack_merge_rsm(rack, prev, rsm);
@@ -10264,6 +10989,9 @@
/* Pass back the moved. */
*moved_two = moved;
*no_extra = noextra;
+ if (IN_RECOVERY(tp->t_flags)) {
+ rack->r_ctl.bytes_acked_in_recovery += changed;
+ }
return (changed);
}
@@ -10464,6 +11192,17 @@
* RTT's.
*/
+ if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) {
+ /*
+ * If we have some sack blocks in the filter
+ * lets prune them out by calling sfb with no blocks.
+ */
+ sack_filter_blks(&rack->r_ctl.rack_sf, NULL, 0, th_ack);
+ }
+ if (SEQ_GT(th_ack, tp->snd_una)) {
+ /* Clear any app ack remembered settings */
+ rack->r_ctl.cleared_app_ack = 0;
+ }
rack->r_wanted_output = 1;
if (SEQ_GT(th_ack, tp->snd_una))
rack->r_ctl.last_cumack_advance = acktime;
@@ -10533,10 +11272,10 @@
return;
}
#ifdef INVARIANTS
- panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
+ panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n",
tp,
tp->t_state, th_ack, rack,
- tp->snd_una, tp->snd_max, tp->snd_nxt);
+ tp->snd_una, tp->snd_max);
#endif
return;
}
@@ -10599,6 +11338,20 @@
uint32_t left;
uint8_t newly_acked;
+ if (rsm->r_flags & RACK_WAS_LOST) {
+ /*
+ * This can happen when we marked it as lost
+ * and yet before retransmitting we get an ack
+ * which can happen due to reordering.
+ */
+ rsm->r_flags &= ~RACK_WAS_LOST;
+ KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
+ if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
+ rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start;
+ else
+ rack->r_ctl.rc_considered_lost = 0;
+ }
rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
rsm->r_rtr_bytes = 0;
@@ -10613,6 +11366,10 @@
rsm->r_in_tmap = 0;
}
newly_acked = 1;
+ if (((rsm->r_flags & RACK_ACKED) == 0) &&
+ (IN_RECOVERY(tp->t_flags))) {
+ rack->r_ctl.bytes_acked_in_recovery += (rsm->r_end - rsm->r_start);
+ }
if (rsm->r_flags & RACK_ACKED) {
/*
* It was acked on the scoreboard -- remove
@@ -10639,6 +11396,9 @@
*/
rack->r_might_revert = 1;
}
+ rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
+ } else {
+ rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
}
if ((rsm->r_flags & RACK_TO_REXT) &&
(tp->t_flags & TF_RCVD_TSTMP) &&
@@ -10691,6 +11451,27 @@
* total for the part being cum-acked.
*/
rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
+ } else {
+ if (((rsm->r_flags & RACK_ACKED) == 0) &&
+ (IN_RECOVERY(tp->t_flags))) {
+ rack->r_ctl.bytes_acked_in_recovery += (th_ack - rsm->r_start);
+ }
+ rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack);
+ }
+ /* And what about the lost flag? */
+ if (rsm->r_flags & RACK_WAS_LOST) {
+ /*
+ * This can happen when we marked it as lost
+ * and yet before retransmitting we get an ack
+ * which can happen due to reordering. In this
+ * case its only a partial ack of the send.
+ */
+ KASSERT((rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)),
+ ("rsm:%p rack:%p rc_considered_lost goes negative th_ack:%u", rsm, rack, th_ack));
+ if (rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start))
+ rack->r_ctl.rc_considered_lost -= th_ack - rsm->r_start;
+ else
+ rack->r_ctl.rc_considered_lost = 0;
}
/*
* Clear the dup ack count for
@@ -10807,7 +11588,26 @@
tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
tp->snd_recover = tp->snd_una;
rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
- EXIT_RECOVERY(tp->t_flags);
+ if (IN_RECOVERY(tp->t_flags)) {
+ rack_exit_recovery(tp, rack, 3);
+ if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){
+ /*
+ * We were in recovery, had an RTO
+ * and then re-entered recovery (more sack's arrived)
+ * and we have properly recorded the old ssthresh from
+ * the first recovery. We want to be able to slow-start
+ * back to this level. The ssthresh from the timeout
+ * and then back into recovery will end up most likely
+ * to be min(cwnd=1mss, 2mss). Which makes it basically
+ * so we get no slow-start after our RTO.
+ */
+ rack->rto_from_rec = 0;
+ if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
+ tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
+ }
+ }
+ rack->r_ctl.bytes_acked_in_recovery = 0;
+ rack->r_ctl.time_entered_recovery = 0;
}
rack->r_might_revert = 0;
}
@@ -11062,7 +11862,8 @@
static uint32_t
do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una)
{
- return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt);
+ return (((tp->snd_max - snd_una) -
+ (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt);
}
static int32_t
@@ -11505,7 +12306,7 @@
((rsm->r_flags & RACK_MUST_RXT) == 0)) {
/* Enter recovery */
entered_recovery = 1;
- rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
+ rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
/*
* When we enter recovery we need to assure we send
* one packet.
@@ -11547,7 +12348,7 @@
}
static void
-rack_strike_dupack(struct tcp_rack *rack)
+rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
{
struct rack_sendmap *rsm;
@@ -11581,7 +12382,7 @@
if (rack->r_ctl.rc_resend != NULL) {
if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
rack_cong_signal(rack->rc_tp, CC_NDUPACK,
- rack->rc_tp->snd_una, __LINE__);
+ th_ack, __LINE__);
}
rack->r_wanted_output = 1;
rack->r_timer_override = 1;
@@ -11598,6 +12399,25 @@
struct tcp_rack *rack,
struct socket *so)
{
+ /*
+ * So what is dragging bottom?
+ *
+ * Dragging bottom means you were under pacing and had a
+ * delay in processing inbound acks waiting on our pacing
+ * timer to expire. While you were waiting all of the acknowledgments
+ * for the packets you sent have arrived. This means we are pacing
+ * way underneath the bottleneck to the point where our Goodput
+ * measurements stop working, since they require more than one
+ * ack (usually at least 8 packets worth with multiple acks so we can
+ * gauge the inter-ack times). If that occurs we have a real problem
+ * since we are stuck in a hole that we can't get out of without
+ * something speeding us up.
+ *
+ * We also check to see if we are widdling down to just one segment
+ * outstanding. If this occurs and we have room to send in our cwnd/rwnd
+ * then we are adding the delayed ack interval into our measurments and
+ * we need to speed up slightly.
+ */
uint32_t segsiz, minseg;
segsiz = ctf_fixed_maxseg(tp);
@@ -11614,10 +12434,13 @@
*/
uint64_t lt_bw;
+ tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM);
lt_bw = rack_get_lt_bw(rack);
rack->rc_dragged_bottom = 1;
rack_validate_multipliers_at_or_above100(rack);
if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
+ (rack->dis_lt_bw == 0) &&
+ (rack->use_lesser_lt_bw == 0) &&
(lt_bw > 0)) {
/*
* Lets use the long-term b/w we have
@@ -11729,7 +12552,7 @@
log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ;
log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff);
log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ;
- log.u_bbr.bbr_state = 1;
+ log.u_bbr.inhpts = 1;
#ifdef TCP_REQUEST_TRK
off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
@@ -11745,6 +12568,20 @@
log.u_bbr.flex7 |= rack->rc_hybrid_mode;
log.u_bbr.flex7 <<= 1;
log.u_bbr.flex7 |= rack->dgp_on;
+ /*
+ * Compose bbr_state to be a bit wise 0000ADHF
+ * where A is the always_pace flag
+ * where D is the dgp_on flag
+ * where H is the hybrid_mode on flag
+ * where F is the use_fixed_rate flag.
+ */
+ log.u_bbr.bbr_state = rack->rc_always_pace;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->dgp_on;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
+ log.u_bbr.bbr_state <<= 1;
+ log.u_bbr.bbr_state |= rack->use_fixed_rate;
log.u_bbr.flex8 = mod;
log.u_bbr.delRate = rack->r_ctl.bw_rate_cap;
log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg;
@@ -11763,12 +12600,13 @@
#ifdef TCP_REQUEST_TRK
static void
-rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len)
+rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
{
- struct tcp_sendfile_track *rc_cur;
+ struct tcp_sendfile_track *rc_cur, *orig_ent;
struct tcpcb *tp;
int err = 0;
+ orig_ent = rack->r_ctl.rc_last_sft;
rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq);
if (rc_cur == NULL) {
/* If not in the beginning what about the end piece */
@@ -11781,11 +12619,17 @@
/* If we find no parameters we are in straight DGP mode */
if(rc_cur == NULL) {
/* None found for this seq, just DGP for now */
- rack->r_ctl.client_suggested_maxseg = 0;
- rack->rc_catch_up = 0;
- rack->r_ctl.bw_rate_cap = 0;
- if (rack->rc_hybrid_mode)
+ if (rack->rc_hybrid_mode) {
+ rack->r_ctl.client_suggested_maxseg = 0;
+ rack->rc_catch_up = 0;
+ if (rack->cspr_is_fcc == 0)
+ rack->r_ctl.bw_rate_cap = 0;
+ else
+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
+ }
+ if (rack->rc_hybrid_mode) {
rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
+ }
if (rack->r_ctl.rc_last_sft) {
rack->r_ctl.rc_last_sft = NULL;
}
@@ -11793,6 +12637,20 @@
}
if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) {
/* This entry was never setup for hybrid pacing on/off etc */
+ if (rack->rc_hybrid_mode) {
+ rack->r_ctl.client_suggested_maxseg = 0;
+ rack->rc_catch_up = 0;
+ rack->r_ctl.bw_rate_cap = 0;
+ }
+ if (rack->r_ctl.rc_last_sft) {
+ rack->r_ctl.rc_last_sft = NULL;
+ }
+ if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) {
+ rc_cur->flags |= TCP_TRK_TRACK_FLG_FSND;
+ rc_cur->first_send = cts;
+ rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes;
+ rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
+ }
return;
}
/*
@@ -11812,18 +12670,40 @@
}
if (rack->rc_hybrid_mode == 0) {
rack->r_ctl.rc_last_sft = rc_cur;
+ if (orig_ent) {
+ orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
+ orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
+ orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND;
+ }
rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
return;
}
if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){
/* Compensate for all the header overhead's */
- rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
- } else
- rack->r_ctl.bw_rate_cap = 0;
+ if (rack->cspr_is_fcc == 0)
+ rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
+ else
+ rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
+ } else {
+ if (rack->rc_hybrid_mode) {
+ if (rack->cspr_is_fcc == 0)
+ rack->r_ctl.bw_rate_cap = 0;
+ else
+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
+ }
+ }
if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS)
rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg;
else
rack->r_ctl.client_suggested_maxseg = 0;
+ if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) {
+ /*
+ * It is the same timestamp as the previous one
+ * add the hybrid flag that will indicate we use
+ * sendtime not arrival time for catch-up mode.
+ */
+ rc_cur->hybrid_flags |= TCP_HYBRID_PACING_SENDTIME;
+ }
if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) &&
(rc_cur->cspr > 0)) {
uint64_t len;
@@ -11833,7 +12713,20 @@
* Calculate the deadline time, first set the
* time to when the request arrived.
*/
- rc_cur->deadline = rc_cur->localtime;
+ if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) {
+ /*
+ * For cases where its a duplicate tm (we received more
+ * than one request for a tm) we want to use now, the point
+ * where we are just sending the first bit of the request.
+ */
+ rc_cur->deadline = cts;
+ } else {
+ /*
+ * Here we have a different tm from the last request
+ * so we want to use arrival time as our base.
+ */
+ rc_cur->deadline = rc_cur->localtime;
+ }
/*
* Next calculate the length and compensate for
* TLS if need be.
@@ -11867,9 +12760,15 @@
*/
rack_set_pace_segments(tp, rack, __LINE__, NULL);
}
+ if (orig_ent) {
+ orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
+ orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
+ orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND;
+ }
rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
/* Remember it for next time and for CU mode */
rack->r_ctl.rc_last_sft = rc_cur;
+ rack->r_ctl.last_tm_mark = rc_cur->timestamp;
}
#endif
@@ -11884,7 +12783,7 @@
(ent->flags == TCP_TRK_TRACK_FLG_EMPTY) ||
(SEQ_GEQ(seq, ent->end_seq))) {
/* Time to update the track. */
- rack_set_dgp_hybrid_mode(rack, seq, len);
+ rack_set_dgp_hybrid_mode(rack, seq, len, cts);
ent = rack->r_ctl.rc_last_sft;
}
/* Out of all */
@@ -12116,8 +13015,17 @@
* if so be sure to NULL the pointer so we know we are no longer
* set to anything.
*/
- if (ent == rack->r_ctl.rc_last_sft)
+ if (ent == rack->r_ctl.rc_last_sft) {
rack->r_ctl.rc_last_sft = NULL;
+ if (rack->rc_hybrid_mode) {
+ rack->rc_catch_up = 0;
+ if (rack->cspr_is_fcc == 0)
+ rack->r_ctl.bw_rate_cap = 0;
+ else
+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
+ rack->r_ctl.client_suggested_maxseg = 0;
+ }
+ }
/* Generate the log that the tcp_netflix call would have */
tcp_req_log_req_info(rack->rc_tp, ent,
i, TCP_TRK_REQ_LOG_FREED, 0, 0);
@@ -12139,7 +13047,7 @@
rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
struct tcpcb *tp, struct tcpopt *to,
uint32_t tiwin, int32_t tlen,
- int32_t * ofia, int32_t thflags, int32_t *ret_val)
+ int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen)
{
int32_t ourfinisacked = 0;
int32_t nsegs, acked_amount;
@@ -12147,7 +13055,8 @@
struct mbuf *mfree;
struct tcp_rack *rack;
int32_t under_pacing = 0;
- int32_t recovery = 0;
+ int32_t post_recovery = 0;
+ uint32_t p_cwnd;
INP_WLOCK_ASSERT(tptoinpcb(tp));
@@ -12176,8 +13085,9 @@
if ((th->th_ack == tp->snd_una) &&
(tiwin == tp->snd_wnd) &&
+ (orig_tlen == 0) &&
((to->to_flags & TOF_SACK) == 0)) {
- rack_strike_dupack(rack);
+ rack_strike_dupack(rack, th->th_ack);
dup_ack_struck = 1;
}
rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)),
@@ -12185,6 +13095,7 @@
if ((rack->sack_attack_disable > 0) &&
(th->th_ack == tp->snd_una) &&
(tiwin == tp->snd_wnd) &&
+ (orig_tlen == 0) &&
(dsack_seen == 0) &&
(sacks_seen > 0)) {
/*
@@ -12197,7 +13108,7 @@
* were we are ignoring sacks from this guy due to
* it being a suspected attacker.
*/
- rack_strike_dupack(rack);
+ rack_strike_dupack(rack, th->th_ack);
}
}
@@ -12306,15 +13217,37 @@
tcp_rack_partialack(tp);
} else {
rack_post_recovery(tp, th->th_ack);
- recovery = 1;
+ post_recovery = 1;
+ /*
+ * Grab the segsiz, multiply by 2 and add the snd_cwnd
+ * that is the max the CC should add if we are exiting
+ * recovery and doing a late add.
+ */
+ p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ p_cwnd <<= 1;
+ p_cwnd += tp->snd_cwnd;
}
+ } else if ((rack->rto_from_rec == 1) &&
+ SEQ_GEQ(th->th_ack, tp->snd_recover)) {
+ /*
+ * We were in recovery, hit a rxt timeout
+ * and never re-entered recovery. The timeout(s)
+ * made up all the lost data. In such a case
+ * we need to clear the rto_from_rec flag.
+ */
+ rack->rto_from_rec = 0;
}
/*
* Let the congestion control algorithm update congestion control
* related information. This typically means increasing the
* congestion window.
*/
- rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
+ rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery);
+ if (post_recovery &&
+ (tp->snd_cwnd > p_cwnd)) {
+ /* Must be non-newreno (cubic) getting too ahead of itself */
+ tp->snd_cwnd = p_cwnd;
+ }
SOCKBUF_LOCK(&so->so_snd);
acked_amount = min(acked, (int)sbavail(&so->so_snd));
tp->snd_wnd -= acked_amount;
@@ -12338,13 +13271,6 @@
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
- /* now check the rxt clamps */
- if ((recovery == 1) &&
- (rack->excess_rxt_on) &&
- (rack->r_cwnd_was_clamped == 0)) {
- do_rack_excess_rxt(tp, rack);
- } else if (rack->r_cwnd_was_clamped)
- do_rack_check_for_unclamp(tp, rack);
m_freem(mfree);
if (SEQ_GT(tp->snd_una, tp->snd_recover))
tp->snd_recover = tp->snd_una;
@@ -12363,11 +13289,12 @@
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
tp->t_flags &= ~TF_PREVVALID;
+ rack->r_ctl.idle_snd_una = tp->snd_una;
rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
- rack->r_ctl.retran_during_recovery = 0;
- rack->r_ctl.dsack_byte_cnt = 0;
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
+ rack->r_ctl.retran_during_recovery = 0;
+ rack->r_ctl.dsack_byte_cnt = 0;
rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
if (sbavail(&tptosocket(tp)->so_snd) == 0)
tp->t_acktime = 0;
@@ -12562,7 +13489,6 @@
}
}
-
/*
* Return value of 1, the TCB is unlocked and most
* likely gone, return value of 0, the TCP is still
@@ -12713,12 +13639,20 @@
SOCKBUF_LOCK(&so->so_rcv);
if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
m_freem(m);
- } else
+ } else {
+ int32_t newsize;
+
+ if (tlen > 0) {
+ newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
+ if (newsize)
+ if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+ }
#ifdef NETFLIX_SB_LIMITS
appended =
#endif
sbappendstream_locked(&so->so_rcv, m, 0);
-
+ }
rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
/* NB: sorwakeup_locked() does an implicit unlock. */
sorwakeup_locked(so);
@@ -12877,9 +13811,6 @@
if (__predict_false(th->th_seq != tp->rcv_nxt)) {
return (0);
}
- if (__predict_false(tp->snd_nxt != tp->snd_max)) {
- return (0);
- }
if (tiwin && tiwin != tp->snd_wnd) {
return (0);
}
@@ -13005,10 +13936,6 @@
/* Above what we have sent? */
return (0);
}
- if (__predict_false(tp->snd_nxt != tp->snd_max)) {
- /* We are retransmitting */
- return (0);
- }
if (__predict_false(tiwin == 0)) {
/* zero window */
return (0);
@@ -13176,6 +14103,7 @@
rack->r_ctl.retran_during_recovery = 0;
rack->rc_suspicious = 0;
rack->r_ctl.dsack_byte_cnt = 0;
+ rack->r_ctl.idle_snd_una = tp->snd_una;
rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
@@ -13203,6 +14131,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ int32_t orig_tlen = tlen;
int32_t todrop;
int32_t ourfinisacked = 0;
struct tcp_rack *rack;
@@ -13267,8 +14196,9 @@
*/
if (IS_FASTOPEN(tp->t_flags) &&
(tp->snd_una != tp->snd_max)) {
- tp->snd_nxt = th->th_ack;
- tfo_partial = 1;
+ /* Was it a partial ack? */
+ if (SEQ_LT(th->th_ack, tp->snd_max))
+ tfo_partial = 1;
}
/*
* If there's data, delay ACK; if there's also a FIN ACKNOW
@@ -13299,6 +14229,24 @@
* and there is no send_map.
*/
tp->snd_una++;
+ if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) {
+ /*
+ * We sent a SYN with data, and thus have a
+ * sendmap entry with a SYN set. Lets find it
+ * and take off the send bit and the byte and
+ * set it up to be what we send (send it next).
+ */
+ struct rack_sendmap *rsm;
+
+ rsm = tqhash_min(rack->r_ctl.tqh);
+ if (rsm) {
+ if (rsm->r_flags & RACK_HAS_SYN) {
+ rsm->r_flags &= ~RACK_HAS_SYN;
+ rsm->r_start++;
+ }
+ rack->r_ctl.rc_resend = rsm;
+ }
+ }
}
/*
* Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
@@ -13361,7 +14309,7 @@
tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
tcp_rack_xmit_timer_commit(rack, tp);
}
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen))
return (ret_val);
/* We may have changed to FIN_WAIT_1 above */
if (tp->t_state == TCPS_FIN_WAIT_1) {
@@ -13407,6 +14355,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
struct tcp_rack *rack;
+ int32_t orig_tlen = tlen;
int32_t ret_val = 0;
int32_t ourfinisacked = 0;
@@ -13579,7 +14528,7 @@
tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
tcp_rack_xmit_timer_commit(rack, tp);
}
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
return (ret_val);
}
if (tp->t_state == TCPS_FIN_WAIT_1) {
@@ -13624,6 +14573,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ int32_t orig_tlen = tlen;
struct tcp_rack *rack;
/*
@@ -13730,7 +14680,7 @@
/*
* Ack processing.
*/
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
return (ret_val);
}
if (sbavail(&so->so_snd)) {
@@ -13756,6 +14706,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ int32_t orig_tlen = tlen;
struct tcp_rack *rack;
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -13830,7 +14781,7 @@
/*
* Ack processing.
*/
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
return (ret_val);
}
if (sbavail(&so->so_snd)) {
@@ -13884,6 +14835,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ int32_t orig_tlen = tlen;
int32_t ourfinisacked = 0;
struct tcp_rack *rack;
@@ -13966,7 +14918,7 @@
/*
* Ack processing.
*/
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
return (ret_val);
}
if (ourfinisacked) {
@@ -14011,6 +14963,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ int32_t orig_tlen = tlen;
int32_t ourfinisacked = 0;
struct tcp_rack *rack;
@@ -14093,7 +15046,7 @@
/*
* Ack processing.
*/
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
return (ret_val);
}
if (ourfinisacked) {
@@ -14124,6 +15077,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ int32_t orig_tlen;
int32_t ourfinisacked = 0;
struct tcp_rack *rack;
@@ -14152,6 +15106,7 @@
if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
return (ret_val);
}
+ orig_tlen = tlen;
if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
&rack->r_ctl.challenge_ack_ts,
&rack->r_ctl.challenge_ack_cnt)) {
@@ -14206,7 +15161,7 @@
/*
* case TCPS_LAST_ACK: Ack processing.
*/
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
return (ret_val);
}
if (ourfinisacked) {
@@ -14237,6 +15192,7 @@
uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
{
int32_t ret_val = 0;
+ int32_t orig_tlen = tlen;
int32_t ourfinisacked = 0;
struct tcp_rack *rack;
@@ -14320,7 +15276,7 @@
/*
* Ack processing.
*/
- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
return (ret_val);
}
if (sbavail(&so->so_snd)) {
@@ -14919,65 +15875,43 @@
}
static void
-rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval)
+rack_translate_policer_detect(struct tcp_rack *rack, uint32_t optval)
{
/*
- * P = percent bits
- * F = fill cw bit -- Toggle fillcw if this bit is set.
- * S = Segment bits
- * M = set max segment bit
- * U = Unclamined
- * C = If set to non-zero override the max number of clamps.
- * L = Bit to indicate if clamped gets lower.
+ * P = Percent of retransmits 499 = 49.9%
+ * A = Average number 1 (.1%) -> 169 (16.9%)
+ * M = Median number of retrans 1 - 16
+ * MMMM MMMM AAAA AAAA PPPP PPPP PPPP PPPP
*
- * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP
- *
- * The lowest 3 nibbles is the perentage .1 - 6553.5%
- * where 10.1 = 101, max 6553.5
- * The upper 16 bits holds some options.
- * The F bit will turn on fill-cw on if you are
- * not pacing, it will turn it off if dgp is on.
- * The L bit will change it so when clamped we get
- * the min(gp, lt-bw) for dgp.
*/
- uint16_t per;
+ uint16_t per, upp;
- rack->r_ctl.saved_rxt_clamp_val = optval;
per = optval & 0x0000ffff;
- rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff);
- if (optval > 0) {
- uint16_t clamp_opt;
-
- rack->excess_rxt_on = 1;
- clamp_opt = ((optval & 0xffff0000) >> 16);
- rack->r_ctl.clamp_options = clamp_opt & 0x00ff;
- if (clamp_opt & 0xff00) {
- /* A max clamps is also present */
- rack->r_ctl.max_clamps = (clamp_opt >> 8);
- } else {
- /* No specified clamps means no limit */
- rack->r_ctl.max_clamps = 0;
- }
- if (rack->r_ctl.clamp_options & 0x0002) {
- rack->r_clamped_gets_lower = 1;
- } else {
- rack->r_clamped_gets_lower = 0;
- }
+ rack->r_ctl.policer_rxt_threshold = (uint32_t)(per & 0xffff);
+ upp = ((optval & 0xffff0000) >> 16);
+ rack->r_ctl.policer_avg_threshold = (0x00ff & upp);
+ rack->r_ctl.policer_med_threshold = ((upp >> 8) & 0x00ff);
+ if ((rack->r_ctl.policer_rxt_threshold > 0) &&
+ (rack->r_ctl.policer_avg_threshold > 0) &&
+ (rack->r_ctl.policer_med_threshold > 0)) {
+ rack->policer_detect_on = 1;
} else {
- /* Turn it off back to default */
- rack->excess_rxt_on = 0;
- rack->r_clamped_gets_lower = 0;
+ rack->policer_detect_on = 0;
}
-
+ rack->r_ctl.saved_policer_val = optval;
+ policer_detection_log(rack, optval,
+ rack->r_ctl.policer_avg_threshold,
+ rack->r_ctl.policer_med_threshold,
+ rack->r_ctl.policer_rxt_threshold, 11);
}
-
static int32_t
rack_init(struct tcpcb *tp, void **ptr)
{
struct inpcb *inp = tptoinpcb(tp);
struct tcp_rack *rack = NULL;
uint32_t iwin, snt, us_cts;
+ size_t sz;
int err, no_query;
tcp_hpts_init(tp);
@@ -15036,16 +15970,22 @@
rack->rc_new_rnd_needed = 1;
rack->r_ctl.rc_split_limit = V_tcp_map_split_limit;
/* We want abe like behavior as well */
+
rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
- if (rack_rxt_clamp_thresh) {
- rack_translate_clamp_value(rack, rack_rxt_clamp_thresh);
- rack->excess_rxt_on = 1;
+ rack->r_ctl.policer_del_mss = rack_req_del_mss;
+ if ((rack_policer_rxt_thresh > 0) &&
+ (rack_policer_avg_thresh > 0) &&
+ (rack_policer_med_thresh > 0)) {
+ rack->r_ctl.policer_rxt_threshold = rack_policer_rxt_thresh;
+ rack->r_ctl.policer_avg_threshold = rack_policer_avg_thresh;
+ rack->r_ctl.policer_med_threshold = rack_policer_med_thresh;
+ rack->policer_detect_on = 1;
+ } else {
+ rack->policer_detect_on = 0;
}
- if (rack_uses_full_dgp_in_rec)
- rack->r_ctl.full_dgp_in_rec = 1;
if (rack_fill_cw_state)
rack->rc_pace_to_cwnd = 1;
if (rack_pacing_min_seg)
@@ -15063,6 +16003,15 @@
if (rack_tcp_accounting) {
tp->t_flags2 |= TF2_TCP_ACCOUNTING;
}
+#endif
+ rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY;
+ sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc);
+ rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT);
+ if (rack->r_ctl.pcm_s == NULL) {
+ rack->r_ctl.pcm_i.cnt_alloc = 0;
+ }
+#ifdef NETFLIX_STATS
+ rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask;
#endif
rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
@@ -15070,6 +16019,7 @@
rack->rack_enable_scwnd = 1;
rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
rack->rc_user_set_max_segs = rack_hptsi_segments;
+ rack->r_ctl.max_reduction = rack_max_reduce;
rack->rc_force_max_seg = 0;
TAILQ_INIT(&rack->r_ctl.opt_list);
rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
@@ -15084,12 +16034,22 @@
} else {
rack->r_ctl.saved_hibeta = 50;
}
+ /*
+ * We initialize to all ones so we never match 0
+ * just in case the client sends in 0, it hopefully
+ * will never have all 1's in ms :-)
+ */
+ rack->r_ctl.last_tm_mark = 0xffffffffffffffff;
rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
+ rack->r_ctl.pol_bw_comp = rack_policing_do_bw_comp;
rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
rack->r_ctl.rc_highest_us_rtt = 0;
rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
+ rack->pcm_enabled = rack_pcm_is_enabled;
+ if (rack_fillcw_bw_cap)
+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
if (rack_use_cmp_acks)
rack->r_use_cmp_ack = 1;
@@ -15098,6 +16058,7 @@
if (rack_gp_no_rec_chg)
rack->rc_gp_no_rec_chg = 1;
if (rack_pace_every_seg && tcp_can_enable_pacing()) {
+ rack->r_ctl.pacing_method |= RACK_REG_PACING;
rack->rc_always_pace = 1;
if (rack->rack_hibeta)
rack_set_cc_pacing(rack);
@@ -15114,13 +16075,31 @@
rack->r_limit_scw = 0;
rack_init_retransmit_value(rack, rack_rxt_controls);
rack->rc_labc = V_tcp_abc_l_var;
+ if (rack_honors_hpts_min_to)
+ rack->r_use_hpts_min = 1;
+ if (tp->snd_una != 0) {
+ rack->r_ctl.idle_snd_una = tp->snd_una;
+ rack->rc_sendvars_notset = 0;
+ /*
+ * Make sure any TCP timers are not running.
+ */
+ tcp_timer_stop(tp);
+ } else {
+ /*
+ * Server side, we are called from the
+ * syn-cache. This means none of the
+ * snd_una/max are set yet so we have
+ * to defer this until the first send.
+ */
+ rack->rc_sendvars_notset = 1;
+ }
+
rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
rack->r_ctl.rc_min_to = rack_min_to;
microuptime(&rack->r_ctl.act_rcv_time);
rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
- rack->rc_init_win = rack_default_init_window;
rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
if (rack_hw_up_only)
rack->r_up_only = 1;
@@ -15132,15 +16111,34 @@
} else
rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
+ if (rack_timely_off) {
+ rack->rc_skip_timely = 1;
+ }
+ if (rack->rc_skip_timely) {
+ rack->r_ctl.rack_per_of_gp_rec = 90;
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ rack->r_ctl.rack_per_of_gp_ss = 250;
+ }
rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
+ rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
+
setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
rack_probertt_filter_life);
us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
rack->r_ctl.rc_time_of_last_probertt = us_cts;
- rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
+ rack->r_ctl.rc_went_idle_time = us_cts;
+ rack->r_ctl.challenge_ack_ts = tcp_ts_getticks() - (tcp_ack_war_time_window + 1);
rack->r_ctl.rc_time_probertt_starts = 0;
+
+ rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff;
+ if (rack_rnd_cnt_req & 0x10000)
+ rack->r_ctl.gate_to_fs = 1;
+ rack->r_ctl.gp_gain_req = rack_gp_gain_req;
+ if ((rack_rnd_cnt_req & 0x100) > 0) {
+
+ }
if (rack_dsack_std_based & 0x1) {
/* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
rack->rc_rack_tmr_std_based = 1;
@@ -15449,10 +16447,8 @@
rack->r_ctl.fsb.tcp_ip_hdr = NULL;
rack->r_ctl.fsb.th = NULL;
}
- if (rack->rc_always_pace) {
- tcp_decrement_paced_conn();
- rack_undo_cc_pacing(rack);
- rack->rc_always_pace = 0;
+ if (rack->rc_always_pace == 1) {
+ rack_remove_pacing(rack);
}
/* Clean up any options if they were not applied */
while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
@@ -15492,6 +16488,12 @@
uma_zfree(rack_zone, rsm);
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
}
+ if (rack->r_ctl.pcm_s != NULL) {
+ free(rack->r_ctl.pcm_s, M_TCPPCM);
+ rack->r_ctl.pcm_s = NULL;
+ rack->r_ctl.pcm_i.cnt_alloc = 0;
+ rack->r_ctl.pcm_i.cnt = 0;
+ }
if ((rack->r_ctl.rc_num_maps_alloced > 0) &&
(tcp_bblogging_on(tp))) {
union tcp_log_stackspecific log;
@@ -15593,6 +16595,16 @@
int tmr_up;
tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
+ if (tcp_in_hpts(rack->rc_tp) == 0) {
+ /*
+ * Ok we probably need some timer up, but no
+ * matter what the mask we are not in hpts. We
+ * may have received an old ack and thus did nothing.
+ */
+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+ rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
+ return;
+ }
if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
return;
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
@@ -15916,6 +16928,134 @@
}
}
+static void
+rack_new_round_starts(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq)
+{
+ /*
+ * The next send has occurred mark the end of the round
+ * as when that data gets acknowledged. We can
+ * also do common things we might need to do when
+ * a round begins.
+ */
+ rack->r_ctl.roundends = tp->snd_max;
+ rack->rc_new_rnd_needed = 0;
+ rack_log_hystart_event(rack, tp->snd_max, 4);
+}
+
+
+static void
+rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2,
+ uint32_t flex3)
+{
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ (void)tcp_get_usecs(&tv);
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ log.u_bbr.flex8 = mod;
+ log.u_bbr.flex1 = flex1;
+ log.u_bbr.flex2 = flex2;
+ log.u_bbr.flex3 = flex3;
+ log.u_bbr.flex4 = rack_pcm_every_n_rounds;
+ log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds;
+ log.u_bbr.bbr_substate = rack->pcm_needed;
+ log.u_bbr.bbr_substate <<= 1;
+ log.u_bbr.bbr_substate |= rack->pcm_in_progress;
+ log.u_bbr.bbr_substate <<= 1;
+ log.u_bbr.bbr_substate |= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */
+ (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ }
+}
+
+static void
+rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq)
+{
+ /*
+ * The round (current_round) has ended. We now
+ * setup for the next round by incrementing the
+ * round numnber and doing any round specific
+ * things.
+ */
+ rack_log_hystart_event(rack, high_seq, 21);
+ rack->r_ctl.current_round++;
+ /* New round (current_round) begins at next send */
+ rack->rc_new_rnd_needed = 1;
+ if ((rack->pcm_enabled == 1) &&
+ (rack->pcm_needed == 0) &&
+ (rack->pcm_in_progress == 0)) {
+ /*
+ * If we have enabled PCM, then we need to
+ * check if the round has adanced to the state
+ * where one is required.
+ */
+ int rnds;
+
+ rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
+ if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
+ rack->pcm_needed = 1;
+ rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
+ } else if (rack_verbose_logging) {
+ rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
+ }
+ }
+ if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
+ /* We have hystart enabled send the round info in */
+ if (CC_ALGO(tp)->newround != NULL) {
+ CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
+ }
+ }
+ /*
+ * For DGP an initial startup check. We want to validate
+ * that we are not just pushing on slow-start and just
+ * not gaining.. i.e. filling buffers without getting any
+ * boost in b/w during the inital slow-start.
+ */
+ if (rack->dgp_on &&
+ (rack->rc_initial_ss_comp == 0) &&
+ (tp->snd_cwnd < tp->snd_ssthresh) &&
+ (rack->r_ctl.num_measurements >= RACK_REQ_AVG) &&
+ (rack->r_ctl.gp_rnd_thresh > 0) &&
+ ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) {
+
+ /*
+ * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where
+ * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets
+ * exit SS.
+ *
+ * Pick up the flight size now as we enter slowstart (not the
+ * cwnd which may be inflated).
+ */
+ rack->rc_initial_ss_comp = 1;
+
+ if (tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = rack->r_ctl.current_round;
+ log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
+ log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh;
+ log.u_bbr.flex5 = rack->r_ctl.gate_to_fs;
+ log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs;
+ log.u_bbr.flex8 = 40;
+ (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
+ 0, &log, false, NULL, __func__, __LINE__,&tv);
+ }
+ if ((rack->r_ctl.gate_to_fs == 1) &&
+ (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) {
+ tp->snd_cwnd = rack->r_ctl.ss_hi_fs;
+ }
+ tp->snd_ssthresh = tp->snd_cwnd - 1;
+ /* Turn off any fast output running */
+ rack->r_fast_output = 0;
+ }
+}
+
static int
rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
{
@@ -15949,7 +17089,7 @@
#endif
int nsegs = 0;
int under_pacing = 0;
- int recovery = 0;
+ int post_recovery = 0;
#ifdef TCP_ACCOUNTING
sched_pin();
#endif
@@ -16122,7 +17262,7 @@
}
} else if (ae->ack_val_set == ACK_DUPACK) {
/* Case D */
- rack_strike_dupack(rack);
+ rack_strike_dupack(rack, ae->ack);
} else if (ae->ack_val_set == ACK_RWND) {
/* Case C */
if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
@@ -16172,8 +17312,6 @@
}
#endif
high_seq = ae->ack;
- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
- rack_log_hystart_event(rack, high_seq, 8);
/* Setup our act_rcv_time */
if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
ts.tv_sec = ae->timestamp / 1000000000;
@@ -16239,13 +17377,11 @@
if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) &&
(rack->rc_new_rnd_needed == 0) &&
(nxt_pkt == 0)) {
- rack_log_hystart_event(rack, high_seq, 21);
- rack->r_ctl.current_round++;
- /* Force the next send to setup the next round */
- rack->rc_new_rnd_needed = 1;
- if (CC_ALGO(tp)->newround != NULL) {
- CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
- }
+ /*
+ * We have crossed into a new round with
+ * this th_ack value.
+ */
+ rack_new_round_setup(tp, rack, high_seq);
}
/*
* Clear the probe not answered flag
@@ -16306,8 +17442,17 @@
tcp_rack_partialack(tp);
} else {
rack_post_recovery(tp, high_seq);
- recovery = 1;
+ post_recovery = 1;
}
+ } else if ((rack->rto_from_rec == 1) &&
+ SEQ_GEQ(high_seq, tp->snd_recover)) {
+ /*
+ * We were in recovery, hit a rxt timeout
+ * and never re-entered recovery. The timeout(s)
+ * made up all the lost data. In such a case
+ * we need to clear the rto_from_rec flag.
+ */
+ rack->rto_from_rec = 0;
}
/* Handle the rack-log-ack part (sendmap) */
if ((sbused(&so->so_snd) == 0) &&
@@ -16340,9 +17485,24 @@
KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
if (acked_amount > 0) {
+ uint32_t p_cwnd;
struct mbuf *mfree;
- rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
+ if (post_recovery) {
+ /*
+ * Grab the segsiz, multiply by 2 and add the snd_cwnd
+ * that is the max the CC should add if we are exiting
+ * recovery and doing a late add.
+ */
+ p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+ p_cwnd <<= 1;
+ p_cwnd += tp->snd_cwnd;
+ }
+ rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery);
+ if (post_recovery && (tp->snd_cwnd > p_cwnd)) {
+ /* Must be non-newreno (cubic) getting too ahead of itself */
+ tp->snd_cwnd = p_cwnd;
+ }
SOCKBUF_LOCK(&so->so_snd);
mfree = sbcut_locked(&so->so_snd, acked_amount);
tp->snd_una = high_seq;
@@ -16351,12 +17511,6 @@
/* Wake up the socket if we have room to write more */
rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
sowwakeup_locked(so);
- if ((recovery == 1) &&
- (rack->excess_rxt_on) &&
- (rack->r_cwnd_was_clamped == 0)) {
- do_rack_excess_rxt(tp, rack);
- } else if (rack->r_cwnd_was_clamped)
- do_rack_check_for_unclamp(tp, rack);
m_freem(mfree);
}
/* update progress */
@@ -16587,7 +17741,9 @@
}
rack_handle_might_revert(tp, rack);
ctf_calc_rwin(so, tp);
- if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
+ if ((rack->r_wanted_output != 0) ||
+ (rack->r_fast_output != 0) ||
+ (tp->t_flags & TF_ACKNOW )) {
send_out_a_rst:
if (tcp_output(tp) < 0) {
#ifdef TCP_ACCOUNTING
@@ -16630,7 +17786,7 @@
* us_cts - is the time that LRO or hardware actually got the packet in microseconds.
*/
uint32_t cts, us_cts, ms_cts;
- uint32_t tiwin, high_seq;
+ uint32_t tiwin;
struct timespec ts;
struct tcpopt to;
struct tcp_rack *rack;
@@ -16818,7 +17974,6 @@
tp->t_flags &= ~TF_GPUTINPROG;
}
}
- high_seq = th->th_ack;
if (tcp_bblogging_on(rack->rc_tp)) {
union tcp_log_stackspecific log;
struct timeval ltv;
@@ -16938,7 +18093,6 @@
m_freem(m);
goto done_with_input;
}
-
/*
* Segment received on connection. Reset idle time and keep-alive
* timer. XXX: This should be done after segment validation to
@@ -16975,7 +18129,28 @@
if (TSTMP_GT(to.to_tsecr, ms_cts))
to.to_tsecr = 0;
}
-
+ if ((rack->r_rcvpath_rtt_up == 1) &&
+ (to.to_flags & TOF_TS) &&
+ (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) {
+ uint32_t rtt = 0;
+
+ /*
+ * We are receiving only and thus not sending
+ * data to do an RTT. We set a flag when we first
+ * sent this TS to the peer. We now have it back
+ * and have an RTT to share. We log it as a conf
+ * 4, we are not so sure about it.. since we
+ * may have lost an ack.
+ */
+ if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv))
+ rtt = (cts - rack->r_ctl.last_time_of_arm_rcv);
+ rack->r_rcvpath_rtt_up = 0;
+ /* Submit and commit the timer */
+ if (rtt > 0) {
+ tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1);
+ tcp_rack_xmit_timer_commit(rack, tp);
+ }
+ }
/*
* If its the first time in we need to take care of options and
* verify we can do SACK for rack!
@@ -17069,7 +18244,7 @@
(rack->use_fixed_rate == 0) &&
(rack->rc_always_pace)) {
/* Check in on probertt */
- rack_check_probe_rtt(rack, us_cts);
+ rack_check_probe_rtt(rack, cts);
}
rack_clear_rate_sample(rack);
if ((rack->forced_ack) &&
@@ -17113,7 +18288,7 @@
* If we are going for target, lets recheck before
* we output.
*/
- rack_check_probe_rtt(rack, us_cts);
+ rack_check_probe_rtt(rack, cts);
}
if (rack->set_pacing_done_a_iw == 0) {
/* How much has been acked? */
@@ -17144,7 +18319,10 @@
}
#endif
if ((nxt_pkt == 0) && (no_output == 0)) {
- if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
+ if ((rack->r_wanted_output != 0) ||
+ (tp->t_flags & TF_ACKNOW) ||
+ (rack->r_fast_output != 0)) {
+
do_output_now:
if (tcp_output(tp) < 0) {
#ifdef TCP_ACCOUNTING
@@ -17156,6 +18334,8 @@
}
rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
rack_free_trim(rack);
+ } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
+ goto do_output_now;
} else if ((no_output == 1) &&
(nxt_pkt == 0) &&
(tcp_in_hpts(rack->rc_tp) == 0)) {
@@ -17170,9 +18350,6 @@
/* Clear the flag, it may have been cleared by output but we may not have */
if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS))
tp->t_flags2 &= ~TF2_HPTS_CALLS;
- /* Update any rounds needed */
- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
- rack_log_hystart_event(rack, high_seq, 8);
/*
* The draft (v3) calls for us to use SEQ_GEQ, but that
* causes issues when we are just going app limited. Lets
@@ -17186,13 +18363,11 @@
if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) &&
(rack->rc_new_rnd_needed == 0) &&
(nxt_pkt == 0)) {
- rack_log_hystart_event(rack, tp->snd_una, 21);
- rack->r_ctl.current_round++;
- /* Force the next send to setup the next round */
- rack->rc_new_rnd_needed = 1;
- if (CC_ALGO(tp)->newround != NULL) {
- CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
- }
+ /*
+ * We have crossed into a new round with
+ * the new snd_unae.
+ */
+ rack_new_round_setup(tp, rack, tp->snd_una);
}
if ((nxt_pkt == 0) &&
((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
@@ -17242,6 +18417,7 @@
if (did_out)
rack->r_wanted_output = 0;
}
+
#ifdef TCP_ACCOUNTING
sched_unpin();
#endif
@@ -17325,7 +18501,7 @@
srtt = rack_grab_rtt(tp, rack);
idx = rsm->r_rtr_cnt - 1;
ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
- thresh = rack_calc_thresh_rack(rack, srtt, tsused);
+ thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
if ((tsused == ts_low) ||
(TSTMP_LT(tsused, ts_low))) {
/* No time since sending */
@@ -17354,7 +18530,7 @@
}
static void
-rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
+rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
int line, struct rack_sendmap *rsm, uint8_t quality)
{
@@ -17370,6 +18546,7 @@
if ((method != 2) &&
(method != 3) &&
(method != 7) &&
+ (method != 89) &&
(method != 14) &&
(method != 20)) {
return;
@@ -17429,12 +18606,8 @@
log.u_bbr.bbr_substate = quality;
log.u_bbr.bbr_state = rack->dgp_on;
log.u_bbr.bbr_state <<= 1;
- log.u_bbr.bbr_state |= rack->r_fill_less_agg;
- log.u_bbr.bbr_state <<= 1;
log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd;
log.u_bbr.bbr_state <<= 2;
- log.u_bbr.bbr_state |= rack->r_pacing_discount;
- log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) | log.u_bbr.flex7);
TCP_LOG_EVENTP(rack->rc_tp, NULL,
&rack->rc_inp->inp_socket->so_rcv,
&rack->rc_inp->inp_socket->so_snd,
@@ -17537,7 +18710,6 @@
{
uint64_t lentim, fill_bw;
- /* Lets first see if we are full, if so continue with normal rate */
rack->r_via_fill_cw = 0;
if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
return (slot);
@@ -17551,6 +18723,8 @@
/* The rtt is huge, N * smallest, lets not fill */
return (slot);
}
+ if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap)
+ return (slot);
/*
* first lets calculate the b/w based on the last us-rtt
* and the the smallest send window.
@@ -17570,26 +18744,47 @@
/* Now lets make it into a b/w */
fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
+ /* Adjust to any cap */
+ if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap)
+ fill_bw = rack->r_ctl.fillcw_cap;
+
at_lt_bw:
- if (rack->r_fill_less_agg) {
+ if (rack_bw_multipler > 0) {
/*
- * We want the average of the rate_wanted
- * and our fill-cw calculated bw. We also want
- * to cap any increase to be no more than
- * X times the lt_bw (where X is the rack_bw_multipler).
+ * We want to limit fill-cw to the some multiplier
+ * of the max(lt_bw, gp_est). The normal default
+ * is 0 for off, so a sysctl has enabled it.
*/
- uint64_t lt_bw, rate;
+ uint64_t lt_bw, gp, rate;
+ gp = rack_get_gp_est(rack);
lt_bw = rack_get_lt_bw(rack);
- if (lt_bw > *rate_wanted)
+ if (lt_bw > gp)
rate = lt_bw;
else
- rate = *rate_wanted;
- fill_bw += rate;
- fill_bw /= 2;
- if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) {
- fill_bw = rate * rack_bw_multipler;
- }
+ rate = gp;
+ rate *= rack_bw_multipler;
+ rate /= 100;
+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex1 = rack_bw_multipler;
+ log.u_bbr.flex2 = len;
+ log.u_bbr.cur_del_rate = gp;
+ log.u_bbr.delRate = lt_bw;
+ log.u_bbr.bw_inuse = rate;
+ log.u_bbr.rttProp = fill_bw;
+ log.u_bbr.flex8 = 44;
+ tcp_log_event(rack->rc_tp, NULL, NULL, NULL,
+ BBR_LOG_CWND, 0,
+ 0, &log, false, NULL,
+ __func__, __LINE__, &tv);
+ }
+ if (fill_bw > rate)
+ fill_bw = rate;
}
/* We are below the min b/w */
if (non_paced)
@@ -17638,9 +18833,8 @@
}
}
if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) {
- if (rack->rc_hybrid_mode)
- rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
- fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__);
+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
+ fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__);
fill_bw = rack->r_ctl.bw_rate_cap;
}
/*
@@ -17659,11 +18853,121 @@
return (slot);
}
-static int32_t
-rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
+static uint32_t
+rack_policer_check_send(struct tcp_rack *rack, uint32_t len, uint32_t segsiz, uint32_t *needs)
{
- uint64_t srtt;
- int32_t slot = 0;
+ uint64_t calc;
+
+ rack->rc_policer_should_pace = 0;
+ calc = rack_policer_bucket_reserve * rack->r_ctl.policer_bucket_size;
+ calc /= 100;
+ /*
+ * Now lets look at if we want more than is in the bucket <or>
+ * we want more than is reserved in the bucket.
+ */
+ if (rack_verbose_logging > 0)
+ policer_detection_log(rack, len, segsiz, calc, rack->r_ctl.current_policer_bucket, 8);
+ if ((calc > rack->r_ctl.current_policer_bucket) ||
+ (len >= (rack->r_ctl.current_policer_bucket - calc))) {
+ /*
+ * We may want to pace depending on if we are going
+ * into the reserve or not.
+ */
+ uint32_t newlen;
+
+ if (calc > rack->r_ctl.current_policer_bucket) {
+ /*
+ * This will eat into the reserve if we
+ * don't have room at all some lines
+ * below will catch it.
+ */
+ newlen = rack->r_ctl.policer_max_seg;
+ rack->rc_policer_should_pace = 1;
+ } else {
+ /*
+ * We have all of the reserve plus something in the bucket
+ * that we can give out.
+ */
+ newlen = rack->r_ctl.current_policer_bucket - calc;
+ if (newlen < rack->r_ctl.policer_max_seg) {
+ /*
+ * Into the reserve to get a full policer_max_seg
+ * so we set the len to that and eat into
+ * the reserve. If we go over the code
+ * below will make us wait.
+ */
+ newlen = rack->r_ctl.policer_max_seg;
+ rack->rc_policer_should_pace = 1;
+ }
+ }
+ if (newlen > rack->r_ctl.current_policer_bucket) {
+ /* We have to wait some */
+ *needs = newlen - rack->r_ctl.current_policer_bucket;
+ return (0);
+ }
+ if (rack_verbose_logging > 0)
+ policer_detection_log(rack, len, segsiz, newlen, 0, 9);
+ len = newlen;
+ } /* else we have all len available above the reserve */
+ if (rack_verbose_logging > 0)
+ policer_detection_log(rack, len, segsiz, calc, 0, 10);
+ return (len);
+}
+
+static uint32_t
+rack_policed_sending(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, uint32_t segsiz, int call_line)
+{
+ /*
+ * Given a send of len, and a token bucket set at current_policer_bucket_size
+ * are we close enough to the end of the bucket that we need to pace? If so
+ * calculate out a time and return it. Otherwise subtract the tokens from
+ * the bucket.
+ */
+ uint64_t calc;
+
+ if ((rack->r_ctl.policer_bw == 0) ||
+ (rack->r_ctl.policer_bucket_size < segsiz)) {
+ /*
+ * We should have an estimate here...
+ */
+ return (0);
+ }
+ calc = (uint64_t)rack_policer_bucket_reserve * (uint64_t)rack->r_ctl.policer_bucket_size;
+ calc /= 100;
+ if ((rack->r_ctl.current_policer_bucket < len) ||
+ (rack->rc_policer_should_pace == 1) ||
+ ((rack->r_ctl.current_policer_bucket - len) <= (uint32_t)calc)) {
+ /* we need to pace */
+ uint64_t lentim, res;
+ uint32_t slot;
+
+ lentim = (uint64_t)len * (uint64_t)HPTS_USEC_IN_SEC;
+ res = lentim / rack->r_ctl.policer_bw;
+ slot = (uint32_t)res;
+ if (rack->r_ctl.current_policer_bucket > len)
+ rack->r_ctl.current_policer_bucket -= len;
+ else
+ rack->r_ctl.current_policer_bucket = 0;
+ policer_detection_log(rack, len, slot, (uint32_t)rack_policer_bucket_reserve, call_line, 5);
+ rack->rc_policer_should_pace = 0;
+ return(slot);
+ }
+ /* Just take tokens out of the bucket and let rack do whatever it would have */
+ policer_detection_log(rack, len, 0, (uint32_t)rack_policer_bucket_reserve, call_line, 6);
+ if (len < rack->r_ctl.current_policer_bucket) {
+ rack->r_ctl.current_policer_bucket -= len;
+ } else {
+ rack->r_ctl.current_policer_bucket = 0;
+ }
+ return (0);
+}
+
+
+static int32_t
+rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)
+{
+ uint64_t srtt;
+ int32_t slot = 0;
int32_t minslot = 0;
int can_start_hw_pacing = 1;
int err;
@@ -17674,6 +18978,25 @@
pace_one = 1;
else
pace_one = 0;
+ if (rack->rc_policer_detected == 1) {
+ /*
+ * A policer has been detected and we
+ * have all of our data (policer-bw and
+ * policer bucket size) calculated. Call
+ * into the function to find out if we are
+ * overriding the time.
+ */
+ slot = rack_policed_sending(rack, tp, len, segsiz, line);
+ if (slot) {
+ uint64_t logbw;
+
+ logbw = rack->r_ctl.current_policer_bucket;
+ logbw <<= 32;
+ logbw |= rack->r_ctl.policer_bucket_size;
+ rack_log_pacing_delay_calc(rack, len, slot, rack->r_ctl.policer_bw, logbw, 0, 89, __LINE__, NULL, 0);
+ return(slot);
+ }
+ }
if (rack->rc_always_pace == 0) {
/*
* We use the most optimistic possible cwnd/srtt for
@@ -18214,6 +19537,16 @@
rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
rack->r_ctl.rc_gp_cumack_ts = 0;
+ if ((rack->r_ctl.cleared_app_ack == 1) &&
+ (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) {
+ /*
+ * We just cleared an application limited period
+ * so the next seq out needs to skip the first
+ * ack.
+ */
+ rack->app_limited_needs_set = 1;
+ rack->r_ctl.cleared_app_ack = 0;
+ }
rack_log_pacing_delay_calc(rack,
tp->gput_seq,
tp->gput_ack,
@@ -19132,7 +20465,6 @@
rack->r_late = 0;
rack->r_ctl.rc_agg_early = 0;
}
-
rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz);
if (doing_tlp) {
@@ -19189,17 +20521,8 @@
tcp_rl_log_enobuf(rack->r_ctl.crte);
}
counter_u64_add(rack_saw_enobuf, 1);
- } else
- slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
- if ((slot == 0) ||
- (rack->rc_always_pace == 0) ||
- (rack->r_rr_config == 1)) {
- /*
- * We have no pacing set or we
- * are using old-style rack or
- * we are overridden to use the old 1ms pacing.
- */
- slot = rack->r_ctl.rc_min_to;
+ } else {
+ slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
}
rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
#ifdef TCP_ACCOUNTING
@@ -19261,7 +20584,7 @@
(so->so_snd.sb_hiwat / 8 * 7) &&
sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
sendwin >= (sbused(&so->so_snd) -
- (tp->snd_nxt - tp->snd_una))) {
+ (tp->snd_max - tp->snd_una))) {
if (rack_autosndbuf_inc)
scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
else
@@ -19313,7 +20636,7 @@
uint32_t s_soff;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
- uint16_t add_flag = RACK_SENT_FP;
+ uint32_t add_flag = RACK_SENT_FP;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
@@ -19680,6 +21003,22 @@
rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv);
rack->r_ctl.lt_seq = tp->snd_una;
rack->lt_bw_up = 1;
+ } else if ((error == 0) &&
+ (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) {
+ /*
+ * Need to record what we have since we are
+ * approaching seq wrap.
+ */
+ struct timeval tv;
+ uint64_t tmark;
+
+ rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
+ rack->r_ctl.lt_seq = tp->snd_una;
+ tmark = tcp_get_u64_usecs(&tv);
+ if (tmark > rack->r_ctl.lt_timemark) {
+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
+ rack->r_ctl.lt_timemark = tmark;
+ }
}
rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz);
@@ -19699,13 +21038,7 @@
tp->snd_max += len;
tp->snd_nxt = tp->snd_max;
if (rack->rc_new_rnd_needed) {
- /*
- * Update the rnd to start ticking not
- * that from a time perspective all of
- * the preceding idle time is "in the round"
- */
- rack->rc_new_rnd_needed = 0;
- rack->r_ctl.roundends = tp->snd_max;
+ rack_new_round_starts(tp, rack, tp->snd_max);
}
{
int idx;
@@ -19746,7 +21079,7 @@
}
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
counter_u64_add(rack_fto_send, 1);
- slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
+ slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__);
rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
#ifdef TCP_ACCOUNTING
crtsc = get_cyclecount();
@@ -19856,7 +21189,7 @@
goto restart;
}
/* Now has it been long enough ? */
- thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts);
+ thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1);
if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
rack_log_collapse(rack, rsm->r_start,
(cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
@@ -19870,6 +21203,25 @@
return (NULL);
}
+static void
+rack_credit_back_policer_idle_time(struct tcp_rack *rack, uint64_t idle_t, int line)
+{
+ /*
+ * We were idle some time (idle_t) and so our policer bucket
+ * needs to grow. It can go no higher than policer_bucket_size.
+ */
+ uint64_t len;
+
+ len = idle_t * rack->r_ctl.policer_bw;
+ len /= HPTS_USEC_IN_SEC;
+ rack->r_ctl.current_policer_bucket += (uint32_t)len;
+ if (rack->r_ctl.policer_bucket_size < rack->r_ctl.current_policer_bucket) {
+ rack->r_ctl.current_policer_bucket = rack->r_ctl.policer_bucket_size;
+ }
+ if (rack_verbose_logging > 0)
+ policer_detection_log(rack, (uint32_t)len, line, (uint32_t)idle_t, 0, 7);
+}
+
static inline void
rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg)
{
@@ -19931,7 +21283,7 @@
unsigned ipsec_optlen = 0;
#endif
- int32_t idle, sendalot;
+ int32_t idle, sendalot, tot_idle;
int32_t sub_from_prr = 0;
volatile int32_t sack_rxmit;
struct rack_sendmap *rsm = NULL;
@@ -19940,7 +21292,7 @@
int32_t slot = 0;
int32_t sup_rack = 0;
uint32_t cts, ms_cts, delayed, early;
- uint16_t add_flag = RACK_SENT_SP;
+ uint32_t add_flag = RACK_SENT_SP;
/* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
uint8_t doing_tlp = 0;
uint32_t cwnd_to_use, pace_max_seg;
@@ -20101,12 +21453,16 @@
early = rack->r_ctl.rc_last_output_to - cts;
} else
early = 0;
- if (delayed) {
+ if (delayed && (rack->rc_always_pace == 1)) {
rack->r_ctl.rc_agg_delayed += delayed;
rack->r_late = 1;
- } else if (early) {
+ } else if (early && (rack->rc_always_pace == 1)) {
rack->r_ctl.rc_agg_early += early;
rack->r_early = 1;
+ } else if (rack->rc_always_pace == 0) {
+ /* Non-paced we are not late */
+ rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0;
+ rack->r_early = rack->r_late = 0;
}
/* Now that early/late accounting is done turn off the flag */
rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
@@ -20168,9 +21524,9 @@
}
if ((tp->snd_una == tp->snd_max) &&
rack->r_ctl.rc_went_idle_time &&
- TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
- idle = cts - rack->r_ctl.rc_went_idle_time;
- if (idle > rack_min_probertt_hold) {
+ (cts > rack->r_ctl.rc_went_idle_time)) {
+ tot_idle = idle = (cts - rack->r_ctl.rc_went_idle_time);
+ if (idle > (uint64_t)rack_min_probertt_hold) {
/* Count as a probe rtt */
if (rack->in_probe_rtt == 0) {
rack->r_ctl.rc_lower_rtt_us_cts = cts;
@@ -20183,17 +21539,75 @@
}
idle = 0;
}
+ if(rack->policer_detect_on) {
+ /*
+ * If we are doing policer detetion we at a minium
+ * record the time but if possible add back to
+ * the bucket based on the idle time.
+ */
+ uint64_t idle_t, u64_cts;
+
+ segsiz = min(ctf_fixed_maxseg(tp),
+ rack->r_ctl.rc_pace_min_segs);
+ u64_cts = tcp_tv_to_lusectick(&tv);
+ if ((rack->rc_policer_detected == 1) &&
+ (rack->r_ctl.policer_bucket_size > segsiz) &&
+ (rack->r_ctl.policer_bw > 0) &&
+ (u64_cts > rack->r_ctl.last_sendtime)) {
+ /* We are being policed add back the time */
+ idle_t = u64_cts - rack->r_ctl.last_sendtime;
+ rack_credit_back_policer_idle_time(rack, idle_t, __LINE__);
+ }
+ rack->r_ctl.last_sendtime = u64_cts;
+ }
if (rack_use_fsb &&
(rack->r_ctl.fsb.tcp_ip_hdr) &&
(rack->r_fsb_inited == 0) &&
(rack->r_state != TCPS_CLOSED))
rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]);
+ if (rack->rc_sendvars_notset == 1) {
+ rack->r_ctl.idle_snd_una = tp->snd_una;
+ rack->rc_sendvars_notset = 0;
+ /*
+ * Make sure any TCP timers (keep-alive) is not running.
+ */
+ tcp_timer_stop(tp);
+ }
+ if ((rack->rack_no_prr == 1) &&
+ (rack->rc_always_pace == 0)) {
+ /*
+ * Sanity check before sending, if we have
+ * no-pacing enabled and prr is turned off that
+ * is a logistics error. Correct this by turnning
+ * prr back on. A user *must* set some form of
+ * pacing in order to turn PRR off. We do this
+ * in the output path so that we can avoid socket
+ * option ordering issues that would occur if we
+ * tried to do it while setting rack_no_prr on.
+ */
+ rack->rack_no_prr = 0;
+ }
+ if ((rack->pcm_enabled == 1) &&
+ (rack->pcm_needed == 0) &&
+ (tot_idle > 0)) {
+ /*
+ * We have been idle some micro seconds. We need
+ * to factor this in to see if a PCM is needed.
+ */
+ uint32_t rtts_idle, rnds;
+
+ if (tp->t_srtt)
+ rtts_idle = tot_idle / tp->t_srtt;
+ else
+ rtts_idle = 0;
+ rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
+ rack->r_ctl.pcm_idle_rounds += rtts_idle;
+ if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
+ rack->pcm_needed = 1;
+ rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round );
+ }
+ }
again:
- /*
- * If we've recently taken a timeout, snd_max will be greater than
- * snd_nxt. There may be SACK information that allows us to avoid
- * resending already delivered data. Adjust snd_nxt accordingly.
- */
sendalot = 0;
cts = tcp_get_usecs(&tv);
ms_cts = tcp_tv_to_mssectick(&tv);
@@ -20205,6 +21619,44 @@
pace_max_seg = rack->rc_user_set_max_segs * segsiz;
else
pace_max_seg = rack->r_ctl.rc_pace_max_segs;
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (rack->r_ctl.pcm_max_seg == 0)) {
+ /*
+ * We set in our first send so we know that the ctf_fixed_maxseg
+ * has been fully set. If we do it in rack_init() we most likely
+ * see 512 bytes so we end up at 5120, not desirable.
+ */
+ rack->r_ctl.pcm_max_seg = rc_init_window(rack);
+ if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) {
+ /*
+ * Assure our initial PCM probe is at least 10 MSS.
+ */
+ rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
+ }
+ }
+ if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
+ uint32_t rw_avail, cwa;
+
+ if (tp->snd_wnd > ctf_outstanding(tp))
+ rw_avail = tp->snd_wnd - ctf_outstanding(tp);
+ else
+ rw_avail = 0;
+ if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked))
+ cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ else
+ cwa = 0;
+ if ((cwa >= rack->r_ctl.pcm_max_seg) &&
+ (rw_avail > rack->r_ctl.pcm_max_seg)) {
+ /* Raise up the max seg for this trip through */
+ pace_max_seg = rack->r_ctl.pcm_max_seg;
+ /* Disable any fast output */
+ rack->r_fast_output = 0;
+ }
+ if (rack_verbose_logging) {
+ rack_log_pcm(rack, 4,
+ cwa, rack->r_ctl.pcm_max_seg, rw_avail);
+ }
+ }
sb_offset = tp->snd_max - tp->snd_una;
cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
flags = tcp_outflags[tp->t_state];
@@ -20431,10 +21883,19 @@
((rsm->r_flags & RACK_HAS_FIN) == 0)) {
int ret;
+ if ((rack->rc_policer_detected == 1) &&
+ (rack->r_ctl.policer_bucket_size > segsiz) &&
+ (rack->r_ctl.policer_bw > 0)) {
+ /* Check to see if there is room */
+ if (rack->r_ctl.current_policer_bucket < len) {
+ goto skip_fast_output;
+ }
+ }
ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
if (ret == 0)
return (0);
}
+skip_fast_output:
so = inp->inp_socket;
sb = &so->so_snd;
if (do_a_prefetch == 0) {
@@ -20487,28 +21948,19 @@
prefetch_rsm = 1;
}
SOCKBUF_LOCK(sb);
- /*
- * If snd_nxt == snd_max and we have transmitted a FIN, the
- * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
- * negative length. This can also occur when TCP opens up its
- * congestion window while receiving additional duplicate acks after
- * fast-retransmit because TCP will reset snd_nxt to snd_max after
- * the fast-retransmit.
- *
- * In the normal retransmit-FIN-only case, however, snd_nxt will be
- * set to snd_una, the sb_offset will be 0, and the length may wind
- * up 0.
- *
- * If sack_rxmit is true we are retransmitting from the scoreboard
- * in which case len is already set.
- */
if ((sack_rxmit == 0) &&
(TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) {
+ /*
+ * We are not retransmitting (sack_rxmit is 0) so we
+ * are sending new data. This is always based on snd_max.
+ * Now in theory snd_max may be equal to snd_una, if so
+ * then nothing is outstanding and the offset would be 0.
+ */
uint32_t avail;
avail = sbavail(sb);
- if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
- sb_offset = tp->snd_nxt - tp->snd_una;
+ if (SEQ_GT(tp->snd_max, tp->snd_una) && avail)
+ sb_offset = tp->snd_max - tp->snd_una;
else
sb_offset = 0;
if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
@@ -20632,13 +22084,53 @@
kern_prefetch(so, &prefetch_so_done);
prefetch_so_done = 1;
}
+ orig_len = len;
+ if ((rack->rc_policer_detected == 1) &&
+ (rack->r_ctl.policer_bucket_size > segsiz) &&
+ (rack->r_ctl.policer_bw > 0) &&
+ (len > 0)) {
+ /*
+ * Ok we believe we have a policer watching
+ * what we send, can we send len? If not can
+ * we tune it down to a smaller value?
+ */
+ uint32_t plen, buck_needs;
+
+ plen = rack_policer_check_send(rack, len, segsiz, &buck_needs);
+ if (plen == 0) {
+ /*
+ * We are not allowed to send. How long
+ * do we need to pace for i.e. how long
+ * before len is available to send?
+ */
+ uint64_t lentime;
+
+ lentime = buck_needs;
+ lentime *= HPTS_USEC_IN_SEC;
+ lentime /= rack->r_ctl.policer_bw;
+ slot = (uint32_t)lentime;
+ tot_len_this_send = 0;
+ SOCKBUF_UNLOCK(sb);
+ if (rack_verbose_logging > 0)
+ policer_detection_log(rack, len, slot, buck_needs, 0, 12);
+ rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
+ rack_log_type_just_return(rack, cts, 0, slot, hpts_calling, 0, cwnd_to_use);
+ goto just_return_clean;
+ }
+ if (plen < len) {
+ sendalot = 0;
+ len = plen;
+ }
+ }
/*
* Lop off SYN bit if it has already been sent. However, if this is
* SYN-SENT state and if segment contains data and if we don't know
* that foreign host supports TAO, suppress sending segment.
*/
- if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
- ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
+ if ((flags & TH_SYN) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) &&
+ ((sack_rxmit == 0) &&
+ (tp->t_rxtshift == 0))) {
/*
* When sending additional segments following a TFO SYN|ACK,
* do not include the SYN bit.
@@ -20678,7 +22170,6 @@
}
/* Without fast-open there should never be data sent on a SYN */
if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
- tp->snd_nxt = tp->iss;
len = 0;
}
if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
@@ -20686,22 +22177,10 @@
add_flag |= RACK_SENT_W_DSACK;
len = segsiz;
}
- orig_len = len;
if (len <= 0) {
/*
- * If FIN has been sent but not acked, but we haven't been
- * called to retransmit, len will be < 0. Otherwise, window
- * shrank after we sent into it. If window shrank to 0,
- * cancel pending retransmit, pull snd_nxt back to (closed)
- * window, and set the persist timer if it isn't already
- * going. If the window didn't close completely, just wait
- * for an ACK.
- *
- * We also do a general check here to ensure that we will
- * set the persist timer when we have data to send, but a
- * 0-byte window. This makes sure the persist timer is set
- * even if the packet hits one of the "goto send" lines
- * below.
+ * We have nothing to send, or the window shrank, or
+ * is closed, do we need to go into persists?
*/
len = 0;
if ((tp->snd_wnd == 0) &&
@@ -20859,10 +22338,6 @@
if (sack_rxmit) {
if ((rsm->r_flags & RACK_HAS_FIN) == 0)
flags &= ~TH_FIN;
- } else {
- if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
- sbused(sb)))
- flags &= ~TH_FIN;
}
}
recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
@@ -20903,10 +22378,6 @@
pass = 4;
goto send;
}
- if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
- pass = 5;
- goto send;
- }
if (sack_rxmit) {
pass = 6;
goto send;
@@ -21014,7 +22485,7 @@
* yet done so, then we need to send.
*/
if ((flags & TH_FIN) &&
- (tp->snd_nxt == tp->snd_una)) {
+ (tp->snd_max == tp->snd_una)) {
pass = 11;
goto send;
}
@@ -21027,15 +22498,32 @@
{
int app_limited = CTF_JR_SENT_DATA;
+ if ((IS_FASTOPEN(tp->t_flags) == 0) &&
+ (flags & TH_FIN) &&
+ (len == 0) &&
+ (sbused(sb) == (tp->snd_max - tp->snd_una)) &&
+ ((tp->snd_max - tp->snd_una) <= segsiz)) {
+ /*
+ * Ok less than or right at a MSS is
+ * outstanding. The original FreeBSD stack would
+ * have sent a FIN, which can speed things up for
+ * a transactional application doing a MSG_WAITALL.
+ * To speed things up since we do *not* send a FIN
+ * if data is outstanding, we send a "challenge ack".
+ * The idea behind that is instead of having to have
+ * the peer wait for the delayed-ack timer to run off
+ * we send an ack that makes the peer send us an ack.
+ */
+ rack_send_ack_challange(rack);
+ }
if (tot_len_this_send > 0) {
- /* Make sure snd_nxt is up to max */
rack->r_ctl.fsb.recwin = recwin;
- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
+ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
if ((error == 0) &&
+ (rack->rc_policer_detected == 0) &&
rack_use_rfo &&
((flags & (TH_SYN|TH_FIN)) == 0) &&
(ipoptlen == 0) &&
- (tp->snd_nxt == tp->snd_max) &&
(tp->rcv_numsacks == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
@@ -21052,11 +22540,10 @@
segsiz, pace_max_seg, hw_tls, flags);
} else
rack->r_fast_output = 0;
-
-
rack_log_fsb(rack, tp, so, flags,
ipoptlen, orig_len, len, 0,
1, optlen, __LINE__, 1);
+ /* Assure when we leave that snd_nxt will point to top */
if (SEQ_GT(tp->snd_max, tp->snd_nxt))
tp->snd_nxt = tp->snd_max;
} else {
@@ -21218,6 +22705,7 @@
rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
}
+just_return_clean:
#ifdef NETFLIX_SHARED_CWND
if ((sbavail(sb) == 0) &&
rack->r_ctl.rc_scw) {
@@ -21284,13 +22772,39 @@
* is acked first.
*/
flags &= ~TH_FIN;
+ if ((sbused(sb) == (tp->snd_max - tp->snd_una)) &&
+ ((tp->snd_max - tp->snd_una) <= segsiz)) {
+ /*
+ * Ok less than or right at a MSS is
+ * outstanding. The original FreeBSD stack would
+ * have sent a FIN, which can speed things up for
+ * a transactional application doing a MSG_WAITALL.
+ * To speed things up since we do *not* send a FIN
+ * if data is outstanding, we send a "challenge ack".
+ * The idea behind that is instead of having to have
+ * the peer wait for the delayed-ack timer to run off
+ * we send an ack that makes the peer send us an ack.
+ */
+ rack_send_ack_challange(rack);
+ }
}
/* Enforce stack imposed max seg size if we have one */
- if (rack->r_ctl.rc_pace_max_segs &&
- (len > rack->r_ctl.rc_pace_max_segs)) {
+ if (pace_max_seg &&
+ (len > pace_max_seg)) {
mark = 1;
- len = rack->r_ctl.rc_pace_max_segs;
+ len = pace_max_seg;
+ }
+ if ((rsm == NULL) &&
+ (rack->pcm_in_progress == 0) &&
+ (rack->r_ctl.pcm_max_seg > 0) &&
+ (len >= rack->r_ctl.pcm_max_seg)) {
+ /* It is large enough for a measurement */
+ add_flag |= RACK_IS_PCM;
+ rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag);
+ } else if (rack_verbose_logging) {
+ rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag);
}
+
SOCKBUF_LOCK_ASSERT(sb);
if (len > 0) {
if (len >= segsiz)
@@ -21313,6 +22827,24 @@
#endif
hdrlen = sizeof(struct tcpiphdr);
+ /*
+ * Ok what seq are we sending from. If we have
+ * no rsm to use, then we look at various bits,
+ * if we are putting out a SYN it will be ISS.
+ * If we are retransmitting a FIN it will
+ * be snd_max-1 else its snd_max.
+ */
+ if (rsm == NULL) {
+ if (flags & TH_SYN)
+ rack_seq = tp->iss;
+ else if ((flags & TH_FIN) &&
+ (tp->t_flags & TF_SENTFIN))
+ rack_seq = tp->snd_max - 1;
+ else
+ rack_seq = tp->snd_max;
+ } else {
+ rack_seq = rsm->r_start;
+ }
/*
* Compute options for segment. We only have to care about SYN and
* established connection segments. Options for SYN-ACK segments
@@ -21322,7 +22854,6 @@
if ((tp->t_flags & TF_NOOPT) == 0) {
/* Maximum segment size. */
if (flags & TH_SYN) {
- tp->snd_nxt = tp->iss;
to.to_mss = tcp_mssopt(&inp->inp_inc);
if (tp->t_port)
to.to_mss -= V_tcp_udp_tunneling_overhead;
@@ -21369,14 +22900,47 @@
/* Timestamps. */
if ((tp->t_flags & TF_RCVD_TSTMP) ||
((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
- to.to_tsval = ms_cts + tp->ts_offset;
+ uint32_t ts_to_use;
+
+ if ((rack->r_rcvpath_rtt_up == 1) &&
+ (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) {
+ /*
+ * When we are doing a rcv_rtt probe all
+ * other timestamps use the next msec. This
+ * is safe since our previous ack is in the
+ * air and we will just have a few more
+ * on the next ms. This assures that only
+ * the one ack has the ms_cts that was on
+ * our ack-probe.
+ */
+ ts_to_use = ms_cts + 1;
+ } else {
+ ts_to_use = ms_cts;
+ }
+ to.to_tsval = ts_to_use + tp->ts_offset;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
+ if ((len == 0) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state)) &&
+ ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) &&
+ (tp->snd_una == tp->snd_max) &&
+ (flags & TH_ACK) &&
+ (sbavail(sb) == 0) &&
+ (rack->r_ctl.current_round != 0) &&
+ ((flags & (TH_SYN|TH_FIN)) == 0) &&
+ (rack->r_rcvpath_rtt_up == 0)) {
+ rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts;
+ rack->r_ctl.last_time_of_arm_rcv = cts;
+ rack->r_rcvpath_rtt_up = 1;
+ /* Subtract 1 from seq to force a response */
+ rack_seq--;
+ }
}
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
- (so->so_rcv.sb_flags & SB_AUTOSIZE))
- tp->rfbuf_ts = tcp_ts_getticks();
+ (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+ tp->rfbuf_ts = ms_cts;
+ }
/* Selective ACK's. */
if (tp->t_flags & TF_SACK_PERMIT) {
if (flags & TH_SYN)
@@ -21544,7 +23108,24 @@
(sbused(sb))) {
/*
* We have outstanding data, don't send a fin by itself!.
+ *
+ * Check to see if we need to send a challenge ack.
*/
+ if ((sbused(sb) == (tp->snd_max - tp->snd_una)) &&
+ ((tp->snd_max - tp->snd_una) <= segsiz)) {
+ /*
+ * Ok less than or right at a MSS is
+ * outstanding. The original FreeBSD stack would
+ * have sent a FIN, which can speed things up for
+ * a transactional application doing a MSG_WAITALL.
+ * To speed things up since we do *not* send a FIN
+ * if data is outstanding, we send a "challenge ack".
+ * The idea behind that is instead of having to have
+ * the peer wait for the delayed-ack timer to run off
+ * we send an ack that makes the peer send us an ack.
+ */
+ rack_send_ack_challange(rack);
+ }
goto just_return;
}
/*
@@ -21557,10 +23138,8 @@
uint32_t max_val;
uint32_t moff;
- if (rack->r_ctl.rc_pace_max_segs)
- max_val = rack->r_ctl.rc_pace_max_segs;
- else if (rack->rc_user_set_max_segs)
- max_val = rack->rc_user_set_max_segs * segsiz;
+ if (pace_max_seg)
+ max_val = pace_max_seg;
else
max_val = len;
/*
@@ -21596,16 +23175,28 @@
if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
m_copydata(mb, moff, (int)len,
mtod(m, caddr_t)+hdrlen);
- if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+ /*
+ * If we are not retransmitting advance the
+ * sndptr to help remember the next place in
+ * the sb.
+ */
+ if (rsm == NULL)
sbsndptr_adv(sb, mb, len);
m->m_len += len;
} else {
struct sockbuf *msb;
- if (SEQ_LT(tp->snd_nxt, tp->snd_max))
- msb = NULL;
- else
+ /*
+ * If we are not retransmitting pass in msb so
+ * the socket buffer can be advanced. Otherwise
+ * set it to NULL if its a retransmission since
+ * we don't want to change the sb remembered
+ * location.
+ */
+ if (rsm == NULL)
msb = sb;
+ else
+ msb = NULL;
m->m_next = tcp_m_copym(
mb, moff, &len,
if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
@@ -21631,7 +23222,7 @@
goto out;
}
}
- if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
+ if (sack_rxmit) {
if (rsm && (rsm->r_flags & RACK_TLP)) {
/*
* TLP should not count in retran count, but
@@ -21750,14 +23341,6 @@
#endif
}
}
- /*
- * Fill in fields, remembering maximum advertised window for use in
- * delaying messages about window sizes. If resending a FIN, be sure
- * not to use a new sequence number.
- */
- if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
- tp->snd_nxt == tp->snd_max)
- tp->snd_nxt--;
/*
* If we are starting a connection, send ECN setup SYN packet. If we
* are on a retransmit, we may resend those bits a number of times
@@ -21787,29 +23370,7 @@
#endif
}
}
- /*
- * If we are doing retransmissions, then snd_nxt will not reflect
- * the first unsent octet. For ACK only packets, we do not want the
- * sequence number of the retransmitted packet, we want the sequence
- * number of the next unsent octet. So, if there is no data (and no
- * SYN or FIN), use snd_max instead of snd_nxt when filling in
- * ti_seq. But if we are in persist state, snd_max might reflect
- * one byte beyond the right edge of the window, so use snd_nxt in
- * that case, since we know we aren't doing a retransmission.
- * (retransmit and persist are mutually exclusive...)
- */
- if (sack_rxmit == 0) {
- if (len || (flags & (TH_SYN | TH_FIN))) {
- th->th_seq = htonl(tp->snd_nxt);
- rack_seq = tp->snd_nxt;
- } else {
- th->th_seq = htonl(tp->snd_max);
- rack_seq = tp->snd_max;
- }
- } else {
- th->th_seq = htonl(rsm->r_start);
- rack_seq = rsm->r_start;
- }
+ th->th_seq = htonl(rack_seq);
th->th_ack = htonl(tp->rcv_nxt);
tcp_set_flags(th, flags);
/*
@@ -22170,6 +23731,13 @@
rack_to_usec_ts(&tv),
rsm, add_flag, s_mb, s_moff, hw_tls, segsiz);
if (error == 0) {
+ if (add_flag & RACK_IS_PCM) {
+ /* We just launched a PCM */
+ /* rrs here log */
+ rack->pcm_in_progress = 1;
+ rack->pcm_needed = 0;
+ rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag);
+ }
if (rsm == NULL) {
if (rack->lt_bw_up == 0) {
rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
@@ -22184,9 +23752,11 @@
rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
rack->r_ctl.lt_seq = tp->snd_una;
- tmark = tcp_tv_to_lusectick(&tv);
- rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
- rack->r_ctl.lt_timemark = tmark;
+ tmark = tcp_get_u64_usecs(&tv);
+ if (tmark > rack->r_ctl.lt_timemark) {
+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
+ rack->r_ctl.lt_timemark = tmark;
+ }
}
}
rack->forced_ack = 0; /* If we send something zap the FA flag */
@@ -22256,15 +23826,17 @@
(len > 0) &&
(tp->snd_una == tp->snd_max))
rack->r_ctl.rc_tlp_rxt_last_time = cts;
+
{
- tcp_seq startseq = tp->snd_nxt;
+ /*
+ * This block is not associated with the above error == 0 test.
+ * It is used to advance snd_max if we have a new transmit.
+ */
+ tcp_seq startseq = tp->snd_max;
+
- /* Track our lost count */
if (rsm && (doing_tlp == 0))
rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
- /*
- * Advance snd_nxt over sequence space of this segment.
- */
if (error)
/* We don't log or do anything with errors */
goto nomore;
@@ -22287,53 +23859,53 @@
rack->rc_tlp_in_progress = 1;
rack->r_ctl.rc_tlp_cnt_out++;
}
- if (flags & (TH_SYN | TH_FIN)) {
- if (flags & TH_SYN)
- tp->snd_nxt++;
- if (flags & TH_FIN) {
- tp->snd_nxt++;
- tp->t_flags |= TF_SENTFIN;
- }
- }
- /* In the ENOBUFS case we do *not* update snd_max */
+ /*
+ * If we are retransmitting we are done, snd_max
+ * does not get updated.
+ */
if (sack_rxmit)
goto nomore;
-
- tp->snd_nxt += len;
- if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
- if (tp->snd_una == tp->snd_max) {
- /*
- * Update the time we just added data since
- * none was outstanding.
- */
- rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
- tp->t_acktime = ticks;
- }
- tp->snd_max = tp->snd_nxt;
- if (rack->rc_new_rnd_needed) {
- /*
- * Update the rnd to start ticking not
- * that from a time perspective all of
- * the preceding idle time is "in the round"
- */
- rack->rc_new_rnd_needed = 0;
- rack->r_ctl.roundends = tp->snd_max;
- }
+ if ((tp->snd_una == tp->snd_max) && (len > 0)) {
/*
- * Time this transmission if not a retransmission and
- * not currently timing anything.
- * This is only relevant in case of switching back to
- * the base stack.
+ * Update the time we just added data since
+ * nothing was outstanding.
*/
- if (tp->t_rtttime == 0) {
- tp->t_rtttime = ticks;
- tp->t_rtseq = startseq;
- KMOD_TCPSTAT_INC(tcps_segstimed);
+ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
+ tp->t_acktime = ticks;
+ }
+ /*
+ * Now for special SYN/FIN handling.
+ */
+ if (flags & (TH_SYN | TH_FIN)) {
+ if ((flags & TH_SYN) &&
+ ((tp->t_flags & TF_SENTSYN) == 0)) {
+ tp->snd_max++;
+ tp->t_flags |= TF_SENTSYN;
}
- if (len &&
- ((tp->t_flags & TF_GPUTINPROG) == 0))
- rack_start_gp_measurement(tp, rack, startseq, sb_offset);
+ if ((flags & TH_FIN) &&
+ ((tp->t_flags & TF_SENTFIN) == 0)) {
+ tp->snd_max++;
+ tp->t_flags |= TF_SENTFIN;
+ }
+ }
+ tp->snd_max += len;
+ if (rack->rc_new_rnd_needed) {
+ rack_new_round_starts(tp, rack, tp->snd_max);
+ }
+ /*
+ * Time this transmission if not a retransmission and
+ * not currently timing anything.
+ * This is only relevant in case of switching back to
+ * the base stack.
+ */
+ if (tp->t_rtttime == 0) {
+ tp->t_rtttime = ticks;
+ tp->t_rtseq = startseq;
+ KMOD_TCPSTAT_INC(tcps_segstimed);
}
+ if (len &&
+ ((tp->t_flags & TF_GPUTINPROG) == 0))
+ rack_start_gp_measurement(tp, rack, startseq, sb_offset);
/*
* If we are doing FO we need to update the mbuf position and subtract
* this happens when the peer sends us duplicate information and
@@ -22356,6 +23928,47 @@
rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
}
}
+ if (rack_pcm_blast == 0) {
+ if ((orig_len > len) &&
+ (add_flag & RACK_IS_PCM) &&
+ (len < pace_max_seg) &&
+ ((pace_max_seg - len) > segsiz)) {
+ /*
+ * We are doing a PCM measurement and we did
+ * not get enough data in the TSO to meet the
+ * burst requirement.
+ */
+ uint32_t n_len;
+
+ n_len = (orig_len - len);
+ orig_len -= len;
+ pace_max_seg -= len;
+ len = n_len;
+ sb_offset = tp->snd_max - tp->snd_una;
+ /* Re-lock for the next spin */
+ SOCKBUF_LOCK(sb);
+ goto send;
+ }
+ } else {
+ if ((orig_len > len) &&
+ (add_flag & RACK_IS_PCM) &&
+ ((orig_len - len) > segsiz)) {
+ /*
+ * We are doing a PCM measurement and we did
+ * not get enough data in the TSO to meet the
+ * burst requirement.
+ */
+ uint32_t n_len;
+
+ n_len = (orig_len - len);
+ orig_len -= len;
+ len = n_len;
+ sb_offset = tp->snd_max - tp->snd_una;
+ /* Re-lock for the next spin */
+ SOCKBUF_LOCK(sb);
+ goto send;
+ }
+ }
}
nomore:
if (error) {
@@ -22488,14 +24101,10 @@
enobufs:
if (sendalot) {
/* Do we need to turn off sendalot? */
- if (rack->r_ctl.rc_pace_max_segs &&
- (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
+ if (pace_max_seg &&
+ (tot_len_this_send >= pace_max_seg)) {
/* We hit our max. */
sendalot = 0;
- } else if ((rack->rc_user_set_max_segs) &&
- (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
- /* We hit the user defined max */
- sendalot = 0;
}
}
if ((error == 0) && (flags & TH_FIN))
@@ -22515,22 +24124,7 @@
* hit the else if with slot preset. Other
* errors return.
*/
- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
- }
- if (rsm &&
- (rsm->r_flags & RACK_HAS_SYN) == 0 &&
- rack->use_rack_rr) {
- /* Its a retransmit and we use the rack cheat? */
- if ((slot == 0) ||
- (rack->rc_always_pace == 0) ||
- (rack->r_rr_config == 1)) {
- /*
- * We have no pacing set or we
- * are using old-style rack or
- * we are overridden to use the old 1ms pacing.
- */
- slot = rack->r_ctl.rc_min_to;
- }
+ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
}
/* We have sent clear the flag */
rack->r_ent_rec_ns = 0;
@@ -22568,9 +24162,9 @@
rack_use_rfo &&
((flags & (TH_SYN|TH_FIN)) == 0) &&
(rsm == NULL) &&
- (tp->snd_nxt == tp->snd_max) &&
(ipoptlen == 0) &&
(tp->rcv_numsacks == 0) &&
+ (rack->rc_policer_detected == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
((IN_RECOVERY(tp->t_flags)) == 0) &&
@@ -22599,7 +24193,6 @@
(rsm == NULL) &&
(ipoptlen == 0) &&
(tp->rcv_numsacks == 0) &&
- (tp->snd_nxt == tp->snd_max) &&
(rack->r_must_retran == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
@@ -22625,8 +24218,8 @@
}
goto again;
}
- /* Assure when we leave that snd_nxt will point to top */
skip_all_send:
+ /* Assure when we leave that snd_nxt will point to top */
if (SEQ_GT(tp->snd_max, tp->snd_nxt))
tp->snd_nxt = tp->snd_max;
rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
@@ -22705,14 +24298,26 @@
static int
rack_set_dgp(struct tcp_rack *rack)
{
- /* pace_always=1 */
- if (rack->rc_always_pace == 0) {
- if (tcp_can_enable_pacing() == 0)
- return (EBUSY);
+ if (rack->dgp_on == 1)
+ return(0);
+ if ((rack->use_fixed_rate == 1) &&
+ (rack->rc_always_pace == 1)) {
+ /*
+ * We are already pacing another
+ * way.
+ */
+ return (EBUSY);
+ }
+ if (rack->rc_always_pace == 1) {
+ rack_remove_pacing(rack);
}
+ if (tcp_incr_dgp_pacing_cnt() == 0)
+ return (ENOSPC);
+ rack->r_ctl.pacing_method |= RACK_DGP_PACING;
rack->rc_fillcw_apply_discount = 0;
rack->dgp_on = 1;
rack->rc_always_pace = 1;
+ rack->rc_pace_dnd = 1;
rack->use_fixed_rate = 0;
if (rack->gp_ready)
rack_set_cc_pacing(rack);
@@ -22737,14 +24342,7 @@
/* npush=2 */
rack->r_ctl.rc_no_push_at_mrtt = 2;
/* fillcw=1 */
- if (rack->r_cwnd_was_clamped == 0) {
- rack->rc_pace_to_cwnd = 1;
- } else {
- rack->rc_pace_to_cwnd = 0;
- /* Reset all multipliers to 100.0 so just the measured bw */
- rack->r_ctl.rack_per_of_gp_ss = 100;
- rack->r_ctl.rack_per_of_gp_ca = 100;
- }
+ rack->rc_pace_to_cwnd = 1;
rack->rc_pace_fill_if_rttin_range = 0;
rack->rtt_limit_mul = 0;
/* noprr=1 */
@@ -22753,12 +24351,9 @@
rack->r_limit_scw = 1;
/* gp_inc_rec */
rack->r_ctl.rack_per_of_gp_rec = 90;
- rack_client_buffer_level_set(rack);
return (0);
}
-
-
static int
rack_set_profile(struct tcp_rack *rack, int prof)
{
@@ -22768,72 +24363,37 @@
* Profile 1 is "standard" DGP. It ignores
* client buffer level.
*/
- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0;
err = rack_set_dgp(rack);
if (err)
return (err);
- } else if (prof == 2) {
- /*
- * Profile 2 is DGP. Less aggressive with
- * respect to client buffer level.
- */
- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1;
+ } else if (prof == 6) {
err = rack_set_dgp(rack);
if (err)
return (err);
- } else if (prof == 3) {
/*
- * Profile 3 is DGP. Even Less aggressive with
- * respect to client buffer level.
- */
- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2;
- err = rack_set_dgp(rack);
- if (err)
- return (err);
- } else if (prof == 4) {
- /*
- * Profile 4 is DGP with the most responsiveness
- * to client buffer level.
- */
- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3;
- err = rack_set_dgp(rack);
- if (err)
- return (err);
- } else if (prof == 5) {
- err = rack_set_dgp(rack);
- if (err)
- return (err);
- /*
- * By turning DGP off we change the rate
- * picked to be only the one the cwnd and rtt
- * get us.
- */
- rack->dgp_on = 0;
- } else if (prof == 6) {
- err = rack_set_dgp(rack);
- if (err)
- return (err);
- /*
- * Profile 6 tweaks DGP so that it will apply to
- * fill-cw the same settings that profile5 does
- * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted).
+ * Profile 6 tweaks DGP so that it will apply to
+ * fill-cw the same settings that profile5 does
+ * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted).
*/
rack->rc_fillcw_apply_discount = 1;
} else if (prof == 0) {
/* This changes things back to the default settings */
- rack->dgp_on = 0;
- rack->rc_hybrid_mode = 0;
+ if (rack->rc_always_pace == 1) {
+ rack_remove_pacing(rack);
+ } else {
+ /* Make sure any stray flags are off */
+ rack->dgp_on = 0;
+ rack->rc_hybrid_mode = 0;
+ rack->use_fixed_rate = 0;
+ }
err = 0;
if (rack_fill_cw_state)
rack->rc_pace_to_cwnd = 1;
else
rack->rc_pace_to_cwnd = 0;
- if (rack->rc_always_pace) {
- tcp_decrement_paced_conn();
- rack_undo_cc_pacing(rack);
- rack->rc_always_pace = 0;
- }
+
if (rack_pace_every_seg && tcp_can_enable_pacing()) {
+ rack->r_ctl.pacing_method |= RACK_REG_PACING;
rack->rc_always_pace = 1;
if (rack->rack_hibeta)
rack_set_cc_pacing(rack);
@@ -22883,7 +24443,6 @@
}
rack->r_rr_config = 0;
rack->r_ctl.rc_no_push_at_mrtt = 0;
- rack->rc_pace_to_cwnd = 0;
rack->rc_pace_fill_if_rttin_range = 0;
rack->rtt_limit_mul = 0;
@@ -22911,7 +24470,7 @@
struct deferred_opt_list *dol;
dol = malloc(sizeof(struct deferred_opt_list),
- M_TCPFSB, M_NOWAIT|M_ZERO);
+ M_TCPDO, M_NOWAIT|M_ZERO);
if (dol == NULL) {
/*
* No space yikes -- fail out..
@@ -22935,19 +24494,6 @@
microuptime(&tv);
- /*
- * If BB logging is not on we need to look at the DTL flag.
- * If its on already then those reasons override the DTL input.
- * We do this with any request, you can turn DTL on, but it does
- * not turn off at least from hybrid pacing requests.
- */
- if (tcp_bblogging_on(rack->rc_tp) == 0) {
- if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) {
- /* Turn on BB point logging */
- tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS,
- TCP_BBPOINT_REQ_LEVEL_LOGGING);
- }
- }
/* Make sure no fixed rate is on */
rack->use_fixed_rate = 0;
rack->r_ctl.rc_fixed_pacing_rate_rec = 0;
@@ -22962,6 +24508,8 @@
rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0);
return (ENOSPC);
}
+ /* mask our internal flags */
+ hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK;
/* The seq will be snd_una + everything in the buffer */
seq = sft->start_seq;
if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) {
@@ -22986,6 +24534,26 @@
return (err);
}
}
+ /*
+ * Now we must switch to hybrid mode as well which also
+ * means moving to regular pacing.
+ */
+ if (rack->rc_hybrid_mode == 0) {
+ /* First time */
+ if (tcp_can_enable_pacing()) {
+ rack->r_ctl.pacing_method |= RACK_REG_PACING;
+ rack->rc_hybrid_mode = 1;
+ } else {
+ return (ENOSPC);
+ }
+ if (rack->r_ctl.pacing_method & RACK_DGP_PACING) {
+ /*
+ * This should be true.
+ */
+ tcp_dec_dgp_pacing_cnt();
+ rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
+ }
+ }
/* Now set in our flags */
sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET;
if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR)
@@ -22996,7 +24564,6 @@
sft->hint_maxseg = hybrid->hint_maxseg;
else
sft->hint_maxseg = 0;
- rack->rc_hybrid_mode = 1;
rack->rc_tp->tcp_hybrid_start++;
rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0);
return (0);
@@ -23005,6 +24572,36 @@
#endif
}
+static int
+rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si)
+{
+ /*
+ * Gather rack specific information.
+ */
+ struct tcp_rack *rack;
+
+ rack = (struct tcp_rack *)tp->t_fb_ptr;
+ /* We pulled a SSI info log out what was there */
+ policer_detection_log(rack, rack->rc_highly_buffered, 0, 0, 0, 20);
+ if (rack->policer_detect_on) {
+ si->policer_detection_enabled = 1;
+ if (rack->rc_policer_detected) {
+ si->policer_detected = 1;
+ si->policer_bucket_size = rack->r_ctl.policer_bucket_size;
+ si->policer_last_bw = rack->r_ctl.policer_bw;
+ } else {
+ si->policer_detected = 0;
+ si->policer_bucket_size = 0;
+ si->policer_last_bw = 0;
+ }
+ si->current_round = rack->r_ctl.current_round;
+ si->highly_buffered = rack->rc_highly_buffered;
+ }
+ si->bytes_transmitted = tp->t_sndbytes;
+ si->bytes_retransmitted = tp->t_snd_rxt_bytes;
+ return (0);
+}
+
static int
rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid)
@@ -23077,34 +24674,7 @@
}
break;
case TCP_RACK_PACING_BETA:
- RACK_OPTS_INC(tcp_rack_beta);
- if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
- /* This only works for newreno. */
- error = EINVAL;
- break;
- }
- if (rack->rc_pacing_cc_set) {
- /*
- * Set them into the real CC module
- * whats in the rack pcb is the old values
- * to be used on restoral/
- */
- sopt.sopt_dir = SOPT_SET;
- opt.name = CC_NEWRENO_BETA;
- opt.val = optval;
- if (CC_ALGO(tp)->ctl_output != NULL)
- error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
- else {
- error = ENOENT;
- break;
- }
- } else {
- /*
- * Not pacing yet so set it into our local
- * rack pcb storage.
- */
- rack->r_ctl.rc_saved_beta.beta = optval;
- }
+ error = EINVAL;
break;
case TCP_RACK_TIMER_SLOP:
RACK_OPTS_INC(tcp_rack_timer_slop);
@@ -23188,8 +24758,29 @@
else
rack->r_up_only = 0;
break;
+ case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
+ RACK_OPTS_INC(tcp_fillcw_rate_cap);
+ rack->r_ctl.fillcw_cap = loptval;
+ break;
case TCP_PACING_RATE_CAP:
RACK_OPTS_INC(tcp_pacing_rate_cap);
+ if ((rack->dgp_on == 1) &&
+ (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
+ /*
+ * If we are doing DGP we need to switch
+ * to using the pacing limit.
+ */
+ if (tcp_can_enable_pacing() == 0) {
+ error = ENOSPC;
+ break;
+ }
+ /*
+ * Now change up the flags and counts to be correct.
+ */
+ rack->r_ctl.pacing_method |= RACK_REG_PACING;
+ tcp_dec_dgp_pacing_cnt();
+ rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
+ }
rack->r_ctl.bw_rate_cap = loptval;
break;
case TCP_HYBRID_PACING:
@@ -23197,8 +24788,18 @@
error = EINVAL;
break;
}
+ if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) {
+ error = EPERM;
+ break;
+ }
error = process_hybrid_pacing(rack, hybrid);
break;
+ case TCP_SIDECHAN_DIS: /* URL:scodm */
+ if (optval)
+ rack->r_ctl.side_chan_dis_mask = optval;
+ else
+ rack->r_ctl.side_chan_dis_mask = 0;
+ break;
case TCP_RACK_PROFILE:
RACK_OPTS_INC(tcp_profile);
error = rack_set_profile(rack, optval);
@@ -23224,15 +24825,37 @@
rack->r_limit_scw = 0;
break;
case TCP_RACK_DGP_IN_REC:
- RACK_OPTS_INC(tcp_dgp_in_rec);
- if (optval)
- rack->r_ctl.full_dgp_in_rec = 1;
- else
- rack->r_ctl.full_dgp_in_rec = 0;
+ error = EINVAL;
+ break;
+ case TCP_POLICER_DETECT: /* URL:pol_det */
+ RACK_OPTS_INC(tcp_pol_detect);
+ rack_translate_policer_detect(rack, optval);
break;
- case TCP_RXT_CLAMP:
- RACK_OPTS_INC(tcp_rxt_clamp);
- rack_translate_clamp_value(rack, optval);
+ case TCP_POLICER_MSS:
+ RACK_OPTS_INC(tcp_pol_mss);
+ rack->r_ctl.policer_del_mss = (uint8_t)optval;
+ if (optval & 0x00000100) {
+ /*
+ * Value is setup like so:
+ * VVVV VVVV VVVV VVVV VVVV VVAI MMMM MMMM
+ * Where MMMM MMMM is MSS setting
+ * I (9th bit) is the Postive value that
+ * says it is being set (if its 0 then the
+ * upper bits 11 - 32 have no meaning.
+ * This allows setting it off with
+ * 0x000001MM.
+ *
+ * The 10th bit is used to turn on the
+ * alternate median (not the expanded one).
+ *
+ */
+ rack->r_ctl.pol_bw_comp = (optval >> 10);
+ }
+ if (optval & 0x00000200) {
+ rack->r_ctl.policer_alt_median = 1;
+ } else {
+ rack->r_ctl.policer_alt_median = 0;
+ }
break;
case TCP_RACK_PACE_TO_FILL:
RACK_OPTS_INC(tcp_fillcw);
@@ -23240,8 +24863,6 @@
rack->rc_pace_to_cwnd = 0;
else {
rack->rc_pace_to_cwnd = 1;
- if (optval > 1)
- rack->r_fill_less_agg = 1;
}
if ((optval >= rack_gp_rtt_maxmul) &&
rack_gp_rtt_maxmul &&
@@ -23299,6 +24920,12 @@
else
error = EINVAL;
break;
+ case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
+ if (optval > 0)
+ rack->cspr_is_fcc = 1;
+ else
+ rack->cspr_is_fcc = 0;
+ break;
case TCP_TIMELY_DYN_ADJ:
RACK_OPTS_INC(tcp_timely_dyn);
if (optval == 0)
@@ -23341,11 +24968,16 @@
* method using a pacing rate.
*/
RACK_OPTS_INC(tcp_rack_pace_always);
+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
+ error = EPERM;
+ break;
+ }
if (optval > 0) {
if (rack->rc_always_pace) {
error = EALREADY;
break;
} else if (tcp_can_enable_pacing()) {
+ rack->r_ctl.pacing_method |= RACK_REG_PACING;
rack->rc_always_pace = 1;
if (rack->rack_hibeta)
rack_set_cc_pacing(rack);
@@ -23355,10 +24987,8 @@
break;
}
} else {
- if (rack->rc_always_pace) {
- tcp_decrement_paced_conn();
- rack->rc_always_pace = 0;
- rack_undo_cc_pacing(rack);
+ if (rack->rc_always_pace == 1) {
+ rack_remove_pacing(rack);
}
}
if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
@@ -23375,58 +25005,11 @@
val *= 1000;
val /= 8;
rack->r_ctl.init_rate = val;
- if (rack->rc_init_win != rack_default_init_window) {
- uint32_t win, snt;
-
- /*
- * Options don't always get applied
- * in the order you think. So in order
- * to assure we update a cwnd we need
- * to check and see if we are still
- * where we should raise the cwnd.
- */
- win = rc_init_window(rack);
- if (SEQ_GT(tp->snd_max, tp->iss))
- snt = tp->snd_max - tp->iss;
- else
- snt = 0;
- if ((snt < win) &&
- (tp->snd_cwnd < win))
- tp->snd_cwnd = win;
- }
if (rack->rc_always_pace)
rack_update_seg(rack);
break;
case TCP_BBR_IWINTSO:
- RACK_OPTS_INC(tcp_initial_win);
- if (optval && (optval <= 0xff)) {
- uint32_t win, snt;
-
- rack->rc_init_win = optval;
- win = rc_init_window(rack);
- if (SEQ_GT(tp->snd_max, tp->iss))
- snt = tp->snd_max - tp->iss;
- else
- snt = 0;
- if ((snt < win) &&
- (tp->t_srtt |
- rack->r_ctl.init_rate)) {
- /*
- * We are not past the initial window
- * and we have some bases for pacing,
- * so we need to possibly adjust up
- * the cwnd. Note even if we don't set
- * the cwnd, its still ok to raise the rc_init_win
- * which can be used coming out of idle when we
- * would have a rate.
- */
- if (tp->snd_cwnd < win)
- tp->snd_cwnd = win;
- }
- if (rack->rc_always_pace)
- rack_update_seg(rack);
- } else
- error = EINVAL;
+ error = EINVAL;
break;
case TCP_RACK_FORCE_MSEG:
RACK_OPTS_INC(tcp_rack_force_max_seg);
@@ -23443,6 +25026,24 @@
case TCP_RACK_PACE_MAX_SEG:
/* Max segments size in a pace in bytes */
RACK_OPTS_INC(tcp_rack_max_seg);
+ if ((rack->dgp_on == 1) &&
+ (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
+ /*
+ * If we set a max-seg and are doing DGP then
+ * we now fall under the pacing limits not the
+ * DGP ones.
+ */
+ if (tcp_can_enable_pacing() == 0) {
+ error = ENOSPC;
+ break;
+ }
+ /*
+ * Now change up the flags and counts to be correct.
+ */
+ rack->r_ctl.pacing_method |= RACK_REG_PACING;
+ tcp_dec_dgp_pacing_cnt();
+ rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
+ }
if (optval <= MAX_USER_SET_SEG)
rack->rc_user_set_max_segs = optval;
else
@@ -23452,6 +25053,18 @@
case TCP_RACK_PACE_RATE_REC:
/* Set the fixed pacing rate in Bytes per second ca */
RACK_OPTS_INC(tcp_rack_pace_rate_rec);
+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
+ error = EPERM;
+ break;
+ }
+ if (rack->dgp_on) {
+ /*
+ * We are already pacing another
+ * way.
+ */
+ error = EBUSY;
+ break;
+ }
rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
@@ -23470,6 +25083,18 @@
case TCP_RACK_PACE_RATE_SS:
/* Set the fixed pacing rate in Bytes per second ca */
RACK_OPTS_INC(tcp_rack_pace_rate_ss);
+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
+ error = EPERM;
+ break;
+ }
+ if (rack->dgp_on) {
+ /*
+ * We are already pacing another
+ * way.
+ */
+ error = EBUSY;
+ break;
+ }
rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
@@ -23488,6 +25113,18 @@
case TCP_RACK_PACE_RATE_CA:
/* Set the fixed pacing rate in Bytes per second ca */
RACK_OPTS_INC(tcp_rack_pace_rate_ca);
+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
+ error = EPERM;
+ break;
+ }
+ if (rack->dgp_on) {
+ /*
+ * We are already pacing another
+ * way.
+ */
+ error = EBUSY;
+ break;
+ }
rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
@@ -23571,6 +25208,41 @@
rack->r_rack_hw_rate_caps = 0;
}
break;
+ case TCP_DGP_UPPER_BOUNDS:
+ {
+ uint8_t val;
+ val = optval & 0x0000ff;
+ rack->r_ctl.rack_per_upper_bound_ca = val;
+ val = (optval >> 16) & 0x0000ff;
+ rack->r_ctl.rack_per_upper_bound_ss = val;
+ break;
+ }
+ case TCP_SS_EEXIT: /* URL:eexit */
+ if (optval > 0) {
+ rack->r_ctl.gp_rnd_thresh = optval & 0x0ff;
+ if (optval & 0x10000) {
+ rack->r_ctl.gate_to_fs = 1;
+ } else {
+ rack->r_ctl.gate_to_fs = 0;
+ }
+ if (optval & 0x20000) {
+ rack->r_ctl.use_gp_not_last = 1;
+ } else {
+ rack->r_ctl.use_gp_not_last = 0;
+ }
+ if (optval & 0xfffc0000) {
+ uint32_t v;
+
+ v = (optval >> 18) & 0x00003fff;
+ if (v >= 1000)
+ rack->r_ctl.gp_gain_req = v;
+ }
+ } else {
+ /* We do not do ss early exit at all */
+ rack->rc_initial_ss_comp = 1;
+ rack->r_ctl.gp_rnd_thresh = 0;
+ }
+ break;
case TCP_RACK_SPLIT_LIMIT:
RACK_OPTS_INC(tcp_split_limit);
rack->r_ctl.rc_split_limit = optval;
@@ -23681,6 +25353,50 @@
else
rack->r_ctl.rc_rate_sample_method = optval;
break;
+ case TCP_HONOR_HPTS_MIN:
+ RACK_OPTS_INC(tcp_honor_hpts);
+ if (optval) {
+ rack->r_use_hpts_min = 1;
+ /*
+ * Must be between 2 - 80% to be a reduction else
+ * we keep the default (10%).
+ */
+ if ((optval > 1) && (optval <= 80)) {
+ rack->r_ctl.max_reduction = optval;
+ }
+ } else
+ rack->r_use_hpts_min = 0;
+ break;
+ case TCP_REC_IS_DYN: /* URL:dynrec */
+ RACK_OPTS_INC(tcp_dyn_rec);
+ if (optval)
+ rack->rc_gp_no_rec_chg = 1;
+ else
+ rack->rc_gp_no_rec_chg = 0;
+ break;
+ case TCP_NO_TIMELY:
+ RACK_OPTS_INC(tcp_notimely);
+ if (optval) {
+ rack->rc_skip_timely = 1;
+ rack->r_ctl.rack_per_of_gp_rec = 90;
+ rack->r_ctl.rack_per_of_gp_ca = 100;
+ rack->r_ctl.rack_per_of_gp_ss = 250;
+ } else {
+ rack->rc_skip_timely = 0;
+ }
+ break;
+ case TCP_GP_USE_LTBW:
+ if (optval == 0) {
+ rack->use_lesser_lt_bw = 0;
+ rack->dis_lt_bw = 1;
+ } else if (optval == 1) {
+ rack->use_lesser_lt_bw = 1;
+ rack->dis_lt_bw = 0;
+ } else if (optval == 2) {
+ rack->use_lesser_lt_bw = 0;
+ rack->dis_lt_bw = 0;
+ }
+ break;
case TCP_DATA_AFTER_CLOSE:
RACK_OPTS_INC(tcp_data_after_close);
if (optval)
@@ -23695,6 +25411,431 @@
return (error);
}
+static void
+rack_inherit(struct tcpcb *tp, struct inpcb *parent)
+{
+ /*
+ * A new connection has been created (tp) and
+ * the parent is the inpcb given. We want to
+ * apply a read-lock to the parent (we are already
+ * holding a write lock on the tp) and copy anything
+ * out of the rack specific data as long as its tfb is
+ * the same as ours i.e. we are the same stack. Otherwise
+ * we just return.
+ */
+ struct tcpcb *par;
+ struct tcp_rack *dest, *src;
+ int cnt = 0;
+
+ par = intotcpcb(parent);
+ if (par->t_fb != tp->t_fb) {
+ /* Not the same stack */
+ tcp_log_socket_option(tp, 0, 0, 1);
+ return;
+ }
+ /* Ok if we reach here lets setup the two rack pointers */
+ dest = (struct tcp_rack *)tp->t_fb_ptr;
+ src = (struct tcp_rack *)par->t_fb_ptr;
+ if ((src == NULL) || (dest == NULL)) {
+ /* Huh? */
+ tcp_log_socket_option(tp, 0, 0, 2);
+ return;
+ }
+ /* Now copy out anything we wish to inherit i.e. things in socket-options */
+ /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */
+ if ((src->dgp_on) && (dest->dgp_on == 0)) {
+ /* Profile 1 had to be set via sock opt */
+ rack_set_dgp(dest);
+ cnt++;
+ }
+ /* TCP_RACK_SET_RXT_OPTIONS */
+ if (dest->full_size_rxt != src->full_size_rxt) {
+ dest->full_size_rxt = src->full_size_rxt;
+ cnt++;
+ }
+ if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) {
+ dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min;
+ cnt++;
+ }
+ /* TCP_RACK_DSACK_OPT */
+ if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) {
+ dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based;
+ cnt++;
+ }
+ if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) {
+ dest->rc_rack_use_dsack = src->rc_rack_use_dsack;
+ cnt++;
+ }
+ /* TCP_RACK_PACING_DIVISOR */
+ if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) {
+ dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor;
+ cnt++;
+ }
+ /* TCP_RACK_HI_BETA */
+ if (src->rack_hibeta != dest->rack_hibeta) {
+ cnt++;
+ if (src->rack_hibeta) {
+ dest->r_ctl.rc_saved_beta.beta = src->r_ctl.rc_saved_beta.beta;
+ dest->rack_hibeta = 1;
+ } else {
+ dest->rack_hibeta = 0;
+ }
+ }
+ /* TCP_RACK_TIMER_SLOP */
+ if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) {
+ dest->r_ctl.timer_slop = src->r_ctl.timer_slop;
+ cnt++;
+ }
+ /* TCP_RACK_PACING_BETA_ECN */
+ if (dest->r_ctl.rc_saved_beta.beta_ecn != src->r_ctl.rc_saved_beta.beta_ecn) {
+ dest->r_ctl.rc_saved_beta.beta_ecn = src->r_ctl.rc_saved_beta.beta_ecn;
+ cnt++;
+ }
+ if (dest->r_ctl.rc_saved_beta.newreno_flags != src->r_ctl.rc_saved_beta.newreno_flags) {
+ dest->r_ctl.rc_saved_beta.newreno_flags = src->r_ctl.rc_saved_beta.newreno_flags;
+ cnt++;
+ }
+ /* We do not do TCP_DEFER_OPTIONS */
+ /* TCP_RACK_MEASURE_CNT */
+ if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) {
+ dest->r_ctl.req_measurements = src->r_ctl.req_measurements;
+ cnt++;
+ }
+ /* TCP_HDWR_UP_ONLY */
+ if (dest->r_up_only != src->r_up_only) {
+ dest->r_up_only = src->r_up_only;
+ cnt++;
+ }
+ /* TCP_FILLCW_RATE_CAP */
+ if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) {
+ dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap;
+ cnt++;
+ }
+ /* TCP_PACING_RATE_CAP */
+ if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) {
+ dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap;
+ cnt++;
+ }
+ /* A listener can't set TCP_HYBRID_PACING */
+ /* TCP_SIDECHAN_DIS */
+ if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) {
+ dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask;
+ cnt++;
+ }
+ /* TCP_SHARED_CWND_TIME_LIMIT */
+ if (dest->r_limit_scw != src->r_limit_scw) {
+ dest->r_limit_scw = src->r_limit_scw;
+ cnt++;
+ }
+ /* TCP_POLICER_DETECT */
+ if (dest->r_ctl.policer_rxt_threshold != src->r_ctl.policer_rxt_threshold) {
+ dest->r_ctl.policer_rxt_threshold = src->r_ctl.policer_rxt_threshold;
+ cnt++;
+ }
+ if (dest->r_ctl.policer_avg_threshold != src->r_ctl.policer_avg_threshold) {
+ dest->r_ctl.policer_avg_threshold = src->r_ctl.policer_avg_threshold;
+ cnt++;
+ }
+ if (dest->r_ctl.policer_med_threshold != src->r_ctl.policer_med_threshold) {
+ dest->r_ctl.policer_med_threshold = src->r_ctl.policer_med_threshold;
+ cnt++;
+ }
+ if (dest->policer_detect_on != src->policer_detect_on) {
+ dest->policer_detect_on = src->policer_detect_on;
+ cnt++;
+ }
+
+ if (dest->r_ctl.saved_policer_val != src->r_ctl.saved_policer_val) {
+ dest->r_ctl.saved_policer_val = src->r_ctl.saved_policer_val;
+ cnt++;
+ }
+ /* TCP_POLICER_MSS */
+ if (dest->r_ctl.policer_del_mss != src->r_ctl.policer_del_mss) {
+ dest->r_ctl.policer_del_mss = src->r_ctl.policer_del_mss;
+ cnt++;
+ }
+
+ if (dest->r_ctl.pol_bw_comp != src->r_ctl.pol_bw_comp) {
+ dest->r_ctl.pol_bw_comp = src->r_ctl.pol_bw_comp;
+ cnt++;
+ }
+
+ if (dest->r_ctl.policer_alt_median != src->r_ctl.policer_alt_median) {
+ dest->r_ctl.policer_alt_median = src->r_ctl.policer_alt_median;
+ cnt++;
+ }
+ /* TCP_RACK_PACE_TO_FILL */
+ if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) {
+ dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd;
+ cnt++;
+ }
+ if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) {
+ dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range;
+ cnt++;
+ }
+ if (dest->rtt_limit_mul != src->rtt_limit_mul) {
+ dest->rtt_limit_mul = src->rtt_limit_mul;
+ cnt++;
+ }
+ /* TCP_RACK_NO_PUSH_AT_MAX */
+ if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) {
+ dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt;
+ cnt++;
+ }
+ /* TCP_SHARED_CWND_ENABLE */
+ if (dest->rack_enable_scwnd != src->rack_enable_scwnd) {
+ dest->rack_enable_scwnd = src->rack_enable_scwnd;
+ cnt++;
+ }
+ /* TCP_USE_CMP_ACKS */
+ if (dest->r_use_cmp_ack != src->r_use_cmp_ack) {
+ dest->r_use_cmp_ack = src->r_use_cmp_ack;
+ cnt++;
+ }
+
+ if (dest->r_mbuf_queue != src->r_mbuf_queue) {
+ dest->r_mbuf_queue = src->r_mbuf_queue;
+ cnt++;
+ }
+ /* TCP_RACK_MBUF_QUEUE */
+ if (dest->r_mbuf_queue != src->r_mbuf_queue) {
+ dest->r_mbuf_queue = src->r_mbuf_queue;
+ cnt++;
+ }
+ if (dest->r_mbuf_queue || dest->rc_always_pace || dest->r_use_cmp_ack) {
+ tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
+ } else {
+ tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
+ }
+ if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) {
+ tp->t_flags2 |= TF2_MBUF_ACKCMP;
+ }
+ /* TCP_RACK_NONRXT_CFG_RATE */
+ if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) {
+ dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr;
+ cnt++;
+ }
+ /* TCP_NO_PRR */
+ if (dest->rack_no_prr != src->rack_no_prr) {
+ dest->rack_no_prr = src->rack_no_prr;
+ cnt++;
+ }
+ if (dest->no_prr_addback != src->no_prr_addback) {
+ dest->no_prr_addback = src->no_prr_addback;
+ cnt++;
+ }
+ /* RACK_CSPR_IS_FCC */
+ if (dest->cspr_is_fcc != src->cspr_is_fcc) {
+ dest->cspr_is_fcc = src->cspr_is_fcc;
+ cnt++;
+ }
+ /* TCP_TIMELY_DYN_ADJ */
+ if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) {
+ dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul;
+ cnt++;
+ }
+ if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
+ dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
+ cnt++;
+ }
+ /* TCP_RACK_DO_DETECTION */
+ if (dest->do_detection != src->do_detection) {
+ dest->do_detection = src->do_detection;
+ cnt++;
+ }
+ /* TCP_RACK_TLP_USE */
+ if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) {
+ dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use;
+ cnt++;
+ }
+ /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */
+ /* TCP_BBR_RACK_INIT_RATE */
+ if (dest->r_ctl.init_rate != src->r_ctl.init_rate) {
+ dest->r_ctl.init_rate = src->r_ctl.init_rate;
+ cnt++;
+ }
+ /* TCP_RACK_FORCE_MSEG */
+ if (dest->rc_force_max_seg != src->rc_force_max_seg) {
+ dest->rc_force_max_seg = src->rc_force_max_seg;
+ cnt++;
+ }
+ /* TCP_RACK_PACE_MIN_SEG */
+ if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) {
+ dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs;
+ cnt++;
+ }
+ /* we don't allow TCP_RACK_PACE_MAX_SEG */
+ /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */
+ if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) {
+ dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca;
+ cnt++;
+ }
+ if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) {
+ dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss;
+ cnt++;
+ }
+ if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) {
+ dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec;
+ cnt++;
+ }
+ /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */
+ if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) {
+ dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec;
+ cnt++;
+ }
+ if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
+ dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
+ cnt++;
+ }
+
+ if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) {
+ dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss;
+ cnt++;
+ }
+ /* TCP_RACK_RR_CONF */
+ if (dest->r_rr_config != src->r_rr_config) {
+ dest->r_rr_config = src->r_rr_config;
+ cnt++;
+ }
+ /* TCP_PACING_DND */
+ if (dest->rc_pace_dnd != src->rc_pace_dnd) {
+ dest->rc_pace_dnd = src->rc_pace_dnd;
+ cnt++;
+ }
+ /* TCP_HDWR_RATE_CAP */
+ if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) {
+ dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps;
+ cnt++;
+ }
+ /* TCP_DGP_UPPER_BOUNDS */
+ if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) {
+ dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca;
+ cnt++;
+ }
+ if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) {
+ dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss;
+ cnt++;
+ }
+ /* TCP_SS_EEXIT */
+ if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) {
+ dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh;
+ cnt++;
+ }
+ if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) {
+ dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs;
+ cnt++;
+ }
+ if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) {
+ dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last;
+ cnt++;
+ }
+ if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) {
+ dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req;
+ cnt++;
+ }
+ /* TCP_BBR_HDWR_PACE */
+ if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) {
+ dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena;
+ cnt++;
+ }
+ if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) {
+ dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace;
+ cnt++;
+ }
+ /* TCP_RACK_PRR_SENDALOT */
+ if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) {
+ dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot;
+ cnt++;
+ }
+ /* TCP_RACK_MIN_TO */
+ if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) {
+ dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to;
+ cnt++;
+ }
+ /* TCP_RACK_EARLY_SEG */
+ if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) {
+ dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs;
+ cnt++;
+ }
+ /* TCP_RACK_ENABLE_HYSTART */
+ if (par->t_ccv.flags != tp->t_ccv.flags) {
+ cnt++;
+ if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) {
+ tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
+ if (rack_do_hystart > RACK_HYSTART_ON)
+ tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
+ if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
+ tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
+ } else {
+ tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
+ }
+ }
+ /* TCP_RACK_REORD_THRESH */
+ if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) {
+ dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift;
+ cnt++;
+ }
+ /* TCP_RACK_REORD_FADE */
+ if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) {
+ dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade;
+ cnt++;
+ }
+ /* TCP_RACK_TLP_THRESH */
+ if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) {
+ dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold;
+ cnt++;
+ }
+ /* TCP_BBR_USE_RACK_RR */
+ if (dest->use_rack_rr != src->use_rack_rr) {
+ dest->use_rack_rr = src->use_rack_rr;
+ cnt++;
+ }
+ /* TCP_RACK_PKT_DELAY */
+ if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) {
+ dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay;
+ cnt++;
+ }
+ /* TCP_DELACK will get copied via the main code if applicable */
+ /* TCP_BBR_RACK_RTT_USE */
+ if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) {
+ dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method;
+ cnt++;
+ }
+ /* TCP_HONOR_HPTS_MIN */
+ if (dest->r_use_hpts_min != src->r_use_hpts_min) {
+ dest->r_use_hpts_min = src->r_use_hpts_min;
+ cnt++;
+ }
+ if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) {
+ dest->r_ctl.max_reduction = src->r_ctl.max_reduction;
+ cnt++;
+ }
+ /* TCP_REC_IS_DYN */
+ if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) {
+ dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg;
+ cnt++;
+ }
+ if (dest->rc_skip_timely != src->rc_skip_timely) {
+ dest->rc_skip_timely = src->rc_skip_timely;
+ cnt++;
+ }
+ /* TCP_DATA_AFTER_CLOSE */
+ if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) {
+ dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo;
+ cnt++;
+ }
+ /* TCP_GP_USE_LTBW */
+ if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) {
+ dest->use_lesser_lt_bw = src->use_lesser_lt_bw;
+ cnt++;
+ }
+ if (dest->dis_lt_bw != src->dis_lt_bw) {
+ dest->dis_lt_bw = src->dis_lt_bw;
+ cnt++;
+ }
+ tcp_log_socket_option(tp, 0, cnt, 0);
+}
+
static void
rack_apply_deferred_options(struct tcp_rack *rack)
@@ -23778,7 +25919,10 @@
.tfb_switch_failed = rack_switch_failed,
.tfb_early_wake_check = rack_wake_check,
.tfb_compute_pipe = rack_compute_pipe,
+ .tfb_stack_info = rack_stack_information,
+ .tfb_inherit = rack_inherit,
.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
+
};
/*
@@ -23846,7 +25990,6 @@
/* Already read in and sanity checked in sosetopt(). */
if (inp->inp_socket) {
rack->client_bufferlvl = inp->inp_socket->so_peerprio;
- rack_client_buffer_level_set(rack);
}
break;
}
@@ -23859,7 +26002,6 @@
/* Pacing related ones */
case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
- case TCP_BBR_IWINTSO: /* URL:tso_iwin */
case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */
case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
@@ -23874,12 +26016,12 @@
case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */
case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */
case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */
- case TCP_RACK_PACING_BETA: /* URL:pacing_beta */
+ case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
- case TCP_RACK_DGP_IN_REC: /* URL:dgpinrec */
/* End pacing related */
- case TCP_RXT_CLAMP: /* URL:rxtclamp */
+ case TCP_POLICER_DETECT: /* URL:pol_det */
+ case TCP_POLICER_MSS: /* URL:pol_mss */
case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
case TCP_RACK_MIN_TO: /* URL:min_to */
@@ -23901,7 +26043,8 @@
case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
case TCP_RACK_PROFILE: /* URL:profile */
- case TCP_HYBRID_PACING: /* URL:hybrid */
+ case TCP_SIDECHAN_DIS: /* URL:scodm */
+ case TCP_HYBRID_PACING: /* URL:pacing=hybrid */
case TCP_USE_CMP_ACKS: /* URL:cmpack */
case TCP_RACK_ABC_VAL: /* URL:labc */
case TCP_REC_ABC_VAL: /* URL:reclabc */
@@ -23913,8 +26056,15 @@
case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */
case TCP_RACK_HI_BETA: /* URL:hibeta */
case TCP_RACK_SPLIT_LIMIT: /* URL:split */
+ case TCP_SS_EEXIT: /* URL:eexit */
+ case TCP_DGP_UPPER_BOUNDS: /* URL:upper */
case TCP_RACK_PACING_DIVISOR: /* URL:divisor */
case TCP_PACING_DND: /* URL:dnd */
+ case TCP_NO_TIMELY: /* URL:notimely */
+ case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
+ case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */
+ case TCP_REC_IS_DYN: /* URL:dynrec */
+ case TCP_GP_USE_LTBW: /* URL:useltbw */
goto process_opt;
break;
default:
@@ -23922,14 +26072,14 @@
return (tcp_default_ctloutput(tp, sopt));
break;
}
-
default:
INP_WUNLOCK(inp);
return (0);
}
process_opt:
INP_WUNLOCK(inp);
- if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
+ if ((sopt->sopt_name == TCP_PACING_RATE_CAP) ||
+ (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) {
error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
/*
* We truncate it down to 32 bits for the socket-option trace this
@@ -23953,11 +26103,10 @@
if (rack->defer_options && (rack->gp_ready == 0) &&
(sopt->sopt_name != TCP_DEFER_OPTIONS) &&
(sopt->sopt_name != TCP_HYBRID_PACING) &&
- (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
(sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) &&
(sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
(sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
- /* Options are beind deferred */
+ /* Options are being deferred */
if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
INP_WUNLOCK(inp);
return (0);
@@ -24016,6 +26165,7 @@
ti->tcpi_snd_zerowin = tp->t_sndzerowin;
ti->tcpi_total_tlp = tp->t_sndtlppack;
ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
+ ti->tcpi_rttmin = tp->t_rttlow;
#ifdef NETFLIX_STATS
memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
#endif
@@ -24062,21 +26212,6 @@
* when you exit recovery.
*/
case TCP_RACK_PACING_BETA:
- if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
- error = EINVAL;
- else if (rack->rc_pacing_cc_set == 0)
- optval = rack->r_ctl.rc_saved_beta.beta;
- else {
- /*
- * Reach out into the CC data and report back what
- * I have previously set. Yeah it looks hackish but
- * we don't want to report the saved values.
- */
- if (tp->t_ccv.cc_data)
- optval = ((struct newreno *)tp->t_ccv.cc_data)->beta;
- else
- error = EINVAL;
- }
break;
/*
* Beta_ecn is the congestion control value for NewReno that influences how
@@ -24112,7 +26247,7 @@
optval |= 2;
}
break;
- case TCP_RACK_ENABLE_HYSTART:
+ case TCP_RACK_ENABLE_HYSTART:
{
if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
optval = RACK_HYSTART_ON;
@@ -24126,13 +26261,16 @@
}
break;
case TCP_RACK_DGP_IN_REC:
- optval = rack->r_ctl.full_dgp_in_rec;
+ error = EINVAL;
break;
case TCP_RACK_HI_BETA:
optval = rack->rack_hibeta;
break;
- case TCP_RXT_CLAMP:
- optval = rack->r_ctl.saved_rxt_clamp_val;
+ case TCP_POLICER_MSS:
+ optval = rack->r_ctl.policer_del_mss;
+ break;
+ case TCP_POLICER_DETECT:
+ optval = rack->r_ctl.saved_policer_val;
break;
case TCP_DEFER_OPTIONS:
optval = rack->defer_options;
@@ -24149,6 +26287,9 @@
case TCP_HDWR_UP_ONLY:
optval= rack->r_up_only;
break;
+ case TCP_FILLCW_RATE_CAP:
+ loptval = rack->r_ctl.fillcw_cap;
+ break;
case TCP_PACING_RATE_CAP:
loptval = rack->r_ctl.bw_rate_cap;
break;
@@ -24156,6 +26297,9 @@
/* You cannot retrieve a profile, its write only */
error = EINVAL;
break;
+ case TCP_SIDECHAN_DIS:
+ optval = rack->r_ctl.side_chan_dis_mask;
+ break;
case TCP_HYBRID_PACING:
/* You cannot retrieve hybrid pacing information, its write only */
error = EINVAL;
@@ -24165,8 +26309,6 @@
break;
case TCP_RACK_PACE_TO_FILL:
optval = rack->rc_pace_to_cwnd;
- if (optval && rack->r_fill_less_agg)
- optval++;
break;
case TCP_RACK_NO_PUSH_AT_MAX:
optval = rack->r_ctl.rc_no_push_at_mrtt;
@@ -24185,6 +26327,18 @@
else
optval = 0;
break;
+ case TCP_GP_USE_LTBW:
+ if (rack->dis_lt_bw) {
+ /* It is not used */
+ optval = 0;
+ } else if (rack->use_lesser_lt_bw) {
+ /* we use min() */
+ optval = 1;
+ } else {
+ /* we use max() */
+ optval = 2;
+ }
+ break;
case TCP_RACK_DO_DETECTION:
optval = rack->do_detection;
break;
@@ -24192,11 +26346,14 @@
/* Now do we use the LRO mbuf-queue feature */
optval = rack->r_mbuf_queue;
break;
+ case RACK_CSPR_IS_FCC:
+ optval = rack->cspr_is_fcc;
+ break;
case TCP_TIMELY_DYN_ADJ:
optval = rack->rc_gp_dyn_mul;
break;
case TCP_BBR_IWINTSO:
- optval = rack->rc_init_win;
+ error = EINVAL;
break;
case TCP_RACK_TLP_REDUCE:
/* RACK TLP cwnd reduction (bool) */
@@ -24242,6 +26399,18 @@
/* RACK reorder threshold (shift amount) */
optval = rack->r_ctl.rc_reorder_shift;
break;
+ case TCP_SS_EEXIT:
+ if (rack->r_ctl.gp_rnd_thresh) {
+ uint32_t v;
+
+ v = rack->r_ctl.gp_gain_req;
+ v <<= 17;
+ optval = v | (rack->r_ctl.gp_rnd_thresh & 0xff);
+ if (rack->r_ctl.gate_to_fs == 1)
+ optval |= 0x10000;
+ } else
+ optval = 0;
+ break;
case TCP_RACK_REORD_FADE:
/* Does reordering fade after ms time */
optval = rack->r_ctl.rc_reorder_fade;
@@ -24282,6 +26451,11 @@
case TCP_RACK_PACE_RATE_REC:
optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
break;
+ case TCP_DGP_UPPER_BOUNDS:
+ optval = rack->r_ctl.rack_per_upper_bound_ss;
+ optval <<= 16;
+ optval |= rack->r_ctl.rack_per_upper_bound_ca;
+ break;
case TCP_RACK_GP_INCREASE_SS:
optval = rack->r_ctl.rack_per_of_gp_ca;
break;
@@ -24303,6 +26477,18 @@
case TCP_SHARED_CWND_TIME_LIMIT:
optval = rack->r_limit_scw;
break;
+ case TCP_HONOR_HPTS_MIN:
+ if (rack->r_use_hpts_min)
+ optval = rack->r_ctl.max_reduction;
+ else
+ optval = 0;
+ break;
+ case TCP_REC_IS_DYN:
+ optval = rack->rc_gp_no_rec_chg;
+ break;
+ case TCP_NO_TIMELY:
+ optval = rack->rc_skip_timely;
+ break;
case TCP_RACK_TIMER_SLOP:
optval = rack->r_ctl.timer_slop;
break;
@@ -24312,7 +26498,8 @@
}
INP_WUNLOCK(inp);
if (error == 0) {
- if (TCP_PACING_RATE_CAP)
+ if ((sopt->sopt_name == TCP_PACING_RATE_CAP) ||
+ (sopt->sopt_name == TCP_FILLCW_RATE_CAP))
error = sooptcopyout(sopt, &loptval, sizeof loptval);
else
error = sooptcopyout(sopt, &optval, sizeof optval);
diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c
new file mode 100644
diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h
--- a/sys/netinet/tcp_stacks/sack_filter.h
+++ b/sys/netinet/tcp_stacks/sack_filter.h
@@ -51,5 +51,10 @@
int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks,
tcp_seq th_ack);
void sack_filter_reject(struct sack_filter *sf, struct sackblk *in);
+static inline uint8_t sack_filter_blks_used(struct sack_filter *sf)
+{
+ return (sf->sf_used);
+}
+
#endif
#endif
diff --git a/sys/netinet/tcp_stacks/tailq_hash.h b/sys/netinet/tcp_stacks/tailq_hash.h
--- a/sys/netinet/tcp_stacks/tailq_hash.h
+++ b/sys/netinet/tcp_stacks/tailq_hash.h
@@ -13,10 +13,12 @@
#define MAX_ALLOWED_SEQ_RANGE (SEQ_BUCKET_SIZE * (MAX_HASH_ENTRIES-1))
struct tailq_hash {
- struct rack_head ht[MAX_HASH_ENTRIES];
uint32_t min;
uint32_t max;
uint32_t count;
+ struct rack_sendmap *rsm_min;
+ struct rack_sendmap *rsm_max;
+ struct rack_head ht[MAX_HASH_ENTRIES];
};
struct rack_sendmap *
@@ -53,6 +55,10 @@
int
tqhash_trim(struct tailq_hash *hs, uint32_t th_ack);
+void
+tqhash_update_end(struct tailq_hash *hs, struct rack_sendmap *rsm,
+ uint32_t th_ack);
+
#define TQHASH_FOREACH(var, head) \
for ((var) = tqhash_min((head)); \
diff --git a/sys/netinet/tcp_stacks/tailq_hash.c b/sys/netinet/tcp_stacks/tailq_hash.c
--- a/sys/netinet/tcp_stacks/tailq_hash.c
+++ b/sys/netinet/tcp_stacks/tailq_hash.c
@@ -65,7 +65,6 @@
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
-#include <netinet/tcp_ratelimit.h>
#include <netinet/tcp_accounting.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
@@ -100,6 +99,7 @@
#include "sack_filter.h"
#include "tcp_rack.h"
#include "tailq_hash.h"
+#include "opt_global.h"
struct rack_sendmap *
@@ -107,7 +107,7 @@
{
struct rack_sendmap *rsm;
- rsm = tqhash_find(hs, hs->min);
+ rsm = hs->rsm_min;
return(rsm);
}
@@ -116,7 +116,7 @@
{
struct rack_sendmap *rsm;
- rsm = tqhash_find(hs, (hs->max - 1));
+ rsm = hs->rsm_max;
return (rsm);
}
@@ -224,13 +224,19 @@
void
tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type)
{
- TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
+
hs->count--;
if (hs->count == 0) {
hs->min = hs->max;
+ hs->rsm_max = hs->rsm_min = NULL;
} else if (type == REMOVE_TYPE_CUMACK) {
hs->min = rsm->r_end;
+ hs->rsm_min = tqhash_next(hs, rsm);
+ } else if (rsm == hs->rsm_max) {
+ hs->rsm_max = tqhash_prev(hs, rsm);
+ hs->max = hs->rsm_max->r_end;
}
+ TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
}
int
@@ -240,6 +246,7 @@
int inserted = 0;
uint32_t ebucket;
+#ifdef INVARIANTS
if (hs->count > 0) {
if ((rsm->r_end - hs->min) > MAX_ALLOWED_SEQ_RANGE) {
return (-1);
@@ -249,6 +256,7 @@
return (-2);
}
}
+#endif
rsm->bindex = rsm->r_start / SEQ_BUCKET_SIZE;
rsm->bindex %= MAX_HASH_ENTRIES;
ebucket = rsm->r_end / SEQ_BUCKET_SIZE;
@@ -263,13 +271,17 @@
/* Special case */
hs->min = rsm->r_start;
hs->max = rsm->r_end;
+ hs->rsm_min = hs->rsm_max = rsm;
hs->count = 1;
} else {
hs->count++;
- if (SEQ_GT(rsm->r_end, hs->max))
+ if (SEQ_GEQ(rsm->r_end, hs->max)) {
hs->max = rsm->r_end;
- if (SEQ_LT(rsm->r_start, hs->min))
+ hs->rsm_max = rsm;
+ } if (SEQ_LEQ(rsm->r_start, hs->min)) {
hs->min = rsm->r_start;
+ hs->rsm_min = rsm;
+ }
}
/* Check the common case of inserting at the end */
l = TAILQ_LAST(&hs->ht[rsm->bindex], rack_head);
@@ -299,6 +311,7 @@
TAILQ_INIT(&hs->ht[i]);
}
hs->min = hs->max = 0;
+ hs->rsm_min = hs->rsm_max = NULL;
hs->count = 0;
}
@@ -339,3 +352,11 @@
return (0);
}
+void
+tqhash_update_end(struct tailq_hash *hs, struct rack_sendmap *rsm,
+ uint32_t th_ack)
+{
+ if (hs->max == rsm->r_end)
+ hs->max = th_ack;
+ rsm->r_end = th_ack;
+}
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -48,6 +48,8 @@
#define RACK_MERGED 0x080000/* The RSM was merged */
#define RACK_PMTU_CHG 0x100000/* The path mtu changed on this guy */
#define RACK_STRADDLE 0x200000/* The seq straddles the bucket line */
+#define RACK_WAS_LOST 0x400000/* Is the rsm considered lost */
+#define RACK_IS_PCM 0x800000/* A PCM measurement is being taken */
#define RACK_NUM_OF_RETRANS 3
#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
@@ -63,6 +65,7 @@
uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
uint32_t r_flags : 24, /* Flags as defined above */
r_rtr_cnt : 8; /* Retran count, index this -1 to get time */
+ uint32_t r_act_rxt_cnt; /* The actual total count of transmits */
struct mbuf *m;
uint32_t soff;
uint32_t orig_m_len; /* The original mbuf len when we sent (can update) */
@@ -174,6 +177,8 @@
#define RACK_TO_FRM_PERSIST 5
#define RACK_TO_FRM_DELACK 6
+#define RCV_PATH_RTT_MS 10 /* How many ms between recv path RTT's */
+
struct rack_opts_stats {
uint64_t tcp_rack_tlp_reduce;
uint64_t tcp_rack_pace_always;
@@ -232,7 +237,7 @@
uint64_t tcp_rack_rtt_use;
uint64_t tcp_data_after_close;
uint64_t tcp_defer_opt;
- uint64_t tcp_rxt_clamp;
+ uint64_t tcp_pol_detect;
uint64_t tcp_rack_beta;
uint64_t tcp_rack_beta_ecn;
uint64_t tcp_rack_timer_slop;
@@ -242,6 +247,11 @@
uint64_t tcp_rack_pacing_divisor;
uint64_t tcp_rack_min_seg;
uint64_t tcp_dgp_in_rec;
+ uint64_t tcp_notimely;
+ uint64_t tcp_honor_hpts;
+ uint64_t tcp_dyn_rec;
+ uint64_t tcp_fillcw_rate_cap;
+ uint64_t tcp_pol_mss;
};
/* RTT shrink reasons */
@@ -263,6 +273,9 @@
#define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */
#define RACK_MIN_BW 8000 /* 64kbps in Bps */
+#define CCSP_DIS_MASK 0x0001
+#define HYBRID_DIS_MASK 0x0002
+
/* Rack quality indicators for GPUT measurements */
#define RACK_QUALITY_NONE 0 /* No quality stated */
#define RACK_QUALITY_HIGH 1 /* A normal measurement of a GP RTT */
@@ -319,6 +332,7 @@
*
*/
#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */
+#define RETRAN_CNT_SIZE 16
#define RACK_NUM_FSB_DEBUG 16
#ifdef _KERNEL
@@ -342,6 +356,26 @@
struct tailq_hash;
+struct rack_pcm_info {
+ /* Base send time and s/e filled in by rack_log_output */
+ uint64_t send_time;
+ uint32_t sseq;
+ uint32_t eseq;
+ /* Ack's fill in the rest of the data */
+ uint16_t cnt;
+ /* Maximum acks present */
+ uint16_t cnt_alloc;
+};
+
+#define RACK_DEFAULT_PCM_ARRAY 16
+
+struct rack_pcm_stats {
+ uint32_t sseq;
+ uint32_t eseq;
+ uint64_t ack_time;
+};
+
+
struct rack_control {
/* Second cache line 0x40 from tcp_rack */
struct tailq_hash *tqh; /* Tree of all segments Lock(a) */
@@ -402,6 +436,7 @@
uint32_t rc_rcvtime; /* When we last received data */
uint32_t rc_num_split_allocs; /* num split map entries allocated */
uint32_t rc_split_limit; /* Limit from control var can be set by socket opt */
+ uint32_t rack_avg_rec_sends;
uint32_t rc_last_output_to;
uint32_t rc_went_idle_time;
@@ -452,19 +487,45 @@
struct tcp_sendfile_track *rc_last_sft;
uint32_t lt_seq; /* Seq at start of lt_bw gauge */
int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */
- uint64_t last_sndbytes;
- uint64_t last_snd_rxt_bytes;
- uint64_t rxt_threshold;
uint64_t last_tmit_time_acked; /* Holds the last cumack point's last send time */
- uint32_t last_rnd_rxt_clamped;
- uint32_t num_of_clamps_applied;
- uint32_t clamp_options;
- uint32_t max_clamps;
+ /* Recovery stats */
+ uint64_t time_entered_recovery;
+ uint64_t bytes_acked_in_recovery;
+ /* Policer Detection */
+ uint64_t last_policer_sndbytes;
+ uint64_t last_policer_snd_rxt_bytes;
+ uint64_t policer_bw;
+ uint64_t last_sendtime;
+
+ uint64_t last_gpest;
+ uint64_t last_tm_mark; /* Last tm mark used */
+ uint64_t fillcw_cap; /* B/W cap on fill cw */
+ struct rack_pcm_info pcm_i;
+ struct rack_pcm_stats *pcm_s;
+ uint32_t gp_gain_req; /* Percent off gp gain req */
+ uint32_t last_rnd_of_gp_rise;
+ uint32_t gp_rnd_thresh;
+ uint32_t ss_hi_fs;
+ uint32_t gate_to_fs;
+ uint32_t policer_max_seg;
+ uint32_t pol_bw_comp;
+ uint16_t policer_rxt_threshold;
+ uint8_t policer_avg_threshold;
+ uint8_t policer_med_threshold;
+ uint32_t pcm_max_seg;
+ uint32_t last_pcm_round;
+ uint32_t pcm_idle_rounds;
+ uint32_t current_policer_bucket;
+ uint32_t policer_bucket_size;
+ uint32_t idle_snd_una;
+ uint32_t ack_for_idle;
+ uint32_t last_amount_before_rec;
uint32_t rc_gp_srtt; /* Current GP srtt */
uint32_t rc_prev_gp_srtt; /* Previous RTT */
uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */
uint32_t rc_loss_at_start; /* At measurement window where was our lost value */
+ uint32_t rc_considered_lost; /* Count in recovery of non-retransmitted bytes considered lost */
uint32_t dsack_round_end; /* In a round of seeing a DSACK */
uint32_t current_round; /* Starting at zero */
@@ -491,6 +552,8 @@
uint32_t rc_snd_max_at_rto; /* For non-sack when the RTO occurred what was snd-max */
uint32_t rc_out_at_rto;
int32_t rc_scw_index;
+ uint32_t max_reduction;
+ uint32_t side_chan_dis_mask; /* Bit mask of socket opt's disabled */
uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
uint32_t rc_last_timeout_snduna;
uint32_t last_tlp_acked_start;
@@ -503,7 +566,11 @@
uint32_t ack_during_sd;
uint32_t input_pkt;
uint32_t saved_input_pkt;
- uint32_t saved_rxt_clamp_val; /* The encoded value we used to setup clamping */
+ uint32_t saved_policer_val; /* The encoded value we used to setup policer detection */
+ uint32_t cleared_app_ack_seq;
+ uint32_t last_rcv_tstmp_for_rtt;
+ uint32_t last_time_of_arm_rcv;
+ uint32_t rto_ssthresh;
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
@@ -516,10 +583,13 @@
* we also set the flag (if ecn_beta is set) to make
* new_reno do less of a backoff for ecn (think abe).
*/
+ uint16_t rc_cnt_of_retran[RETRAN_CNT_SIZE];
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
+ uint8_t policer_del_mss; /* How many mss during recovery for policer detection */
uint8_t rack_per_upper_bound_ss;
uint8_t rack_per_upper_bound_ca;
+ uint8_t cleared_app_ack;
uint8_t dsack_persist;
uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */
uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */
@@ -528,17 +598,19 @@
uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
uint8_t rc_rate_sample_method;
- uint8_t rc_dgp_bl_agg; /* Buffer Level aggression during DGP */
+ uint8_t policer_alt_median; /* Alternate median for policer detection */
uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */
uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */
- uint8_t pacing_discount_amm; /*
- * This is a multipler to the base discount that
- * can be used to increase the discount.
- */
+ uint8_t use_gp_not_last;
+ uint8_t pacing_method; /* If pace_always, what type of pacing */
uint8_t already_had_a_excess;
};
#endif
+#define RACK_PACING_NONE 0x00
+#define RACK_DGP_PACING 0x01
+#define RACK_REG_PACING 0x02
+
/* DGP with no buffer level mitigations */
#define DGP_LEVEL0 0
@@ -578,6 +650,10 @@
#define HYBRID_LOG_EXTEND 14 /* We extended the end */
#define HYBRID_LOG_SENT_LOST 15 /* A closing sent/lost report */
+#define LOST_ZERO 1 /* Zero it out */
+#define LOST_ADD 2 /* Add to it */
+#define LOST_SUB 3 /* Sub from it */
+
#define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */
#define RACK_MINRTT_FILTER_TIM 10 /* Seconds */
@@ -590,6 +666,7 @@
*/
#define MAX_USER_SET_SEG 0x3f /* The max we can set is 63 which is probably too many */
+#define RACK_FREE_CNT_MAX 0x2f /* Max our counter can do */
#ifdef _KERNEL
@@ -601,8 +678,9 @@
int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */
struct tcpcb *rc_tp; /* The tcpcb Lock(a) */
struct inpcb *rc_inp; /* The inpcb Lock(a) */
- uint8_t rc_free_cnt; /* Number of free entries on the rc_free list
- * Lock(a) */
+ uint8_t rc_free_cnt : 6,
+ rc_skip_timely : 1,
+ pcm_enabled : 1; /* Is PCM enabled */
uint8_t client_bufferlvl : 3, /* Expected range [0,5]: 0=unset, 1=low/empty */
rack_deferred_inited : 1,
/* ******************************************************************** */
@@ -612,11 +690,11 @@
shape_rxt_to_pacing_min : 1,
/* ******************************************************************** */
rc_ack_required: 1,
- r_pacing_discount : 1;
+ r_use_hpts_min : 1;
uint8_t no_prr_addback : 1,
gp_ready : 1,
defer_options: 1,
- excess_rxt_on: 1, /* Are actions on for excess retransmissions? */
+ dis_lt_bw : 1,
rc_ack_can_sendout_data: 1, /*
* If set it will override pacing restrictions on not sending
* data when the pacing timer is running. I.e. you set this
@@ -659,7 +737,7 @@
r_rack_hw_rate_caps: 1,
r_up_only: 1,
r_via_fill_cw : 1,
- r_fill_less_agg : 1;
+ r_rcvpath_rtt_up : 1;
uint8_t rc_user_set_max_segs : 7, /* Socket option value Lock(a) */
rc_fillcw_apply_discount;
@@ -673,7 +751,7 @@
rc_highly_buffered: 1, /* The path is highly buffered */
rc_dragged_bottom: 1,
rc_pace_dnd : 1, /* The pace do not disturb bit */
- rc_avali2 : 1,
+ rc_initial_ss_comp : 1,
rc_gp_filled : 1,
rc_hw_nobuf : 1;
uint8_t r_state : 4, /* Current rack state Lock(a) */
@@ -696,8 +774,8 @@
uint8_t app_limited_needs_set : 1,
use_fixed_rate : 1,
rc_has_collapsed : 1,
- r_cwnd_was_clamped : 1,
- r_clamped_gets_lower : 1,
+ use_lesser_lt_bw : 1,
+ cspr_is_fcc : 1,
rack_hdrw_pacing : 1, /* We are doing Hardware pacing */
rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */
rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */
@@ -722,7 +800,14 @@
r_persist_lt_bw_off : 1,
r_collapse_point_valid : 1,
dgp_on : 1;
- uint16_t rc_init_win : 8,
+ uint16_t rto_from_rec: 1,
+ avail_bit: 1,
+ pcm_in_progress: 1,
+ pcm_needed: 1,
+ policer_detect_on: 1, /* Are we detecting policers? */
+ rc_policer_detected : 1, /* We are beiing policed */
+ rc_policer_should_pace : 1, /* The sizing algo thinks we should pace */
+ rc_sendvars_notset : 1, /* Inside rack_init send variables (snd_max/una etc) were not set */
rc_gp_rtt_set : 1,
rc_gp_dyn_mul : 1,
rc_gp_saw_rec : 1,
@@ -735,5 +820,9 @@
struct rack_control r_ctl;
} __aligned(CACHE_LINE_SIZE);
+
+void rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack,
+ uint32_t ss, uint32_t es);
+
#endif
#endif
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -287,18 +287,29 @@
static volatile uint32_t number_of_tcp_connections_pacing = 0;
static uint32_t shadow_num_connections = 0;
static counter_u64_t tcp_pacing_failures;
+static counter_u64_t tcp_dgp_failures;
+static uint32_t shadow_tcp_pacing_dgp = 0;
+static volatile uint32_t number_of_dgp_connections = 0;
static int tcp_pacing_limit = 10000;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
&tcp_pacing_limit, 1000,
"If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
+static int tcp_dgp_limit = -1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW,
+ &tcp_dgp_limit, -1,
+ "If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)");
+
SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
&shadow_num_connections, 0, "Number of TCP connections being paced");
SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD,
&tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit");
+SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD,
+ &tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit");
+
static int tcp_log_debug = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
@@ -1571,6 +1582,7 @@
tcp_uncomp_total = counter_u64_alloc(M_WAITOK);
tcp_bad_csums = counter_u64_alloc(M_WAITOK);
tcp_pacing_failures = counter_u64_alloc(M_WAITOK);
+ tcp_dgp_failures = counter_u64_alloc(M_WAITOK);
#ifdef TCPPCAP
tcp_pcap_init();
#endif
@@ -4022,6 +4034,43 @@
}
}
+int
+tcp_incr_dgp_pacing_cnt(void)
+{
+ if ((tcp_dgp_limit == -1) ||
+ (tcp_dgp_limit > number_of_dgp_connections)) {
+ atomic_fetchadd_int(&number_of_dgp_connections, 1);
+ shadow_tcp_pacing_dgp = number_of_dgp_connections;
+ return (1);
+ } else {
+ counter_u64_add(tcp_dgp_failures, 1);
+ return (0);
+ }
+}
+
+static uint8_t tcp_dgp_warning = 0;
+
+void
+tcp_dec_dgp_pacing_cnt(void)
+{
+ uint32_t ret;
+
+ ret = atomic_fetchadd_int(&number_of_dgp_connections, -1);
+ shadow_tcp_pacing_dgp = number_of_dgp_connections;
+ KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?"));
+ if (ret == 0) {
+ if (tcp_dgp_limit != -1) {
+ printf("Warning all DGP is now disabled, count decrements invalidly!\n");
+ tcp_dgp_limit = 0;
+ tcp_dgp_warning = 1;
+ } else if (tcp_dgp_warning == 0) {
+ printf("Warning DGP pacing is invalid, invalid decrement\n");
+ tcp_dgp_warning = 1;
+ }
+ }
+
+}
+
static uint8_t tcp_pacing_warning = 0;
void
@@ -4541,7 +4590,7 @@
if (tp->t_tcpreq_req) {
for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) {
fil = &tp->t_tcpreq_info[i];
- if (fil->flags != TCP_TRK_TRACK_FLG_USED)
+ if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0)
continue;
if ((fil->timestamp == req->timestamp) &&
(fil->start == req->start) &&
@@ -4573,6 +4622,7 @@
allocated = 1;
fil->flags = TCP_TRK_TRACK_FLG_USED;
fil->timestamp = req->timestamp;
+ fil->playout_ms = req->playout_ms;
fil->localtime = ts;
fil->start = req->start;
if (req->flags & TCP_LOG_HTTPD_RANGE_END) {
@@ -4589,7 +4639,10 @@
fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc;
fil->start_seq = tp->snd_una +
tptosocket(tp)->so_snd.sb_ccc;
- fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
+ if (req->flags & TCP_LOG_HTTPD_RANGE_END)
+ fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
+ else
+ fil->end_seq = 0;
if (tptosocket(tp)->so_snd.sb_tls_info) {
/*
* This session is doing TLS. Take a swag guess
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1032,7 +1032,10 @@
if (!solisten_enqueue(so, SS_ISCONNECTED))
tp->t_flags |= TF_SONOTCONN;
-
+ /* Can we inherit anything from the listener? */
+ if (tp->t_fb->tfb_inherit != NULL) {
+ (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(lso));
+ }
return (so);
allocfail:
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -179,6 +179,12 @@
goto out;
}
tp->t_state = TCPS_CLOSED;
+ /* Can we inherit anything from the listener? */
+ if ((so->so_listen != NULL) &&
+ (so->so_listen->so_pcb != NULL) &&
+ (tp->t_fb->tfb_inherit != NULL)) {
+ (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(so->so_listen));
+ }
tcp_bblog_pru(tp, PRU_ATTACH, error);
INP_WUNLOCK(inp);
TCPSTATES_INC(TCPS_CLOSED);
@@ -1601,6 +1607,7 @@
ti->tcpi_rcv_numsacks = tp->rcv_numsacks;
ti->tcpi_rcv_adv = tp->rcv_adv;
ti->tcpi_dupacks = tp->t_dupacks;
+ ti->tcpi_rttmin = tp->t_rttlow;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
ti->tcpi_options |= TCPI_OPT_TOE;
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -138,7 +138,8 @@
#define TCP_TRK_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */
#define TCP_TRK_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */
#define TCP_TRK_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */
-#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
+#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
+#define TCP_TRK_TRACK_FLG_LSND 0x20 /* We were able to set the Last Sent */
#define MAX_TCP_TRK_REQ 5 /* Max we will have at once */
struct tcp_sendfile_track {
@@ -151,11 +152,14 @@
uint64_t cspr; /* Client suggested pace rate */
uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */
uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */
+ uint64_t sent_at_ls; /* Sent value at the last send */
+ uint64_t rxt_at_ls; /* Retransmit value at the last send */
tcp_seq start_seq; /* First TCP Seq assigned */
tcp_seq end_seq; /* If range req last seq */
uint32_t flags; /* Type of request open etc */
uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */
uint32_t hint_maxseg; /* Client hinted maxseg */
+ uint32_t playout_ms; /* Client playout ms */
uint32_t hybrid_flags; /* Hybrid flags on this request */
};
@@ -623,6 +627,8 @@
void (*tfb_switch_failed)(struct tcpcb *);
bool (*tfb_early_wake_check)(struct tcpcb *);
int (*tfb_compute_pipe)(struct tcpcb *tp);
+ int (*tfb_stack_info)(struct tcpcb *tp, struct stack_specific_info *);
+ void (*tfb_inherit)(struct tcpcb *tp, struct inpcb *h_inp);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
@@ -788,7 +794,7 @@
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
#define TF_CLOSED 0x04000000 /* close(2) called on socket */
-#define TF_UNUSED1 0x08000000 /* unused */
+#define TF_SENTSYN 0x08000000 /* At least one syn has been sent */
#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
@@ -1501,6 +1507,8 @@
int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
size_t seed_len);
int tcp_can_enable_pacing(void);
+int tcp_incr_dgp_pacing_cnt(void);
+void tcp_dec_dgp_pacing_cnt(void);
void tcp_decrement_paced_conn(void);
void tcp_change_time_units(struct tcpcb *, int);
void tcp_handle_orphaned_packets(struct tcpcb *);

File Metadata

Mime Type
text/plain
Expires
Mon, Sep 30, 11:05 PM (8 h, 30 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
13186043
Default Alt Text
D43986.diff (257 KB)

Event Timeline