D43986.diff
No OneTemporary
Actions

Size

257 KB

Referenced Files

None

Subscribers

None

D43986.diff
View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
	--- a/sys/modules/tcp/rack/Makefile
	+++ b/sys/modules/tcp/rack/Makefile
	@@ -5,7 +5,7 @@

	STACKNAME= rack
	KMOD= tcp_${STACKNAME}
	-SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c
	+SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c rack_pcm.c

	SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
	SRCS+= opt_kern_tls.h
	diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
	--- a/sys/netinet/tcp.h
	+++ b/sys/netinet/tcp.h
	@@ -334,9 +334,22 @@
	#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
	#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */
	#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */
	-#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */
	+#define TCP_POLICER_DETECT 1149 /* Do we apply a thresholds to rack to detect and compensate for policers? */
	+#define TCP_RXT_CLAMP TCP_POLICER_DETECT
	#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */
	#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */
	+#define TCP_SS_EEXIT 1152 /* Do we do early exit from slowtart if no b/w growth */
	+#define TCP_DGP_UPPER_BOUNDS 1153 /* SS and CA upper bound in percentage */
	+#define TCP_NO_TIMELY 1154 /* Disable/enable Timely */
	+#define TCP_HONOR_HPTS_MIN 1155 /* Do we honor hpts min to */
	+#define TCP_REC_IS_DYN 1156 /* Do we allow timely to change recovery multiplier? */
	+#define TCP_SIDECHAN_DIS 1157 /* Disable/enable the side-channel */
	+#define TCP_FILLCW_RATE_CAP 1158 /* Set a cap for DGP's fillcw */
	+#define TCP_POLICER_MSS 1159 /* Policer MSS requirement */
	+#define TCP_STACK_SPEC_INFO 1160 /* Get stack specific information (if present) */
	+#define RACK_CSPR_IS_FCC 1161
	+#define TCP_GP_USE_LTBW 1162 /* how we use lt_bw 0=not, 1=min, 2=max */
	+

	/* Start of reserved space for third-party user-settable options. */
	#define TCP_VENDOR SO_VENDOR
	@@ -447,6 +460,7 @@
	u_int32_t tcpi_rcv_adv; /* Peer advertised window */
	u_int32_t tcpi_dupacks; /* Consecutive dup ACKs recvd */

	+ u_int32_t tcpi_rttmin; /* Min observed RTT */
	/* Padding to grow without breaking ABI. */
	u_int32_t __tcpi_pad[14]; /* Padding. */
	};
	@@ -463,6 +477,20 @@

	#define TCP_FUNCTION_NAME_LEN_MAX 32

	+struct stack_specific_info {
	+ char stack_name[TCP_FUNCTION_NAME_LEN_MAX];
	+ uint64_t policer_last_bw; /* Only valid if detection enabled and policer detected */
	+ uint64_t bytes_transmitted;
	+ uint64_t bytes_retransmitted;
	+ uint32_t policer_detection_enabled: 1,
	+ policer_detected : 1, /* transport thinks a policer is on path */
	+ highly_buffered : 1, /* transport considers the path highly buffered */
	+ spare : 29;
	+ uint32_t policer_bucket_size; /* Only valid if detection enabled and policer detected */
	+ uint32_t current_round;
	+ uint32_t _rack_i_pad[18];
	+};
	+
	struct tcp_function_set {
	char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
	uint32_t pcbcnt;
	@@ -488,6 +516,7 @@
	uint64_t start;
	uint64_t end;
	uint32_t flags;
	+ uint32_t playout_ms;
	};

	union tcp_log_userdata {
	@@ -518,9 +547,12 @@
	#define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */
	#define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */
	#define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */
	-#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */
	+#define TCP_HAS_PLAYOUT_MS 0x0040 /* The client included the chunk playout milliseconds: deprecate */
	+/* the below are internal only flags */
	+#define TCP_HYBRID_PACING_USER_MASK 0x0FFF /* Non-internal flags mask */
	+#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tells us we set the mss on this entry */
	#define TCP_HYBRID_PACING_WASSET 0x2000 /* We init to this to know if a hybrid command was issued */
	-
	+#define TCP_HYBRID_PACING_SENDTIME 0x4000 /* Duplicate tm to last, use sendtime for catch up mode */

	struct tcp_hybrid_req {
	struct tcp_snd_req req;
	diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
	--- a/sys/netinet/tcp_log_buf.h
	+++ b/sys/netinet/tcp_log_buf.h
	@@ -267,7 +267,9 @@
	TCP_RACK_TP_TRIGGERED, /* A rack tracepoint is triggered 68 */
	TCP_HYBRID_PACING_LOG, /* Hybrid pacing log 69 */
	TCP_LOG_PRU, /* TCP protocol user request 70 */
	- TCP_LOG_END /* End (keep at end) 71 */
	+ TCP_POLICER_DET, /* TCP Policer detectionn 71 */
	+ TCP_PCM_MEASURE, /* TCP Path Capacity Measurement 72 */
	+ TCP_LOG_END /* End (keep at end) 72 */
	};

	enum tcp_log_states {
	@@ -371,10 +373,11 @@
	#define TCP_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */
	#define TCP_TP_REQ_LOG_FAIL 0x00000005 /* We tried to allocate a Request log but had no space */
	#define TCP_TP_RESET_RCV 0x00000006 /* Triggers when we receive a RST */
	-#define TCP_TP_EXCESS_RXT 0x00000007 /* When we get excess RXT's clamping the cwnd */
	+#define TCP_TP_POLICER_DET 0x00000007 /* When we detect a policer */
	+#define TCP_TP_EXCESS_RXT TCP_TP_POLICER_DET /* alias */
	#define TCP_TP_SAD_TRIGGERED 0x00000008 /* Sack Attack Detection triggers */
	-
	#define TCP_TP_SAD_SUSPECT 0x0000000a /* A sack has supicious information in it */
	+#define TCP_TP_PACED_BOTTOM 0x0000000b /* We have paced at the bottom */

	#ifdef _KERNEL

	diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
	--- a/sys/netinet/tcp_stacks/bbr.c
	+++ b/sys/netinet/tcp_stacks/bbr.c
	@@ -11529,7 +11529,9 @@
	bbr_set_pktepoch(bbr, cts, __LINE__);
	bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
	if (nxt_pkt == 0) {
	- if (bbr->r_wanted_output != 0) {
	+ if ((bbr->r_wanted_output != 0) \|\|
	+ (tp->t_flags & TF_ACKNOW)) {
	+
	bbr->rc_output_starts_timer = 0;
	did_out = 1;
	if (tcp_output(tp) < 0)
	diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
	--- a/sys/netinet/tcp_stacks/rack.c
	+++ b/sys/netinet/tcp_stacks/rack.c
	@@ -142,9 +142,12 @@
	#define V_newreno_beta VNET(newreno_beta)
	#define V_newreno_beta_ecn VNET(newreno_beta_ecn)

	+#define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME)
	+#define M_TCPDO __CONCAT(M_TCPDO, STACKNAME)

	-MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
	-MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
	+MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block");
	+MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options");
	+MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information");

	struct sysctl_ctx_list rack_sysctl_ctx;
	struct sysctl_oid *rack_sysctl_root;
	@@ -190,12 +193,24 @@
	static int32_t rack_reorder_thresh = 2;
	static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
	* - 60 seconds */
	-static uint32_t rack_clamp_ss_upper = 110;
	-static uint32_t rack_clamp_ca_upper = 105;
	-static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */
	-static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */
	-static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */
	-static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */
	+static uint16_t rack_policer_rxt_thresh= 0; /* 499 = 49.9%, 0 is off */
	+static uint8_t rack_policer_avg_thresh = 0; /* 3.2 */
	+static uint8_t rack_policer_med_thresh = 0; /* 1 - 16 */
	+static uint16_t rack_policer_bucket_reserve = 20; /* How much % is reserved in the bucket */
	+static uint64_t rack_pol_min_bw = 125000; /* 1mbps in Bytes per sec */
	+static uint32_t rack_policer_data_thresh = 64000; /* 64,000 bytes must be sent before we engage */
	+static uint32_t rack_policing_do_bw_comp = 1;
	+static uint32_t rack_pcm_every_n_rounds = 100;
	+static uint32_t rack_pcm_blast = 0;
	+static uint32_t rack_pcm_is_enabled = 1;
	+static uint8_t rack_req_del_mss = 18; /* How many segments need to be sent in a recovery episode to do policer_detection */
	+static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
	+
	+static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */
	+static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
	+
	+
	+static int32_t rack_rxt_scoreboard_clear_thresh = 2;
	static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */
	static int32_t rack_rxt_controls = 0;
	static int32_t rack_fill_cw_state = 0;
	@@ -217,9 +232,8 @@
	static int32_t rack_apply_rtt_with_reduced_conf = 0;
	static int32_t rack_hibeta_setting = 0;
	static int32_t rack_default_pacing_divisor = 250;
	-static int32_t rack_uses_full_dgp_in_rec = 1;
	static uint16_t rack_pacing_min_seg = 0;
	-
	+static int32_t rack_timely_off = 0;

	static uint32_t sad_seg_size_per = 800; /* 80.0 % */
	static int32_t rack_pkt_delay = 1000;
	@@ -235,7 +249,7 @@
	static int32_t rack_max_abc_post_recovery = 2;
	static int32_t rack_client_low_buf = 0;
	static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
	-static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */
	+static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */
	#ifdef TCP_ACCOUNTING
	static int32_t rack_tcp_accounting = 0;
	#endif
	@@ -247,8 +261,9 @@
	static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
	static int32_t rack_persist_min = 250000; /* 250usec */
	static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
	+static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */
	+static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */
	static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
	-static int32_t rack_default_init_window = 0; /* Use system default */
	static int32_t rack_limit_time_with_srtt = 0;
	static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
	static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */
	@@ -282,7 +297,6 @@
	static int32_t rack_def_profile = 0;

	static int32_t rack_lower_cwnd_at_tlp = 0;
	-static int32_t rack_limited_retran = 0;
	static int32_t rack_always_send_oldest = 0;
	static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;

	@@ -356,6 +370,7 @@
	static int32_t rack_down_raise_thresh = 100;
	static int32_t rack_req_segs = 1;
	static uint64_t rack_bw_rate_cap = 0;
	+static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */


	/* Rack specific counters */
	@@ -377,6 +392,7 @@
	counter_u64_t rack_tlp_retran_bytes;
	counter_u64_t rack_to_tot;
	counter_u64_t rack_hot_alloc;
	+counter_u64_t tcp_policer_detected;
	counter_u64_t rack_to_alloc;
	counter_u64_t rack_to_alloc_hard;
	counter_u64_t rack_to_alloc_emerg;
	@@ -440,7 +456,7 @@
	static int
	rack_process_ack(struct mbuf m, struct tcphdr th,
	struct socket so, struct tcpcb tp, struct tcpopt *to,
	- uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
	+ uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen);
	static int
	rack_process_data(struct mbuf m, struct tcphdr th,
	struct socket so, struct tcpcb tp, int32_t drop_hdrlen, int32_t tlen,
	@@ -454,6 +470,8 @@
	static struct rack_sendmap *
	rack_check_recovery_mode(struct tcpcb *tp,
	uint32_t tsused);
	+static uint32_t
	+rack_grab_rtt(struct tcpcb tp, struct tcp_rack rack);
	static void
	rack_cong_signal(struct tcpcb *tp,
	uint32_t type, uint32_t ack, int );
	@@ -504,13 +522,14 @@
	static void
	rack_log_output(struct tcpcb tp, struct tcpopt to, int32_t len,
	uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
	- struct rack_sendmap hintrsm, uint16_t add_flags, struct mbuf s_mb, uint32_t s_moff, int hw_tls, int segsiz);
	+ struct rack_sendmap hintrsm, uint32_t add_flags, struct mbuf s_mb, uint32_t s_moff, int hw_tls, int segsiz);

	static uint64_t rack_get_gp_est(struct tcp_rack *rack);

	+
	static void
	rack_log_sack_passed(struct tcpcb tp, struct tcp_rack rack,
	- struct rack_sendmap *rsm);
	+ struct rack_sendmap *rsm, uint32_t cts);
	static void rack_log_to_event(struct tcp_rack rack, int32_t to_num, struct rack_sendmap rsm);
	static int32_t rack_output(struct tcpcb *tp);

	@@ -526,10 +545,10 @@
	static void rack_timer_cancel(struct tcpcb tp, struct tcp_rack rack, uint32_t cts, int line);
	static uint32_t
	rack_update_entry(struct tcpcb tp, struct tcp_rack rack,
	- struct rack_sendmap rsm, uint64_t ts, int32_t lenp, uint16_t add_flag, int segsiz);
	+ struct rack_sendmap rsm, uint64_t ts, int32_t lenp, uint32_t add_flag, int segsiz);
	static void
	rack_update_rsm(struct tcpcb tp, struct tcp_rack rack,
	- struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz);
	+ struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz);
	static int
	rack_update_rtt(struct tcpcb tp, struct tcp_rack rack,
	struct rack_sendmap rsm, struct tcpopt to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
	@@ -538,6 +557,10 @@
	rack_do_close_wait(struct mbuf m, struct tcphdr th,
	struct socket so, struct tcpcb tp, struct tcpopt *to, int32_t drop_hdrlen,
	int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
	+
	+static void
	+rack_peg_rxt(struct tcp_rack rack, struct rack_sendmap rsm, uint32_t segsiz);
	+
	static int
	rack_do_closing(struct mbuf m, struct tcphdr th,
	struct socket so, struct tcpcb tp, struct tcpopt *to, int32_t drop_hdrlen,
	@@ -720,6 +743,22 @@
	rack_swap_beta_values(rack, 4);
	}

	+static void
	+rack_remove_pacing(struct tcp_rack *rack)
	+{
	+ if (rack->rc_pacing_cc_set)
	+ rack_undo_cc_pacing(rack);
	+ if (rack->r_ctl.pacing_method & RACK_REG_PACING)
	+ tcp_decrement_paced_conn();
	+ if (rack->r_ctl.pacing_method & RACK_DGP_PACING)
	+ tcp_dec_dgp_pacing_cnt();
	+ rack->rc_always_pace = 0;
	+ rack->r_ctl.pacing_method = RACK_PACING_NONE;
	+ rack->dgp_on = 0;
	+ rack->rc_hybrid_mode = 0;
	+ rack->use_fixed_rate = 0;
	+}
	+
	static void
	rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t,
	uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm)
	@@ -742,6 +781,8 @@
	log.u_bbr.pkts_out = line;
	log.u_bbr.cwnd_gain = rack->app_limited_needs_set;
	log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt;
	+ log.u_bbr.epoch = rack->r_ctl.current_round;
	+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
	if (rsm != NULL) {
	log.u_bbr.applimited = rsm->r_start;
	log.u_bbr.delivered = rsm->r_end;
	@@ -857,6 +898,7 @@
	struct sysctl_oid *rack_measure;
	struct sysctl_oid *rack_probertt;
	struct sysctl_oid *rack_hw_pacing;
	+ struct sysctl_oid *rack_policing;

	rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_sysctl_root),
	@@ -994,11 +1036,36 @@
	"pacing",
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Pacing related Controls");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_pacing),
	+ OID_AUTO, "pcm_enabled", CTLFLAG_RW,
	+ &rack_pcm_is_enabled, 1,
	+ "Do we by default do PCM measurements?");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_pacing),
	+ OID_AUTO, "pcm_rnds", CTLFLAG_RW,
	+ &rack_pcm_every_n_rounds, 100,
	+ "How many rounds before we need to do a PCM measurement");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_pacing),
	+ OID_AUTO, "pcm_blast", CTLFLAG_RW,
	+ &rack_pcm_blast, 0,
	+ "Blast out the full cwnd/rwnd when doing a PCM measurement");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_pacing),
	+ OID_AUTO, "rnd_gp_gain", CTLFLAG_RW,
	+ &rack_gp_gain_req, 1200,
	+ "How much do we have to increase the GP to record the round 1200 = 120.0");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_pacing),
	+ OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW,
	+ &rack_rnd_cnt_req, 0x10005,
	+ "How many rounds less than rnd_gp_gain will drop us out of SS");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_pacing),
	- OID_AUTO, "fulldgpinrec", CTLFLAG_RW,
	- &rack_uses_full_dgp_in_rec, 1,
	- "Do we use all DGP features in recovery (fillcw, timely et.al.)?");
	+ OID_AUTO, "no_timely", CTLFLAG_RW,
	+ &rack_timely_off, 0,
	+ "Do we not use timely in DGP?");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_pacing),
	OID_AUTO, "fullbufdisc", CTLFLAG_RW,
	@@ -1017,13 +1084,13 @@
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_pacing),
	OID_AUTO, "divisor", CTLFLAG_RW,
	- &rack_default_pacing_divisor, 4,
	+ &rack_default_pacing_divisor, 250,
	"What is the default divisor given to the rl code?");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_pacing),
	OID_AUTO, "fillcw_max_mult", CTLFLAG_RW,
	- &rack_bw_multipler, 2,
	- "What is the multiplier of the current gp_est that fillcw can increase the b/w too?");
	+ &rack_bw_multipler, 0,
	+ "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_pacing),
	OID_AUTO, "max_pace_over", CTLFLAG_RW,
	@@ -1039,11 +1106,6 @@
	OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
	&rack_limit_time_with_srtt, 0,
	"Do we limit pacing time based on srtt");
	- SYSCTL_ADD_S32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_pacing),
	- OID_AUTO, "init_win", CTLFLAG_RW,
	- &rack_default_init_window, 0,
	- "Do we have a rack initial window 0 = system default");
	SYSCTL_ADD_U16(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_pacing),
	OID_AUTO, "gp_per_ss", CTLFLAG_RW,
	@@ -1079,6 +1141,11 @@
	OID_AUTO, "rate_cap", CTLFLAG_RW,
	&rack_bw_rate_cap, 0,
	"If set we apply this value to the absolute rate cap used by pacing");
	+ SYSCTL_ADD_U64(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_pacing),
	+ OID_AUTO, "fillcw_cap", CTLFLAG_RW,
	+ &rack_fillcw_bw_cap, 3750000,
	+ "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?");
	SYSCTL_ADD_U8(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_sysctl_root),
	OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
	@@ -1317,11 +1384,6 @@
	OID_AUTO, "send_oldest", CTLFLAG_RW,
	&rack_always_send_oldest, 0,
	"Should we always send the oldest TLP and RACK-TLP");
	- SYSCTL_ADD_S32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_tlp),
	- OID_AUTO, "rack_tlimit", CTLFLAG_RW,
	- &rack_limited_retran, 0,
	- "How many times can a rack timeout drive out sends");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_tlp),
	OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
	@@ -1355,6 +1417,26 @@
	"timers",
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Timer related controls");
	+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_timers),
	+ OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW,
	+ &rack_ssthresh_rest_rto_rec, 0,
	+ "When doing recovery -> rto -> recovery do we reset SSthresh?");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_timers),
	+ OID_AUTO, "scoreboard_thresh", CTLFLAG_RW,
	+ &rack_rxt_scoreboard_clear_thresh, 2,
	+ "How many RTO's are allowed before we clear the scoreboard");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_timers),
	+ OID_AUTO, "honor_hpts_min", CTLFLAG_RW,
	+ &rack_honors_hpts_min_to, 1,
	+ "Do rack pacing timers honor hpts min timeout");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_timers),
	+ OID_AUTO, "hpts_max_reduce", CTLFLAG_RW,
	+ &rack_max_reduce, 10,
	+ "Max percentage we will reduce slot by for pacing when we are behind");
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_timers),
	OID_AUTO, "persmin", CTLFLAG_RW,
	@@ -1434,11 +1516,6 @@
	"features",
	CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"Feature controls");
	- SYSCTL_ADD_U64(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_features),
	- OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW,
	- &rack_rxt_clamp_thresh, 0,
	- "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_features),
	OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW,
	@@ -1474,6 +1551,53 @@
	OID_AUTO, "hystartplusplus", CTLFLAG_RW,
	&rack_do_hystart, 0,
	"Should RACK enable HyStart++ on connections?");
	+ /* Policer detection */
	+ rack_policing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_sysctl_root),
	+ OID_AUTO,
	+ "policing",
	+ CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	+ "policer detection");
	+ SYSCTL_ADD_U16(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "rxt_thresh", CTLFLAG_RW,
	+ &rack_policer_rxt_thresh, 0,
	+ "Percentage of retransmits we need to be a possible policer (499 = 49.9 percent)");
	+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "avg_thresh", CTLFLAG_RW,
	+ &rack_policer_avg_thresh, 0,
	+ "What threshold of average retransmits needed to recover a lost packet (1 - 169 aka 21 = 2.1)?");
	+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "med_thresh", CTLFLAG_RW,
	+ &rack_policer_med_thresh, 0,
	+ "What threshold of Median retransmits needed to recover a lost packet (1 - 16)?");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "data_thresh", CTLFLAG_RW,
	+ &rack_policer_data_thresh, 64000,
	+ "How many bytes must have gotten through before we can start doing policer detection?");
	+ SYSCTL_ADD_U32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "bwcomp", CTLFLAG_RW,
	+ &rack_policing_do_bw_comp, 1,
	+ "Do we raise up low b/w so that at least pace_max_seg can be sent in the srtt?");
	+ SYSCTL_ADD_U8(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "recmss", CTLFLAG_RW,
	+ &rack_req_del_mss, 18,
	+ "How many MSS must be delivered during recovery to engage policer detection?");
	+ SYSCTL_ADD_U16(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "res_div", CTLFLAG_RW,
	+ &rack_policer_bucket_reserve, 20,
	+ "What percentage is reserved in the policer bucket?");
	+ SYSCTL_ADD_U64(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_policing),
	+ OID_AUTO, "min_comp_bw", CTLFLAG_RW,
	+ &rack_pol_min_bw, 125000,
	+ "Do we have a min b/w for b/w compensation (0 = no)?");
	/* Misc rack controls */
	rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_sysctl_root),
	@@ -1578,31 +1702,8 @@
	OID_AUTO, "autoscale", CTLFLAG_RW,
	&rack_autosndbuf_inc, 20,
	"What percentage should rack scale up its snd buffer by?");
	- SYSCTL_ADD_U32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_misc),
	- OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW,
	- &rack_rxt_min_rnds, 10,
	- "Number of rounds needed between RTT clamps due to high loss rates");
	- SYSCTL_ADD_U32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_misc),
	- OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW,
	- &rack_unclamp_round_thresh, 100,
	- "Number of rounds needed with no loss to unclamp");
	- SYSCTL_ADD_U32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_misc),
	- OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW,
	- &rack_unclamp_rxt_thresh, 5,
	- "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n");
	- SYSCTL_ADD_U32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_misc),
	- OID_AUTO, "clamp_ss_upper", CTLFLAG_RW,
	- &rack_clamp_ss_upper, 110,
	- "Clamp percentage ceiling in SS?");
	- SYSCTL_ADD_U32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_misc),
	- OID_AUTO, "clamp_ca_upper", CTLFLAG_RW,
	- &rack_clamp_ca_upper, 110,
	- "Clamp percentage ceiling in CA?");
	+
	+
	/* Sack Attacker detection stuff */
	SYSCTL_ADD_U32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_attack),
	@@ -1779,6 +1880,13 @@
	OID_AUTO, "alloc_hot", CTLFLAG_RD,
	&rack_hot_alloc,
	"Total allocations from the top of our list");
	+ tcp_policer_detected = counter_u64_alloc(M_WAITOK);
	+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_counters),
	+ OID_AUTO, "policer_detected", CTLFLAG_RD,
	+ &tcp_policer_detected,
	+ "Total policer_detections");
	+
	rack_to_alloc = counter_u64_alloc(M_WAITOK);
	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_counters),
	@@ -1957,17 +2065,8 @@
	static uint32_t
	rc_init_window(struct tcp_rack *rack)
	{
	- uint32_t win;
	+ return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));

	- if (rack->rc_init_win == 0) {
	- /*
	- * Nothing set by the user, use the system stack
	- * default.
	- */
	- return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
	- }
	- win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
	- return (win);
	}

	static uint64_t
	@@ -2071,6 +2170,7 @@
	off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
	log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
	#endif
	+ log.u_bbr.inhpts = 1;
	log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs);
	log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs);
	log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags;
	@@ -2116,9 +2216,24 @@
	memset(&log, 0, sizeof(log));

	log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	- log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
	log.u_bbr.delRate = cur->sent_at_fs;
	- log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes;
	+
	+ if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) {
	+ /*
	+ * We did not get a new Rules Applied to set so
	+ * no overlapping send occured, this means the
	+ * current byte counts are correct.
	+ */
	+ log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
	+ log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes;
	+ } else {
	+ /*
	+ * Overlapping send case, we switched to a new
	+ * send and did a rules applied.
	+ */
	+ log.u_bbr.cur_del_rate = cur->sent_at_ls;
	+ log.u_bbr.rttProp = cur->rxt_at_ls;
	+ }
	log.u_bbr.bw_inuse = cur->rxt_at_fs;
	log.u_bbr.cwnd_gain = line;
	off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
	@@ -2138,6 +2253,7 @@
	log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff);
	/* now set all the flags in */
	log.u_bbr.pkts_out = cur->hybrid_flags;
	+ log.u_bbr.lost = cur->playout_ms;
	log.u_bbr.flex6 = cur->flags;
	/*
	* Last send time = <flex5 \| pkt_epoch> note we do not distinguish cases
	@@ -2146,6 +2262,20 @@
	*/
	log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff);
	log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff);
	+ /*
	+ * Compose bbr_state to be a bit wise 0000ADHF
	+ * where A is the always_pace flag
	+ * where D is the dgp_on flag
	+ * where H is the hybrid_mode on flag
	+ * where F is the use_fixed_rate flag.
	+ */
	+ log.u_bbr.bbr_state = rack->rc_always_pace;
	+ log.u_bbr.bbr_state <<= 1;
	+ log.u_bbr.bbr_state \|= rack->dgp_on;
	+ log.u_bbr.bbr_state <<= 1;
	+ log.u_bbr.bbr_state \|= rack->rc_hybrid_mode;
	+ log.u_bbr.bbr_state <<= 1;
	+ log.u_bbr.bbr_state \|= rack->use_fixed_rate;

	log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST;
	tcp_log_event(rack->rc_tp, NULL,
	@@ -2299,6 +2429,7 @@
	#ifdef TCP_REQUEST_TRK
	if (rack->rc_hybrid_mode &&
	rack->rc_catch_up &&
	+ (rack->r_ctl.rc_last_sft != NULL) &&
	(rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
	(rack_hybrid_allow_set_maxseg == 1) &&
	((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
	@@ -2338,7 +2469,10 @@
	*/
	uint64_t srtt;

	- lt_bw = rack_get_lt_bw(rack);
	+ if (rack->dis_lt_bw == 1)
	+ lt_bw = 0;
	+ else
	+ lt_bw = rack_get_lt_bw(rack);
	if (lt_bw) {
	/*
	* No goodput bw but a long-term b/w does exist
	@@ -2374,19 +2508,22 @@
	/* Still doing initial average must calculate */
	bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1);
	}
	+ if (rack->dis_lt_bw) {
	+ /* We are not using lt-bw */
	+ ret_bw = bw;
	+ goto compensate;
	+ }
	lt_bw = rack_get_lt_bw(rack);
	if (lt_bw == 0) {
	/* If we don't have one then equate it to the gp_bw */
	lt_bw = rack->r_ctl.gp_bw;
	}
	- if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){
	- /* if clamped take the lowest */
	+ if (rack->use_lesser_lt_bw) {
	if (lt_bw < bw)
	ret_bw = lt_bw;
	else
	ret_bw = bw;
	} else {
	- /* If not set for clamped to get lowest, take the highest */
	if (lt_bw > bw)
	ret_bw = lt_bw;
	else
	@@ -2487,6 +2624,8 @@
	log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
	log.u_bbr.flex8 = mod;
	log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.epoch = rack->r_ctl.current_round;
	+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2535,6 +2674,8 @@
	else
	log.u_bbr.cur_del_rate = 0;
	log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
	+ log.u_bbr.epoch = rack->r_ctl.current_round;
	+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2552,28 +2693,9 @@
	uint64_t bw_est, high_rate;
	uint64_t gain;

	- if ((rack->r_pacing_discount == 0) \|\|
	- (rack_full_buffer_discount == 0)) {
	- /*
	- * No buffer level based discount from client buffer
	- * level is enabled or the feature is disabled.
	- */
	- gain = (uint64_t)rack_get_output_gain(rack, rsm);
	- bw_est = bw * gain;
	- bw_est /= (uint64_t)100;
	- } else {
	- /*
	- * We have a discount in place apply it with
	- * just a 100% gain (we get no boost if the buffer
	- * is full).
	- */
	- uint64_t discount;
	-
	- discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm);
	- discount /= 100;
	- /* What %% of the b/w do we discount */
	- bw_est = bw - discount;
	- }
	+ gain = (uint64_t)rack_get_output_gain(rack, rsm);
	+ bw_est = bw * gain;
	+ bw_est /= (uint64_t)100;
	/* Never fall below the minimum (def 64kbps) */
	if (bw_est < RACK_MIN_BW)
	bw_est = RACK_MIN_BW;
	@@ -2659,6 +2781,8 @@
	log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
	log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
	log.u_bbr.pacing_gain = rack->r_must_retran;
	+ log.u_bbr.epoch = rack->r_ctl.current_round;
	+ log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2698,6 +2822,10 @@
	log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
	log.u_bbr.lost = rack_rto_min;
	log.u_bbr.epoch = rack->r_ctl.roundends;
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	+ log.u_bbr.applimited = rack->rc_tp->t_flags2;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2731,6 +2859,9 @@
	log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
	log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
	log.u_bbr.pacing_gain = rack->r_must_retran;
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2780,6 +2911,9 @@
	log.u_bbr.lost = 0;
	else
	log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2927,6 +3061,9 @@
	log.u_bbr.flex4 = where;
	log.u_bbr.flex7 = 2;
	log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2939,7 +3076,7 @@
	static void
	rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho)
	{
	- if (tcp_bblogging_on(rack->rc_tp)) {
	+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
	union tcp_log_stackspecific log;
	struct timeval tv;

	@@ -2951,6 +3088,9 @@
	log.u_bbr.flex7 = 3;
	log.u_bbr.rttProp = tsv;
	log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -2979,6 +3119,9 @@
	log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
	log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
	log.u_bbr.pacing_gain = rack->r_must_retran;
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -3051,6 +3194,13 @@
	log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
	log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
	log.u_bbr.pacing_gain = rack->r_must_retran;
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	+ log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat;
	+ log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat;
	+ log.u_bbr.lost = rack->rc_tp->t_srtt;
	+ log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -3112,6 +3262,9 @@
	log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
	log.u_bbr.pacing_gain = rack->r_must_retran;
	log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -3146,6 +3299,9 @@
	log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
	log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
	log.u_bbr.pacing_gain = rack->r_must_retran;
	+ log.u_bbr.bw_inuse = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse <<= 32;
	+ log.u_bbr.bw_inuse \|= rack->r_ctl.rc_considered_lost;
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -3314,6 +3470,7 @@
	counter_u64_free(rack_saw_enobuf_hw);
	counter_u64_free(rack_saw_enetunreach);
	counter_u64_free(rack_hot_alloc);
	+ counter_u64_free(tcp_policer_detected);
	counter_u64_free(rack_to_alloc);
	counter_u64_free(rack_to_alloc_hard);
	counter_u64_free(rack_to_alloc_emerg);
	@@ -3475,6 +3632,8 @@
	rack->r_ctl.rc_num_split_allocs--;
	}
	if (rsm == rack->r_ctl.rc_first_appl) {
	+ rack->r_ctl.cleared_app_ack_seq = rsm->r_start + (rsm->r_end - rsm->r_start);
	+ rack->r_ctl.cleared_app_ack = 1;
	if (rack->r_ctl.rc_app_limited_cnt == 0)
	rack->r_ctl.rc_first_appl = NULL;
	else
	@@ -3490,7 +3649,7 @@
	rack->r_ctl.rc_sacklast = NULL;
	memset(rsm, 0, sizeof(struct rack_sendmap));
	/* Make sure we are not going to overrun our count limit of 0xff */
	- if ((rack->rc_free_cnt + 1) > 0xff) {
	+ if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) {
	rack_free_trim(rack);
	}
	TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
	@@ -3806,6 +3965,8 @@

	logged = 0;

	+ if (rack->rc_skip_timely)
	+ return;
	if (override) {
	/*
	* override is passed when we are
	@@ -3976,6 +4137,8 @@
	uint64_t logvar, logvar2, logvar3;
	uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;

	+ if (rack->rc_skip_timely)
	+ return;
	if (rack->rc_gp_incr) {
	/* Turn off increment counting */
	rack->rc_gp_incr = 0;
	@@ -4177,6 +4340,7 @@
	*/
	uint32_t segsiz;

	+ rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
	if (rack->rc_gp_dyn_mul == 0)
	return;

	@@ -4203,7 +4367,6 @@
	rack->r_ctl.rc_pace_min_segs);
	rack->in_probe_rtt = 1;
	rack->measure_saw_probe_rtt = 1;
	- rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
	rack->r_ctl.rc_time_probertt_starts = 0;
	rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
	if (rack_probertt_use_min_rtt_entry)
	@@ -4387,6 +4550,7 @@
	rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
	{
	/* Check in on probe-rtt */
	+
	if (rack->rc_gp_filled == 0) {
	/* We do not do p-rtt unless we have gp measurements */
	return;
	@@ -4431,7 +4595,10 @@
	if (calc) {
	/* Maybe */
	calc *= rack_per_of_gp_probertt_reduce;
	- rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
	+ if (calc > rack_per_of_gp_probertt)
	+ rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
	+ else
	+ rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
	/* Limit it too */
	if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
	rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
	@@ -4472,7 +4639,9 @@
	rack_exit_probertt(rack, us_cts);
	}

	- } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
	+ } else if ((rack->rc_skip_timely == 0) &&
	+ (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) &&
	+ ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) {
	/* Go into probertt, its been too long since we went lower */
	rack_enter_probertt(rack, us_cts);
	}
	@@ -4831,6 +5000,32 @@
	}
	}

	+static void
	+rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line)
	+{
	+ if (tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = add_part;
	+ log.u_bbr.flex2 = sub_part;
	+ log.u_bbr.flex3 = rack_wma_divisor;
	+ log.u_bbr.flex4 = srtt;
	+ log.u_bbr.flex7 = (uint16_t)line;
	+ log.u_bbr.flex8 = meth;
	+ log.u_bbr.delRate = rack->r_ctl.gp_bw;
	+ log.u_bbr.cur_del_rate = meas_bw;
	+ log.u_bbr.rttProp = utim;
	+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
	+ &rack->rc_inp->inp_socket->so_rcv,
	+ &rack->rc_inp->inp_socket->so_snd,
	+ BBR_LOG_THRESH_CALC, 0,
	+ 0, &log, false, &rack->r_ctl.act_rcv_time);
	+ }
	+}
	+
	static void
	rack_do_goodput_measurement(struct tcpcb tp, struct tcp_rack rack,
	tcp_seq th_ack, int line, uint8_t quality)
	@@ -5046,6 +5241,8 @@
	* other hand if we get a measurement over 1ms with a
	* 10ms rtt we only want to take a much smaller portion.
	*/
	+ uint8_t meth;
	+
	if (rack->r_ctl.num_measurements < 0xff) {
	rack->r_ctl.num_measurements++;
	}
	@@ -5086,6 +5283,7 @@
	*/
	addpart = bytes_ps * utim;
	addpart /= (srtt * 8);
	+ meth = 1;
	} else {
	/*
	* Don't allow a single measurement
	@@ -5098,7 +5296,9 @@
	*/
	subpart = rack->r_ctl.gp_bw / 2;
	addpart = bytes_ps / 2;
	+ meth = 2;
	}
	+ rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
	resid_bw = rack->r_ctl.gp_bw - subpart;
	rack->r_ctl.gp_bw = resid_bw + addpart;
	did_add = 1;
	@@ -5116,6 +5316,7 @@
	subpart /= (srtt * rack_wma_divisor);
	addpart = bytes_ps * utim;
	addpart /= (srtt * rack_wma_divisor);
	+ meth = 3;
	} else {
	/*
	* The scaled measurement was long
	@@ -5124,6 +5325,7 @@
	*/
	subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
	addpart = bytes_ps / rack_wma_divisor;
	+ meth = 4;
	}
	if ((rack->measure_saw_probe_rtt == 0) \|\|
	(bytes_ps > rack->r_ctl.gp_bw)) {
	@@ -5133,12 +5335,83 @@
	* add in.
	*/
	did_add = 1;
	+ rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
	resid_bw = rack->r_ctl.gp_bw - subpart;
	rack->r_ctl.gp_bw = resid_bw + addpart;
	}
	}
	rack_set_pace_segments(tp, rack, __LINE__, NULL);
	}
	+ /*
	+ * We only watch the growth of the GP during the initial startup
	+ * or first-slowstart that ensues. If we ever needed to watch
	+ * growth of gp outside of that period all we need to do is
	+ * remove the first clause of this if (rc_initial_ss_comp).
	+ */
	+ if ((rack->rc_initial_ss_comp == 0) &&
	+ (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) {
	+ uint64_t gp_est;
	+
	+ gp_est = bytes_ps;
	+ if (tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = rack->r_ctl.current_round;
	+ log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
	+ log.u_bbr.delRate = gp_est;
	+ log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
	+ log.u_bbr.flex8 = 41;
	+ (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
	+ 0, &log, false, NULL, __func__, __LINE__,&tv);
	+ }
	+ if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) \|\|
	+ (rack->r_ctl.last_gpest == 0)) {
	+ /*
	+ * The round we get our measurement averaging going
	+ * is the base round so it always is the source point
	+ * for when we had our first increment. From there on
	+ * we only record the round that had a rise.
	+ */
	+ rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
	+ rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
	+ } else if (gp_est >= rack->r_ctl.last_gpest) {
	+ /*
	+ * Test to see if its gone up enough
	+ * to set the round count up to now. Note
	+ * that on the seeding of the 4th measurement we
	+ */
	+ gp_est *= 1000;
	+ gp_est /= rack->r_ctl.last_gpest;
	+ if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) {
	+ /*
	+ * We went up enough to record the round.
	+ */
	+ if (tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = rack->r_ctl.current_round;
	+ log.u_bbr.flex2 = (uint32_t)gp_est;
	+ log.u_bbr.flex3 = rack->r_ctl.gp_gain_req;
	+ log.u_bbr.delRate = gp_est;
	+ log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
	+ log.u_bbr.flex8 = 42;
	+ (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
	+ 0, &log, false, NULL, __func__, __LINE__,&tv);
	+ }
	+ rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
	+ if (rack->r_ctl.use_gp_not_last == 1)
	+ rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
	+ else
	+ rack->r_ctl.last_gpest = bytes_ps;
	+ }
	+ }
	+ }
	if ((rack->gp_ready == 0) &&
	(rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
	/* We have enough measurements now */
	@@ -5152,10 +5425,15 @@
	rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
	rack_get_bw(rack), 22, did_add, NULL, quality);
	/* We do not update any multipliers if we are in or have seen a probe-rtt */
	- if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
	- rack_update_multiplier(rack, timely_says, bytes_ps,
	- rack->r_ctl.rc_gp_srtt,
	- rack->r_ctl.rc_rtt_diff);
	+
	+ if ((rack->measure_saw_probe_rtt == 0) &&
	+ rack->rc_gp_rtt_set) {
	+ if (rack->rc_skip_timely == 0) {
	+ rack_update_multiplier(rack, timely_says, bytes_ps,
	+ rack->r_ctl.rc_gp_srtt,
	+ rack->r_ctl.rc_rtt_diff);
	+ }
	+ }
	rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
	rack_get_bw(rack), 3, line, NULL, quality);
	rack_log_pacing_delay_calc(rack,
	@@ -5179,7 +5457,6 @@
	rack->rc_gp_saw_ca = 0;
	rack->rc_gp_saw_ss = 0;
	rack->rc_dragged_bottom = 0;
	-
	if (quality == RACK_QUALITY_HIGH) {
	/*
	* Gput in the stats world is in kbps where bytes_ps is
	@@ -5326,7 +5603,7 @@
	*/
	static void
	rack_ack_received(struct tcpcb tp, struct tcp_rack rack, uint32_t th_ack, uint16_t nsegs,
	- uint16_t type, int32_t recovery)
	+ uint16_t type, int32_t post_recovery)
	{
	uint32_t prior_cwnd, acked;
	struct tcp_log_buffer *lgb = NULL;
	@@ -5335,7 +5612,7 @@
	INP_WLOCK_ASSERT(tptoinpcb(tp));
	tp->t_ccv.nsegs = nsegs;
	acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una);
	- if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
	+ if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
	uint32_t max;

	max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
	@@ -5348,17 +5625,21 @@
	((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
	#endif
	if ((th_ack == tp->snd_max) && rack->lt_bw_up) {
	- /* We will ack all, time
	- * to end any lt_bw_up we
	- * have running until something
	- * new is sent.
	+ /*
	+ * We will ack all the data, time to end any
	+ * lt_bw_up we have running until something
	+ * new is sent. Note we need to use the actual
	+ * ack_rcv_time which with pacing may be different.
	*/
	- struct timeval tv;
	+ uint64_t tmark;

	rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq);
	rack->r_ctl.lt_seq = tp->snd_max;
	- (void)tcp_get_usecs(&tv);
	- rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark);
	+ tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
	+ if (tmark >= rack->r_ctl.lt_timemark) {
	+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
	+ }
	+ rack->r_ctl.lt_timemark = tmark;
	rack->lt_bw_up = 0;
	}
	quality = RACK_QUALITY_NONE;
	@@ -5385,7 +5666,7 @@
	tp->t_bytes_acked = 0;
	}
	prior_cwnd = tp->snd_cwnd;
	- if ((recovery == 0) \|\| (rack_max_abc_post_recovery == 0) \|\| rack->r_use_labc_for_rec \|\|
	+ if ((post_recovery == 0) \|\| (rack_max_abc_post_recovery == 0) \|\| rack->r_use_labc_for_rec \|\|
	(rack_client_low_buf && rack->client_bufferlvl &&
	(rack->client_bufferlvl < rack_client_low_buf)))
	labc_to_use = rack->rc_labc;
	@@ -5446,6 +5727,14 @@
	if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
	rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
	}
	+ if ((rack->rc_initial_ss_comp == 0) &&
	+ (tp->snd_cwnd >= tp->snd_ssthresh)) {
	+ /*
	+ * The cwnd has grown beyond ssthresh we have
	+ * entered ca and completed our first Slowstart.
	+ */
	+ rack->rc_initial_ss_comp = 1;
	+ }
	}

	static void
	@@ -5467,180 +5756,64 @@
	rack->r_wanted_output = 1;
	}

	-static inline void
	-rack_set_most_aggr(struct tcp_rack *rack)
	-{
	- rack->r_fill_less_agg = 0;
	- /* Once the cwnd as been clamped we don't do fill_cw */
	- if (rack->r_cwnd_was_clamped == 0)
	- rack->rc_pace_to_cwnd = 1;
	- rack->r_pacing_discount = 0;
	-}
	-
	-static inline void
	-rack_limit_fillcw(struct tcp_rack *rack)
	-{
	- rack->r_fill_less_agg = 1;
	- /* Once the cwnd as been clamped we don't do fill_cw */
	- if (rack->r_cwnd_was_clamped == 0)
	- rack->rc_pace_to_cwnd = 1;
	- rack->r_pacing_discount = 0;
	-}
	-
	-static inline void
	-rack_disable_fillcw(struct tcp_rack *rack)
	+static inline uint64_t
	+rack_get_rxt_per(uint64_t snds, uint64_t rxts)
	{
	- rack->r_fill_less_agg = 1;
	- rack->rc_pace_to_cwnd = 0;
	- rack->r_pacing_discount = 0;
	-}
	+ uint64_t rxt_per;

	-static void
	-rack_client_buffer_level_set(struct tcp_rack *rack)
	-{
	- /*
	- * Only if DGP is on do we do anything that
	- * changes stack behavior. If DGP is off all
	- * we will do is issue a BB log (if BB logging is
	- * on) and return.
	- */
	- if (rack->dgp_on == 0) {
	- rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl,
	- 0, 0, 0, 30, __LINE__, NULL, 0);
	- return;
	- }
	- if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) {
	- goto set_most_agg;
	- }
	- /*
	- * We are in DGP so what setting should we
	- * apply based on where the client is?
	- */
	- switch(rack->r_ctl.rc_dgp_bl_agg) {
	- default:
	- case DGP_LEVEL0:
	-set_most_agg:
	- rack_set_most_aggr(rack);
	- break;
	- case DGP_LEVEL1:
	- if (rack->client_bufferlvl == 4)
	- rack_limit_fillcw(rack);
	- else if (rack->client_bufferlvl == 5)
	- rack_disable_fillcw(rack);
	- else
	- rack_set_most_aggr(rack);
	- break;
	- case DGP_LEVEL2:
	- if (rack->client_bufferlvl == 3)
	- rack_limit_fillcw(rack);
	- else if (rack->client_bufferlvl == 4)
	- rack_disable_fillcw(rack);
	- else if (rack->client_bufferlvl == 5) {
	- rack_disable_fillcw(rack);
	- rack->r_pacing_discount = 1;
	- rack->r_ctl.pacing_discount_amm = 1;
	- } else
	- rack_set_most_aggr(rack);
	- break;
	- case DGP_LEVEL3:
	- if (rack->client_bufferlvl == 2)
	- rack_limit_fillcw(rack);
	- else if (rack->client_bufferlvl == 3)
	- rack_disable_fillcw(rack);
	- else if (rack->client_bufferlvl == 4) {
	- rack_disable_fillcw(rack);
	- rack->r_pacing_discount = 1;
	- rack->r_ctl.pacing_discount_amm = 1;
	- } else if (rack->client_bufferlvl == 5) {
	- rack_disable_fillcw(rack);
	- rack->r_pacing_discount = 1;
	- rack->r_ctl.pacing_discount_amm = 2;
	- } else
	- rack_set_most_aggr(rack);
	- break;
	+ if (snds > 0) {
	+ rxt_per = rxts * 1000;
	+ rxt_per /= snds;
	+ } else {
	+ /* This is an unlikely path */
	+ if (rxts) {
	+ /* Its the max it was all re-transmits */
	+ rxt_per = 0xffffffffffffffff;
	+ } else {
	+ rxt_per = 0;
	+ }
	}
	- rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0,
	- 0, 0, 30, __LINE__, NULL, 0);
	+ return (rxt_per);
	}

	static void
	-do_rack_check_for_unclamp(struct tcpcb tp, struct tcp_rack rack)
	+policer_detection_log(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint8_t flex8)
	{
	- /*
	- * Can we unclamp. We unclamp if more than
	- * N rounds have transpired with no loss.
	- */
	- uint64_t snds, rxts, rxt_per;
	- uint32_t rnds;
	-
	- rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
	- if ((rack_unclamp_round_thresh > 0) &&
	- (rnds >= rack_unclamp_round_thresh)) {
	- snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
	- KASSERT ((snds > 0), ("rack:%p tp:%p snds:%ju is 0", rack, tp,
	- (uintmax_t)snds));
	- rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
	- rxt_per = rxts * 1000;
	- rxt_per /= snds;
	- if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) {
	- /* Unclamp */
	- if (tcp_bblogging_on(rack->rc_tp)) {
	- union tcp_log_stackspecific log;
	- struct timeval tv;
	-
	- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	- log.u_bbr.flex3 = rnds;
	- log.u_bbr.flex4 = rack_unclamp_round_thresh;
	- log.u_bbr.flex5 = (uint32_t)rxt_per;
	- log.u_bbr.flex8 = 6;
	- log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
	- log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
	- log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
	- log.u_bbr.applimited = rack->r_ctl.max_clamps;
	- log.u_bbr.epoch = rack->r_ctl.clamp_options;
	- log.u_bbr.cur_del_rate = rxts;
	- log.u_bbr.bw_inuse = rack_get_lt_bw(rack);
	- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
	- log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
	- log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
	- tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
	- 0, &log, false, NULL, NULL, 0, &tv);
	- }
	- rack->r_ctl.num_of_clamps_applied = 0;
	- rack->r_cwnd_was_clamped = 0;
	- rack->excess_rxt_on = 1;
	- if (rack->r_ctl.clamp_options) {
	- /*
	- * We only allow fillcw to be toggled
	- * if you are setting a max seg too.
	- */
	- if (rack->r_ctl.clamp_options & 0x1) {
	- if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
	- /* turn on fill cw for non-dgp*/
	- rack->rc_pace_to_cwnd = 0;
	- } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
	- /* For DGP we want it off */
	- rack->rc_pace_to_cwnd = 1;
	- }
	- }
	- }
	- if (rack->dgp_on) {
	- /* Reset all multipliers to 100.0 so just the measured bw */
	- /* Crash any per boosts down to 100% */
	- rack->r_ctl.rack_per_of_gp_rec = 100;
	- rack->r_ctl.rack_per_of_gp_ss = 100;
	- rack->r_ctl.rack_per_of_gp_ca = 100;
	- /* Set in an upper bound for ss/ca % increase */
	- rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
	- rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
	- }
	- }
	+ if (tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = flex1;
	+ log.u_bbr.flex2 = flex2;
	+ log.u_bbr.flex3 = flex3;
	+ log.u_bbr.flex4 = flex4;
	+ log.u_bbr.flex5 = rack->r_ctl.current_policer_bucket;
	+ log.u_bbr.flex6 = rack->r_ctl.policer_bucket_size;
	+ log.u_bbr.flex7 = 0;
	+ log.u_bbr.flex8 = flex8;
	+ log.u_bbr.bw_inuse = rack->r_ctl.policer_bw;
	+ log.u_bbr.applimited = rack->r_ctl.current_round;
	+ log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
	+ log.u_bbr.delivered = (uint32_t)rack->r_ctl.bytes_acked_in_recovery;
	+ log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
	+ log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes;
	+ log.u_bbr.rttProp = rack->r_ctl.gp_bw;
	+ log.u_bbr.bbr_state = rack->rc_policer_detected;
	+ log.u_bbr.bbr_substate = 0;
	+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
	+ log.u_bbr.use_lt_bw = rack->policer_detect_on;
	+ log.u_bbr.lt_epoch = 0;
	+ log.u_bbr.pkts_out = 0;
	+ tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
	+ 0, &log, false, NULL, NULL, 0, &tv);
	}
	+
	}

	static void
	-do_rack_excess_rxt(struct tcpcb tp, struct tcp_rack rack)
	+policer_detection(struct tcpcb tp, struct tcp_rack rack, int post_recovery)
	{
	/*
	* Rack excess rxt accounting is turned on. If we
	@@ -5648,166 +5821,395 @@
	* rounds, then back off the cwnd and ssthresh
	* to fit into the long-term b/w.
	*/
	- uint64_t snds, rxts, rxt_per, lt_bw, bdp;
	- uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0;

	- /* Is it shut off by 0 rounds? */
	- if (rack_rxt_min_rnds == 0)
	- return;
	- if ((rack->r_ctl.max_clamps > 0) &&
	- (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) {
	- /*
	- * The idea, if max_clamps is set, is that if clamping it
	- * N times did not work again, then there is no sense
	- * clamping it again. The link is just a lossy link and
	- * our clamps are doing no good. Turn it off so we don't come
	- * back here again.
	- */
	- rack->excess_rxt_on = 0;
	- rack->r_cwnd_was_clamped = 0;
	- rack->r_ctl.num_of_clamps_applied = 0;
	- return;
	- }
	- snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
	- rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
	- rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
	- /* Has enough rounds progressed for us to re-measure? */
	- if ((rnds >= rack_rxt_min_rnds) &&
	- (rack->r_ctl.rxt_threshold > 0)){
	- rxt_per = rxts * 1000;
	- rxt_per /= snds;
	- if (rxt_per >= rack->r_ctl.rxt_threshold) {
	- /*
	- * Action required:
	- * We are above our excess retransmit level, lets
	- * cut down the cwnd and ssthresh to match the long-term
	- * b/w we are getting.
	- */
	- /* First disable scwnd if enabled */
	-#ifdef NETFLIX_SHARED_CWND
	- rack->rack_enable_scwnd = 0;
	- if (rack->r_ctl.rc_scw) {
	- uint32_t limit;
	+ uint32_t pkts, mid, med, alt_med, avg, segsiz, tot_retran_pkt_count = 0;
	+ uint32_t cnt_of_mape_rxt = 0;
	+ uint64_t snds, rxts, rxt_per, tim, del, del_bw;
	+ int i;
	+ struct timeval tv;

	- shared_cwnd_was_enabled = 1;
	- if (rack->r_limit_scw)
	- limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
	- else
	- limit = 0;
	- tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
	- rack->r_ctl.rc_scw_index,
	- limit);
	- rack->r_ctl.rc_scw = NULL;
	- }

	-#endif
	- /* Calculate what the cwnd and ssthresh should be */
	- tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT);
	- lt_bw = rack_get_lt_bw(rack);
	- if (lt_bw == 0) {
	- /*
	- * No lt_bw, lets chop things to one MSS
	- * and the ssthresh to the iwnd.
	- */
	-reset_to_iw:
	- new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
	- new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp));
	- } else {
	- rtt = rack->rc_rack_rtt;
	- if (rtt == 0) {
	- /* If we have no rack_rtt drop to the IW situation */
	- goto reset_to_iw;
	- }
	- bdp = lt_bw * (uint64_t)rtt;
	- bdp /= HPTS_USEC_IN_SEC;
	- new_cwnd = (uint32_t)bdp;
	- new_ssthresh = new_cwnd - 1;
	- if (new_cwnd < ctf_fixed_maxseg(tp)) {
	- /* Rock bottom, goto IW settings */
	- goto reset_to_iw;
	- }
	- }
	- rack->r_cwnd_was_clamped = 1;
	- rack->r_ctl.num_of_clamps_applied++;
	- /* Reset the counter fromn now */
	- tp->t_bytes_acked = 0;
	+ /*
	+ * First is there enough packets delivered during recovery to make
	+ * a determiniation of b/w?
	+ */
	+ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
	+ if ((rack->rc_policer_detected == 0) &&
	+ (rack->r_ctl.policer_del_mss > 0) &&
	+ ((uint32_t)rack->r_ctl.policer_del_mss > ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz))) {
	+ /*
	+ * Not enough data sent in recovery for initial detection. Once
	+ * we have deteced a policer we allow less than the threshold (polcer_del_mss)
	+ * amount of data in a recovery to let us fall through and double check
	+ * our policer settings and possibly expand or collapse the bucket size and
	+ * the polcier b/w.
	+ *
	+ * Once you are declared to be policed. this block of code cannot be
	+ * reached, instead blocks further down will re-check the policer detection
	+ * triggers and possibly reset the measurements if somehow we have let the
	+ * policer bucket size grow too large.
	+ */
	+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
	+ policer_detection_log(rack, rack->r_ctl.policer_del_mss,
	+ ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz),
	+ rack->r_ctl.bytes_acked_in_recovery, segsiz, 18);
	+ }
	+ return;
	+ }
	+ tcp_get_usecs(&tv);
	+ tim = tcp_tv_to_lusectick(&tv) - rack->r_ctl.time_entered_recovery;
	+ del = rack->r_ctl.bytes_acked_in_recovery;
	+ if (tim > 0)
	+ del_bw = (del * (uint64_t)1000000) / tim;
	+ else
	+ del_bw = 0;
	+ /* B/W compensation? */
	+
	+ if (rack->r_ctl.pol_bw_comp && ((rack->r_ctl.policer_bw > 0) \|\|
	+ (del_bw > 0))) {
	+ /*
	+ * Sanity check now that the data is in. How long does it
	+ * take for us to pace out two of our policer_max_seg's?
	+ *
	+ * If it is longer than the RTT then we are set
	+ * too slow, maybe because of not enough data
	+ * sent during recovery.
	+ */
	+ uint64_t lentime, res, srtt, max_delbw, alt_bw;
	+
	+ srtt = (uint64_t)rack_grab_rtt(tp, rack);
	+ if ((tp->t_srtt > 0) && (srtt > tp->t_srtt))
	+ srtt = tp->t_srtt;
	+ lentime = rack->r_ctl.policer_max_seg * (uint64_t)HPTS_USEC_IN_SEC * 2;
	+ if (del_bw > rack->r_ctl.policer_bw) {
	+ max_delbw = del_bw;
	+ } else {
	+ max_delbw = rack->r_ctl.policer_bw;
	+ }
	+ res = lentime / max_delbw;
	+ if ((srtt > 0) && (res > srtt)) {
	/*
	- * Now what about options?
	- * We look at the bottom 8 bits:
	- * F = fill cw bit (toggle it if set)
	- * S = Segment bits
	- * M = set max segment bit
	+ * At this rate we can not get two policer_maxsegs
	+ * out before the ack arrives back.
	*
	- * SSSS SSMF
	+ * Lets at least get it raised up so that
	+ * we can be a bit faster than that if possible.
	*/
	- if (rack->r_ctl.clamp_options) {
	- if (rack->r_ctl.clamp_options & 0x1) {
	- if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
	- /* turn on fill cw for non-dgp*/
	- rack->rc_pace_to_cwnd = 1;
	- } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
	- /* For DGP we want it off */
	- rack->rc_pace_to_cwnd = 0;
	- }
	+ lentime = (rack->r_ctl.policer_max_seg * 2);
	+ tim = srtt;
	+ alt_bw = (lentime * (uint64_t)HPTS_USEC_IN_SEC) / tim;
	+ if (alt_bw > max_delbw) {
	+ uint64_t cap_alt_bw;
	+
	+ cap_alt_bw = (max_delbw + (max_delbw * rack->r_ctl.pol_bw_comp));
	+ if ((rack_pol_min_bw > 0) && (cap_alt_bw < rack_pol_min_bw)) {
	+ /* We place a min on the cap which defaults to 1Mbps */
	+ cap_alt_bw = rack_pol_min_bw;
	+ }
	+ if (alt_bw <= cap_alt_bw) {
	+ /* It should be */
	+ del_bw = alt_bw;
	+ policer_detection_log(rack,
	+ (uint32_t)tim,
	+ rack->r_ctl.policer_max_seg,
	+ 0,
	+ 0,
	+ 16);
	+ } else {
	+ /*
	+ * This is an odd case where likely the RTT is very very
	+ * low. And yet it is still being policed. We don't want
	+ * to get more than (rack_policing_do_bw_comp+1) x del-rate
	+ * where del-rate is what we got in recovery for either the
	+ * first Policer Detection(PD) or this PD we are on now.
	+ */
	+ del_bw = cap_alt_bw;
	+ policer_detection_log(rack,
	+ (uint32_t)tim,
	+ rack->r_ctl.policer_max_seg,
	+ (uint32_t)max_delbw,
	+ (rack->r_ctl.pol_bw_comp + 1),
	+ 16);
	}
	}
	- if (rack->dgp_on) {
	- /* Reset all multipliers to 100.0 so just the measured bw */
	- /* Crash any per boosts down to 100% */
	- rack->r_ctl.rack_per_of_gp_rec = 100;
	- rack->r_ctl.rack_per_of_gp_ss = 100;
	- rack->r_ctl.rack_per_of_gp_ca = 100;
	- /* Set in an upper bound for ss/ca % increase */
	- rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper;
	- rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper;
	- /* Now move to the lt_bw */
	- rack->r_ctl.gp_bw = lt_bw;
	- rack->rc_gp_filled = 1;
	- rack->r_ctl.num_measurements = RACK_REQ_AVG;
	- }
	- if (tcp_bblogging_on(rack->rc_tp)) {
	- union tcp_log_stackspecific log;
	- struct timeval tv;
	-
	- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	- log.u_bbr.flex1 = new_cwnd;
	- log.u_bbr.flex2 = new_ssthresh;
	- log.u_bbr.flex3 = rnds;
	- log.u_bbr.flex4 = rack_rxt_min_rnds;
	- log.u_bbr.flex5 = rtt;
	- log.u_bbr.flex6 = shared_cwnd_was_enabled;
	- log.u_bbr.flex8 = 5;
	- log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
	- log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
	- log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
	- log.u_bbr.applimited = rack->r_ctl.max_clamps;
	- log.u_bbr.epoch = rack->r_ctl.clamp_options;
	- log.u_bbr.cur_del_rate = rxts;
	- log.u_bbr.delRate = snds;
	- log.u_bbr.rttProp = rack->r_ctl.rxt_threshold;
	- log.u_bbr.bw_inuse = lt_bw;
	- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
	- log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
	- log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
	- tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
	- 0, &log, false, NULL, NULL, 0, &tv);
	- }
	- /* Update our point where we did it */
	- if (rack->r_ctl.already_had_a_excess == 0) {
	- rack->r_ctl.already_had_a_excess = 1;
	- counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1);
	+ }
	+ }
	+ snds = tp->t_sndbytes - rack->r_ctl.last_policer_sndbytes;
	+ rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_policer_snd_rxt_bytes;
	+ rxt_per = rack_get_rxt_per(snds, rxts);
	+ /* Figure up the average and median */
	+ for(i = 0; i < RETRAN_CNT_SIZE; i++) {
	+ if (rack->r_ctl.rc_cnt_of_retran[i] > 0) {
	+ tot_retran_pkt_count += (i + 1) * rack->r_ctl.rc_cnt_of_retran[i];
	+ cnt_of_mape_rxt += rack->r_ctl.rc_cnt_of_retran[i];
	+ }
	+ }
	+ if (cnt_of_mape_rxt)
	+ avg = (tot_retran_pkt_count * 10)/cnt_of_mape_rxt;
	+ else
	+ avg = 0;
	+ alt_med = med = 0;
	+ mid = tot_retran_pkt_count/2;
	+ for(i = 0; i < RETRAN_CNT_SIZE; i++) {
	+ pkts = (i + 1) * rack->r_ctl.rc_cnt_of_retran[i];
	+ if (mid > pkts) {
	+ mid -= pkts;
	+ continue;
	+ }
	+ med = (i + 1);
	+ break;
	+ }
	+ mid = cnt_of_mape_rxt / 2;
	+ for(i = 0; i < RETRAN_CNT_SIZE; i++) {
	+ if (mid > rack->r_ctl.rc_cnt_of_retran[i]) {
	+ mid -= rack->r_ctl.rc_cnt_of_retran[i];
	+ continue;
	+ }
	+ alt_med = (i + 1);
	+ break;
	+ }
	+ if (rack->r_ctl.policer_alt_median) {
	+ /* Swap the medians */
	+ uint32_t swap;
	+
	+ swap = med;
	+ med = alt_med;
	+ alt_med = swap;
	+ }
	+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = avg;
	+ log.u_bbr.flex2 = med;
	+ log.u_bbr.flex3 = (uint32_t)rxt_per;
	+ log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold;
	+ log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold;
	+ log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold;
	+ log.u_bbr.flex7 = rack->r_ctl.policer_alt_median;
	+ log.u_bbr.flex8 = 1;
	+ log.u_bbr.delivered = rack->r_ctl.policer_bucket_size;
	+ log.u_bbr.applimited = rack->r_ctl.current_round;
	+ log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
	+ log.u_bbr.bw_inuse = del_bw;
	+ log.u_bbr.cur_del_rate = rxts;
	+ log.u_bbr.delRate = snds;
	+ log.u_bbr.rttProp = rack->r_ctl.gp_bw;
	+ log.u_bbr.bbr_state = rack->rc_policer_detected;
	+ log.u_bbr.bbr_substate = 0;
	+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
	+ log.u_bbr.use_lt_bw = rack->policer_detect_on;
	+ log.u_bbr.lt_epoch = (uint32_t)tim;
	+ log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery;
	+ tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
	+ 0, &log, false, NULL, NULL, 0, &tv);
	+ }
	+ if (med == RETRAN_CNT_SIZE) {
	+ /*
	+ * If the median is the maximum, then what we
	+ * likely have here is a network breakage. Either that
	+ * or we are so unlucky that all of our traffic is being
	+ * dropped and having to be retransmitted the maximum times
	+ * and this just is not how a policer works.
	+ *
	+ * If it is truely a policer eventually we will come
	+ * through and it won't be the maximum.
	+ */
	+ return;
	+ }
	+ /* Has enough rounds progressed for us to re-measure? */
	+ if ((rxt_per >= (uint64_t)rack->r_ctl.policer_rxt_threshold) &&
	+ (avg >= rack->r_ctl.policer_avg_threshold) &&
	+ (med >= rack->r_ctl.policer_med_threshold)) {
	+ /*
	+ * We hit all thresholds that indicate we are
	+ * being policed. Now we may be doing this from a rack timeout
	+ * which then means the rest of recovery will hopefully go
	+ * smoother as we pace. At the end of recovery we will
	+ * fall back in here and reset the values using the
	+ * results of the entire recovery episode (we could also
	+ * hit this as we exit recovery as well which means only
	+ * one time in here).
	+ *
	+ * This is done explicitly that if we hit the thresholds
	+ * again in a second recovery we overwrite the values. We do
	+ * that because over time, as we pace the policer_bucket_size may
	+ * continue to grow. This then provides more and more times when
	+ * we are not pacing to the policer rate. This lets us compensate
	+ * for when we hit a false positive and those flows continue to
	+ * increase. However if its a real policer we will then get over its
	+ * limit, over time, again and thus end up back here hitting the
	+ * thresholds again.
	+ *
	+ * The alternative to this is to instead whenever we pace due to
	+ * policing in rack_policed_sending we could add the amount len paced to the
	+ * idle_snd_una value (which decreases the amount in last_amount_before_rec
	+ * since that is always [th_ack - idle_snd_una]). This would then prevent
	+ * the polcier_bucket_size from growing in additional recovery episodes
	+ * Which would then mean false postives would be pretty much stuck
	+ * after things got back to normal (assuming that what caused the
	+ * false positive was a small network outage).
	+ *
	+ */
	+ tcp_trace_point(rack->rc_tp, TCP_TP_POLICER_DET);
	+ if (rack->rc_policer_detected == 0) {
	+ /*
	+ * Increment the stat that tells us we identified
	+ * a policer only once. Note that if we ever allow
	+ * the flag to be cleared (reverted) then we need
	+ * to adjust this to not do multi-counting.
	+ */
	+ counter_u64_add(tcp_policer_detected, 1);
	+ }
	+ rack->r_ctl.last_policer_sndbytes = tp->t_sndbytes;
	+ rack->r_ctl.last_policer_snd_rxt_bytes = tp->t_snd_rxt_bytes;
	+ rack->r_ctl.policer_bw = del_bw;
	+ rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp,
	+ rack->r_ctl.policer_bw,
	+ min(ctf_fixed_maxseg(rack->rc_tp),
	+ rack->r_ctl.rc_pace_min_segs),
	+ 0, NULL,
	+ NULL, rack->r_ctl.pace_len_divisor);
	+ /* Now what about the policer bucket size */
	+ rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
	+ if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) {
	+ /* We must be able to send our max-seg or else chaos ensues */
	+ rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2;
	+ }
	+ if (rack->rc_policer_detected == 0)
	+ rack->r_ctl.current_policer_bucket = 0;
	+ if (tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = avg;
	+ log.u_bbr.flex2 = med;
	+ log.u_bbr.flex3 = rxt_per;
	+ log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold;
	+ log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold;
	+ log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold;
	+ log.u_bbr.flex7 = rack->r_ctl.policer_alt_median;
	+ log.u_bbr.flex8 = 2;
	+ log.u_bbr.applimited = rack->r_ctl.current_round;
	+ log.u_bbr.bw_inuse = del_bw;
	+ log.u_bbr.delivered = rack->r_ctl.policer_bucket_size;
	+ log.u_bbr.cur_del_rate = rxts;
	+ log.u_bbr.delRate = snds;
	+ log.u_bbr.rttProp = rack->r_ctl.gp_bw;
	+ log.u_bbr.bbr_state = rack->rc_policer_detected;
	+ log.u_bbr.bbr_substate = 0;
	+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
	+ log.u_bbr.use_lt_bw = rack->policer_detect_on;
	+ log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
	+ log.u_bbr.lt_epoch = (uint32_t)tim;
	+ log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery;
	+ tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
	+ 0, &log, false, NULL, NULL, 0, &tv);
	+ /*
	+ * Put out an added log, 19, for the sole purpose
	+ * of getting the txt/rxt so that we can benchmark
	+ * in read-bbrlog the ongoing rxt rate after our
	+ * policer invocation in the HYSTART announcments.
	+ */
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
	+ log.u_bbr.flex1 = alt_med;
	+ log.u_bbr.flex8 = 19;
	+ log.u_bbr.cur_del_rate = tp->t_sndbytes;
	+ log.u_bbr.delRate = tp->t_snd_rxt_bytes;
	+ tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
	+ 0, &log, false, NULL, NULL, 0, &tv);
	+ }
	+ /* Turn off any fast output, thats ended */
	+ rack->r_fast_output = 0;
	+ /* Mark the time for credits */
	+ rack->r_ctl.last_sendtime = tcp_get_u64_usecs(NULL);
	+ if (rack->r_rr_config < 2) {
	+ /*
	+ * We need to be stricter on the RR config so
	+ * the pacing has priority.
	+ */
	+ rack->r_rr_config = 2;
	+ }
	+ policer_detection_log(rack,
	+ rack->r_ctl.idle_snd_una,
	+ rack->r_ctl.ack_for_idle,
	+ 0,
	+ (uint32_t)tim,
	+ 14);
	+ rack->rc_policer_detected = 1;
	+ } else if ((rack->rc_policer_detected == 1) &&
	+ (post_recovery == 1)) {
	+ /*
	+ * If we are exiting recovery and have already detected
	+ * we need to possibly update the values.
	+ *
	+ * First: Update the idle -> recovery sent value.
	+ */
	+ uint32_t srtt;
	+
	+ if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) {
	+ rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
	+ }
	+ srtt = (uint64_t)rack_grab_rtt(tp, rack);
	+ if ((tp->t_srtt > 0) && (srtt > tp->t_srtt))
	+ srtt = tp->t_srtt;
	+ if ((srtt != 0) &&
	+ (tim < (uint64_t)srtt)) {
	+ /*
	+ * Not long enough.
	+ */
	+ if (rack_verbose_logging)
	+ policer_detection_log(rack,
	+ (uint32_t)tim,
	+ 0,
	+ 0,
	+ 0,
	+ 15);
	+ return;
	+ }
	+ /*
	+ * Finally update the b/w if its grown.
	+ */
	+ if (del_bw > rack->r_ctl.policer_bw) {
	+ rack->r_ctl.policer_bw = del_bw;
	+ rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp,
	+ rack->r_ctl.policer_bw,
	+ min(ctf_fixed_maxseg(rack->rc_tp),
	+ rack->r_ctl.rc_pace_min_segs),
	+ 0, NULL,
	+ NULL, rack->r_ctl.pace_len_divisor);
	+ if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) {
	+ /* We must be able to send our max-seg or else chaos ensues */
	+ rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2;
	}
	- counter_u64_add(rack_rxt_clamps_cwnd, 1);
	- rack->r_ctl.last_sndbytes = tp->t_sndbytes;
	- rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes;
	- rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round;
	- if (new_cwnd < tp->snd_cwnd)
	- tp->snd_cwnd = new_cwnd;
	- if (new_ssthresh < tp->snd_ssthresh)
	- tp->snd_ssthresh = new_ssthresh;
	}
	+ policer_detection_log(rack,
	+ rack->r_ctl.idle_snd_una,
	+ rack->r_ctl.ack_for_idle,
	+ 0,
	+ (uint32_t)tim,
	+ 3);
	+ }
	+}
	+
	+static void
	+rack_exit_recovery(struct tcpcb tp, struct tcp_rack rack, int how)
	+{
	+ /* now check with the policer if on */
	+ if (rack->policer_detect_on == 1) {
	+ policer_detection(tp, rack, 1);
	}
	+ /*
	+ * Now exit recovery, note we must do the idle set after the policer_detection
	+ * to get the amount acked prior to recovery correct.
	+ */
	+ rack->r_ctl.idle_snd_una = tp->snd_una;
	+ EXIT_RECOVERY(tp->t_flags);
	}

	static void
	@@ -5882,9 +6284,12 @@
	}
	rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
	}
	- EXIT_RECOVERY(tp->t_flags);
	- if (rack->r_ctl.full_dgp_in_rec)
	- rack_client_buffer_level_set(rack);
	+ if (rack->rto_from_rec == 1) {
	+ rack->rto_from_rec = 0;
	+ if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
	+ tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
	+ }
	+ rack_exit_recovery(tp, rack, 1);
	}

	static void
	@@ -5909,12 +6314,69 @@
	tp->t_flags &= ~TF_WASFRECOVERY;
	tp->t_flags &= ~TF_WASCRECOVERY;
	if (!IN_FASTRECOVERY(tp->t_flags)) {
	- if (rack->dgp_on && rack->r_cwnd_was_clamped) {
	- /* Reset the gains so that on exit we will be softer longer */
	- rack->r_ctl.rack_per_of_gp_rec = 100;
	- rack->r_ctl.rack_per_of_gp_ss = 98;
	- rack->r_ctl.rack_per_of_gp_ca = 98;
	+ struct rack_sendmap *rsm;
	+ struct timeval tv;
	+ uint32_t segsiz;
	+
	+ /* Check if this is the end of the initial Start-up i.e. initial slow-start */
	+ if (rack->rc_initial_ss_comp == 0) {
	+ /* Yep it is the end of the initial slowstart */
	+ rack->rc_initial_ss_comp = 1;
	+ }
	+ microuptime(&tv);
	+ rack->r_ctl.time_entered_recovery = tcp_tv_to_lusectick(&tv);
	+ if (SEQ_GEQ(ack, tp->snd_una)) {
	+ /*
	+ * The ack is above snd_una. Lets see
	+ * if we can establish a postive distance from
	+ * our idle mark.
	+ */
	+ rack->r_ctl.ack_for_idle = ack;
	+ if (SEQ_GT(ack, rack->r_ctl.idle_snd_una)) {
	+ rack->r_ctl.last_amount_before_rec = ack - rack->r_ctl.idle_snd_una;
	+ } else {
	+ /* No data thru yet */
	+ rack->r_ctl.last_amount_before_rec = 0;
	+ }
	+ } else if (SEQ_GT(tp->snd_una, rack->r_ctl.idle_snd_una)) {
	+ /*
	+ * The ack is out of order and behind the snd_una. It may
	+ * have contained SACK information which we processed else
	+ * we would have rejected it.
	+ */
	+ rack->r_ctl.ack_for_idle = tp->snd_una;
	+ rack->r_ctl.last_amount_before_rec = tp->snd_una - rack->r_ctl.idle_snd_una;
	+ } else {
	+ rack->r_ctl.ack_for_idle = ack;
	+ rack->r_ctl.last_amount_before_rec = 0;
	+ }
	+ if (rack->rc_policer_detected) {
	+ /*
	+ * If we are being policed and we have a loss, it
	+ * means our bucket is now empty. This can happen
	+ * where some other flow on the same host sends
	+ * that this connection is not aware of.
	+ */
	+ rack->r_ctl.current_policer_bucket = 0;
	+ if (rack_verbose_logging)
	+ policer_detection_log(rack, rack->r_ctl.last_amount_before_rec, 0, 0, 0, 4);
	+ if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) {
	+ rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
	+ }
	+ }
	+ memset(rack->r_ctl.rc_cnt_of_retran, 0, sizeof(rack->r_ctl.rc_cnt_of_retran));
	+ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
	+ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
	+ /*
	+ * Go through the outstanding and re-peg
	+ * any that should have been left in the
	+ * retransmit list (on a double recovery).
	+ */
	+ if (rsm->r_act_rxt_cnt > 0) {
	+ rack_peg_rxt(rack, rsm, segsiz);
	+ }
	}
	+ rack->r_ctl.bytes_acked_in_recovery = 0;
	rack->r_ctl.rc_prr_delivered = 0;
	rack->r_ctl.rc_prr_out = 0;
	rack->r_fast_output = 0;
	@@ -5947,15 +6409,19 @@
	tp->t_dupacks = 0;
	tp->t_bytes_acked = 0;
	rack->r_fast_output = 0;
	- EXIT_RECOVERY(tp->t_flags);
	- if (tp->t_rxtshift == 1) {
	+ if (IN_RECOVERY(tp->t_flags))
	+ rack_exit_recovery(tp, rack, 2);
	+ rack->r_ctl.bytes_acked_in_recovery = 0;
	+ rack->r_ctl.time_entered_recovery = 0;
	+ orig_cwnd = tp->snd_cwnd;
	+ rack_log_to_prr(rack, 16, orig_cwnd, line);
	+ if (CC_ALGO(tp)->cong_signal == NULL) {
	+ /* TSNH */
	tp->snd_ssthresh = max(2,
	min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
	ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
	+ tp->snd_cwnd = ctf_fixed_maxseg(tp);
	}
	- orig_cwnd = tp->snd_cwnd;
	- tp->snd_cwnd = ctf_fixed_maxseg(tp);
	- rack_log_to_prr(rack, 16, orig_cwnd, line);
	if (tp->t_flags2 & TF2_ECN_PERMIT)
	tp->t_flags2 \|= TF2_ECN_SND_CWR;
	break;
	@@ -5984,8 +6450,6 @@
	}
	if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
	rack_log_to_prr(rack, 15, cwnd_enter, line);
	- if (rack->r_ctl.full_dgp_in_rec)
	- rack_client_buffer_level_set(rack);
	rack->r_ctl.dsack_byte_cnt = 0;
	rack->r_ctl.retran_during_recovery = 0;
	rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
	@@ -6078,7 +6542,7 @@
	}

	static uint32_t
	-rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
	+rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed)
	{
	int32_t lro;
	uint32_t thresh;
	@@ -6149,7 +6613,8 @@
	* have seen reordering <and> we have a DSACK count.
	*/
	thresh += rack->r_ctl.num_dsack * (srtt >> 2);
	- rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
	+ if (log_allowed)
	+ rack_log_dsack_event(rack, 4, line, srtt, thresh);
	}
	/* SRTT * 2 is the ceiling */
	if (thresh > (srtt * 2)) {
	@@ -6159,7 +6624,8 @@
	if (thresh > rack_rto_max) {
	thresh = rack_rto_max;
	}
	- rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
	+ if (log_allowed)
	+ rack_log_dsack_event(rack, 6, line, srtt, thresh);
	return (thresh);
	}

	@@ -6294,7 +6760,7 @@
	}
	idx = rsm->r_rtr_cnt - 1;
	srtt = rack_grab_rtt(tp, rack);
	- thresh = rack_calc_thresh_rack(rack, srtt, tsused);
	+ thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
	if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
	return (NULL);
	}
	@@ -6456,7 +6922,7 @@
	goto activate_tlp;
	}
	srtt = rack_grab_rtt(tp, rack);
	- thresh = rack_calc_thresh_rack(rack, srtt, cts);
	+ thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1);
	idx = rsm->r_rtr_cnt - 1;
	exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
	if (SEQ_GEQ(exp, cts)) {
	@@ -6563,8 +7029,6 @@
	static void
	rack_enter_persist(struct tcpcb tp, struct tcp_rack rack, uint32_t cts, tcp_seq snd_una)
	{
	- struct timeval tv;
	-
	if (rack->rc_in_persist == 0) {
	if (tp->t_flags & TF_GPUTINPROG) {
	/*
	@@ -6580,21 +7044,23 @@
	rack->rack_scwnd_is_idle = 1;
	}
	#endif
	- rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv);
	+ rack->r_ctl.rc_went_idle_time = cts;
	+ if (rack->r_ctl.rc_went_idle_time == 0)
	+ rack->r_ctl.rc_went_idle_time = 1;
	if (rack->lt_bw_up) {
	/* Suspend our LT BW measurement */
	uint64_t tmark;

	rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq);
	rack->r_ctl.lt_seq = snd_una;
	- tmark = tcp_tv_to_lusectick(&tv);
	- rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
	+ tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
	+ if (tmark >= rack->r_ctl.lt_timemark) {
	+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
	+ }
	rack->r_ctl.lt_timemark = tmark;
	rack->lt_bw_up = 0;
	rack->r_persist_lt_bw_off = 1;
	}
	- if (rack->r_ctl.rc_went_idle_time == 0)
	- rack->r_ctl.rc_went_idle_time = 1;
	rack_timer_cancel(tp, rack, cts, __LINE__);
	rack->r_ctl.persist_lost_ends = 0;
	rack->probe_not_answered = 0;
	@@ -6609,9 +7075,6 @@
	static void
	rack_exit_persist(struct tcpcb tp, struct tcp_rack rack, uint32_t cts)
	{
	- struct timeval tv;
	- uint32_t t_time;
	-
	if (tcp_in_hpts(rack->rc_tp)) {
	tcp_hpts_remove(rack->rc_tp);
	rack->r_ctl.rc_hpts_flags = 0;
	@@ -6622,7 +7085,6 @@
	rack->rack_scwnd_is_idle = 0;
	}
	#endif
	- t_time = tcp_get_usecs(&tv);
	if (rack->rc_gp_dyn_mul &&
	(rack->use_fixed_rate == 0) &&
	(rack->rc_always_pace)) {
	@@ -6632,7 +7094,7 @@
	*/
	uint32_t time_idle, idle_min;

	- time_idle = t_time - rack->r_ctl.rc_went_idle_time;
	+ time_idle = cts - rack->r_ctl.rc_went_idle_time;
	idle_min = rack_min_probertt_hold;
	if (rack_probertt_gpsrtt_cnt_div) {
	uint64_t extra;
	@@ -6658,10 +7120,11 @@
	}
	if (rack->r_persist_lt_bw_off) {
	/* Continue where we left off */
	- rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
	+ rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL);
	rack->lt_bw_up = 1;
	rack->r_persist_lt_bw_off = 0;
	}
	+ rack->r_ctl.idle_snd_una = tp->snd_una;
	rack->rc_in_persist = 0;
	rack->r_ctl.rc_went_idle_time = 0;
	tp->t_rxtshift = 0;
	@@ -6734,7 +7197,7 @@
	}

	static void
	-rack_start_hpts_timer(struct tcp_rack rack, struct tcpcb tp, uint32_t cts,
	+rack_start_hpts_timer (struct tcp_rack rack, struct tcpcb tp, uint32_t cts,
	int32_t slot, uint32_t tot_len_this_send, int sup_rack)
	{
	struct hpts_diag diag;
	@@ -6778,7 +7241,8 @@
	rack->r_early = 0;
	rack->r_ctl.rc_agg_early = 0;
	}
	- if (rack->r_late) {
	+ if ((rack->r_late) &&
	+ ((rack->r_use_hpts_min == 0) \|\| (rack->dgp_on == 0))) {
	/*
	* This is harder, we can
	* compensate some but it
	@@ -6812,6 +7276,32 @@
	if (rack->r_ctl.rc_agg_delayed == 0)
	rack->r_late = 0;
	}
	+ } else if (rack->r_late) {
	+ /* r_use_hpts_min is on and so is DGP */
	+ uint32_t max_red;
	+
	+ max_red = (slot * rack->r_ctl.max_reduction) / 100;
	+ if (max_red >= rack->r_ctl.rc_agg_delayed) {
	+ slot -= rack->r_ctl.rc_agg_delayed;
	+ rack->r_ctl.rc_agg_delayed = 0;
	+ } else {
	+ slot -= max_red;
	+ rack->r_ctl.rc_agg_delayed -= max_red;
	+ }
	+ }
	+ if ((rack->r_use_hpts_min == 1) &&
	+ (slot > 0) &&
	+ (rack->dgp_on == 1)) {
	+ /*
	+ * We are enforcing a min pacing timer
	+ * based on our hpts min timeout.
	+ */
	+ uint32_t min;
	+
	+ min = get_hpts_min_sleep_time();
	+ if (min > slot) {
	+ slot = min;
	+ }
	}
	hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
	#ifdef TCP_SAD_DETECTION
	@@ -7041,6 +7531,34 @@
	rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
	}

	+static void
	+rack_mark_lost(struct tcpcb *tp,
	+ struct tcp_rack rack, struct rack_sendmap rsm, uint32_t cts)
	+{
	+ struct rack_sendmap *nrsm;
	+ uint32_t thresh, exp;
	+
	+ thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
	+ nrsm = rsm;
	+ TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) {
	+ if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) {
	+ /* Got up to all that were marked sack-passed */
	+ break;
	+ }
	+ if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
	+ exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
	+ if (TSTMP_LT(exp, cts) \|\| (exp == cts)) {
	+ /* We now consider it lost */
	+ nrsm->r_flags \|= RACK_WAS_LOST;
	+ rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
	+ } else {
	+ /* Past here it won't be lost so stop */
	+ break;
	+ }
	+ }
	+ }
	+}
	+
	/*
	* RACK Timer, here we simply do logging and house keeping.
	* the normal rack_output() function will call the
	@@ -7067,6 +7585,8 @@
	rsm = rack_check_recovery_mode(tp, cts);
	rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
	if (rsm) {
	+ /* We need to stroke any lost that are now declared as lost */
	+ rack_mark_lost(tp, rack, rsm, cts);
	rack->r_ctl.rc_resend = rsm;
	rack->r_timer_override = 1;
	if (rack->use_rack_rr) {
	@@ -7088,6 +7608,16 @@
	0, 0, 0);
	return (1);
	}
	+ if ((rack->policer_detect_on == 1) &&
	+ (rack->rc_policer_detected == 0)) {
	+ /*
	+ * We do this early if we have not
	+ * deteceted to attempt to detect
	+ * quicker. Normally we want to do this
	+ * as recovery exits (and we will again).
	+ */
	+ policer_detection(tp, rack, 0);
	+ }
	return (0);
	}

	@@ -7189,13 +7719,14 @@
	nrsm->r_start = start;
	nrsm->r_end = rsm->r_end;
	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
	+ nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt;
	nrsm->r_flags = rsm->r_flags;
	nrsm->r_dupack = rsm->r_dupack;
	nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
	nrsm->r_rtr_bytes = 0;
	nrsm->r_fas = rsm->r_fas;
	nrsm->r_bas = rsm->r_bas;
	- rsm->r_end = nrsm->r_start;
	+ tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start);
	nrsm->r_just_ret = rsm->r_just_ret;
	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
	nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
	@@ -7242,7 +7773,7 @@
	*/
	rack_log_map_chg(rack->rc_tp, rack, NULL,
	l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
	- l_rsm->r_end = r_rsm->r_end;
	+ tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end);
	if (l_rsm->r_dupack < r_rsm->r_dupack)
	l_rsm->r_dupack = r_rsm->r_dupack;
	if (r_rsm->r_rtr_bytes)
	@@ -7344,6 +7875,7 @@
	*/
	rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
	rack->r_ctl.retran_during_recovery = 0;
	+ rack->r_might_revert = 0;
	rack->r_ctl.dsack_byte_cnt = 0;
	counter_u64_add(rack_tlp_tot, 1);
	if (rack->r_state && (rack->r_state != tp->t_state))
	@@ -7517,6 +8049,32 @@
	return (0);
	}

	+static inline int
	+rack_send_ack_challange(struct tcp_rack *rack)
	+{
	+ struct tcptemp *t_template;
	+
	+ t_template = tcpip_maketemplate(rack->rc_inp);
	+ if (t_template) {
	+ if (rack->forced_ack == 0) {
	+ rack->forced_ack = 1;
	+ rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
	+ } else {
	+ rack->probe_not_answered = 1;
	+ }
	+ tcp_respond(rack->rc_tp, t_template->tt_ipgen,
	+ &t_template->tt_t, (struct mbuf *)NULL,
	+ rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0);
	+ free(t_template, M_TEMP);
	+ /* This does send an ack so kill any D-ack timer */
	+ if (rack->rc_tp->t_flags & TF_DELACK)
	+ rack->rc_tp->t_flags &= ~TF_DELACK;
	+ return(1);
	+ } else
	+ return (0);
	+
	+}
	+
	/*
	* Persists timer, here we simply send the
	* same thing as a keepalive will.
	@@ -7528,7 +8086,6 @@
	static int
	rack_timeout_persist(struct tcpcb tp, struct tcp_rack rack, uint32_t cts)
	{
	- struct tcptemp *t_template;
	int32_t retval = 1;

	if (rack->rc_in_persist == 0)
	@@ -7575,26 +8132,14 @@
	retval = -ETIMEDOUT; /* tcp_drop() */
	goto out;
	}
	- t_template = tcpip_maketemplate(rack->rc_inp);
	- if (t_template) {
	+ if (rack_send_ack_challange(rack)) {
	/* only set it if we were answered */
	- if (rack->forced_ack == 0) {
	- rack->forced_ack = 1;
	- rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
	- } else {
	- rack->probe_not_answered = 1;
	+ if (rack->probe_not_answered) {
	counter_u64_add(rack_persists_loss, 1);
	rack->r_ctl.persist_lost_ends++;
	}
	counter_u64_add(rack_persists_sends, 1);
	counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
	- tcp_respond(tp, t_template->tt_ipgen,
	- &t_template->tt_t, (struct mbuf *)NULL,
	- tp->rcv_nxt, tp->snd_una - 1, 0);
	- /* This sends an ack */
	- if (tp->t_flags & TF_DELACK)
	- tp->t_flags &= ~TF_DELACK;
	- free(t_template, M_TEMP);
	}
	if (tp->t_rxtshift < V_tcp_retries)
	tp->t_rxtshift++;
	@@ -7614,7 +8159,6 @@
	static int
	rack_timeout_keepalive(struct tcpcb tp, struct tcp_rack rack, uint32_t cts)
	{
	- struct tcptemp *t_template;
	struct inpcb *inp = tptoinpcb(tp);

	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
	@@ -7641,19 +8185,7 @@
	* respond.
	*/
	KMOD_TCPSTAT_INC(tcps_keepprobe);
	- t_template = tcpip_maketemplate(inp);
	- if (t_template) {
	- if (rack->forced_ack == 0) {
	- rack->forced_ack = 1;
	- rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
	- } else {
	- rack->probe_not_answered = 1;
	- }
	- tcp_respond(tp, t_template->tt_ipgen,
	- &t_template->tt_t, (struct mbuf *)NULL,
	- tp->rcv_nxt, tp->snd_una - 1, 0);
	- free(t_template, M_TEMP);
	- }
	+ rack_send_ack_challange(rack);
	}
	rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
	return (1);
	@@ -7680,8 +8212,26 @@
	rack = (struct tcp_rack *)tp->t_fb_ptr;
	rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
	rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
	+ rack->r_timer_override = 1;
	+ rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
	+ rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
	+ rack->r_late = 0;
	+ rack->r_early = 0;
	+ rack->r_ctl.rc_agg_delayed = 0;
	+ rack->r_ctl.rc_agg_early = 0;
	if (rack->r_state && (rack->r_state != tp->t_state))
	rack_set_state(tp, rack);
	+ if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) {
	+ /*
	+ * We do not clear the scoreboard until we have had
	+ * more than rack_rxt_scoreboard_clear_thresh time-outs.
	+ */
	+ rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
	+ if (rack->r_ctl.rc_resend != NULL)
	+ rack->r_ctl.rc_resend->r_flags \|= RACK_TO_REXT;
	+
	+ return;
	+ }
	/*
	* Ideally we would like to be able to
	* mark SACK-PASS on anything not acked here.
	@@ -7714,27 +8264,26 @@
	trsm = rsm;
	if (rsm->r_flags & RACK_ACKED)
	rsm->r_flags \|= RACK_WAS_ACKED;
	- rsm->r_flags &= ~(RACK_ACKED \| RACK_SACK_PASSED \| RACK_WAS_SACKPASS \| RACK_RWND_COLLAPSED);
	+ rsm->r_flags &= ~(RACK_ACKED \| RACK_SACK_PASSED \| RACK_WAS_SACKPASS \| RACK_RWND_COLLAPSED \| RACK_WAS_LOST);
	rsm->r_flags \|= RACK_MUST_RXT;
	}
	+ /* zero the lost since it's all gone */
	+ rack->r_ctl.rc_considered_lost = 0;
	/* Clear the count (we just un-acked them) */
	- rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
	rack->r_ctl.rc_sacked = 0;
	rack->r_ctl.rc_sacklast = NULL;
	- rack->r_ctl.rc_agg_delayed = 0;
	- rack->r_early = 0;
	- rack->r_ctl.rc_agg_early = 0;
	- rack->r_late = 0;
	/* Clear the tlp rtx mark */
	rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
	if (rack->r_ctl.rc_resend != NULL)
	rack->r_ctl.rc_resend->r_flags \|= RACK_TO_REXT;
	rack->r_ctl.rc_prr_sndcnt = 0;
	rack_log_to_prr(rack, 6, 0, __LINE__);
	- rack->r_timer_override = 1;
	+ rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
	+ if (rack->r_ctl.rc_resend != NULL)
	+ rack->r_ctl.rc_resend->r_flags \|= RACK_TO_REXT;
	if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
	#ifdef TCP_SAD_DETECTION
	- \|\| (rack->sack_attack_disable != 0)
	+ \|\| (rack->sack_attack_disable != 0)
	#endif
	) && ((tp->t_flags & TF_SENTFIN) == 0)) {
	/*
	@@ -7744,9 +8293,8 @@
	*/
	rack->r_must_retran = 1;
	rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
	- rack->r_ctl.rc_sacked);
	+ rack->r_ctl.rc_sacked);
	}
	- rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
	}

	static void
	@@ -7829,6 +8377,17 @@
	rack->r_ctl.retran_during_recovery = 0;
	rack->rc_ack_required = 1;
	rack->r_ctl.dsack_byte_cnt = 0;
	+ if (IN_RECOVERY(tp->t_flags) &&
	+ (rack->rto_from_rec == 0)) {
	+ /*
	+ * Mark that we had a rto while in recovery
	+ * and save the ssthresh so if we go back
	+ * into recovery we will have a chance
	+ * to slowstart back to the level.
	+ */
	+ rack->rto_from_rec = 1;
	+ rack->r_ctl.rto_ssthresh = tp->snd_ssthresh;
	+ }
	if (IN_FASTRECOVERY(tp->t_flags))
	tp->t_flags \|= TF_WASFRECOVERY;
	else
	@@ -7877,7 +8436,6 @@
	* retransmit interval. Back off to a longer retransmit interval
	* and retransmit one segment.
	*/
	- rack_remxt_tmr(tp);
	if ((rack->r_ctl.rc_resend == NULL) \|\|
	((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
	/*
	@@ -7888,6 +8446,7 @@
	*/
	tp->t_rxtshift++;
	}
	+ rack_remxt_tmr(tp);
	if (tp->t_rxtshift > V_tcp_retries) {
	tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
	drop_it:
	@@ -8240,23 +8799,124 @@
	}
	}

	+/*
	+ * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This
	+ * array is zeroed at the start of recovery. Each time a segment
	+ * is retransmitted, we translate that into a number of packets
	+ * (based on segsiz) and based on how many times its been retransmitted
	+ * increment by the number of packets the counter that represents
	+ * retansmitted N times. Index 0 is retransmitted 1 time, index 1
	+ * is retransmitted 2 times etc.
	+ *
	+ * So for example when we send a 4344 byte transmission with a 1448
	+ * byte segsize, and its the third time we have retransmitted this
	+ * segment, we would add to the rc_cnt_of_retran[2] the value of
	+ * 3. That represents 3 MSS were retransmitted 3 times (index is
	+ * the number of times retranmitted minus 1).
	+ */
	+static void
	+rack_peg_rxt(struct tcp_rack rack, struct rack_sendmap rsm, uint32_t segsiz)
	+{
	+ int idx;
	+ uint32_t peg;
	+
	+ peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1;
	+ peg /= segsiz;
	+ idx = rsm->r_act_rxt_cnt - 1;
	+ if (idx >= RETRAN_CNT_SIZE)
	+ idx = RETRAN_CNT_SIZE - 1;
	+ /* Max of a uint16_t retransmits in a bucket */
	+ if ((rack->r_ctl.rc_cnt_of_retran[idx] + peg) < 0xffff)
	+ rack->r_ctl.rc_cnt_of_retran[idx] += peg;
	+ else
	+ rack->r_ctl.rc_cnt_of_retran[idx] = 0xffff;
	+}
	+
	+/*
	+ * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This
	+ * array is zeroed at the start of recovery. Each time a segment
	+ * is retransmitted, we translate that into a number of packets
	+ * (based on segsiz) and based on how many times its been retransmitted
	+ * increment by the number of packets the counter that represents
	+ * retansmitted N times. Index 0 is retransmitted 1 time, index 1
	+ * is retransmitted 2 times etc.
	+ *
	+ * The rack_unpeg_rxt is used when we go to retransmit a segment
	+ * again. Basically if the segment had previously been retransmitted
	+ * say 3 times (as our previous example illustrated in the comment
	+ * above rack_peg_rxt() prior to calling that and incrementing
	+ * r_ack_rxt_cnt we would have called rack_unpeg_rxt() that would
	+ * subtract back the previous add from its last rxt (in this
	+ * example r_act_cnt would have been 2 for 2 retransmissions. So
	+ * we would have subtracted 3 from rc_cnt_of_reetran[1] to remove
	+ * those 3 segments. You will see this in the rack_update_rsm()
	+ * below where we do:
	+ * if (rsm->r_act_rxt_cnt > 0) {
	+ * rack_unpeg_rxt(rack, rsm, segsiz);
	+ * }
	+ * rsm->r_act_rxt_cnt++;
	+ * rack_peg_rxt(rack, rsm, segsiz);
	+ *
	+ * This effectively moves the count from rc_cnt_of_retran[1] to
	+ * rc_cnt_of_retran[2].
	+ */
	+static void
	+rack_unpeg_rxt(struct tcp_rack rack, struct rack_sendmap rsm, uint32_t segsiz)
	+{
	+ int idx;
	+ uint32_t peg;
	+
	+ idx = rsm->r_act_rxt_cnt - 1;
	+ if (idx >= RETRAN_CNT_SIZE)
	+ idx = RETRAN_CNT_SIZE - 1;
	+ peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1;
	+ peg /= segsiz;
	+ if (peg < rack->r_ctl.rc_cnt_of_retran[idx])
	+ rack->r_ctl.rc_cnt_of_retran[idx] -= peg;
	+ else {
	+ /* TSNH */
	+ rack->r_ctl.rc_cnt_of_retran[idx] = 0;
	+ }
	+}
	+
	static void
	rack_update_rsm(struct tcpcb tp, struct tcp_rack rack,
	- struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz)
	+ struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz)
	{
	int32_t idx;

	rsm->r_rtr_cnt++;
	- rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
	- rsm->r_dupack = 0;
	if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
	rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
	rsm->r_flags \|= RACK_OVERMAX;
	}
	+ if (rsm->r_act_rxt_cnt > 0) {
	+ /* Drop the count back for this, its retransmitting again */
	+ rack_unpeg_rxt(rack, rsm, segsiz);
	+ }
	+ rsm->r_act_rxt_cnt++;
	+ /* Peg the count/index */
	+ rack_peg_rxt(rack, rsm, segsiz);
	+ rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
	+ rsm->r_dupack = 0;
	if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
	rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
	rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
	}
	+ if (rsm->r_flags & RACK_WAS_LOST) {
	+ /*
	+ * We retransmitted it putting it back in flight
	+ * remove the lost desgination and reduce the
	+ * bytes considered lost.
	+ */
	+ rsm->r_flags &= ~RACK_WAS_LOST;
	+ KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
	+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
	+ if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
	+ rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start;
	+ else
	+ rack->r_ctl.rc_considered_lost = 0;
	+ }
	idx = rsm->r_rtr_cnt - 1;
	rsm->r_tim_lastsent[idx] = ts;
	/*
	@@ -8304,7 +8964,7 @@

	static uint32_t
	rack_update_entry(struct tcpcb tp, struct tcp_rack rack,
	- struct rack_sendmap rsm, uint64_t ts, int32_t lenp, uint16_t add_flag, int segsiz)
	+ struct rack_sendmap rsm, uint64_t ts, int32_t lenp, uint32_t add_flag, int segsiz)
	{
	/*
	* We (re-)transmitted starting at rsm->r_start for some length
	@@ -8381,7 +9041,7 @@
	static void
	rack_log_output(struct tcpcb tp, struct tcpopt to, int32_t len,
	uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
	- struct rack_sendmap hintrsm, uint16_t add_flag, struct mbuf s_mb,
	+ struct rack_sendmap hintrsm, uint32_t add_flag, struct mbuf s_mb,
	uint32_t s_moff, int hw_tls, int segsiz)
	{
	struct tcp_rack *rack;
	@@ -8440,13 +9100,6 @@
	len++;
	if (th_flags & TH_FIN)
	len++;
	- if (SEQ_LT(snd_max, tp->snd_nxt)) {
	- /*
	- * The add/update as not been done for the FIN/SYN
	- * yet.
	- */
	- snd_max = tp->snd_nxt;
	- }
	}
	if (SEQ_LEQ((seq_out + len), snd_una)) {
	/* Are sending an old segment to induce an ack (keep-alive)? */
	@@ -8492,6 +9145,7 @@
	rsm->r_hw_tls = 1;
	rsm->r_tim_lastsent[0] = cts;
	rsm->r_rtr_cnt = 1;
	+ rsm->r_act_rxt_cnt = 0;
	rsm->r_rtr_bytes = 0;
	if (th_flags & TH_SYN) {
	/* The data space is one beyond snd_una */
	@@ -8515,6 +9169,10 @@
	rsm->r_fas = (ctf_flight_size(rack->rc_tp,
	rack->r_ctl.rc_sacked) +
	(rsm->r_end - rsm->r_start));
	+ if ((rack->rc_initial_ss_comp == 0) &&
	+ (rack->r_ctl.ss_hi_fs < rsm->r_fas)) {
	+ rack->r_ctl.ss_hi_fs = rsm->r_fas;
	+ }
	/* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
	if (rsm->m) {
	if (rsm->m->m_len <= rsm->soff) {
	@@ -8558,6 +9216,13 @@
	#endif
	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
	rsm->r_in_tmap = 1;
	+ if (rsm->r_flags & RACK_IS_PCM) {
	+ rack->r_ctl.pcm_i.send_time = cts;
	+ rack->r_ctl.pcm_i.eseq = rsm->r_end;
	+ /* First time through we set the start too */
	+ if (rack->pcm_in_progress == 0)
	+ rack->r_ctl.pcm_i.sseq = rsm->r_start;
	+ }
	/*
	* Special case detection, is there just a single
	* packet outstanding when we are not in recovery?
	@@ -8886,6 +9551,7 @@
	}
	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
	#endif
	+ rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
	/*
	* the retransmit should happen at rtt + 4 * rttvar. Because of the
	* way we do the smoothing, srtt and rttvar will each average +1/2
	@@ -8939,6 +9605,7 @@
	val = rack_probertt_lower_within * rack_time_between_probertt;
	val /= 100;
	if ((rack->in_probe_rtt == 0) &&
	+ (rack->rc_skip_timely == 0) &&
	((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
	rack_enter_probertt(rack, us_cts);
	}
	@@ -9051,7 +9718,7 @@
	(!IN_FASTRECOVERY(tp->t_flags))) {
	/* Segment was a TLP and our retrans matched */
	if (rack->r_ctl.rc_tlp_cwnd_reduce) {
	- rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
	+ rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
	}
	}
	if ((rack->r_ctl.rc_rack_tmit_time == 0) \|\|
	@@ -9198,10 +9865,14 @@
	*/
	static void
	rack_log_sack_passed(struct tcpcb *tp,
	- struct tcp_rack rack, struct rack_sendmap rsm)
	+ struct tcp_rack rack, struct rack_sendmap rsm, uint32_t cts)
	{
	struct rack_sendmap *nrsm;
	+ uint32_t thresh;

	+ /* Get our rxt threshold for lost consideration */
	+ thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
	+ /* Now start looking at rsm's */
	nrsm = rsm;
	TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
	rack_head, r_tnext) {
	@@ -9224,6 +9895,17 @@
	*/
	continue;
	}
	+ /* Check lost state */
	+ if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
	+ uint32_t exp;
	+
	+ exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
	+ if (TSTMP_LT(exp, cts) \|\| (exp == cts)) {
	+ /* We consider it lost */
	+ nrsm->r_flags \|= RACK_WAS_LOST;
	+ rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
	+ }
	+ }
	if (nrsm->r_flags & RACK_SACK_PASSED) {
	/*
	* We found one that is already marked
	@@ -9407,8 +10089,6 @@
	return (1);
	}

	-
	-
	static uint32_t
	rack_proc_sack_blk(struct tcpcb tp, struct tcp_rack rack, struct sackblk *sack,
	struct tcpopt to, struct rack_sendmap *prsm, uint32_t cts,
	@@ -9625,16 +10305,11 @@
	(rsm->bindex == next->bindex) &&
	((rsm->r_flags & RACK_STRADDLE) == 0) &&
	((next->r_flags & RACK_STRADDLE) == 0) &&
	+ ((rsm->r_flags & RACK_IS_PCM) == 0) &&
	+ ((next->r_flags & RACK_IS_PCM) == 0) &&
	(rsm->r_flags & RACK_IN_GP_WIN) &&
	(next->r_flags & RACK_IN_GP_WIN))
	can_use_hookery = 1;
	- else if (next &&
	- (rsm->bindex == next->bindex) &&
	- ((rsm->r_flags & RACK_STRADDLE) == 0) &&
	- ((next->r_flags & RACK_STRADDLE) == 0) &&
	- ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
	- ((next->r_flags & RACK_IN_GP_WIN) == 0))
	- can_use_hookery = 1;
	else
	can_use_hookery = 0;
	if (next && can_use_hookery &&
	@@ -9661,7 +10336,7 @@
	nrsm = &stack_map;
	memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
	/* Now adjust our tree blocks */
	- rsm->r_end = start;
	+ tqhash_update_end(rack->r_ctl.tqh, rsm, start);
	next->r_start = start;
	rsm->r_flags \|= RACK_SHUFFLED;
	next->r_flags \|= RACK_SHUFFLED;
	@@ -9712,6 +10387,17 @@
	if ((nrsm->r_end - nrsm->r_start) >= segsiz)
	rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);
	rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
	+ if (rsm->r_flags & RACK_WAS_LOST) {
	+ int my_chg;
	+
	+ my_chg = (nrsm->r_end - nrsm->r_start);
	+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
	+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
	+ if (my_chg <= rack->r_ctl.rc_considered_lost)
	+ rack->r_ctl.rc_considered_lost -= my_chg;
	+ else
	+ rack->r_ctl.rc_considered_lost = 0;
	+ }
	if (nrsm->r_flags & RACK_SACK_PASSED) {
	rack->r_ctl.rc_reorder_ts = cts;
	if (rack->r_ctl.rc_reorder_ts == 0)
	@@ -9734,7 +10420,7 @@
	* one walk backwards from there.
	*/
	if (nrsm && nrsm->r_in_tmap)
	- rack_log_sack_passed(tp, rack, nrsm);
	+ rack_log_sack_passed(tp, rack, nrsm, cts);
	}
	/* Now are we done? */
	if (SEQ_LT(end, next->r_end) \|\|
	@@ -9875,9 +10561,21 @@
	/* You get a count for acking a whole segment or more */
	if ((rsm->r_end - rsm->r_start) >= segsiz)
	rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
	+ if (rsm->r_flags & RACK_WAS_LOST) {
	+ int my_chg;
	+
	+ my_chg = (rsm->r_end - rsm->r_start);
	+ rsm->r_flags &= ~RACK_WAS_LOST;
	+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
	+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
	+ if (my_chg <= rack->r_ctl.rc_considered_lost)
	+ rack->r_ctl.rc_considered_lost -= my_chg;
	+ else
	+ rack->r_ctl.rc_considered_lost = 0;
	+ }
	rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
	if (rsm->r_in_tmap) /* should be true */
	- rack_log_sack_passed(tp, rack, rsm);
	+ rack_log_sack_passed(tp, rack, rsm, cts);
	/* Is Reordering occuring? */
	if (rsm->r_flags & RACK_SACK_PASSED) {
	rsm->r_flags &= ~RACK_SACK_PASSED;
	@@ -9889,6 +10587,7 @@
	rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
	rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
	rsm->r_flags \|= RACK_ACKED;
	+ rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
	if (rsm->r_in_tmap) {
	TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
	rsm->r_in_tmap = 0;
	@@ -9968,19 +10667,13 @@
	(rsm->bindex == prev->bindex) &&
	((rsm->r_flags & RACK_STRADDLE) == 0) &&
	((prev->r_flags & RACK_STRADDLE) == 0) &&
	+ ((rsm->r_flags & RACK_IS_PCM) == 0) &&
	+ ((prev->r_flags & RACK_IS_PCM) == 0) &&
	(rsm->r_flags & RACK_IN_GP_WIN) &&
	(prev->r_flags & RACK_IN_GP_WIN))
	can_use_hookery = 1;
	- else if (prev &&
	- (rsm->bindex == prev->bindex) &&
	- ((rsm->r_flags & RACK_STRADDLE) == 0) &&
	- ((prev->r_flags & RACK_STRADDLE) == 0) &&
	- ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
	- ((prev->r_flags & RACK_IN_GP_WIN) == 0))
	- can_use_hookery = 1;
	else
	can_use_hookery = 0;
	-
	if (prev && can_use_hookery &&
	(prev->r_flags & RACK_ACKED)) {
	/**
	@@ -10003,7 +10696,7 @@
	noextra++;
	nrsm = &stack_map;
	memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
	- prev->r_end = end;
	+ tqhash_update_end(rack->r_ctl.tqh, prev, end);
	rsm->r_start = end;
	rsm->r_flags \|= RACK_SHUFFLED;
	prev->r_flags \|= RACK_SHUFFLED;
	@@ -10064,6 +10757,17 @@
	rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);

	rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
	+ if (rsm->r_flags & RACK_WAS_LOST) {
	+ int my_chg;
	+
	+ my_chg = (nrsm->r_end - nrsm->r_start);
	+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
	+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
	+ if (my_chg <= rack->r_ctl.rc_considered_lost)
	+ rack->r_ctl.rc_considered_lost -= my_chg;
	+ else
	+ rack->r_ctl.rc_considered_lost = 0;
	+ }
	if (nrsm->r_flags & RACK_SACK_PASSED) {
	rack->r_ctl.rc_reorder_ts = cts;
	if (rack->r_ctl.rc_reorder_ts == 0)
	@@ -10160,10 +10864,22 @@
	/* You get a count for acking a whole segment or more */
	if ((rsm->r_end - rsm->r_start) >= segsiz)
	rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
	-
	+ if (rsm->r_flags & RACK_WAS_LOST) {
	+ int my_chg;
	+
	+ my_chg = (rsm->r_end - rsm->r_start);
	+ rsm->r_flags &= ~RACK_WAS_LOST;
	+ KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
	+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
	+ if (my_chg <= rack->r_ctl.rc_considered_lost)
	+ rack->r_ctl.rc_considered_lost -= my_chg;
	+ else
	+ rack->r_ctl.rc_considered_lost = 0;
	+ }
	rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
	+
	if (rsm->r_in_tmap) /* should be true */
	- rack_log_sack_passed(tp, rack, rsm);
	+ rack_log_sack_passed(tp, rack, rsm, cts);
	/* Is Reordering occuring? */
	if (rsm->r_flags & RACK_SACK_PASSED) {
	rsm->r_flags &= ~RACK_SACK_PASSED;
	@@ -10175,6 +10891,7 @@
	rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
	rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
	rsm->r_flags \|= RACK_ACKED;
	+ rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
	rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
	if (rsm->r_in_tmap) {
	TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
	@@ -10214,8 +10931,12 @@
	break;
	if (rsm->r_flags & RACK_STRADDLE)
	break;
	+ if (rsm->r_flags & RACK_IS_PCM)
	+ break;
	if (next->r_flags & RACK_STRADDLE)
	break;
	+ if (next->r_flags & RACK_IS_PCM)
	+ break;
	if (next->r_flags & RACK_ACKED) {
	/* yep this and next can be merged */
	rsm = rack_merge_rsm(rack, rsm, next);
	@@ -10242,8 +10963,12 @@
	break;
	if (rsm->r_flags & RACK_STRADDLE)
	break;
	+ if (rsm->r_flags & RACK_IS_PCM)
	+ break;
	if (prev->r_flags & RACK_STRADDLE)
	break;
	+ if (prev->r_flags & RACK_IS_PCM)
	+ break;
	if (prev->r_flags & RACK_ACKED) {
	/* yep the previous and this can be merged */
	rsm = rack_merge_rsm(rack, prev, rsm);
	@@ -10264,6 +10989,9 @@
	/* Pass back the moved. */
	*moved_two = moved;
	*no_extra = noextra;
	+ if (IN_RECOVERY(tp->t_flags)) {
	+ rack->r_ctl.bytes_acked_in_recovery += changed;
	+ }
	return (changed);
	}

	@@ -10464,6 +11192,17 @@
	* RTT's.
	*/

	+ if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) {
	+ /*
	+ * If we have some sack blocks in the filter
	+ * lets prune them out by calling sfb with no blocks.
	+ */
	+ sack_filter_blks(&rack->r_ctl.rack_sf, NULL, 0, th_ack);
	+ }
	+ if (SEQ_GT(th_ack, tp->snd_una)) {
	+ /* Clear any app ack remembered settings */
	+ rack->r_ctl.cleared_app_ack = 0;
	+ }
	rack->r_wanted_output = 1;
	if (SEQ_GT(th_ack, tp->snd_una))
	rack->r_ctl.last_cumack_advance = acktime;
	@@ -10533,10 +11272,10 @@
	return;
	}
	#ifdef INVARIANTS
	- panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
	+ panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n",
	tp,
	tp->t_state, th_ack, rack,
	- tp->snd_una, tp->snd_max, tp->snd_nxt);
	+ tp->snd_una, tp->snd_max);
	#endif
	return;
	}
	@@ -10599,6 +11338,20 @@
	uint32_t left;
	uint8_t newly_acked;

	+ if (rsm->r_flags & RACK_WAS_LOST) {
	+ /*
	+ * This can happen when we marked it as lost
	+ * and yet before retransmitting we get an ack
	+ * which can happen due to reordering.
	+ */
	+ rsm->r_flags &= ~RACK_WAS_LOST;
	+ KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
	+ ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
	+ if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
	+ rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start;
	+ else
	+ rack->r_ctl.rc_considered_lost = 0;
	+ }
	rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
	rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
	rsm->r_rtr_bytes = 0;
	@@ -10613,6 +11366,10 @@
	rsm->r_in_tmap = 0;
	}
	newly_acked = 1;
	+ if (((rsm->r_flags & RACK_ACKED) == 0) &&
	+ (IN_RECOVERY(tp->t_flags))) {
	+ rack->r_ctl.bytes_acked_in_recovery += (rsm->r_end - rsm->r_start);
	+ }
	if (rsm->r_flags & RACK_ACKED) {
	/*
	* It was acked on the scoreboard -- remove
	@@ -10639,6 +11396,9 @@
	*/
	rack->r_might_revert = 1;
	}
	+ rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
	+ } else {
	+ rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
	}
	if ((rsm->r_flags & RACK_TO_REXT) &&
	(tp->t_flags & TF_RCVD_TSTMP) &&
	@@ -10691,6 +11451,27 @@
	* total for the part being cum-acked.
	*/
	rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
	+ } else {
	+ if (((rsm->r_flags & RACK_ACKED) == 0) &&
	+ (IN_RECOVERY(tp->t_flags))) {
	+ rack->r_ctl.bytes_acked_in_recovery += (th_ack - rsm->r_start);
	+ }
	+ rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack);
	+ }
	+ /* And what about the lost flag? */
	+ if (rsm->r_flags & RACK_WAS_LOST) {
	+ /*
	+ * This can happen when we marked it as lost
	+ * and yet before retransmitting we get an ack
	+ * which can happen due to reordering. In this
	+ * case its only a partial ack of the send.
	+ */
	+ KASSERT((rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)),
	+ ("rsm:%p rack:%p rc_considered_lost goes negative th_ack:%u", rsm, rack, th_ack));
	+ if (rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start))
	+ rack->r_ctl.rc_considered_lost -= th_ack - rsm->r_start;
	+ else
	+ rack->r_ctl.rc_considered_lost = 0;
	}
	/*
	* Clear the dup ack count for
	@@ -10807,7 +11588,26 @@
	tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
	tp->snd_recover = tp->snd_una;
	rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
	- EXIT_RECOVERY(tp->t_flags);
	+ if (IN_RECOVERY(tp->t_flags)) {
	+ rack_exit_recovery(tp, rack, 3);
	+ if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){
	+ /*
	+ * We were in recovery, had an RTO
	+ * and then re-entered recovery (more sack's arrived)
	+ * and we have properly recorded the old ssthresh from
	+ * the first recovery. We want to be able to slow-start
	+ * back to this level. The ssthresh from the timeout
	+ * and then back into recovery will end up most likely
	+ * to be min(cwnd=1mss, 2mss). Which makes it basically
	+ * so we get no slow-start after our RTO.
	+ */
	+ rack->rto_from_rec = 0;
	+ if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
	+ tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
	+ }
	+ }
	+ rack->r_ctl.bytes_acked_in_recovery = 0;
	+ rack->r_ctl.time_entered_recovery = 0;
	}
	rack->r_might_revert = 0;
	}
	@@ -11062,7 +11862,8 @@
	static uint32_t
	do_rack_compute_pipe(struct tcpcb tp, struct tcp_rack rack, uint32_t snd_una)
	{
	- return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt);
	+ return (((tp->snd_max - snd_una) -
	+ (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt);
	}

	static int32_t
	@@ -11505,7 +12306,7 @@
	((rsm->r_flags & RACK_MUST_RXT) == 0)) {
	/* Enter recovery */
	entered_recovery = 1;
	- rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
	+ rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
	/*
	* When we enter recovery we need to assure we send
	* one packet.
	@@ -11547,7 +12348,7 @@
	}

	static void
	-rack_strike_dupack(struct tcp_rack *rack)
	+rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
	{
	struct rack_sendmap *rsm;

	@@ -11581,7 +12382,7 @@
	if (rack->r_ctl.rc_resend != NULL) {
	if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
	rack_cong_signal(rack->rc_tp, CC_NDUPACK,
	- rack->rc_tp->snd_una, __LINE__);
	+ th_ack, __LINE__);
	}
	rack->r_wanted_output = 1;
	rack->r_timer_override = 1;
	@@ -11598,6 +12399,25 @@
	struct tcp_rack *rack,
	struct socket *so)
	{
	+ /*
	+ * So what is dragging bottom?
	+ *
	+ * Dragging bottom means you were under pacing and had a
	+ * delay in processing inbound acks waiting on our pacing
	+ * timer to expire. While you were waiting all of the acknowledgments
	+ * for the packets you sent have arrived. This means we are pacing
	+ * way underneath the bottleneck to the point where our Goodput
	+ * measurements stop working, since they require more than one
	+ * ack (usually at least 8 packets worth with multiple acks so we can
	+ * gauge the inter-ack times). If that occurs we have a real problem
	+ * since we are stuck in a hole that we can't get out of without
	+ * something speeding us up.
	+ *
	+ * We also check to see if we are widdling down to just one segment
	+ * outstanding. If this occurs and we have room to send in our cwnd/rwnd
	+ * then we are adding the delayed ack interval into our measurments and
	+ * we need to speed up slightly.
	+ */
	uint32_t segsiz, minseg;

	segsiz = ctf_fixed_maxseg(tp);
	@@ -11614,10 +12434,13 @@
	*/
	uint64_t lt_bw;

	+ tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM);
	lt_bw = rack_get_lt_bw(rack);
	rack->rc_dragged_bottom = 1;
	rack_validate_multipliers_at_or_above100(rack);
	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
	+ (rack->dis_lt_bw == 0) &&
	+ (rack->use_lesser_lt_bw == 0) &&
	(lt_bw > 0)) {
	/*
	* Lets use the long-term b/w we have
	@@ -11729,7 +12552,7 @@
	log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ;
	log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff);
	log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ;
	- log.u_bbr.bbr_state = 1;
	+ log.u_bbr.inhpts = 1;
	#ifdef TCP_REQUEST_TRK
	off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
	log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
	@@ -11745,6 +12568,20 @@
	log.u_bbr.flex7 \|= rack->rc_hybrid_mode;
	log.u_bbr.flex7 <<= 1;
	log.u_bbr.flex7 \|= rack->dgp_on;
	+ /*
	+ * Compose bbr_state to be a bit wise 0000ADHF
	+ * where A is the always_pace flag
	+ * where D is the dgp_on flag
	+ * where H is the hybrid_mode on flag
	+ * where F is the use_fixed_rate flag.
	+ */
	+ log.u_bbr.bbr_state = rack->rc_always_pace;
	+ log.u_bbr.bbr_state <<= 1;
	+ log.u_bbr.bbr_state \|= rack->dgp_on;
	+ log.u_bbr.bbr_state <<= 1;
	+ log.u_bbr.bbr_state \|= rack->rc_hybrid_mode;
	+ log.u_bbr.bbr_state <<= 1;
	+ log.u_bbr.bbr_state \|= rack->use_fixed_rate;
	log.u_bbr.flex8 = mod;
	log.u_bbr.delRate = rack->r_ctl.bw_rate_cap;
	log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg;
	@@ -11763,12 +12600,13 @@

	#ifdef TCP_REQUEST_TRK
	static void
	-rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len)
	+rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
	{
	- struct tcp_sendfile_track *rc_cur;
	+ struct tcp_sendfile_track rc_cur, orig_ent;
	struct tcpcb *tp;
	int err = 0;

	+ orig_ent = rack->r_ctl.rc_last_sft;
	rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq);
	if (rc_cur == NULL) {
	/* If not in the beginning what about the end piece */
	@@ -11781,11 +12619,17 @@
	/* If we find no parameters we are in straight DGP mode */
	if(rc_cur == NULL) {
	/* None found for this seq, just DGP for now */
	- rack->r_ctl.client_suggested_maxseg = 0;
	- rack->rc_catch_up = 0;
	- rack->r_ctl.bw_rate_cap = 0;
	- if (rack->rc_hybrid_mode)
	+ if (rack->rc_hybrid_mode) {
	+ rack->r_ctl.client_suggested_maxseg = 0;
	+ rack->rc_catch_up = 0;
	+ if (rack->cspr_is_fcc == 0)
	+ rack->r_ctl.bw_rate_cap = 0;
	+ else
	+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
	+ }
	+ if (rack->rc_hybrid_mode) {
	rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
	+ }
	if (rack->r_ctl.rc_last_sft) {
	rack->r_ctl.rc_last_sft = NULL;
	}
	@@ -11793,6 +12637,20 @@
	}
	if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) {
	/* This entry was never setup for hybrid pacing on/off etc */
	+ if (rack->rc_hybrid_mode) {
	+ rack->r_ctl.client_suggested_maxseg = 0;
	+ rack->rc_catch_up = 0;
	+ rack->r_ctl.bw_rate_cap = 0;
	+ }
	+ if (rack->r_ctl.rc_last_sft) {
	+ rack->r_ctl.rc_last_sft = NULL;
	+ }
	+ if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) {
	+ rc_cur->flags \|= TCP_TRK_TRACK_FLG_FSND;
	+ rc_cur->first_send = cts;
	+ rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes;
	+ rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
	+ }
	return;
	}
	/*
	@@ -11812,18 +12670,40 @@
	}
	if (rack->rc_hybrid_mode == 0) {
	rack->r_ctl.rc_last_sft = rc_cur;
	+ if (orig_ent) {
	+ orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
	+ orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
	+ orig_ent->flags \|= TCP_TRK_TRACK_FLG_LSND;
	+ }
	rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
	return;
	}
	if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){
	/* Compensate for all the header overhead's */
	- rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
	- } else
	- rack->r_ctl.bw_rate_cap = 0;
	+ if (rack->cspr_is_fcc == 0)
	+ rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
	+ else
	+ rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
	+ } else {
	+ if (rack->rc_hybrid_mode) {
	+ if (rack->cspr_is_fcc == 0)
	+ rack->r_ctl.bw_rate_cap = 0;
	+ else
	+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
	+ }
	+ }
	if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS)
	rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg;
	else
	rack->r_ctl.client_suggested_maxseg = 0;
	+ if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) {
	+ /*
	+ * It is the same timestamp as the previous one
	+ * add the hybrid flag that will indicate we use
	+ * sendtime not arrival time for catch-up mode.
	+ */
	+ rc_cur->hybrid_flags \|= TCP_HYBRID_PACING_SENDTIME;
	+ }
	if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) &&
	(rc_cur->cspr > 0)) {
	uint64_t len;
	@@ -11833,7 +12713,20 @@
	* Calculate the deadline time, first set the
	* time to when the request arrived.
	*/
	- rc_cur->deadline = rc_cur->localtime;
	+ if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) {
	+ /*
	+ * For cases where its a duplicate tm (we received more
	+ * than one request for a tm) we want to use now, the point
	+ * where we are just sending the first bit of the request.
	+ */
	+ rc_cur->deadline = cts;
	+ } else {
	+ /*
	+ * Here we have a different tm from the last request
	+ * so we want to use arrival time as our base.
	+ */
	+ rc_cur->deadline = rc_cur->localtime;
	+ }
	/*
	* Next calculate the length and compensate for
	* TLS if need be.
	@@ -11867,9 +12760,15 @@
	*/
	rack_set_pace_segments(tp, rack, __LINE__, NULL);
	}
	+ if (orig_ent) {
	+ orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
	+ orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
	+ orig_ent->flags \|= TCP_TRK_TRACK_FLG_LSND;
	+ }
	rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
	/* Remember it for next time and for CU mode */
	rack->r_ctl.rc_last_sft = rc_cur;
	+ rack->r_ctl.last_tm_mark = rc_cur->timestamp;
	}
	#endif

	@@ -11884,7 +12783,7 @@
	(ent->flags == TCP_TRK_TRACK_FLG_EMPTY) \|\|
	(SEQ_GEQ(seq, ent->end_seq))) {
	/* Time to update the track. */
	- rack_set_dgp_hybrid_mode(rack, seq, len);
	+ rack_set_dgp_hybrid_mode(rack, seq, len, cts);
	ent = rack->r_ctl.rc_last_sft;
	}
	/* Out of all */
	@@ -12116,8 +13015,17 @@
	* if so be sure to NULL the pointer so we know we are no longer
	* set to anything.
	*/
	- if (ent == rack->r_ctl.rc_last_sft)
	+ if (ent == rack->r_ctl.rc_last_sft) {
	rack->r_ctl.rc_last_sft = NULL;
	+ if (rack->rc_hybrid_mode) {
	+ rack->rc_catch_up = 0;
	+ if (rack->cspr_is_fcc == 0)
	+ rack->r_ctl.bw_rate_cap = 0;
	+ else
	+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
	+ rack->r_ctl.client_suggested_maxseg = 0;
	+ }
	+ }
	/* Generate the log that the tcp_netflix call would have */
	tcp_req_log_req_info(rack->rc_tp, ent,
	i, TCP_TRK_REQ_LOG_FREED, 0, 0);
	@@ -12139,7 +13047,7 @@
	rack_process_ack(struct mbuf m, struct tcphdr th, struct socket *so,
	struct tcpcb tp, struct tcpopt to,
	uint32_t tiwin, int32_t tlen,
	- int32_t * ofia, int32_t thflags, int32_t *ret_val)
	+ int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen)
	{
	int32_t ourfinisacked = 0;
	int32_t nsegs, acked_amount;
	@@ -12147,7 +13055,8 @@
	struct mbuf *mfree;
	struct tcp_rack *rack;
	int32_t under_pacing = 0;
	- int32_t recovery = 0;
	+ int32_t post_recovery = 0;
	+ uint32_t p_cwnd;

	INP_WLOCK_ASSERT(tptoinpcb(tp));

	@@ -12176,8 +13085,9 @@

	if ((th->th_ack == tp->snd_una) &&
	(tiwin == tp->snd_wnd) &&
	+ (orig_tlen == 0) &&
	((to->to_flags & TOF_SACK) == 0)) {
	- rack_strike_dupack(rack);
	+ rack_strike_dupack(rack, th->th_ack);
	dup_ack_struck = 1;
	}
	rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)),
	@@ -12185,6 +13095,7 @@
	if ((rack->sack_attack_disable > 0) &&
	(th->th_ack == tp->snd_una) &&
	(tiwin == tp->snd_wnd) &&
	+ (orig_tlen == 0) &&
	(dsack_seen == 0) &&
	(sacks_seen > 0)) {
	/*
	@@ -12197,7 +13108,7 @@
	* were we are ignoring sacks from this guy due to
	* it being a suspected attacker.
	*/
	- rack_strike_dupack(rack);
	+ rack_strike_dupack(rack, th->th_ack);
	}

	}
	@@ -12306,15 +13217,37 @@
	tcp_rack_partialack(tp);
	} else {
	rack_post_recovery(tp, th->th_ack);
	- recovery = 1;
	+ post_recovery = 1;
	+ /*
	+ * Grab the segsiz, multiply by 2 and add the snd_cwnd
	+ * that is the max the CC should add if we are exiting
	+ * recovery and doing a late add.
	+ */
	+ p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
	+ p_cwnd <<= 1;
	+ p_cwnd += tp->snd_cwnd;
	}
	+ } else if ((rack->rto_from_rec == 1) &&
	+ SEQ_GEQ(th->th_ack, tp->snd_recover)) {
	+ /*
	+ * We were in recovery, hit a rxt timeout
	+ * and never re-entered recovery. The timeout(s)
	+ * made up all the lost data. In such a case
	+ * we need to clear the rto_from_rec flag.
	+ */
	+ rack->rto_from_rec = 0;
	}
	/*
	* Let the congestion control algorithm update congestion control
	* related information. This typically means increasing the
	* congestion window.
	*/
	- rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
	+ rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery);
	+ if (post_recovery &&
	+ (tp->snd_cwnd > p_cwnd)) {
	+ /* Must be non-newreno (cubic) getting too ahead of itself */
	+ tp->snd_cwnd = p_cwnd;
	+ }
	SOCKBUF_LOCK(&so->so_snd);
	acked_amount = min(acked, (int)sbavail(&so->so_snd));
	tp->snd_wnd -= acked_amount;
	@@ -12338,13 +13271,6 @@
	rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
	/* NB: sowwakeup_locked() does an implicit unlock. */
	sowwakeup_locked(so);
	- /* now check the rxt clamps */
	- if ((recovery == 1) &&
	- (rack->excess_rxt_on) &&
	- (rack->r_cwnd_was_clamped == 0)) {
	- do_rack_excess_rxt(tp, rack);
	- } else if (rack->r_cwnd_was_clamped)
	- do_rack_check_for_unclamp(tp, rack);
	m_freem(mfree);
	if (SEQ_GT(tp->snd_una, tp->snd_recover))
	tp->snd_recover = tp->snd_una;
	@@ -12363,11 +13289,12 @@
	if (tp->snd_una == tp->snd_max) {
	/* Nothing left outstanding */
	tp->t_flags &= ~TF_PREVVALID;
	+ rack->r_ctl.idle_snd_una = tp->snd_una;
	rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
	- rack->r_ctl.retran_during_recovery = 0;
	- rack->r_ctl.dsack_byte_cnt = 0;
	if (rack->r_ctl.rc_went_idle_time == 0)
	rack->r_ctl.rc_went_idle_time = 1;
	+ rack->r_ctl.retran_during_recovery = 0;
	+ rack->r_ctl.dsack_byte_cnt = 0;
	rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
	if (sbavail(&tptosocket(tp)->so_snd) == 0)
	tp->t_acktime = 0;
	@@ -12562,7 +13489,6 @@
	}
	}

	-
	/*
	* Return value of 1, the TCB is unlocked and most
	* likely gone, return value of 0, the TCP is still
	@@ -12713,12 +13639,20 @@
	SOCKBUF_LOCK(&so->so_rcv);
	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
	m_freem(m);
	- } else
	+ } else {
	+ int32_t newsize;
	+
	+ if (tlen > 0) {
	+ newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
	+ if (newsize)
	+ if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
	+ so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
	+ }
	#ifdef NETFLIX_SB_LIMITS
	appended =
	#endif
	sbappendstream_locked(&so->so_rcv, m, 0);
	-
	+ }
	rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
	/* NB: sorwakeup_locked() does an implicit unlock. */
	sorwakeup_locked(so);
	@@ -12877,9 +13811,6 @@
	if (__predict_false(th->th_seq != tp->rcv_nxt)) {
	return (0);
	}
	- if (__predict_false(tp->snd_nxt != tp->snd_max)) {
	- return (0);
	- }
	if (tiwin && tiwin != tp->snd_wnd) {
	return (0);
	}
	@@ -13005,10 +13936,6 @@
	/* Above what we have sent? */
	return (0);
	}
	- if (__predict_false(tp->snd_nxt != tp->snd_max)) {
	- /* We are retransmitting */
	- return (0);
	- }
	if (__predict_false(tiwin == 0)) {
	/* zero window */
	return (0);
	@@ -13176,6 +14103,7 @@
	rack->r_ctl.retran_during_recovery = 0;
	rack->rc_suspicious = 0;
	rack->r_ctl.dsack_byte_cnt = 0;
	+ rack->r_ctl.idle_snd_una = tp->snd_una;
	rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
	if (rack->r_ctl.rc_went_idle_time == 0)
	rack->r_ctl.rc_went_idle_time = 1;
	@@ -13203,6 +14131,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	int32_t ret_val = 0;
	+ int32_t orig_tlen = tlen;
	int32_t todrop;
	int32_t ourfinisacked = 0;
	struct tcp_rack *rack;
	@@ -13267,8 +14196,9 @@
	*/
	if (IS_FASTOPEN(tp->t_flags) &&
	(tp->snd_una != tp->snd_max)) {
	- tp->snd_nxt = th->th_ack;
	- tfo_partial = 1;
	+ /* Was it a partial ack? */
	+ if (SEQ_LT(th->th_ack, tp->snd_max))
	+ tfo_partial = 1;
	}
	/*
	* If there's data, delay ACK; if there's also a FIN ACKNOW
	@@ -13299,6 +14229,24 @@
	* and there is no send_map.
	*/
	tp->snd_una++;
	+ if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) {
	+ /*
	+ * We sent a SYN with data, and thus have a
	+ * sendmap entry with a SYN set. Lets find it
	+ * and take off the send bit and the byte and
	+ * set it up to be what we send (send it next).
	+ */
	+ struct rack_sendmap *rsm;
	+
	+ rsm = tqhash_min(rack->r_ctl.tqh);
	+ if (rsm) {
	+ if (rsm->r_flags & RACK_HAS_SYN) {
	+ rsm->r_flags &= ~RACK_HAS_SYN;
	+ rsm->r_start++;
	+ }
	+ rack->r_ctl.rc_resend = rsm;
	+ }
	+ }
	}
	/*
	* Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
	@@ -13361,7 +14309,7 @@
	tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
	tcp_rack_xmit_timer_commit(rack, tp);
	}
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen))
	return (ret_val);
	/* We may have changed to FIN_WAIT_1 above */
	if (tp->t_state == TCPS_FIN_WAIT_1) {
	@@ -13407,6 +14355,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	struct tcp_rack *rack;
	+ int32_t orig_tlen = tlen;
	int32_t ret_val = 0;
	int32_t ourfinisacked = 0;

	@@ -13579,7 +14528,7 @@
	tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
	tcp_rack_xmit_timer_commit(rack, tp);
	}
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
	return (ret_val);
	}
	if (tp->t_state == TCPS_FIN_WAIT_1) {
	@@ -13624,6 +14573,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	int32_t ret_val = 0;
	+ int32_t orig_tlen = tlen;
	struct tcp_rack *rack;

	/*
	@@ -13730,7 +14680,7 @@
	/*
	* Ack processing.
	*/
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
	return (ret_val);
	}
	if (sbavail(&so->so_snd)) {
	@@ -13756,6 +14706,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	int32_t ret_val = 0;
	+ int32_t orig_tlen = tlen;
	struct tcp_rack *rack;

	rack = (struct tcp_rack *)tp->t_fb_ptr;
	@@ -13830,7 +14781,7 @@
	/*
	* Ack processing.
	*/
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
	return (ret_val);
	}
	if (sbavail(&so->so_snd)) {
	@@ -13884,6 +14835,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	int32_t ret_val = 0;
	+ int32_t orig_tlen = tlen;
	int32_t ourfinisacked = 0;
	struct tcp_rack *rack;

	@@ -13966,7 +14918,7 @@
	/*
	* Ack processing.
	*/
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
	return (ret_val);
	}
	if (ourfinisacked) {
	@@ -14011,6 +14963,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	int32_t ret_val = 0;
	+ int32_t orig_tlen = tlen;
	int32_t ourfinisacked = 0;
	struct tcp_rack *rack;

	@@ -14093,7 +15046,7 @@
	/*
	* Ack processing.
	*/
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
	return (ret_val);
	}
	if (ourfinisacked) {
	@@ -14124,6 +15077,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	int32_t ret_val = 0;
	+ int32_t orig_tlen;
	int32_t ourfinisacked = 0;
	struct tcp_rack *rack;

	@@ -14152,6 +15106,7 @@
	if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
	return (ret_val);
	}
	+ orig_tlen = tlen;
	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
	&rack->r_ctl.challenge_ack_ts,
	&rack->r_ctl.challenge_ack_cnt)) {
	@@ -14206,7 +15161,7 @@
	/*
	* case TCPS_LAST_ACK: Ack processing.
	*/
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
	return (ret_val);
	}
	if (ourfinisacked) {
	@@ -14237,6 +15192,7 @@
	uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
	{
	int32_t ret_val = 0;
	+ int32_t orig_tlen = tlen;
	int32_t ourfinisacked = 0;
	struct tcp_rack *rack;

	@@ -14320,7 +15276,7 @@
	/*
	* Ack processing.
	*/
	- if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
	+ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
	return (ret_val);
	}
	if (sbavail(&so->so_snd)) {
	@@ -14919,65 +15875,43 @@
	}

	static void
	-rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval)
	+rack_translate_policer_detect(struct tcp_rack *rack, uint32_t optval)
	{
	/*
	- * P = percent bits
	- * F = fill cw bit -- Toggle fillcw if this bit is set.
	- * S = Segment bits
	- * M = set max segment bit
	- * U = Unclamined
	- * C = If set to non-zero override the max number of clamps.
	- * L = Bit to indicate if clamped gets lower.
	+ * P = Percent of retransmits 499 = 49.9%
	+ * A = Average number 1 (.1%) -> 169 (16.9%)
	+ * M = Median number of retrans 1 - 16
	+ * MMMM MMMM AAAA AAAA PPPP PPPP PPPP PPPP
	*
	- * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP
	- *
	- * The lowest 3 nibbles is the perentage .1 - 6553.5%
	- * where 10.1 = 101, max 6553.5
	- * The upper 16 bits holds some options.
	- * The F bit will turn on fill-cw on if you are
	- * not pacing, it will turn it off if dgp is on.
	- * The L bit will change it so when clamped we get
	- * the min(gp, lt-bw) for dgp.
	*/
	- uint16_t per;
	+ uint16_t per, upp;

	- rack->r_ctl.saved_rxt_clamp_val = optval;
	per = optval & 0x0000ffff;
	- rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff);
	- if (optval > 0) {
	- uint16_t clamp_opt;
	-
	- rack->excess_rxt_on = 1;
	- clamp_opt = ((optval & 0xffff0000) >> 16);
	- rack->r_ctl.clamp_options = clamp_opt & 0x00ff;
	- if (clamp_opt & 0xff00) {
	- /* A max clamps is also present */
	- rack->r_ctl.max_clamps = (clamp_opt >> 8);
	- } else {
	- /* No specified clamps means no limit */
	- rack->r_ctl.max_clamps = 0;
	- }
	- if (rack->r_ctl.clamp_options & 0x0002) {
	- rack->r_clamped_gets_lower = 1;
	- } else {
	- rack->r_clamped_gets_lower = 0;
	- }
	+ rack->r_ctl.policer_rxt_threshold = (uint32_t)(per & 0xffff);
	+ upp = ((optval & 0xffff0000) >> 16);
	+ rack->r_ctl.policer_avg_threshold = (0x00ff & upp);
	+ rack->r_ctl.policer_med_threshold = ((upp >> 8) & 0x00ff);
	+ if ((rack->r_ctl.policer_rxt_threshold > 0) &&
	+ (rack->r_ctl.policer_avg_threshold > 0) &&
	+ (rack->r_ctl.policer_med_threshold > 0)) {
	+ rack->policer_detect_on = 1;
	} else {
	- /* Turn it off back to default */
	- rack->excess_rxt_on = 0;
	- rack->r_clamped_gets_lower = 0;
	+ rack->policer_detect_on = 0;
	}
	-
	+ rack->r_ctl.saved_policer_val = optval;
	+ policer_detection_log(rack, optval,
	+ rack->r_ctl.policer_avg_threshold,
	+ rack->r_ctl.policer_med_threshold,
	+ rack->r_ctl.policer_rxt_threshold, 11);
	}

	-
	static int32_t
	rack_init(struct tcpcb tp, void *ptr)
	{
	struct inpcb *inp = tptoinpcb(tp);
	struct tcp_rack *rack = NULL;
	uint32_t iwin, snt, us_cts;
	+ size_t sz;
	int err, no_query;

	tcp_hpts_init(tp);
	@@ -15036,16 +15970,22 @@
	rack->rc_new_rnd_needed = 1;
	rack->r_ctl.rc_split_limit = V_tcp_map_split_limit;
	/* We want abe like behavior as well */
	+
	rack->r_ctl.rc_saved_beta.newreno_flags \|= CC_NEWRENO_BETA_ECN_ENABLED;
	rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
	rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
	rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
	- if (rack_rxt_clamp_thresh) {
	- rack_translate_clamp_value(rack, rack_rxt_clamp_thresh);
	- rack->excess_rxt_on = 1;
	+ rack->r_ctl.policer_del_mss = rack_req_del_mss;
	+ if ((rack_policer_rxt_thresh > 0) &&
	+ (rack_policer_avg_thresh > 0) &&
	+ (rack_policer_med_thresh > 0)) {
	+ rack->r_ctl.policer_rxt_threshold = rack_policer_rxt_thresh;
	+ rack->r_ctl.policer_avg_threshold = rack_policer_avg_thresh;
	+ rack->r_ctl.policer_med_threshold = rack_policer_med_thresh;
	+ rack->policer_detect_on = 1;
	+ } else {
	+ rack->policer_detect_on = 0;
	}
	- if (rack_uses_full_dgp_in_rec)
	- rack->r_ctl.full_dgp_in_rec = 1;
	if (rack_fill_cw_state)
	rack->rc_pace_to_cwnd = 1;
	if (rack_pacing_min_seg)
	@@ -15063,6 +16003,15 @@
	if (rack_tcp_accounting) {
	tp->t_flags2 \|= TF2_TCP_ACCOUNTING;
	}
	+#endif
	+ rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY;
	+ sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc);
	+ rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT);
	+ if (rack->r_ctl.pcm_s == NULL) {
	+ rack->r_ctl.pcm_i.cnt_alloc = 0;
	+ }
	+#ifdef NETFLIX_STATS
	+ rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask;
	#endif
	rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
	rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
	@@ -15070,6 +16019,7 @@
	rack->rack_enable_scwnd = 1;
	rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
	rack->rc_user_set_max_segs = rack_hptsi_segments;
	+ rack->r_ctl.max_reduction = rack_max_reduce;
	rack->rc_force_max_seg = 0;
	TAILQ_INIT(&rack->r_ctl.opt_list);
	rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
	@@ -15084,12 +16034,22 @@
	} else {
	rack->r_ctl.saved_hibeta = 50;
	}
	+ /*
	+ * We initialize to all ones so we never match 0
	+ * just in case the client sends in 0, it hopefully
	+ * will never have all 1's in ms :-)
	+ */
	+ rack->r_ctl.last_tm_mark = 0xffffffffffffffff;
	rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
	rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
	+ rack->r_ctl.pol_bw_comp = rack_policing_do_bw_comp;
	rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
	rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
	rack->r_ctl.rc_highest_us_rtt = 0;
	rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
	+ rack->pcm_enabled = rack_pcm_is_enabled;
	+ if (rack_fillcw_bw_cap)
	+ rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
	rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
	if (rack_use_cmp_acks)
	rack->r_use_cmp_ack = 1;
	@@ -15098,6 +16058,7 @@
	if (rack_gp_no_rec_chg)
	rack->rc_gp_no_rec_chg = 1;
	if (rack_pace_every_seg && tcp_can_enable_pacing()) {
	+ rack->r_ctl.pacing_method \|= RACK_REG_PACING;
	rack->rc_always_pace = 1;
	if (rack->rack_hibeta)
	rack_set_cc_pacing(rack);
	@@ -15114,13 +16075,31 @@
	rack->r_limit_scw = 0;
	rack_init_retransmit_value(rack, rack_rxt_controls);
	rack->rc_labc = V_tcp_abc_l_var;
	+ if (rack_honors_hpts_min_to)
	+ rack->r_use_hpts_min = 1;
	+ if (tp->snd_una != 0) {
	+ rack->r_ctl.idle_snd_una = tp->snd_una;
	+ rack->rc_sendvars_notset = 0;
	+ /*
	+ * Make sure any TCP timers are not running.
	+ */
	+ tcp_timer_stop(tp);
	+ } else {
	+ /*
	+ * Server side, we are called from the
	+ * syn-cache. This means none of the
	+ * snd_una/max are set yet so we have
	+ * to defer this until the first send.
	+ */
	+ rack->rc_sendvars_notset = 1;
	+ }
	+
	rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
	rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
	rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
	rack->r_ctl.rc_min_to = rack_min_to;
	microuptime(&rack->r_ctl.act_rcv_time);
	rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
	- rack->rc_init_win = rack_default_init_window;
	rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
	if (rack_hw_up_only)
	rack->r_up_only = 1;
	@@ -15132,15 +16111,34 @@
	} else
	rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
	rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
	+ if (rack_timely_off) {
	+ rack->rc_skip_timely = 1;
	+ }
	+ if (rack->rc_skip_timely) {
	+ rack->r_ctl.rack_per_of_gp_rec = 90;
	+ rack->r_ctl.rack_per_of_gp_ca = 100;
	+ rack->r_ctl.rack_per_of_gp_ss = 250;
	+ }
	rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
	rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
	+ rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
	+
	setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
	rack_probertt_filter_life);
	us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
	rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
	rack->r_ctl.rc_time_of_last_probertt = us_cts;
	- rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
	+ rack->r_ctl.rc_went_idle_time = us_cts;
	+ rack->r_ctl.challenge_ack_ts = tcp_ts_getticks() - (tcp_ack_war_time_window + 1);
	rack->r_ctl.rc_time_probertt_starts = 0;
	+
	+ rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff;
	+ if (rack_rnd_cnt_req & 0x10000)
	+ rack->r_ctl.gate_to_fs = 1;
	+ rack->r_ctl.gp_gain_req = rack_gp_gain_req;
	+ if ((rack_rnd_cnt_req & 0x100) > 0) {
	+
	+ }
	if (rack_dsack_std_based & 0x1) {
	/* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
	rack->rc_rack_tmr_std_based = 1;
	@@ -15449,10 +16447,8 @@
	rack->r_ctl.fsb.tcp_ip_hdr = NULL;
	rack->r_ctl.fsb.th = NULL;
	}
	- if (rack->rc_always_pace) {
	- tcp_decrement_paced_conn();
	- rack_undo_cc_pacing(rack);
	- rack->rc_always_pace = 0;
	+ if (rack->rc_always_pace == 1) {
	+ rack_remove_pacing(rack);
	}
	/* Clean up any options if they were not applied */
	while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
	@@ -15492,6 +16488,12 @@
	uma_zfree(rack_zone, rsm);
	rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
	}
	+ if (rack->r_ctl.pcm_s != NULL) {
	+ free(rack->r_ctl.pcm_s, M_TCPPCM);
	+ rack->r_ctl.pcm_s = NULL;
	+ rack->r_ctl.pcm_i.cnt_alloc = 0;
	+ rack->r_ctl.pcm_i.cnt = 0;
	+ }
	if ((rack->r_ctl.rc_num_maps_alloced > 0) &&
	(tcp_bblogging_on(tp))) {
	union tcp_log_stackspecific log;
	@@ -15593,6 +16595,16 @@
	int tmr_up;

	tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
	+ if (tcp_in_hpts(rack->rc_tp) == 0) {
	+ /*
	+ * Ok we probably need some timer up, but no
	+ * matter what the mask we are not in hpts. We
	+ * may have received an old ack and thus did nothing.
	+ */
	+ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
	+ rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
	+ return;
	+ }
	if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
	return;
	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
	@@ -15916,6 +16928,134 @@
	}
	}

	+static void
	+rack_new_round_starts(struct tcpcb tp, struct tcp_rack rack, uint32_t high_seq)
	+{
	+ /*
	+ * The next send has occurred mark the end of the round
	+ * as when that data gets acknowledged. We can
	+ * also do common things we might need to do when
	+ * a round begins.
	+ */
	+ rack->r_ctl.roundends = tp->snd_max;
	+ rack->rc_new_rnd_needed = 0;
	+ rack_log_hystart_event(rack, tp->snd_max, 4);
	+}
	+
	+
	+static void
	+rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2,
	+ uint32_t flex3)
	+{
	+ if (tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ (void)tcp_get_usecs(&tv);
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
	+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
	+ log.u_bbr.flex8 = mod;
	+ log.u_bbr.flex1 = flex1;
	+ log.u_bbr.flex2 = flex2;
	+ log.u_bbr.flex3 = flex3;
	+ log.u_bbr.flex4 = rack_pcm_every_n_rounds;
	+ log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds;
	+ log.u_bbr.bbr_substate = rack->pcm_needed;
	+ log.u_bbr.bbr_substate <<= 1;
	+ log.u_bbr.bbr_substate \|= rack->pcm_in_progress;
	+ log.u_bbr.bbr_substate <<= 1;
	+ log.u_bbr.bbr_substate \|= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */
	+ (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK,
	+ 0, &log, false, NULL, NULL, 0, &tv);
	+ }
	+}
	+
	+static void
	+rack_new_round_setup(struct tcpcb tp, struct tcp_rack rack, uint32_t high_seq)
	+{
	+ /*
	+ * The round (current_round) has ended. We now
	+ * setup for the next round by incrementing the
	+ * round numnber and doing any round specific
	+ * things.
	+ */
	+ rack_log_hystart_event(rack, high_seq, 21);
	+ rack->r_ctl.current_round++;
	+ /* New round (current_round) begins at next send */
	+ rack->rc_new_rnd_needed = 1;
	+ if ((rack->pcm_enabled == 1) &&
	+ (rack->pcm_needed == 0) &&
	+ (rack->pcm_in_progress == 0)) {
	+ /*
	+ * If we have enabled PCM, then we need to
	+ * check if the round has adanced to the state
	+ * where one is required.
	+ */
	+ int rnds;
	+
	+ rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
	+ if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
	+ rack->pcm_needed = 1;
	+ rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
	+ } else if (rack_verbose_logging) {
	+ rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
	+ }
	+ }
	+ if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
	+ /* We have hystart enabled send the round info in */
	+ if (CC_ALGO(tp)->newround != NULL) {
	+ CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
	+ }
	+ }
	+ /*
	+ * For DGP an initial startup check. We want to validate
	+ * that we are not just pushing on slow-start and just
	+ * not gaining.. i.e. filling buffers without getting any
	+ * boost in b/w during the inital slow-start.
	+ */
	+ if (rack->dgp_on &&
	+ (rack->rc_initial_ss_comp == 0) &&
	+ (tp->snd_cwnd < tp->snd_ssthresh) &&
	+ (rack->r_ctl.num_measurements >= RACK_REQ_AVG) &&
	+ (rack->r_ctl.gp_rnd_thresh > 0) &&
	+ ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) {
	+
	+ /*
	+ * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where
	+ * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets
	+ * exit SS.
	+ *
	+ * Pick up the flight size now as we enter slowstart (not the
	+ * cwnd which may be inflated).
	+ */
	+ rack->rc_initial_ss_comp = 1;
	+
	+ if (tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = rack->r_ctl.current_round;
	+ log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
	+ log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh;
	+ log.u_bbr.flex5 = rack->r_ctl.gate_to_fs;
	+ log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs;
	+ log.u_bbr.flex8 = 40;
	+ (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
	+ 0, &log, false, NULL, __func__, __LINE__,&tv);
	+ }
	+ if ((rack->r_ctl.gate_to_fs == 1) &&
	+ (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) {
	+ tp->snd_cwnd = rack->r_ctl.ss_hi_fs;
	+ }
	+ tp->snd_ssthresh = tp->snd_cwnd - 1;
	+ /* Turn off any fast output running */
	+ rack->r_fast_output = 0;
	+ }
	+}
	+
	static int
	rack_do_compressed_ack_processing(struct tcpcb tp, struct socket so, struct mbuf m, int nxt_pkt, struct timeval tv)
	{
	@@ -15949,7 +17089,7 @@
	#endif
	int nsegs = 0;
	int under_pacing = 0;
	- int recovery = 0;
	+ int post_recovery = 0;
	#ifdef TCP_ACCOUNTING
	sched_pin();
	#endif
	@@ -16122,7 +17262,7 @@
	}
	} else if (ae->ack_val_set == ACK_DUPACK) {
	/* Case D */
	- rack_strike_dupack(rack);
	+ rack_strike_dupack(rack, ae->ack);
	} else if (ae->ack_val_set == ACK_RWND) {
	/* Case C */
	if ((ae->flags & TSTMP_LRO) \|\| (ae->flags & TSTMP_HDWR)) {
	@@ -16172,8 +17312,6 @@
	}
	#endif
	high_seq = ae->ack;
	- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
	- rack_log_hystart_event(rack, high_seq, 8);
	/* Setup our act_rcv_time */
	if ((ae->flags & TSTMP_LRO) \|\| (ae->flags & TSTMP_HDWR)) {
	ts.tv_sec = ae->timestamp / 1000000000;
	@@ -16239,13 +17377,11 @@
	if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) &&
	(rack->rc_new_rnd_needed == 0) &&
	(nxt_pkt == 0)) {
	- rack_log_hystart_event(rack, high_seq, 21);
	- rack->r_ctl.current_round++;
	- /* Force the next send to setup the next round */
	- rack->rc_new_rnd_needed = 1;
	- if (CC_ALGO(tp)->newround != NULL) {
	- CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
	- }
	+ /*
	+ * We have crossed into a new round with
	+ * this th_ack value.
	+ */
	+ rack_new_round_setup(tp, rack, high_seq);
	}
	/*
	* Clear the probe not answered flag
	@@ -16306,8 +17442,17 @@
	tcp_rack_partialack(tp);
	} else {
	rack_post_recovery(tp, high_seq);
	- recovery = 1;
	+ post_recovery = 1;
	}
	+ } else if ((rack->rto_from_rec == 1) &&
	+ SEQ_GEQ(high_seq, tp->snd_recover)) {
	+ /*
	+ * We were in recovery, hit a rxt timeout
	+ * and never re-entered recovery. The timeout(s)
	+ * made up all the lost data. In such a case
	+ * we need to clear the rto_from_rec flag.
	+ */
	+ rack->rto_from_rec = 0;
	}
	/* Handle the rack-log-ack part (sendmap) */
	if ((sbused(&so->so_snd) == 0) &&
	@@ -16340,9 +17485,24 @@
	KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
	KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
	if (acked_amount > 0) {
	+ uint32_t p_cwnd;
	struct mbuf *mfree;

	- rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
	+ if (post_recovery) {
	+ /*
	+ * Grab the segsiz, multiply by 2 and add the snd_cwnd
	+ * that is the max the CC should add if we are exiting
	+ * recovery and doing a late add.
	+ */
	+ p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
	+ p_cwnd <<= 1;
	+ p_cwnd += tp->snd_cwnd;
	+ }
	+ rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery);
	+ if (post_recovery && (tp->snd_cwnd > p_cwnd)) {
	+ /* Must be non-newreno (cubic) getting too ahead of itself */
	+ tp->snd_cwnd = p_cwnd;
	+ }
	SOCKBUF_LOCK(&so->so_snd);
	mfree = sbcut_locked(&so->so_snd, acked_amount);
	tp->snd_una = high_seq;
	@@ -16351,12 +17511,6 @@
	/* Wake up the socket if we have room to write more */
	rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
	sowwakeup_locked(so);
	- if ((recovery == 1) &&
	- (rack->excess_rxt_on) &&
	- (rack->r_cwnd_was_clamped == 0)) {
	- do_rack_excess_rxt(tp, rack);
	- } else if (rack->r_cwnd_was_clamped)
	- do_rack_check_for_unclamp(tp, rack);
	m_freem(mfree);
	}
	/* update progress */
	@@ -16587,7 +17741,9 @@
	}
	rack_handle_might_revert(tp, rack);
	ctf_calc_rwin(so, tp);
	- if ((rack->r_wanted_output != 0) \|\| (rack->r_fast_output != 0)) {
	+ if ((rack->r_wanted_output != 0) \|\|
	+ (rack->r_fast_output != 0) \|\|
	+ (tp->t_flags & TF_ACKNOW )) {
	send_out_a_rst:
	if (tcp_output(tp) < 0) {
	#ifdef TCP_ACCOUNTING
	@@ -16630,7 +17786,7 @@
	* us_cts - is the time that LRO or hardware actually got the packet in microseconds.
	*/
	uint32_t cts, us_cts, ms_cts;
	- uint32_t tiwin, high_seq;
	+ uint32_t tiwin;
	struct timespec ts;
	struct tcpopt to;
	struct tcp_rack *rack;
	@@ -16818,7 +17974,6 @@
	tp->t_flags &= ~TF_GPUTINPROG;
	}
	}
	- high_seq = th->th_ack;
	if (tcp_bblogging_on(rack->rc_tp)) {
	union tcp_log_stackspecific log;
	struct timeval ltv;
	@@ -16938,7 +18093,6 @@
	m_freem(m);
	goto done_with_input;
	}
	-
	/*
	* Segment received on connection. Reset idle time and keep-alive
	* timer. XXX: This should be done after segment validation to
	@@ -16975,7 +18129,28 @@
	if (TSTMP_GT(to.to_tsecr, ms_cts))
	to.to_tsecr = 0;
	}
	-
	+ if ((rack->r_rcvpath_rtt_up == 1) &&
	+ (to.to_flags & TOF_TS) &&
	+ (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) {
	+ uint32_t rtt = 0;
	+
	+ /*
	+ * We are receiving only and thus not sending
	+ * data to do an RTT. We set a flag when we first
	+ * sent this TS to the peer. We now have it back
	+ * and have an RTT to share. We log it as a conf
	+ * 4, we are not so sure about it.. since we
	+ * may have lost an ack.
	+ */
	+ if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv))
	+ rtt = (cts - rack->r_ctl.last_time_of_arm_rcv);
	+ rack->r_rcvpath_rtt_up = 0;
	+ /* Submit and commit the timer */
	+ if (rtt > 0) {
	+ tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1);
	+ tcp_rack_xmit_timer_commit(rack, tp);
	+ }
	+ }
	/*
	* If its the first time in we need to take care of options and
	* verify we can do SACK for rack!
	@@ -17069,7 +18244,7 @@
	(rack->use_fixed_rate == 0) &&
	(rack->rc_always_pace)) {
	/* Check in on probertt */
	- rack_check_probe_rtt(rack, us_cts);
	+ rack_check_probe_rtt(rack, cts);
	}
	rack_clear_rate_sample(rack);
	if ((rack->forced_ack) &&
	@@ -17113,7 +18288,7 @@
	* If we are going for target, lets recheck before
	* we output.
	*/
	- rack_check_probe_rtt(rack, us_cts);
	+ rack_check_probe_rtt(rack, cts);
	}
	if (rack->set_pacing_done_a_iw == 0) {
	/* How much has been acked? */
	@@ -17144,7 +18319,10 @@
	}
	#endif
	if ((nxt_pkt == 0) && (no_output == 0)) {
	- if ((rack->r_wanted_output != 0) \|\| (rack->r_fast_output != 0)) {
	+ if ((rack->r_wanted_output != 0) \|\|
	+ (tp->t_flags & TF_ACKNOW) \|\|
	+ (rack->r_fast_output != 0)) {
	+
	do_output_now:
	if (tcp_output(tp) < 0) {
	#ifdef TCP_ACCOUNTING
	@@ -17156,6 +18334,8 @@
	}
	rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
	rack_free_trim(rack);
	+ } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
	+ goto do_output_now;
	} else if ((no_output == 1) &&
	(nxt_pkt == 0) &&
	(tcp_in_hpts(rack->rc_tp) == 0)) {
	@@ -17170,9 +18350,6 @@
	/* Clear the flag, it may have been cleared by output but we may not have */
	if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS))
	tp->t_flags2 &= ~TF2_HPTS_CALLS;
	- /* Update any rounds needed */
	- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
	- rack_log_hystart_event(rack, high_seq, 8);
	/*
	* The draft (v3) calls for us to use SEQ_GEQ, but that
	* causes issues when we are just going app limited. Lets
	@@ -17186,13 +18363,11 @@
	if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) &&
	(rack->rc_new_rnd_needed == 0) &&
	(nxt_pkt == 0)) {
	- rack_log_hystart_event(rack, tp->snd_una, 21);
	- rack->r_ctl.current_round++;
	- /* Force the next send to setup the next round */
	- rack->rc_new_rnd_needed = 1;
	- if (CC_ALGO(tp)->newround != NULL) {
	- CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
	- }
	+ /*
	+ * We have crossed into a new round with
	+ * the new snd_unae.
	+ */
	+ rack_new_round_setup(tp, rack, tp->snd_una);
	}
	if ((nxt_pkt == 0) &&
	((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
	@@ -17242,6 +18417,7 @@
	if (did_out)
	rack->r_wanted_output = 0;
	}
	+
	#ifdef TCP_ACCOUNTING
	sched_unpin();
	#endif
	@@ -17325,7 +18501,7 @@
	srtt = rack_grab_rtt(tp, rack);
	idx = rsm->r_rtr_cnt - 1;
	ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
	- thresh = rack_calc_thresh_rack(rack, srtt, tsused);
	+ thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
	if ((tsused == ts_low) \|\|
	(TSTMP_LT(tsused, ts_low))) {
	/* No time since sending */
	@@ -17354,7 +18530,7 @@
	}

	static void
	-rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
	+rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
	uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
	int line, struct rack_sendmap *rsm, uint8_t quality)
	{
	@@ -17370,6 +18546,7 @@
	if ((method != 2) &&
	(method != 3) &&
	(method != 7) &&
	+ (method != 89) &&
	(method != 14) &&
	(method != 20)) {
	return;
	@@ -17429,12 +18606,8 @@
	log.u_bbr.bbr_substate = quality;
	log.u_bbr.bbr_state = rack->dgp_on;
	log.u_bbr.bbr_state <<= 1;
	- log.u_bbr.bbr_state \|= rack->r_fill_less_agg;
	- log.u_bbr.bbr_state <<= 1;
	log.u_bbr.bbr_state \|= rack->rc_pace_to_cwnd;
	log.u_bbr.bbr_state <<= 2;
	- log.u_bbr.bbr_state \|= rack->r_pacing_discount;
	- log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) \| log.u_bbr.flex7);
	TCP_LOG_EVENTP(rack->rc_tp, NULL,
	&rack->rc_inp->inp_socket->so_rcv,
	&rack->rc_inp->inp_socket->so_snd,
	@@ -17537,7 +18710,6 @@
	{
	uint64_t lentim, fill_bw;

	- /* Lets first see if we are full, if so continue with normal rate */
	rack->r_via_fill_cw = 0;
	if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
	return (slot);
	@@ -17551,6 +18723,8 @@
	/* The rtt is huge, N * smallest, lets not fill */
	return (slot);
	}
	+ if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap)
	+ return (slot);
	/*
	* first lets calculate the b/w based on the last us-rtt
	* and the the smallest send window.
	@@ -17570,26 +18744,47 @@
	/* Now lets make it into a b/w */
	fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
	fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
	+ /* Adjust to any cap */
	+ if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap)
	+ fill_bw = rack->r_ctl.fillcw_cap;
	+
	at_lt_bw:
	- if (rack->r_fill_less_agg) {
	+ if (rack_bw_multipler > 0) {
	/*
	- * We want the average of the rate_wanted
	- * and our fill-cw calculated bw. We also want
	- * to cap any increase to be no more than
	- * X times the lt_bw (where X is the rack_bw_multipler).
	+ * We want to limit fill-cw to the some multiplier
	+ * of the max(lt_bw, gp_est). The normal default
	+ * is 0 for off, so a sysctl has enabled it.
	*/
	- uint64_t lt_bw, rate;
	+ uint64_t lt_bw, gp, rate;

	+ gp = rack_get_gp_est(rack);
	lt_bw = rack_get_lt_bw(rack);
	- if (lt_bw > *rate_wanted)
	+ if (lt_bw > gp)
	rate = lt_bw;
	else
	- rate = *rate_wanted;
	- fill_bw += rate;
	- fill_bw /= 2;
	- if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) {
	- fill_bw = rate * rack_bw_multipler;
	- }
	+ rate = gp;
	+ rate *= rack_bw_multipler;
	+ rate /= 100;
	+ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex1 = rack_bw_multipler;
	+ log.u_bbr.flex2 = len;
	+ log.u_bbr.cur_del_rate = gp;
	+ log.u_bbr.delRate = lt_bw;
	+ log.u_bbr.bw_inuse = rate;
	+ log.u_bbr.rttProp = fill_bw;
	+ log.u_bbr.flex8 = 44;
	+ tcp_log_event(rack->rc_tp, NULL, NULL, NULL,
	+ BBR_LOG_CWND, 0,
	+ 0, &log, false, NULL,
	+ __func__, __LINE__, &tv);
	+ }
	+ if (fill_bw > rate)
	+ fill_bw = rate;
	}
	/* We are below the min b/w */
	if (non_paced)
	@@ -17638,9 +18833,8 @@
	}
	}
	if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) {
	- if (rack->rc_hybrid_mode)
	- rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
	- fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__);
	+ rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
	+ fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__);
	fill_bw = rack->r_ctl.bw_rate_cap;
	}
	/*
	@@ -17659,11 +18853,121 @@
	return (slot);
	}

	-static int32_t
	-rack_get_pacing_delay(struct tcp_rack rack, struct tcpcb tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
	+static uint32_t
	+rack_policer_check_send(struct tcp_rack rack, uint32_t len, uint32_t segsiz, uint32_t needs)
	{
	- uint64_t srtt;
	- int32_t slot = 0;
	+ uint64_t calc;
	+
	+ rack->rc_policer_should_pace = 0;
	+ calc = rack_policer_bucket_reserve * rack->r_ctl.policer_bucket_size;
	+ calc /= 100;
	+ /*
	+ * Now lets look at if we want more than is in the bucket <or>
	+ * we want more than is reserved in the bucket.
	+ */
	+ if (rack_verbose_logging > 0)
	+ policer_detection_log(rack, len, segsiz, calc, rack->r_ctl.current_policer_bucket, 8);
	+ if ((calc > rack->r_ctl.current_policer_bucket) \|\|
	+ (len >= (rack->r_ctl.current_policer_bucket - calc))) {
	+ /*
	+ * We may want to pace depending on if we are going
	+ * into the reserve or not.
	+ */
	+ uint32_t newlen;
	+
	+ if (calc > rack->r_ctl.current_policer_bucket) {
	+ /*
	+ * This will eat into the reserve if we
	+ * don't have room at all some lines
	+ * below will catch it.
	+ */
	+ newlen = rack->r_ctl.policer_max_seg;
	+ rack->rc_policer_should_pace = 1;
	+ } else {
	+ /*
	+ * We have all of the reserve plus something in the bucket
	+ * that we can give out.
	+ */
	+ newlen = rack->r_ctl.current_policer_bucket - calc;
	+ if (newlen < rack->r_ctl.policer_max_seg) {
	+ /*
	+ * Into the reserve to get a full policer_max_seg
	+ * so we set the len to that and eat into
	+ * the reserve. If we go over the code
	+ * below will make us wait.
	+ */
	+ newlen = rack->r_ctl.policer_max_seg;
	+ rack->rc_policer_should_pace = 1;
	+ }
	+ }
	+ if (newlen > rack->r_ctl.current_policer_bucket) {
	+ /* We have to wait some */
	+ *needs = newlen - rack->r_ctl.current_policer_bucket;
	+ return (0);
	+ }
	+ if (rack_verbose_logging > 0)
	+ policer_detection_log(rack, len, segsiz, newlen, 0, 9);
	+ len = newlen;
	+ } /* else we have all len available above the reserve */
	+ if (rack_verbose_logging > 0)
	+ policer_detection_log(rack, len, segsiz, calc, 0, 10);
	+ return (len);
	+}
	+
	+static uint32_t
	+rack_policed_sending(struct tcp_rack rack, struct tcpcb tp, uint32_t len, uint32_t segsiz, int call_line)
	+{
	+ /*
	+ * Given a send of len, and a token bucket set at current_policer_bucket_size
	+ * are we close enough to the end of the bucket that we need to pace? If so
	+ * calculate out a time and return it. Otherwise subtract the tokens from
	+ * the bucket.
	+ */
	+ uint64_t calc;
	+
	+ if ((rack->r_ctl.policer_bw == 0) \|\|
	+ (rack->r_ctl.policer_bucket_size < segsiz)) {
	+ /*
	+ * We should have an estimate here...
	+ */
	+ return (0);
	+ }
	+ calc = (uint64_t)rack_policer_bucket_reserve * (uint64_t)rack->r_ctl.policer_bucket_size;
	+ calc /= 100;
	+ if ((rack->r_ctl.current_policer_bucket < len) \|\|
	+ (rack->rc_policer_should_pace == 1) \|\|
	+ ((rack->r_ctl.current_policer_bucket - len) <= (uint32_t)calc)) {
	+ /* we need to pace */
	+ uint64_t lentim, res;
	+ uint32_t slot;
	+
	+ lentim = (uint64_t)len * (uint64_t)HPTS_USEC_IN_SEC;
	+ res = lentim / rack->r_ctl.policer_bw;
	+ slot = (uint32_t)res;
	+ if (rack->r_ctl.current_policer_bucket > len)
	+ rack->r_ctl.current_policer_bucket -= len;
	+ else
	+ rack->r_ctl.current_policer_bucket = 0;
	+ policer_detection_log(rack, len, slot, (uint32_t)rack_policer_bucket_reserve, call_line, 5);
	+ rack->rc_policer_should_pace = 0;
	+ return(slot);
	+ }
	+ /* Just take tokens out of the bucket and let rack do whatever it would have */
	+ policer_detection_log(rack, len, 0, (uint32_t)rack_policer_bucket_reserve, call_line, 6);
	+ if (len < rack->r_ctl.current_policer_bucket) {
	+ rack->r_ctl.current_policer_bucket -= len;
	+ } else {
	+ rack->r_ctl.current_policer_bucket = 0;
	+ }
	+ return (0);
	+}
	+
	+
	+static int32_t
	+rack_get_pacing_delay(struct tcp_rack rack, struct tcpcb tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)
	+{
	+ uint64_t srtt;
	+ int32_t slot = 0;
	int32_t minslot = 0;
	int can_start_hw_pacing = 1;
	int err;
	@@ -17674,6 +18978,25 @@
	pace_one = 1;
	else
	pace_one = 0;
	+ if (rack->rc_policer_detected == 1) {
	+ /*
	+ * A policer has been detected and we
	+ * have all of our data (policer-bw and
	+ * policer bucket size) calculated. Call
	+ * into the function to find out if we are
	+ * overriding the time.
	+ */
	+ slot = rack_policed_sending(rack, tp, len, segsiz, line);
	+ if (slot) {
	+ uint64_t logbw;
	+
	+ logbw = rack->r_ctl.current_policer_bucket;
	+ logbw <<= 32;
	+ logbw \|= rack->r_ctl.policer_bucket_size;
	+ rack_log_pacing_delay_calc(rack, len, slot, rack->r_ctl.policer_bw, logbw, 0, 89, __LINE__, NULL, 0);
	+ return(slot);
	+ }
	+ }
	if (rack->rc_always_pace == 0) {
	/*
	* We use the most optimistic possible cwnd/srtt for
	@@ -18214,6 +19537,16 @@
	rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
	tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
	rack->r_ctl.rc_gp_cumack_ts = 0;
	+ if ((rack->r_ctl.cleared_app_ack == 1) &&
	+ (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) {
	+ /*
	+ * We just cleared an application limited period
	+ * so the next seq out needs to skip the first
	+ * ack.
	+ */
	+ rack->app_limited_needs_set = 1;
	+ rack->r_ctl.cleared_app_ack = 0;
	+ }
	rack_log_pacing_delay_calc(rack,
	tp->gput_seq,
	tp->gput_ack,
	@@ -19132,7 +20465,6 @@
	rack->r_late = 0;
	rack->r_ctl.rc_agg_early = 0;
	}
	-
	rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
	rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz);
	if (doing_tlp) {
	@@ -19189,17 +20521,8 @@
	tcp_rl_log_enobuf(rack->r_ctl.crte);
	}
	counter_u64_add(rack_saw_enobuf, 1);
	- } else
	- slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
	- if ((slot == 0) \|\|
	- (rack->rc_always_pace == 0) \|\|
	- (rack->r_rr_config == 1)) {
	- /*
	- * We have no pacing set or we
	- * are using old-style rack or
	- * we are overridden to use the old 1ms pacing.
	- */
	- slot = rack->r_ctl.rc_min_to;
	+ } else {
	+ slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
	}
	rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
	#ifdef TCP_ACCOUNTING
	@@ -19261,7 +20584,7 @@
	(so->so_snd.sb_hiwat / 8 * 7) &&
	sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
	sendwin >= (sbused(&so->so_snd) -
	- (tp->snd_nxt - tp->snd_una))) {
	+ (tp->snd_max - tp->snd_una))) {
	if (rack_autosndbuf_inc)
	scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
	else
	@@ -19313,7 +20636,7 @@
	uint32_t s_soff;
	uint32_t if_hw_tsomaxsegcount = 0, startseq;
	uint32_t if_hw_tsomaxsegsize;
	- uint16_t add_flag = RACK_SENT_FP;
	+ uint32_t add_flag = RACK_SENT_FP;
	#ifdef INET6
	struct ip6_hdr *ip6 = NULL;

	@@ -19680,6 +21003,22 @@
	rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv);
	rack->r_ctl.lt_seq = tp->snd_una;
	rack->lt_bw_up = 1;
	+ } else if ((error == 0) &&
	+ (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) {
	+ /*
	+ * Need to record what we have since we are
	+ * approaching seq wrap.
	+ */
	+ struct timeval tv;
	+ uint64_t tmark;
	+
	+ rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
	+ rack->r_ctl.lt_seq = tp->snd_una;
	+ tmark = tcp_get_u64_usecs(&tv);
	+ if (tmark > rack->r_ctl.lt_timemark) {
	+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
	+ rack->r_ctl.lt_timemark = tmark;
	+ }
	}
	rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
	NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz);
	@@ -19699,13 +21038,7 @@
	tp->snd_max += len;
	tp->snd_nxt = tp->snd_max;
	if (rack->rc_new_rnd_needed) {
	- /*
	- * Update the rnd to start ticking not
	- * that from a time perspective all of
	- * the preceding idle time is "in the round"
	- */
	- rack->rc_new_rnd_needed = 0;
	- rack->r_ctl.roundends = tp->snd_max;
	+ rack_new_round_starts(tp, rack, tp->snd_max);
	}
	{
	int idx;
	@@ -19746,7 +21079,7 @@
	}
	tp->t_flags &= ~(TF_ACKNOW \| TF_DELACK);
	counter_u64_add(rack_fto_send, 1);
	- slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
	+ slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__);
	rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
	#ifdef TCP_ACCOUNTING
	crtsc = get_cyclecount();
	@@ -19856,7 +21189,7 @@
	goto restart;
	}
	/* Now has it been long enough ? */
	- thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts);
	+ thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1);
	if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
	rack_log_collapse(rack, rsm->r_start,
	(cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
	@@ -19870,6 +21203,25 @@
	return (NULL);
	}

	+static void
	+rack_credit_back_policer_idle_time(struct tcp_rack *rack, uint64_t idle_t, int line)
	+{
	+ /*
	+ * We were idle some time (idle_t) and so our policer bucket
	+ * needs to grow. It can go no higher than policer_bucket_size.
	+ */
	+ uint64_t len;
	+
	+ len = idle_t * rack->r_ctl.policer_bw;
	+ len /= HPTS_USEC_IN_SEC;
	+ rack->r_ctl.current_policer_bucket += (uint32_t)len;
	+ if (rack->r_ctl.policer_bucket_size < rack->r_ctl.current_policer_bucket) {
	+ rack->r_ctl.current_policer_bucket = rack->r_ctl.policer_bucket_size;
	+ }
	+ if (rack_verbose_logging > 0)
	+ policer_detection_log(rack, (uint32_t)len, line, (uint32_t)idle_t, 0, 7);
	+}
	+
	static inline void
	rack_validate_sizes(struct tcp_rack rack, int32_t len, int32_t segsiz, uint32_t pace_max_seg)
	{
	@@ -19931,7 +21283,7 @@
	unsigned ipsec_optlen = 0;

	#endif
	- int32_t idle, sendalot;
	+ int32_t idle, sendalot, tot_idle;
	int32_t sub_from_prr = 0;
	volatile int32_t sack_rxmit;
	struct rack_sendmap *rsm = NULL;
	@@ -19940,7 +21292,7 @@
	int32_t slot = 0;
	int32_t sup_rack = 0;
	uint32_t cts, ms_cts, delayed, early;
	- uint16_t add_flag = RACK_SENT_SP;
	+ uint32_t add_flag = RACK_SENT_SP;
	/* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
	uint8_t doing_tlp = 0;
	uint32_t cwnd_to_use, pace_max_seg;
	@@ -20101,12 +21453,16 @@
	early = rack->r_ctl.rc_last_output_to - cts;
	} else
	early = 0;
	- if (delayed) {
	+ if (delayed && (rack->rc_always_pace == 1)) {
	rack->r_ctl.rc_agg_delayed += delayed;
	rack->r_late = 1;
	- } else if (early) {
	+ } else if (early && (rack->rc_always_pace == 1)) {
	rack->r_ctl.rc_agg_early += early;
	rack->r_early = 1;
	+ } else if (rack->rc_always_pace == 0) {
	+ /* Non-paced we are not late */
	+ rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0;
	+ rack->r_early = rack->r_late = 0;
	}
	/* Now that early/late accounting is done turn off the flag */
	rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
	@@ -20168,9 +21524,9 @@
	}
	if ((tp->snd_una == tp->snd_max) &&
	rack->r_ctl.rc_went_idle_time &&
	- TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
	- idle = cts - rack->r_ctl.rc_went_idle_time;
	- if (idle > rack_min_probertt_hold) {
	+ (cts > rack->r_ctl.rc_went_idle_time)) {
	+ tot_idle = idle = (cts - rack->r_ctl.rc_went_idle_time);
	+ if (idle > (uint64_t)rack_min_probertt_hold) {
	/* Count as a probe rtt */
	if (rack->in_probe_rtt == 0) {
	rack->r_ctl.rc_lower_rtt_us_cts = cts;
	@@ -20183,17 +21539,75 @@
	}
	idle = 0;
	}
	+ if(rack->policer_detect_on) {
	+ /*
	+ * If we are doing policer detetion we at a minium
	+ * record the time but if possible add back to
	+ * the bucket based on the idle time.
	+ */
	+ uint64_t idle_t, u64_cts;
	+
	+ segsiz = min(ctf_fixed_maxseg(tp),
	+ rack->r_ctl.rc_pace_min_segs);
	+ u64_cts = tcp_tv_to_lusectick(&tv);
	+ if ((rack->rc_policer_detected == 1) &&
	+ (rack->r_ctl.policer_bucket_size > segsiz) &&
	+ (rack->r_ctl.policer_bw > 0) &&
	+ (u64_cts > rack->r_ctl.last_sendtime)) {
	+ /* We are being policed add back the time */
	+ idle_t = u64_cts - rack->r_ctl.last_sendtime;
	+ rack_credit_back_policer_idle_time(rack, idle_t, __LINE__);
	+ }
	+ rack->r_ctl.last_sendtime = u64_cts;
	+ }
	if (rack_use_fsb &&
	(rack->r_ctl.fsb.tcp_ip_hdr) &&
	(rack->r_fsb_inited == 0) &&
	(rack->r_state != TCPS_CLOSED))
	rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]);
	+ if (rack->rc_sendvars_notset == 1) {
	+ rack->r_ctl.idle_snd_una = tp->snd_una;
	+ rack->rc_sendvars_notset = 0;
	+ /*
	+ * Make sure any TCP timers (keep-alive) is not running.
	+ */
	+ tcp_timer_stop(tp);
	+ }
	+ if ((rack->rack_no_prr == 1) &&
	+ (rack->rc_always_pace == 0)) {
	+ /*
	+ * Sanity check before sending, if we have
	+ * no-pacing enabled and prr is turned off that
	+ * is a logistics error. Correct this by turnning
	+ * prr back on. A user must set some form of
	+ * pacing in order to turn PRR off. We do this
	+ * in the output path so that we can avoid socket
	+ * option ordering issues that would occur if we
	+ * tried to do it while setting rack_no_prr on.
	+ */
	+ rack->rack_no_prr = 0;
	+ }
	+ if ((rack->pcm_enabled == 1) &&
	+ (rack->pcm_needed == 0) &&
	+ (tot_idle > 0)) {
	+ /*
	+ * We have been idle some micro seconds. We need
	+ * to factor this in to see if a PCM is needed.
	+ */
	+ uint32_t rtts_idle, rnds;
	+
	+ if (tp->t_srtt)
	+ rtts_idle = tot_idle / tp->t_srtt;
	+ else
	+ rtts_idle = 0;
	+ rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
	+ rack->r_ctl.pcm_idle_rounds += rtts_idle;
	+ if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
	+ rack->pcm_needed = 1;
	+ rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round );
	+ }
	+ }
	again:
	- /*
	- * If we've recently taken a timeout, snd_max will be greater than
	- * snd_nxt. There may be SACK information that allows us to avoid
	- * resending already delivered data. Adjust snd_nxt accordingly.
	- */
	sendalot = 0;
	cts = tcp_get_usecs(&tv);
	ms_cts = tcp_tv_to_mssectick(&tv);
	@@ -20205,6 +21619,44 @@
	pace_max_seg = rack->rc_user_set_max_segs * segsiz;
	else
	pace_max_seg = rack->r_ctl.rc_pace_max_segs;
	+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	+ (rack->r_ctl.pcm_max_seg == 0)) {
	+ /*
	+ * We set in our first send so we know that the ctf_fixed_maxseg
	+ * has been fully set. If we do it in rack_init() we most likely
	+ * see 512 bytes so we end up at 5120, not desirable.
	+ */
	+ rack->r_ctl.pcm_max_seg = rc_init_window(rack);
	+ if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) {
	+ /*
	+ * Assure our initial PCM probe is at least 10 MSS.
	+ */
	+ rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
	+ }
	+ }
	+ if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
	+ uint32_t rw_avail, cwa;
	+
	+ if (tp->snd_wnd > ctf_outstanding(tp))
	+ rw_avail = tp->snd_wnd - ctf_outstanding(tp);
	+ else
	+ rw_avail = 0;
	+ if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked))
	+ cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
	+ else
	+ cwa = 0;
	+ if ((cwa >= rack->r_ctl.pcm_max_seg) &&
	+ (rw_avail > rack->r_ctl.pcm_max_seg)) {
	+ /* Raise up the max seg for this trip through */
	+ pace_max_seg = rack->r_ctl.pcm_max_seg;
	+ /* Disable any fast output */
	+ rack->r_fast_output = 0;
	+ }
	+ if (rack_verbose_logging) {
	+ rack_log_pcm(rack, 4,
	+ cwa, rack->r_ctl.pcm_max_seg, rw_avail);
	+ }
	+ }
	sb_offset = tp->snd_max - tp->snd_una;
	cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
	flags = tcp_outflags[tp->t_state];
	@@ -20431,10 +21883,19 @@
	((rsm->r_flags & RACK_HAS_FIN) == 0)) {
	int ret;

	+ if ((rack->rc_policer_detected == 1) &&
	+ (rack->r_ctl.policer_bucket_size > segsiz) &&
	+ (rack->r_ctl.policer_bw > 0)) {
	+ /* Check to see if there is room */
	+ if (rack->r_ctl.current_policer_bucket < len) {
	+ goto skip_fast_output;
	+ }
	+ }
	ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
	if (ret == 0)
	return (0);
	}
	+skip_fast_output:
	so = inp->inp_socket;
	sb = &so->so_snd;
	if (do_a_prefetch == 0) {
	@@ -20487,28 +21948,19 @@
	prefetch_rsm = 1;
	}
	SOCKBUF_LOCK(sb);
	- /*
	- * If snd_nxt == snd_max and we have transmitted a FIN, the
	- * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
	- * negative length. This can also occur when TCP opens up its
	- * congestion window while receiving additional duplicate acks after
	- * fast-retransmit because TCP will reset snd_nxt to snd_max after
	- * the fast-retransmit.
	- *
	- * In the normal retransmit-FIN-only case, however, snd_nxt will be
	- * set to snd_una, the sb_offset will be 0, and the length may wind
	- * up 0.
	- *
	- * If sack_rxmit is true we are retransmitting from the scoreboard
	- * in which case len is already set.
	- */
	if ((sack_rxmit == 0) &&
	(TCPS_HAVEESTABLISHED(tp->t_state) \|\| IS_FASTOPEN(tp->t_flags))) {
	+ /*
	+ * We are not retransmitting (sack_rxmit is 0) so we
	+ * are sending new data. This is always based on snd_max.
	+ * Now in theory snd_max may be equal to snd_una, if so
	+ * then nothing is outstanding and the offset would be 0.
	+ */
	uint32_t avail;

	avail = sbavail(sb);
	- if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
	- sb_offset = tp->snd_nxt - tp->snd_una;
	+ if (SEQ_GT(tp->snd_max, tp->snd_una) && avail)
	+ sb_offset = tp->snd_max - tp->snd_una;
	else
	sb_offset = 0;
	if ((IN_FASTRECOVERY(tp->t_flags) == 0) \|\| rack->rack_no_prr) {
	@@ -20632,13 +22084,53 @@
	kern_prefetch(so, &prefetch_so_done);
	prefetch_so_done = 1;
	}
	+ orig_len = len;
	+ if ((rack->rc_policer_detected == 1) &&
	+ (rack->r_ctl.policer_bucket_size > segsiz) &&
	+ (rack->r_ctl.policer_bw > 0) &&
	+ (len > 0)) {
	+ /*
	+ * Ok we believe we have a policer watching
	+ * what we send, can we send len? If not can
	+ * we tune it down to a smaller value?
	+ */
	+ uint32_t plen, buck_needs;
	+
	+ plen = rack_policer_check_send(rack, len, segsiz, &buck_needs);
	+ if (plen == 0) {
	+ /*
	+ * We are not allowed to send. How long
	+ * do we need to pace for i.e. how long
	+ * before len is available to send?
	+ */
	+ uint64_t lentime;
	+
	+ lentime = buck_needs;
	+ lentime *= HPTS_USEC_IN_SEC;
	+ lentime /= rack->r_ctl.policer_bw;
	+ slot = (uint32_t)lentime;
	+ tot_len_this_send = 0;
	+ SOCKBUF_UNLOCK(sb);
	+ if (rack_verbose_logging > 0)
	+ policer_detection_log(rack, len, slot, buck_needs, 0, 12);
	+ rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
	+ rack_log_type_just_return(rack, cts, 0, slot, hpts_calling, 0, cwnd_to_use);
	+ goto just_return_clean;
	+ }
	+ if (plen < len) {
	+ sendalot = 0;
	+ len = plen;
	+ }
	+ }
	/*
	* Lop off SYN bit if it has already been sent. However, if this is
	* SYN-SENT state and if segment contains data and if we don't know
	* that foreign host supports TAO, suppress sending segment.
	*/
	- if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
	- ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
	+ if ((flags & TH_SYN) &&
	+ SEQ_GT(tp->snd_max, tp->snd_una) &&
	+ ((sack_rxmit == 0) &&
	+ (tp->t_rxtshift == 0))) {
	/*
	* When sending additional segments following a TFO SYN\|ACK,
	* do not include the SYN bit.
	@@ -20678,7 +22170,6 @@
	}
	/* Without fast-open there should never be data sent on a SYN */
	if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
	- tp->snd_nxt = tp->iss;
	len = 0;
	}
	if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
	@@ -20686,22 +22177,10 @@
	add_flag \|= RACK_SENT_W_DSACK;
	len = segsiz;
	}
	- orig_len = len;
	if (len <= 0) {
	/*
	- * If FIN has been sent but not acked, but we haven't been
	- * called to retransmit, len will be < 0. Otherwise, window
	- * shrank after we sent into it. If window shrank to 0,
	- * cancel pending retransmit, pull snd_nxt back to (closed)
	- * window, and set the persist timer if it isn't already
	- * going. If the window didn't close completely, just wait
	- * for an ACK.
	- *
	- * We also do a general check here to ensure that we will
	- * set the persist timer when we have data to send, but a
	- * 0-byte window. This makes sure the persist timer is set
	- * even if the packet hits one of the "goto send" lines
	- * below.
	+ * We have nothing to send, or the window shrank, or
	+ * is closed, do we need to go into persists?
	*/
	len = 0;
	if ((tp->snd_wnd == 0) &&
	@@ -20859,10 +22338,6 @@
	if (sack_rxmit) {
	if ((rsm->r_flags & RACK_HAS_FIN) == 0)
	flags &= ~TH_FIN;
	- } else {
	- if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
	- sbused(sb)))
	- flags &= ~TH_FIN;
	}
	}
	recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
	@@ -20903,10 +22378,6 @@
	pass = 4;
	goto send;
	}
	- if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
	- pass = 5;
	- goto send;
	- }
	if (sack_rxmit) {
	pass = 6;
	goto send;
	@@ -21014,7 +22485,7 @@
	* yet done so, then we need to send.
	*/
	if ((flags & TH_FIN) &&
	- (tp->snd_nxt == tp->snd_una)) {
	+ (tp->snd_max == tp->snd_una)) {
	pass = 11;
	goto send;
	}
	@@ -21027,15 +22498,32 @@
	{
	int app_limited = CTF_JR_SENT_DATA;

	+ if ((IS_FASTOPEN(tp->t_flags) == 0) &&
	+ (flags & TH_FIN) &&
	+ (len == 0) &&
	+ (sbused(sb) == (tp->snd_max - tp->snd_una)) &&
	+ ((tp->snd_max - tp->snd_una) <= segsiz)) {
	+ /*
	+ * Ok less than or right at a MSS is
	+ * outstanding. The original FreeBSD stack would
	+ * have sent a FIN, which can speed things up for
	+ * a transactional application doing a MSG_WAITALL.
	+ * To speed things up since we do not send a FIN
	+ * if data is outstanding, we send a "challenge ack".
	+ * The idea behind that is instead of having to have
	+ * the peer wait for the delayed-ack timer to run off
	+ * we send an ack that makes the peer send us an ack.
	+ */
	+ rack_send_ack_challange(rack);
	+ }
	if (tot_len_this_send > 0) {
	- /* Make sure snd_nxt is up to max */
	rack->r_ctl.fsb.recwin = recwin;
	- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
	+ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
	if ((error == 0) &&
	+ (rack->rc_policer_detected == 0) &&
	rack_use_rfo &&
	((flags & (TH_SYN\|TH_FIN)) == 0) &&
	(ipoptlen == 0) &&
	- (tp->snd_nxt == tp->snd_max) &&
	(tp->rcv_numsacks == 0) &&
	rack->r_fsb_inited &&
	TCPS_HAVEESTABLISHED(tp->t_state) &&
	@@ -21052,11 +22540,10 @@
	segsiz, pace_max_seg, hw_tls, flags);
	} else
	rack->r_fast_output = 0;
	-
	-
	rack_log_fsb(rack, tp, so, flags,
	ipoptlen, orig_len, len, 0,
	1, optlen, __LINE__, 1);
	+ /* Assure when we leave that snd_nxt will point to top */
	if (SEQ_GT(tp->snd_max, tp->snd_nxt))
	tp->snd_nxt = tp->snd_max;
	} else {
	@@ -21218,6 +22705,7 @@
	rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
	rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
	}
	+just_return_clean:
	#ifdef NETFLIX_SHARED_CWND
	if ((sbavail(sb) == 0) &&
	rack->r_ctl.rc_scw) {
	@@ -21284,13 +22772,39 @@
	* is acked first.
	*/
	flags &= ~TH_FIN;
	+ if ((sbused(sb) == (tp->snd_max - tp->snd_una)) &&
	+ ((tp->snd_max - tp->snd_una) <= segsiz)) {
	+ /*
	+ * Ok less than or right at a MSS is
	+ * outstanding. The original FreeBSD stack would
	+ * have sent a FIN, which can speed things up for
	+ * a transactional application doing a MSG_WAITALL.
	+ * To speed things up since we do not send a FIN
	+ * if data is outstanding, we send a "challenge ack".
	+ * The idea behind that is instead of having to have
	+ * the peer wait for the delayed-ack timer to run off
	+ * we send an ack that makes the peer send us an ack.
	+ */
	+ rack_send_ack_challange(rack);
	+ }
	}
	/* Enforce stack imposed max seg size if we have one */
	- if (rack->r_ctl.rc_pace_max_segs &&
	- (len > rack->r_ctl.rc_pace_max_segs)) {
	+ if (pace_max_seg &&
	+ (len > pace_max_seg)) {
	mark = 1;
	- len = rack->r_ctl.rc_pace_max_segs;
	+ len = pace_max_seg;
	+ }
	+ if ((rsm == NULL) &&
	+ (rack->pcm_in_progress == 0) &&
	+ (rack->r_ctl.pcm_max_seg > 0) &&
	+ (len >= rack->r_ctl.pcm_max_seg)) {
	+ /* It is large enough for a measurement */
	+ add_flag \|= RACK_IS_PCM;
	+ rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag);
	+ } else if (rack_verbose_logging) {
	+ rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag);
	}
	+
	SOCKBUF_LOCK_ASSERT(sb);
	if (len > 0) {
	if (len >= segsiz)
	@@ -21313,6 +22827,24 @@
	#endif
	hdrlen = sizeof(struct tcpiphdr);

	+ /*
	+ * Ok what seq are we sending from. If we have
	+ * no rsm to use, then we look at various bits,
	+ * if we are putting out a SYN it will be ISS.
	+ * If we are retransmitting a FIN it will
	+ * be snd_max-1 else its snd_max.
	+ */
	+ if (rsm == NULL) {
	+ if (flags & TH_SYN)
	+ rack_seq = tp->iss;
	+ else if ((flags & TH_FIN) &&
	+ (tp->t_flags & TF_SENTFIN))
	+ rack_seq = tp->snd_max - 1;
	+ else
	+ rack_seq = tp->snd_max;
	+ } else {
	+ rack_seq = rsm->r_start;
	+ }
	/*
	* Compute options for segment. We only have to care about SYN and
	* established connection segments. Options for SYN-ACK segments
	@@ -21322,7 +22854,6 @@
	if ((tp->t_flags & TF_NOOPT) == 0) {
	/* Maximum segment size. */
	if (flags & TH_SYN) {
	- tp->snd_nxt = tp->iss;
	to.to_mss = tcp_mssopt(&inp->inp_inc);
	if (tp->t_port)
	to.to_mss -= V_tcp_udp_tunneling_overhead;
	@@ -21369,14 +22900,47 @@
	/* Timestamps. */
	if ((tp->t_flags & TF_RCVD_TSTMP) \|\|
	((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
	- to.to_tsval = ms_cts + tp->ts_offset;
	+ uint32_t ts_to_use;
	+
	+ if ((rack->r_rcvpath_rtt_up == 1) &&
	+ (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) {
	+ /*
	+ * When we are doing a rcv_rtt probe all
	+ * other timestamps use the next msec. This
	+ * is safe since our previous ack is in the
	+ * air and we will just have a few more
	+ * on the next ms. This assures that only
	+ * the one ack has the ms_cts that was on
	+ * our ack-probe.
	+ */
	+ ts_to_use = ms_cts + 1;
	+ } else {
	+ ts_to_use = ms_cts;
	+ }
	+ to.to_tsval = ts_to_use + tp->ts_offset;
	to.to_tsecr = tp->ts_recent;
	to.to_flags \|= TOF_TS;
	+ if ((len == 0) &&
	+ (TCPS_HAVEESTABLISHED(tp->t_state)) &&
	+ ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) &&
	+ (tp->snd_una == tp->snd_max) &&
	+ (flags & TH_ACK) &&
	+ (sbavail(sb) == 0) &&
	+ (rack->r_ctl.current_round != 0) &&
	+ ((flags & (TH_SYN\|TH_FIN)) == 0) &&
	+ (rack->r_rcvpath_rtt_up == 0)) {
	+ rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts;
	+ rack->r_ctl.last_time_of_arm_rcv = cts;
	+ rack->r_rcvpath_rtt_up = 1;
	+ /* Subtract 1 from seq to force a response */
	+ rack_seq--;
	+ }
	}
	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 &&
	- (so->so_rcv.sb_flags & SB_AUTOSIZE))
	- tp->rfbuf_ts = tcp_ts_getticks();
	+ (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
	+ tp->rfbuf_ts = ms_cts;
	+ }
	/* Selective ACK's. */
	if (tp->t_flags & TF_SACK_PERMIT) {
	if (flags & TH_SYN)
	@@ -21544,7 +23108,24 @@
	(sbused(sb))) {
	/*
	* We have outstanding data, don't send a fin by itself!.
	+ *
	+ * Check to see if we need to send a challenge ack.
	*/
	+ if ((sbused(sb) == (tp->snd_max - tp->snd_una)) &&
	+ ((tp->snd_max - tp->snd_una) <= segsiz)) {
	+ /*
	+ * Ok less than or right at a MSS is
	+ * outstanding. The original FreeBSD stack would
	+ * have sent a FIN, which can speed things up for
	+ * a transactional application doing a MSG_WAITALL.
	+ * To speed things up since we do not send a FIN
	+ * if data is outstanding, we send a "challenge ack".
	+ * The idea behind that is instead of having to have
	+ * the peer wait for the delayed-ack timer to run off
	+ * we send an ack that makes the peer send us an ack.
	+ */
	+ rack_send_ack_challange(rack);
	+ }
	goto just_return;
	}
	/*
	@@ -21557,10 +23138,8 @@
	uint32_t max_val;
	uint32_t moff;

	- if (rack->r_ctl.rc_pace_max_segs)
	- max_val = rack->r_ctl.rc_pace_max_segs;
	- else if (rack->rc_user_set_max_segs)
	- max_val = rack->rc_user_set_max_segs * segsiz;
	+ if (pace_max_seg)
	+ max_val = pace_max_seg;
	else
	max_val = len;
	/*
	@@ -21596,16 +23175,28 @@
	if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
	m_copydata(mb, moff, (int)len,
	mtod(m, caddr_t)+hdrlen);
	- if (SEQ_LT(tp->snd_nxt, tp->snd_max))
	+ /*
	+ * If we are not retransmitting advance the
	+ * sndptr to help remember the next place in
	+ * the sb.
	+ */
	+ if (rsm == NULL)
	sbsndptr_adv(sb, mb, len);
	m->m_len += len;
	} else {
	struct sockbuf *msb;

	- if (SEQ_LT(tp->snd_nxt, tp->snd_max))
	- msb = NULL;
	- else
	+ /*
	+ * If we are not retransmitting pass in msb so
	+ * the socket buffer can be advanced. Otherwise
	+ * set it to NULL if its a retransmission since
	+ * we don't want to change the sb remembered
	+ * location.
	+ */
	+ if (rsm == NULL)
	msb = sb;
	+ else
	+ msb = NULL;
	m->m_next = tcp_m_copym(
	mb, moff, &len,
	if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
	@@ -21631,7 +23222,7 @@
	goto out;
	}
	}
	- if (SEQ_LT(tp->snd_nxt, tp->snd_max) \|\| sack_rxmit) {
	+ if (sack_rxmit) {
	if (rsm && (rsm->r_flags & RACK_TLP)) {
	/*
	* TLP should not count in retran count, but
	@@ -21750,14 +23341,6 @@
	#endif
	}
	}
	- /*
	- * Fill in fields, remembering maximum advertised window for use in
	- * delaying messages about window sizes. If resending a FIN, be sure
	- * not to use a new sequence number.
	- */
	- if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
	- tp->snd_nxt == tp->snd_max)
	- tp->snd_nxt--;
	/*
	* If we are starting a connection, send ECN setup SYN packet. If we
	* are on a retransmit, we may resend those bits a number of times
	@@ -21787,29 +23370,7 @@
	#endif
	}
	}
	- /*
	- * If we are doing retransmissions, then snd_nxt will not reflect
	- * the first unsent octet. For ACK only packets, we do not want the
	- * sequence number of the retransmitted packet, we want the sequence
	- * number of the next unsent octet. So, if there is no data (and no
	- * SYN or FIN), use snd_max instead of snd_nxt when filling in
	- * ti_seq. But if we are in persist state, snd_max might reflect
	- * one byte beyond the right edge of the window, so use snd_nxt in
	- * that case, since we know we aren't doing a retransmission.
	- * (retransmit and persist are mutually exclusive...)
	- */
	- if (sack_rxmit == 0) {
	- if (len \|\| (flags & (TH_SYN \| TH_FIN))) {
	- th->th_seq = htonl(tp->snd_nxt);
	- rack_seq = tp->snd_nxt;
	- } else {
	- th->th_seq = htonl(tp->snd_max);
	- rack_seq = tp->snd_max;
	- }
	- } else {
	- th->th_seq = htonl(rsm->r_start);
	- rack_seq = rsm->r_start;
	- }
	+ th->th_seq = htonl(rack_seq);
	th->th_ack = htonl(tp->rcv_nxt);
	tcp_set_flags(th, flags);
	/*
	@@ -22170,6 +23731,13 @@
	rack_to_usec_ts(&tv),
	rsm, add_flag, s_mb, s_moff, hw_tls, segsiz);
	if (error == 0) {
	+ if (add_flag & RACK_IS_PCM) {
	+ /* We just launched a PCM */
	+ /* rrs here log */
	+ rack->pcm_in_progress = 1;
	+ rack->pcm_needed = 0;
	+ rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag);
	+ }
	if (rsm == NULL) {
	if (rack->lt_bw_up == 0) {
	rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
	@@ -22184,9 +23752,11 @@

	rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
	rack->r_ctl.lt_seq = tp->snd_una;
	- tmark = tcp_tv_to_lusectick(&tv);
	- rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
	- rack->r_ctl.lt_timemark = tmark;
	+ tmark = tcp_get_u64_usecs(&tv);
	+ if (tmark > rack->r_ctl.lt_timemark) {
	+ rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
	+ rack->r_ctl.lt_timemark = tmark;
	+ }
	}
	}
	rack->forced_ack = 0; /* If we send something zap the FA flag */
	@@ -22256,15 +23826,17 @@
	(len > 0) &&
	(tp->snd_una == tp->snd_max))
	rack->r_ctl.rc_tlp_rxt_last_time = cts;
	+
	{
	- tcp_seq startseq = tp->snd_nxt;
	+ /*
	+ * This block is not associated with the above error == 0 test.
	+ * It is used to advance snd_max if we have a new transmit.
	+ */
	+ tcp_seq startseq = tp->snd_max;
	+

	- /* Track our lost count */
	if (rsm && (doing_tlp == 0))
	rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
	- /*
	- * Advance snd_nxt over sequence space of this segment.
	- */
	if (error)
	/* We don't log or do anything with errors */
	goto nomore;
	@@ -22287,53 +23859,53 @@
	rack->rc_tlp_in_progress = 1;
	rack->r_ctl.rc_tlp_cnt_out++;
	}
	- if (flags & (TH_SYN \| TH_FIN)) {
	- if (flags & TH_SYN)
	- tp->snd_nxt++;
	- if (flags & TH_FIN) {
	- tp->snd_nxt++;
	- tp->t_flags \|= TF_SENTFIN;
	- }
	- }
	- /* In the ENOBUFS case we do not update snd_max */
	+ /*
	+ * If we are retransmitting we are done, snd_max
	+ * does not get updated.
	+ */
	if (sack_rxmit)
	goto nomore;
	-
	- tp->snd_nxt += len;
	- if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
	- if (tp->snd_una == tp->snd_max) {
	- /*
	- * Update the time we just added data since
	- * none was outstanding.
	- */
	- rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
	- tp->t_acktime = ticks;
	- }
	- tp->snd_max = tp->snd_nxt;
	- if (rack->rc_new_rnd_needed) {
	- /*
	- * Update the rnd to start ticking not
	- * that from a time perspective all of
	- * the preceding idle time is "in the round"
	- */
	- rack->rc_new_rnd_needed = 0;
	- rack->r_ctl.roundends = tp->snd_max;
	- }
	+ if ((tp->snd_una == tp->snd_max) && (len > 0)) {
	/*
	- * Time this transmission if not a retransmission and
	- * not currently timing anything.
	- * This is only relevant in case of switching back to
	- * the base stack.
	+ * Update the time we just added data since
	+ * nothing was outstanding.
	*/
	- if (tp->t_rtttime == 0) {
	- tp->t_rtttime = ticks;
	- tp->t_rtseq = startseq;
	- KMOD_TCPSTAT_INC(tcps_segstimed);
	+ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
	+ tp->t_acktime = ticks;
	+ }
	+ /*
	+ * Now for special SYN/FIN handling.
	+ */
	+ if (flags & (TH_SYN \| TH_FIN)) {
	+ if ((flags & TH_SYN) &&
	+ ((tp->t_flags & TF_SENTSYN) == 0)) {
	+ tp->snd_max++;
	+ tp->t_flags \|= TF_SENTSYN;
	}
	- if (len &&
	- ((tp->t_flags & TF_GPUTINPROG) == 0))
	- rack_start_gp_measurement(tp, rack, startseq, sb_offset);
	+ if ((flags & TH_FIN) &&
	+ ((tp->t_flags & TF_SENTFIN) == 0)) {
	+ tp->snd_max++;
	+ tp->t_flags \|= TF_SENTFIN;
	+ }
	+ }
	+ tp->snd_max += len;
	+ if (rack->rc_new_rnd_needed) {
	+ rack_new_round_starts(tp, rack, tp->snd_max);
	+ }
	+ /*
	+ * Time this transmission if not a retransmission and
	+ * not currently timing anything.
	+ * This is only relevant in case of switching back to
	+ * the base stack.
	+ */
	+ if (tp->t_rtttime == 0) {
	+ tp->t_rtttime = ticks;
	+ tp->t_rtseq = startseq;
	+ KMOD_TCPSTAT_INC(tcps_segstimed);
	}
	+ if (len &&
	+ ((tp->t_flags & TF_GPUTINPROG) == 0))
	+ rack_start_gp_measurement(tp, rack, startseq, sb_offset);
	/*
	* If we are doing FO we need to update the mbuf position and subtract
	* this happens when the peer sends us duplicate information and
	@@ -22356,6 +23928,47 @@
	rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
	}
	}
	+ if (rack_pcm_blast == 0) {
	+ if ((orig_len > len) &&
	+ (add_flag & RACK_IS_PCM) &&
	+ (len < pace_max_seg) &&
	+ ((pace_max_seg - len) > segsiz)) {
	+ /*
	+ * We are doing a PCM measurement and we did
	+ * not get enough data in the TSO to meet the
	+ * burst requirement.
	+ */
	+ uint32_t n_len;
	+
	+ n_len = (orig_len - len);
	+ orig_len -= len;
	+ pace_max_seg -= len;
	+ len = n_len;
	+ sb_offset = tp->snd_max - tp->snd_una;
	+ /* Re-lock for the next spin */
	+ SOCKBUF_LOCK(sb);
	+ goto send;
	+ }
	+ } else {
	+ if ((orig_len > len) &&
	+ (add_flag & RACK_IS_PCM) &&
	+ ((orig_len - len) > segsiz)) {
	+ /*
	+ * We are doing a PCM measurement and we did
	+ * not get enough data in the TSO to meet the
	+ * burst requirement.
	+ */
	+ uint32_t n_len;
	+
	+ n_len = (orig_len - len);
	+ orig_len -= len;
	+ len = n_len;
	+ sb_offset = tp->snd_max - tp->snd_una;
	+ /* Re-lock for the next spin */
	+ SOCKBUF_LOCK(sb);
	+ goto send;
	+ }
	+ }
	}
	nomore:
	if (error) {
	@@ -22488,14 +24101,10 @@
	enobufs:
	if (sendalot) {
	/* Do we need to turn off sendalot? */
	- if (rack->r_ctl.rc_pace_max_segs &&
	- (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
	+ if (pace_max_seg &&
	+ (tot_len_this_send >= pace_max_seg)) {
	/* We hit our max. */
	sendalot = 0;
	- } else if ((rack->rc_user_set_max_segs) &&
	- (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
	- /* We hit the user defined max */
	- sendalot = 0;
	}
	}
	if ((error == 0) && (flags & TH_FIN))
	@@ -22515,22 +24124,7 @@
	* hit the else if with slot preset. Other
	* errors return.
	*/
	- slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
	- }
	- if (rsm &&
	- (rsm->r_flags & RACK_HAS_SYN) == 0 &&
	- rack->use_rack_rr) {
	- /* Its a retransmit and we use the rack cheat? */
	- if ((slot == 0) \|\|
	- (rack->rc_always_pace == 0) \|\|
	- (rack->r_rr_config == 1)) {
	- /*
	- * We have no pacing set or we
	- * are using old-style rack or
	- * we are overridden to use the old 1ms pacing.
	- */
	- slot = rack->r_ctl.rc_min_to;
	- }
	+ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
	}
	/* We have sent clear the flag */
	rack->r_ent_rec_ns = 0;
	@@ -22568,9 +24162,9 @@
	rack_use_rfo &&
	((flags & (TH_SYN\|TH_FIN)) == 0) &&
	(rsm == NULL) &&
	- (tp->snd_nxt == tp->snd_max) &&
	(ipoptlen == 0) &&
	(tp->rcv_numsacks == 0) &&
	+ (rack->rc_policer_detected == 0) &&
	rack->r_fsb_inited &&
	TCPS_HAVEESTABLISHED(tp->t_state) &&
	((IN_RECOVERY(tp->t_flags)) == 0) &&
	@@ -22599,7 +24193,6 @@
	(rsm == NULL) &&
	(ipoptlen == 0) &&
	(tp->rcv_numsacks == 0) &&
	- (tp->snd_nxt == tp->snd_max) &&
	(rack->r_must_retran == 0) &&
	rack->r_fsb_inited &&
	TCPS_HAVEESTABLISHED(tp->t_state) &&
	@@ -22625,8 +24218,8 @@
	}
	goto again;
	}
	- /* Assure when we leave that snd_nxt will point to top */
	skip_all_send:
	+ /* Assure when we leave that snd_nxt will point to top */
	if (SEQ_GT(tp->snd_max, tp->snd_nxt))
	tp->snd_nxt = tp->snd_max;
	rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
	@@ -22705,14 +24298,26 @@
	static int
	rack_set_dgp(struct tcp_rack *rack)
	{
	- /* pace_always=1 */
	- if (rack->rc_always_pace == 0) {
	- if (tcp_can_enable_pacing() == 0)
	- return (EBUSY);
	+ if (rack->dgp_on == 1)
	+ return(0);
	+ if ((rack->use_fixed_rate == 1) &&
	+ (rack->rc_always_pace == 1)) {
	+ /*
	+ * We are already pacing another
	+ * way.
	+ */
	+ return (EBUSY);
	+ }
	+ if (rack->rc_always_pace == 1) {
	+ rack_remove_pacing(rack);
	}
	+ if (tcp_incr_dgp_pacing_cnt() == 0)
	+ return (ENOSPC);
	+ rack->r_ctl.pacing_method \|= RACK_DGP_PACING;
	rack->rc_fillcw_apply_discount = 0;
	rack->dgp_on = 1;
	rack->rc_always_pace = 1;
	+ rack->rc_pace_dnd = 1;
	rack->use_fixed_rate = 0;
	if (rack->gp_ready)
	rack_set_cc_pacing(rack);
	@@ -22737,14 +24342,7 @@
	/* npush=2 */
	rack->r_ctl.rc_no_push_at_mrtt = 2;
	/* fillcw=1 */
	- if (rack->r_cwnd_was_clamped == 0) {
	- rack->rc_pace_to_cwnd = 1;
	- } else {
	- rack->rc_pace_to_cwnd = 0;
	- /* Reset all multipliers to 100.0 so just the measured bw */
	- rack->r_ctl.rack_per_of_gp_ss = 100;
	- rack->r_ctl.rack_per_of_gp_ca = 100;
	- }
	+ rack->rc_pace_to_cwnd = 1;
	rack->rc_pace_fill_if_rttin_range = 0;
	rack->rtt_limit_mul = 0;
	/* noprr=1 */
	@@ -22753,12 +24351,9 @@
	rack->r_limit_scw = 1;
	/* gp_inc_rec */
	rack->r_ctl.rack_per_of_gp_rec = 90;
	- rack_client_buffer_level_set(rack);
	return (0);
	}

	-
	-
	static int
	rack_set_profile(struct tcp_rack *rack, int prof)
	{
	@@ -22768,72 +24363,37 @@
	* Profile 1 is "standard" DGP. It ignores
	* client buffer level.
	*/
	- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0;
	err = rack_set_dgp(rack);
	if (err)
	return (err);
	- } else if (prof == 2) {
	- /*
	- * Profile 2 is DGP. Less aggressive with
	- * respect to client buffer level.
	- */
	- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1;
	+ } else if (prof == 6) {
	err = rack_set_dgp(rack);
	if (err)
	return (err);
	- } else if (prof == 3) {
	/*
	- * Profile 3 is DGP. Even Less aggressive with
	- * respect to client buffer level.
	- */
	- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2;
	- err = rack_set_dgp(rack);
	- if (err)
	- return (err);
	- } else if (prof == 4) {
	- /*
	- * Profile 4 is DGP with the most responsiveness
	- * to client buffer level.
	- */
	- rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3;
	- err = rack_set_dgp(rack);
	- if (err)
	- return (err);
	- } else if (prof == 5) {
	- err = rack_set_dgp(rack);
	- if (err)
	- return (err);
	- /*
	- * By turning DGP off we change the rate
	- * picked to be only the one the cwnd and rtt
	- * get us.
	- */
	- rack->dgp_on = 0;
	- } else if (prof == 6) {
	- err = rack_set_dgp(rack);
	- if (err)
	- return (err);
	- /*
	- * Profile 6 tweaks DGP so that it will apply to
	- * fill-cw the same settings that profile5 does
	- * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted).
	+ * Profile 6 tweaks DGP so that it will apply to
	+ * fill-cw the same settings that profile5 does
	+ * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted).
	*/
	rack->rc_fillcw_apply_discount = 1;
	} else if (prof == 0) {
	/* This changes things back to the default settings */
	- rack->dgp_on = 0;
	- rack->rc_hybrid_mode = 0;
	+ if (rack->rc_always_pace == 1) {
	+ rack_remove_pacing(rack);
	+ } else {
	+ /* Make sure any stray flags are off */
	+ rack->dgp_on = 0;
	+ rack->rc_hybrid_mode = 0;
	+ rack->use_fixed_rate = 0;
	+ }
	err = 0;
	if (rack_fill_cw_state)
	rack->rc_pace_to_cwnd = 1;
	else
	rack->rc_pace_to_cwnd = 0;
	- if (rack->rc_always_pace) {
	- tcp_decrement_paced_conn();
	- rack_undo_cc_pacing(rack);
	- rack->rc_always_pace = 0;
	- }
	+
	if (rack_pace_every_seg && tcp_can_enable_pacing()) {
	+ rack->r_ctl.pacing_method \|= RACK_REG_PACING;
	rack->rc_always_pace = 1;
	if (rack->rack_hibeta)
	rack_set_cc_pacing(rack);
	@@ -22883,7 +24443,6 @@
	}
	rack->r_rr_config = 0;
	rack->r_ctl.rc_no_push_at_mrtt = 0;
	- rack->rc_pace_to_cwnd = 0;
	rack->rc_pace_fill_if_rttin_range = 0;
	rack->rtt_limit_mul = 0;

	@@ -22911,7 +24470,7 @@
	struct deferred_opt_list *dol;

	dol = malloc(sizeof(struct deferred_opt_list),
	- M_TCPFSB, M_NOWAIT\|M_ZERO);
	+ M_TCPDO, M_NOWAIT\|M_ZERO);
	if (dol == NULL) {
	/*
	* No space yikes -- fail out..
	@@ -22935,19 +24494,6 @@

	microuptime(&tv);

	- /*
	- * If BB logging is not on we need to look at the DTL flag.
	- * If its on already then those reasons override the DTL input.
	- * We do this with any request, you can turn DTL on, but it does
	- * not turn off at least from hybrid pacing requests.
	- */
	- if (tcp_bblogging_on(rack->rc_tp) == 0) {
	- if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) {
	- /* Turn on BB point logging */
	- tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS,
	- TCP_BBPOINT_REQ_LEVEL_LOGGING);
	- }
	- }
	/* Make sure no fixed rate is on */
	rack->use_fixed_rate = 0;
	rack->r_ctl.rc_fixed_pacing_rate_rec = 0;
	@@ -22962,6 +24508,8 @@
	rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0);
	return (ENOSPC);
	}
	+ /* mask our internal flags */
	+ hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK;
	/* The seq will be snd_una + everything in the buffer */
	seq = sft->start_seq;
	if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) {
	@@ -22986,6 +24534,26 @@
	return (err);
	}
	}
	+ /*
	+ * Now we must switch to hybrid mode as well which also
	+ * means moving to regular pacing.
	+ */
	+ if (rack->rc_hybrid_mode == 0) {
	+ /* First time */
	+ if (tcp_can_enable_pacing()) {
	+ rack->r_ctl.pacing_method \|= RACK_REG_PACING;
	+ rack->rc_hybrid_mode = 1;
	+ } else {
	+ return (ENOSPC);
	+ }
	+ if (rack->r_ctl.pacing_method & RACK_DGP_PACING) {
	+ /*
	+ * This should be true.
	+ */
	+ tcp_dec_dgp_pacing_cnt();
	+ rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
	+ }
	+ }
	/* Now set in our flags */
	sft->hybrid_flags = hybrid->hybrid_flags \| TCP_HYBRID_PACING_WASSET;
	if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR)
	@@ -22996,7 +24564,6 @@
	sft->hint_maxseg = hybrid->hint_maxseg;
	else
	sft->hint_maxseg = 0;
	- rack->rc_hybrid_mode = 1;
	rack->rc_tp->tcp_hybrid_start++;
	rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0);
	return (0);
	@@ -23005,6 +24572,36 @@
	#endif
	}

	+static int
	+rack_stack_information(struct tcpcb tp, struct stack_specific_info si)
	+{
	+ /*
	+ * Gather rack specific information.
	+ */
	+ struct tcp_rack *rack;
	+
	+ rack = (struct tcp_rack *)tp->t_fb_ptr;
	+ /* We pulled a SSI info log out what was there */
	+ policer_detection_log(rack, rack->rc_highly_buffered, 0, 0, 0, 20);
	+ if (rack->policer_detect_on) {
	+ si->policer_detection_enabled = 1;
	+ if (rack->rc_policer_detected) {
	+ si->policer_detected = 1;
	+ si->policer_bucket_size = rack->r_ctl.policer_bucket_size;
	+ si->policer_last_bw = rack->r_ctl.policer_bw;
	+ } else {
	+ si->policer_detected = 0;
	+ si->policer_bucket_size = 0;
	+ si->policer_last_bw = 0;
	+ }
	+ si->current_round = rack->r_ctl.current_round;
	+ si->highly_buffered = rack->rc_highly_buffered;
	+ }
	+ si->bytes_transmitted = tp->t_sndbytes;
	+ si->bytes_retransmitted = tp->t_snd_rxt_bytes;
	+ return (0);
	+}
	+
	static int
	rack_process_option(struct tcpcb tp, struct tcp_rack rack, int sopt_name,
	uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid)
	@@ -23077,34 +24674,7 @@
	}
	break;
	case TCP_RACK_PACING_BETA:
	- RACK_OPTS_INC(tcp_rack_beta);
	- if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
	- /* This only works for newreno. */
	- error = EINVAL;
	- break;
	- }
	- if (rack->rc_pacing_cc_set) {
	- /*
	- * Set them into the real CC module
	- * whats in the rack pcb is the old values
	- * to be used on restoral/
	- */
	- sopt.sopt_dir = SOPT_SET;
	- opt.name = CC_NEWRENO_BETA;
	- opt.val = optval;
	- if (CC_ALGO(tp)->ctl_output != NULL)
	- error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
	- else {
	- error = ENOENT;
	- break;
	- }
	- } else {
	- /*
	- * Not pacing yet so set it into our local
	- * rack pcb storage.
	- */
	- rack->r_ctl.rc_saved_beta.beta = optval;
	- }
	+ error = EINVAL;
	break;
	case TCP_RACK_TIMER_SLOP:
	RACK_OPTS_INC(tcp_rack_timer_slop);
	@@ -23188,8 +24758,29 @@
	else
	rack->r_up_only = 0;
	break;
	+ case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
	+ RACK_OPTS_INC(tcp_fillcw_rate_cap);
	+ rack->r_ctl.fillcw_cap = loptval;
	+ break;
	case TCP_PACING_RATE_CAP:
	RACK_OPTS_INC(tcp_pacing_rate_cap);
	+ if ((rack->dgp_on == 1) &&
	+ (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
	+ /*
	+ * If we are doing DGP we need to switch
	+ * to using the pacing limit.
	+ */
	+ if (tcp_can_enable_pacing() == 0) {
	+ error = ENOSPC;
	+ break;
	+ }
	+ /*
	+ * Now change up the flags and counts to be correct.
	+ */
	+ rack->r_ctl.pacing_method \|= RACK_REG_PACING;
	+ tcp_dec_dgp_pacing_cnt();
	+ rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
	+ }
	rack->r_ctl.bw_rate_cap = loptval;
	break;
	case TCP_HYBRID_PACING:
	@@ -23197,8 +24788,18 @@
	error = EINVAL;
	break;
	}
	+ if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) {
	+ error = EPERM;
	+ break;
	+ }
	error = process_hybrid_pacing(rack, hybrid);
	break;
	+ case TCP_SIDECHAN_DIS: /* URL:scodm */
	+ if (optval)
	+ rack->r_ctl.side_chan_dis_mask = optval;
	+ else
	+ rack->r_ctl.side_chan_dis_mask = 0;
	+ break;
	case TCP_RACK_PROFILE:
	RACK_OPTS_INC(tcp_profile);
	error = rack_set_profile(rack, optval);
	@@ -23224,15 +24825,37 @@
	rack->r_limit_scw = 0;
	break;
	case TCP_RACK_DGP_IN_REC:
	- RACK_OPTS_INC(tcp_dgp_in_rec);
	- if (optval)
	- rack->r_ctl.full_dgp_in_rec = 1;
	- else
	- rack->r_ctl.full_dgp_in_rec = 0;
	+ error = EINVAL;
	+ break;
	+ case TCP_POLICER_DETECT: /* URL:pol_det */
	+ RACK_OPTS_INC(tcp_pol_detect);
	+ rack_translate_policer_detect(rack, optval);
	break;
	- case TCP_RXT_CLAMP:
	- RACK_OPTS_INC(tcp_rxt_clamp);
	- rack_translate_clamp_value(rack, optval);
	+ case TCP_POLICER_MSS:
	+ RACK_OPTS_INC(tcp_pol_mss);
	+ rack->r_ctl.policer_del_mss = (uint8_t)optval;
	+ if (optval & 0x00000100) {
	+ /*
	+ * Value is setup like so:
	+ * VVVV VVVV VVVV VVVV VVVV VVAI MMMM MMMM
	+ * Where MMMM MMMM is MSS setting
	+ * I (9th bit) is the Postive value that
	+ * says it is being set (if its 0 then the
	+ * upper bits 11 - 32 have no meaning.
	+ * This allows setting it off with
	+ * 0x000001MM.
	+ *
	+ * The 10th bit is used to turn on the
	+ * alternate median (not the expanded one).
	+ *
	+ */
	+ rack->r_ctl.pol_bw_comp = (optval >> 10);
	+ }
	+ if (optval & 0x00000200) {
	+ rack->r_ctl.policer_alt_median = 1;
	+ } else {
	+ rack->r_ctl.policer_alt_median = 0;
	+ }
	break;
	case TCP_RACK_PACE_TO_FILL:
	RACK_OPTS_INC(tcp_fillcw);
	@@ -23240,8 +24863,6 @@
	rack->rc_pace_to_cwnd = 0;
	else {
	rack->rc_pace_to_cwnd = 1;
	- if (optval > 1)
	- rack->r_fill_less_agg = 1;
	}
	if ((optval >= rack_gp_rtt_maxmul) &&
	rack_gp_rtt_maxmul &&
	@@ -23299,6 +24920,12 @@
	else
	error = EINVAL;
	break;
	+ case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
	+ if (optval > 0)
	+ rack->cspr_is_fcc = 1;
	+ else
	+ rack->cspr_is_fcc = 0;
	+ break;
	case TCP_TIMELY_DYN_ADJ:
	RACK_OPTS_INC(tcp_timely_dyn);
	if (optval == 0)
	@@ -23341,11 +24968,16 @@
	* method using a pacing rate.
	*/
	RACK_OPTS_INC(tcp_rack_pace_always);
	+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
	+ error = EPERM;
	+ break;
	+ }
	if (optval > 0) {
	if (rack->rc_always_pace) {
	error = EALREADY;
	break;
	} else if (tcp_can_enable_pacing()) {
	+ rack->r_ctl.pacing_method \|= RACK_REG_PACING;
	rack->rc_always_pace = 1;
	if (rack->rack_hibeta)
	rack_set_cc_pacing(rack);
	@@ -23355,10 +24987,8 @@
	break;
	}
	} else {
	- if (rack->rc_always_pace) {
	- tcp_decrement_paced_conn();
	- rack->rc_always_pace = 0;
	- rack_undo_cc_pacing(rack);
	+ if (rack->rc_always_pace == 1) {
	+ rack_remove_pacing(rack);
	}
	}
	if (rack->r_mbuf_queue \|\| rack->rc_always_pace \|\| rack->r_use_cmp_ack)
	@@ -23375,58 +25005,11 @@
	val *= 1000;
	val /= 8;
	rack->r_ctl.init_rate = val;
	- if (rack->rc_init_win != rack_default_init_window) {
	- uint32_t win, snt;
	-
	- /*
	- * Options don't always get applied
	- * in the order you think. So in order
	- * to assure we update a cwnd we need
	- * to check and see if we are still
	- * where we should raise the cwnd.
	- */
	- win = rc_init_window(rack);
	- if (SEQ_GT(tp->snd_max, tp->iss))
	- snt = tp->snd_max - tp->iss;
	- else
	- snt = 0;
	- if ((snt < win) &&
	- (tp->snd_cwnd < win))
	- tp->snd_cwnd = win;
	- }
	if (rack->rc_always_pace)
	rack_update_seg(rack);
	break;
	case TCP_BBR_IWINTSO:
	- RACK_OPTS_INC(tcp_initial_win);
	- if (optval && (optval <= 0xff)) {
	- uint32_t win, snt;
	-
	- rack->rc_init_win = optval;
	- win = rc_init_window(rack);
	- if (SEQ_GT(tp->snd_max, tp->iss))
	- snt = tp->snd_max - tp->iss;
	- else
	- snt = 0;
	- if ((snt < win) &&
	- (tp->t_srtt \|
	- rack->r_ctl.init_rate)) {
	- /*
	- * We are not past the initial window
	- * and we have some bases for pacing,
	- * so we need to possibly adjust up
	- * the cwnd. Note even if we don't set
	- * the cwnd, its still ok to raise the rc_init_win
	- * which can be used coming out of idle when we
	- * would have a rate.
	- */
	- if (tp->snd_cwnd < win)
	- tp->snd_cwnd = win;
	- }
	- if (rack->rc_always_pace)
	- rack_update_seg(rack);
	- } else
	- error = EINVAL;
	+ error = EINVAL;
	break;
	case TCP_RACK_FORCE_MSEG:
	RACK_OPTS_INC(tcp_rack_force_max_seg);
	@@ -23443,6 +25026,24 @@
	case TCP_RACK_PACE_MAX_SEG:
	/* Max segments size in a pace in bytes */
	RACK_OPTS_INC(tcp_rack_max_seg);
	+ if ((rack->dgp_on == 1) &&
	+ (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
	+ /*
	+ * If we set a max-seg and are doing DGP then
	+ * we now fall under the pacing limits not the
	+ * DGP ones.
	+ */
	+ if (tcp_can_enable_pacing() == 0) {
	+ error = ENOSPC;
	+ break;
	+ }
	+ /*
	+ * Now change up the flags and counts to be correct.
	+ */
	+ rack->r_ctl.pacing_method \|= RACK_REG_PACING;
	+ tcp_dec_dgp_pacing_cnt();
	+ rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
	+ }
	if (optval <= MAX_USER_SET_SEG)
	rack->rc_user_set_max_segs = optval;
	else
	@@ -23452,6 +25053,18 @@
	case TCP_RACK_PACE_RATE_REC:
	/* Set the fixed pacing rate in Bytes per second ca */
	RACK_OPTS_INC(tcp_rack_pace_rate_rec);
	+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
	+ error = EPERM;
	+ break;
	+ }
	+ if (rack->dgp_on) {
	+ /*
	+ * We are already pacing another
	+ * way.
	+ */
	+ error = EBUSY;
	+ break;
	+ }
	rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
	if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
	rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
	@@ -23470,6 +25083,18 @@
	case TCP_RACK_PACE_RATE_SS:
	/* Set the fixed pacing rate in Bytes per second ca */
	RACK_OPTS_INC(tcp_rack_pace_rate_ss);
	+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
	+ error = EPERM;
	+ break;
	+ }
	+ if (rack->dgp_on) {
	+ /*
	+ * We are already pacing another
	+ * way.
	+ */
	+ error = EBUSY;
	+ break;
	+ }
	rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
	if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
	rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
	@@ -23488,6 +25113,18 @@
	case TCP_RACK_PACE_RATE_CA:
	/* Set the fixed pacing rate in Bytes per second ca */
	RACK_OPTS_INC(tcp_rack_pace_rate_ca);
	+ if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
	+ error = EPERM;
	+ break;
	+ }
	+ if (rack->dgp_on) {
	+ /*
	+ * We are already pacing another
	+ * way.
	+ */
	+ error = EBUSY;
	+ break;
	+ }
	rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
	if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
	rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
	@@ -23571,6 +25208,41 @@
	rack->r_rack_hw_rate_caps = 0;
	}
	break;
	+ case TCP_DGP_UPPER_BOUNDS:
	+ {
	+ uint8_t val;
	+ val = optval & 0x0000ff;
	+ rack->r_ctl.rack_per_upper_bound_ca = val;
	+ val = (optval >> 16) & 0x0000ff;
	+ rack->r_ctl.rack_per_upper_bound_ss = val;
	+ break;
	+ }
	+ case TCP_SS_EEXIT: /* URL:eexit */
	+ if (optval > 0) {
	+ rack->r_ctl.gp_rnd_thresh = optval & 0x0ff;
	+ if (optval & 0x10000) {
	+ rack->r_ctl.gate_to_fs = 1;
	+ } else {
	+ rack->r_ctl.gate_to_fs = 0;
	+ }
	+ if (optval & 0x20000) {
	+ rack->r_ctl.use_gp_not_last = 1;
	+ } else {
	+ rack->r_ctl.use_gp_not_last = 0;
	+ }
	+ if (optval & 0xfffc0000) {
	+ uint32_t v;
	+
	+ v = (optval >> 18) & 0x00003fff;
	+ if (v >= 1000)
	+ rack->r_ctl.gp_gain_req = v;
	+ }
	+ } else {
	+ /* We do not do ss early exit at all */
	+ rack->rc_initial_ss_comp = 1;
	+ rack->r_ctl.gp_rnd_thresh = 0;
	+ }
	+ break;
	case TCP_RACK_SPLIT_LIMIT:
	RACK_OPTS_INC(tcp_split_limit);
	rack->r_ctl.rc_split_limit = optval;
	@@ -23681,6 +25353,50 @@
	else
	rack->r_ctl.rc_rate_sample_method = optval;
	break;
	+ case TCP_HONOR_HPTS_MIN:
	+ RACK_OPTS_INC(tcp_honor_hpts);
	+ if (optval) {
	+ rack->r_use_hpts_min = 1;
	+ /*
	+ * Must be between 2 - 80% to be a reduction else
	+ * we keep the default (10%).
	+ */
	+ if ((optval > 1) && (optval <= 80)) {
	+ rack->r_ctl.max_reduction = optval;
	+ }
	+ } else
	+ rack->r_use_hpts_min = 0;
	+ break;
	+ case TCP_REC_IS_DYN: /* URL:dynrec */
	+ RACK_OPTS_INC(tcp_dyn_rec);
	+ if (optval)
	+ rack->rc_gp_no_rec_chg = 1;
	+ else
	+ rack->rc_gp_no_rec_chg = 0;
	+ break;
	+ case TCP_NO_TIMELY:
	+ RACK_OPTS_INC(tcp_notimely);
	+ if (optval) {
	+ rack->rc_skip_timely = 1;
	+ rack->r_ctl.rack_per_of_gp_rec = 90;
	+ rack->r_ctl.rack_per_of_gp_ca = 100;
	+ rack->r_ctl.rack_per_of_gp_ss = 250;
	+ } else {
	+ rack->rc_skip_timely = 0;
	+ }
	+ break;
	+ case TCP_GP_USE_LTBW:
	+ if (optval == 0) {
	+ rack->use_lesser_lt_bw = 0;
	+ rack->dis_lt_bw = 1;
	+ } else if (optval == 1) {
	+ rack->use_lesser_lt_bw = 1;
	+ rack->dis_lt_bw = 0;
	+ } else if (optval == 2) {
	+ rack->use_lesser_lt_bw = 0;
	+ rack->dis_lt_bw = 0;
	+ }
	+ break;
	case TCP_DATA_AFTER_CLOSE:
	RACK_OPTS_INC(tcp_data_after_close);
	if (optval)
	@@ -23695,6 +25411,431 @@
	return (error);
	}

	+static void
	+rack_inherit(struct tcpcb tp, struct inpcb parent)
	+{
	+ /*
	+ * A new connection has been created (tp) and
	+ * the parent is the inpcb given. We want to
	+ * apply a read-lock to the parent (we are already
	+ * holding a write lock on the tp) and copy anything
	+ * out of the rack specific data as long as its tfb is
	+ * the same as ours i.e. we are the same stack. Otherwise
	+ * we just return.
	+ */
	+ struct tcpcb *par;
	+ struct tcp_rack dest, src;
	+ int cnt = 0;
	+
	+ par = intotcpcb(parent);
	+ if (par->t_fb != tp->t_fb) {
	+ /* Not the same stack */
	+ tcp_log_socket_option(tp, 0, 0, 1);
	+ return;
	+ }
	+ /* Ok if we reach here lets setup the two rack pointers */
	+ dest = (struct tcp_rack *)tp->t_fb_ptr;
	+ src = (struct tcp_rack *)par->t_fb_ptr;
	+ if ((src == NULL) \|\| (dest == NULL)) {
	+ /* Huh? */
	+ tcp_log_socket_option(tp, 0, 0, 2);
	+ return;
	+ }
	+ /* Now copy out anything we wish to inherit i.e. things in socket-options */
	+ /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */
	+ if ((src->dgp_on) && (dest->dgp_on == 0)) {
	+ /* Profile 1 had to be set via sock opt */
	+ rack_set_dgp(dest);
	+ cnt++;
	+ }
	+ /* TCP_RACK_SET_RXT_OPTIONS */
	+ if (dest->full_size_rxt != src->full_size_rxt) {
	+ dest->full_size_rxt = src->full_size_rxt;
	+ cnt++;
	+ }
	+ if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) {
	+ dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min;
	+ cnt++;
	+ }
	+ /* TCP_RACK_DSACK_OPT */
	+ if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) {
	+ dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based;
	+ cnt++;
	+ }
	+ if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) {
	+ dest->rc_rack_use_dsack = src->rc_rack_use_dsack;
	+ cnt++;
	+ }
	+ /* TCP_RACK_PACING_DIVISOR */
	+ if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) {
	+ dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor;
	+ cnt++;
	+ }
	+ /* TCP_RACK_HI_BETA */
	+ if (src->rack_hibeta != dest->rack_hibeta) {
	+ cnt++;
	+ if (src->rack_hibeta) {
	+ dest->r_ctl.rc_saved_beta.beta = src->r_ctl.rc_saved_beta.beta;
	+ dest->rack_hibeta = 1;
	+ } else {
	+ dest->rack_hibeta = 0;
	+ }
	+ }
	+ /* TCP_RACK_TIMER_SLOP */
	+ if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) {
	+ dest->r_ctl.timer_slop = src->r_ctl.timer_slop;
	+ cnt++;
	+ }
	+ /* TCP_RACK_PACING_BETA_ECN */
	+ if (dest->r_ctl.rc_saved_beta.beta_ecn != src->r_ctl.rc_saved_beta.beta_ecn) {
	+ dest->r_ctl.rc_saved_beta.beta_ecn = src->r_ctl.rc_saved_beta.beta_ecn;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.rc_saved_beta.newreno_flags != src->r_ctl.rc_saved_beta.newreno_flags) {
	+ dest->r_ctl.rc_saved_beta.newreno_flags = src->r_ctl.rc_saved_beta.newreno_flags;
	+ cnt++;
	+ }
	+ /* We do not do TCP_DEFER_OPTIONS */
	+ /* TCP_RACK_MEASURE_CNT */
	+ if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) {
	+ dest->r_ctl.req_measurements = src->r_ctl.req_measurements;
	+ cnt++;
	+ }
	+ /* TCP_HDWR_UP_ONLY */
	+ if (dest->r_up_only != src->r_up_only) {
	+ dest->r_up_only = src->r_up_only;
	+ cnt++;
	+ }
	+ /* TCP_FILLCW_RATE_CAP */
	+ if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) {
	+ dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap;
	+ cnt++;
	+ }
	+ /* TCP_PACING_RATE_CAP */
	+ if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) {
	+ dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap;
	+ cnt++;
	+ }
	+ /* A listener can't set TCP_HYBRID_PACING */
	+ /* TCP_SIDECHAN_DIS */
	+ if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) {
	+ dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask;
	+ cnt++;
	+ }
	+ /* TCP_SHARED_CWND_TIME_LIMIT */
	+ if (dest->r_limit_scw != src->r_limit_scw) {
	+ dest->r_limit_scw = src->r_limit_scw;
	+ cnt++;
	+ }
	+ /* TCP_POLICER_DETECT */
	+ if (dest->r_ctl.policer_rxt_threshold != src->r_ctl.policer_rxt_threshold) {
	+ dest->r_ctl.policer_rxt_threshold = src->r_ctl.policer_rxt_threshold;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.policer_avg_threshold != src->r_ctl.policer_avg_threshold) {
	+ dest->r_ctl.policer_avg_threshold = src->r_ctl.policer_avg_threshold;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.policer_med_threshold != src->r_ctl.policer_med_threshold) {
	+ dest->r_ctl.policer_med_threshold = src->r_ctl.policer_med_threshold;
	+ cnt++;
	+ }
	+ if (dest->policer_detect_on != src->policer_detect_on) {
	+ dest->policer_detect_on = src->policer_detect_on;
	+ cnt++;
	+ }
	+
	+ if (dest->r_ctl.saved_policer_val != src->r_ctl.saved_policer_val) {
	+ dest->r_ctl.saved_policer_val = src->r_ctl.saved_policer_val;
	+ cnt++;
	+ }
	+ /* TCP_POLICER_MSS */
	+ if (dest->r_ctl.policer_del_mss != src->r_ctl.policer_del_mss) {
	+ dest->r_ctl.policer_del_mss = src->r_ctl.policer_del_mss;
	+ cnt++;
	+ }
	+
	+ if (dest->r_ctl.pol_bw_comp != src->r_ctl.pol_bw_comp) {
	+ dest->r_ctl.pol_bw_comp = src->r_ctl.pol_bw_comp;
	+ cnt++;
	+ }
	+
	+ if (dest->r_ctl.policer_alt_median != src->r_ctl.policer_alt_median) {
	+ dest->r_ctl.policer_alt_median = src->r_ctl.policer_alt_median;
	+ cnt++;
	+ }
	+ /* TCP_RACK_PACE_TO_FILL */
	+ if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) {
	+ dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd;
	+ cnt++;
	+ }
	+ if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) {
	+ dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range;
	+ cnt++;
	+ }
	+ if (dest->rtt_limit_mul != src->rtt_limit_mul) {
	+ dest->rtt_limit_mul = src->rtt_limit_mul;
	+ cnt++;
	+ }
	+ /* TCP_RACK_NO_PUSH_AT_MAX */
	+ if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) {
	+ dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt;
	+ cnt++;
	+ }
	+ /* TCP_SHARED_CWND_ENABLE */
	+ if (dest->rack_enable_scwnd != src->rack_enable_scwnd) {
	+ dest->rack_enable_scwnd = src->rack_enable_scwnd;
	+ cnt++;
	+ }
	+ /* TCP_USE_CMP_ACKS */
	+ if (dest->r_use_cmp_ack != src->r_use_cmp_ack) {
	+ dest->r_use_cmp_ack = src->r_use_cmp_ack;
	+ cnt++;
	+ }
	+
	+ if (dest->r_mbuf_queue != src->r_mbuf_queue) {
	+ dest->r_mbuf_queue = src->r_mbuf_queue;
	+ cnt++;
	+ }
	+ /* TCP_RACK_MBUF_QUEUE */
	+ if (dest->r_mbuf_queue != src->r_mbuf_queue) {
	+ dest->r_mbuf_queue = src->r_mbuf_queue;
	+ cnt++;
	+ }
	+ if (dest->r_mbuf_queue \|\| dest->rc_always_pace \|\| dest->r_use_cmp_ack) {
	+ tp->t_flags2 \|= TF2_SUPPORTS_MBUFQ;
	+ } else {
	+ tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
	+ }
	+ if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) {
	+ tp->t_flags2 \|= TF2_MBUF_ACKCMP;
	+ }
	+ /* TCP_RACK_NONRXT_CFG_RATE */
	+ if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) {
	+ dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr;
	+ cnt++;
	+ }
	+ /* TCP_NO_PRR */
	+ if (dest->rack_no_prr != src->rack_no_prr) {
	+ dest->rack_no_prr = src->rack_no_prr;
	+ cnt++;
	+ }
	+ if (dest->no_prr_addback != src->no_prr_addback) {
	+ dest->no_prr_addback = src->no_prr_addback;
	+ cnt++;
	+ }
	+ /* RACK_CSPR_IS_FCC */
	+ if (dest->cspr_is_fcc != src->cspr_is_fcc) {
	+ dest->cspr_is_fcc = src->cspr_is_fcc;
	+ cnt++;
	+ }
	+ /* TCP_TIMELY_DYN_ADJ */
	+ if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) {
	+ dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
	+ dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
	+ cnt++;
	+ }
	+ /* TCP_RACK_DO_DETECTION */
	+ if (dest->do_detection != src->do_detection) {
	+ dest->do_detection = src->do_detection;
	+ cnt++;
	+ }
	+ /* TCP_RACK_TLP_USE */
	+ if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) {
	+ dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use;
	+ cnt++;
	+ }
	+ /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */
	+ /* TCP_BBR_RACK_INIT_RATE */
	+ if (dest->r_ctl.init_rate != src->r_ctl.init_rate) {
	+ dest->r_ctl.init_rate = src->r_ctl.init_rate;
	+ cnt++;
	+ }
	+ /* TCP_RACK_FORCE_MSEG */
	+ if (dest->rc_force_max_seg != src->rc_force_max_seg) {
	+ dest->rc_force_max_seg = src->rc_force_max_seg;
	+ cnt++;
	+ }
	+ /* TCP_RACK_PACE_MIN_SEG */
	+ if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) {
	+ dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs;
	+ cnt++;
	+ }
	+ /* we don't allow TCP_RACK_PACE_MAX_SEG */
	+ /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */
	+ if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) {
	+ dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) {
	+ dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) {
	+ dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec;
	+ cnt++;
	+ }
	+ /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */
	+ if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) {
	+ dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
	+ dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
	+ cnt++;
	+ }
	+
	+ if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) {
	+ dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss;
	+ cnt++;
	+ }
	+ /* TCP_RACK_RR_CONF */
	+ if (dest->r_rr_config != src->r_rr_config) {
	+ dest->r_rr_config = src->r_rr_config;
	+ cnt++;
	+ }
	+ /* TCP_PACING_DND */
	+ if (dest->rc_pace_dnd != src->rc_pace_dnd) {
	+ dest->rc_pace_dnd = src->rc_pace_dnd;
	+ cnt++;
	+ }
	+ /* TCP_HDWR_RATE_CAP */
	+ if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) {
	+ dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps;
	+ cnt++;
	+ }
	+ /* TCP_DGP_UPPER_BOUNDS */
	+ if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) {
	+ dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) {
	+ dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss;
	+ cnt++;
	+ }
	+ /* TCP_SS_EEXIT */
	+ if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) {
	+ dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) {
	+ dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) {
	+ dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) {
	+ dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req;
	+ cnt++;
	+ }
	+ /* TCP_BBR_HDWR_PACE */
	+ if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) {
	+ dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena;
	+ cnt++;
	+ }
	+ if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) {
	+ dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace;
	+ cnt++;
	+ }
	+ /* TCP_RACK_PRR_SENDALOT */
	+ if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) {
	+ dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot;
	+ cnt++;
	+ }
	+ /* TCP_RACK_MIN_TO */
	+ if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) {
	+ dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to;
	+ cnt++;
	+ }
	+ /* TCP_RACK_EARLY_SEG */
	+ if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) {
	+ dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs;
	+ cnt++;
	+ }
	+ /* TCP_RACK_ENABLE_HYSTART */
	+ if (par->t_ccv.flags != tp->t_ccv.flags) {
	+ cnt++;
	+ if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) {
	+ tp->t_ccv.flags \|= CCF_HYSTART_ALLOWED;
	+ if (rack_do_hystart > RACK_HYSTART_ON)
	+ tp->t_ccv.flags \|= CCF_HYSTART_CAN_SH_CWND;
	+ if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
	+ tp->t_ccv.flags \|= CCF_HYSTART_CONS_SSTH;
	+ } else {
	+ tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED\|CCF_HYSTART_CAN_SH_CWND\|CCF_HYSTART_CONS_SSTH);
	+ }
	+ }
	+ /* TCP_RACK_REORD_THRESH */
	+ if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) {
	+ dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift;
	+ cnt++;
	+ }
	+ /* TCP_RACK_REORD_FADE */
	+ if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) {
	+ dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade;
	+ cnt++;
	+ }
	+ /* TCP_RACK_TLP_THRESH */
	+ if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) {
	+ dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold;
	+ cnt++;
	+ }
	+ /* TCP_BBR_USE_RACK_RR */
	+ if (dest->use_rack_rr != src->use_rack_rr) {
	+ dest->use_rack_rr = src->use_rack_rr;
	+ cnt++;
	+ }
	+ /* TCP_RACK_PKT_DELAY */
	+ if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) {
	+ dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay;
	+ cnt++;
	+ }
	+ /* TCP_DELACK will get copied via the main code if applicable */
	+ /* TCP_BBR_RACK_RTT_USE */
	+ if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) {
	+ dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method;
	+ cnt++;
	+ }
	+ /* TCP_HONOR_HPTS_MIN */
	+ if (dest->r_use_hpts_min != src->r_use_hpts_min) {
	+ dest->r_use_hpts_min = src->r_use_hpts_min;
	+ cnt++;
	+ }
	+ if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) {
	+ dest->r_ctl.max_reduction = src->r_ctl.max_reduction;
	+ cnt++;
	+ }
	+ /* TCP_REC_IS_DYN */
	+ if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) {
	+ dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg;
	+ cnt++;
	+ }
	+ if (dest->rc_skip_timely != src->rc_skip_timely) {
	+ dest->rc_skip_timely = src->rc_skip_timely;
	+ cnt++;
	+ }
	+ /* TCP_DATA_AFTER_CLOSE */
	+ if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) {
	+ dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo;
	+ cnt++;
	+ }
	+ /* TCP_GP_USE_LTBW */
	+ if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) {
	+ dest->use_lesser_lt_bw = src->use_lesser_lt_bw;
	+ cnt++;
	+ }
	+ if (dest->dis_lt_bw != src->dis_lt_bw) {
	+ dest->dis_lt_bw = src->dis_lt_bw;
	+ cnt++;
	+ }
	+ tcp_log_socket_option(tp, 0, cnt, 0);
	+}
	+

	static void
	rack_apply_deferred_options(struct tcp_rack *rack)
	@@ -23778,7 +25919,10 @@
	.tfb_switch_failed = rack_switch_failed,
	.tfb_early_wake_check = rack_wake_check,
	.tfb_compute_pipe = rack_compute_pipe,
	+ .tfb_stack_info = rack_stack_information,
	+ .tfb_inherit = rack_inherit,
	.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
	+
	};

	/*
	@@ -23846,7 +25990,6 @@
	/* Already read in and sanity checked in sosetopt(). */
	if (inp->inp_socket) {
	rack->client_bufferlvl = inp->inp_socket->so_peerprio;
	- rack_client_buffer_level_set(rack);
	}
	break;
	}
	@@ -23859,7 +26002,6 @@
	/* Pacing related ones */
	case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
	case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
	- case TCP_BBR_IWINTSO: /* URL:tso_iwin */
	case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */
	case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
	case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
	@@ -23874,12 +26016,12 @@
	case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */
	case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */
	case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */
	- case TCP_RACK_PACING_BETA: /* URL:pacing_beta */
	+ case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
	case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
	case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
	- case TCP_RACK_DGP_IN_REC: /* URL:dgpinrec */
	/* End pacing related */
	- case TCP_RXT_CLAMP: /* URL:rxtclamp */
	+ case TCP_POLICER_DETECT: /* URL:pol_det */
	+ case TCP_POLICER_MSS: /* URL:pol_mss */
	case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
	case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
	case TCP_RACK_MIN_TO: /* URL:min_to */
	@@ -23901,7 +26043,8 @@
	case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
	case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
	case TCP_RACK_PROFILE: /* URL:profile */
	- case TCP_HYBRID_PACING: /* URL:hybrid */
	+ case TCP_SIDECHAN_DIS: /* URL:scodm */
	+ case TCP_HYBRID_PACING: /* URL:pacing=hybrid */
	case TCP_USE_CMP_ACKS: /* URL:cmpack */
	case TCP_RACK_ABC_VAL: /* URL:labc */
	case TCP_REC_ABC_VAL: /* URL:reclabc */
	@@ -23913,8 +26056,15 @@
	case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */
	case TCP_RACK_HI_BETA: /* URL:hibeta */
	case TCP_RACK_SPLIT_LIMIT: /* URL:split */
	+ case TCP_SS_EEXIT: /* URL:eexit */
	+ case TCP_DGP_UPPER_BOUNDS: /* URL:upper */
	case TCP_RACK_PACING_DIVISOR: /* URL:divisor */
	case TCP_PACING_DND: /* URL:dnd */
	+ case TCP_NO_TIMELY: /* URL:notimely */
	+ case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
	+ case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */
	+ case TCP_REC_IS_DYN: /* URL:dynrec */
	+ case TCP_GP_USE_LTBW: /* URL:useltbw */
	goto process_opt;
	break;
	default:
	@@ -23922,14 +26072,14 @@
	return (tcp_default_ctloutput(tp, sopt));
	break;
	}
	-
	default:
	INP_WUNLOCK(inp);
	return (0);
	}
	process_opt:
	INP_WUNLOCK(inp);
	- if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
	+ if ((sopt->sopt_name == TCP_PACING_RATE_CAP) \|\|
	+ (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) {
	error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
	/*
	* We truncate it down to 32 bits for the socket-option trace this
	@@ -23953,11 +26103,10 @@
	if (rack->defer_options && (rack->gp_ready == 0) &&
	(sopt->sopt_name != TCP_DEFER_OPTIONS) &&
	(sopt->sopt_name != TCP_HYBRID_PACING) &&
	- (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
	(sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) &&
	(sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
	(sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
	- /* Options are beind deferred */
	+ /* Options are being deferred */
	if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
	INP_WUNLOCK(inp);
	return (0);
	@@ -24016,6 +26165,7 @@
	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
	ti->tcpi_total_tlp = tp->t_sndtlppack;
	ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
	+ ti->tcpi_rttmin = tp->t_rttlow;
	#ifdef NETFLIX_STATS
	memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
	#endif
	@@ -24062,21 +26212,6 @@
	* when you exit recovery.
	*/
	case TCP_RACK_PACING_BETA:
	- if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
	- error = EINVAL;
	- else if (rack->rc_pacing_cc_set == 0)
	- optval = rack->r_ctl.rc_saved_beta.beta;
	- else {
	- /*
	- * Reach out into the CC data and report back what
	- * I have previously set. Yeah it looks hackish but
	- * we don't want to report the saved values.
	- */
	- if (tp->t_ccv.cc_data)
	- optval = ((struct newreno *)tp->t_ccv.cc_data)->beta;
	- else
	- error = EINVAL;
	- }
	break;
	/*
	* Beta_ecn is the congestion control value for NewReno that influences how
	@@ -24112,7 +26247,7 @@
	optval \|= 2;
	}
	break;
	- case TCP_RACK_ENABLE_HYSTART:
	+ case TCP_RACK_ENABLE_HYSTART:
	{
	if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
	optval = RACK_HYSTART_ON;
	@@ -24126,13 +26261,16 @@
	}
	break;
	case TCP_RACK_DGP_IN_REC:
	- optval = rack->r_ctl.full_dgp_in_rec;
	+ error = EINVAL;
	break;
	case TCP_RACK_HI_BETA:
	optval = rack->rack_hibeta;
	break;
	- case TCP_RXT_CLAMP:
	- optval = rack->r_ctl.saved_rxt_clamp_val;
	+ case TCP_POLICER_MSS:
	+ optval = rack->r_ctl.policer_del_mss;
	+ break;
	+ case TCP_POLICER_DETECT:
	+ optval = rack->r_ctl.saved_policer_val;
	break;
	case TCP_DEFER_OPTIONS:
	optval = rack->defer_options;
	@@ -24149,6 +26287,9 @@
	case TCP_HDWR_UP_ONLY:
	optval= rack->r_up_only;
	break;
	+ case TCP_FILLCW_RATE_CAP:
	+ loptval = rack->r_ctl.fillcw_cap;
	+ break;
	case TCP_PACING_RATE_CAP:
	loptval = rack->r_ctl.bw_rate_cap;
	break;
	@@ -24156,6 +26297,9 @@
	/* You cannot retrieve a profile, its write only */
	error = EINVAL;
	break;
	+ case TCP_SIDECHAN_DIS:
	+ optval = rack->r_ctl.side_chan_dis_mask;
	+ break;
	case TCP_HYBRID_PACING:
	/* You cannot retrieve hybrid pacing information, its write only */
	error = EINVAL;
	@@ -24165,8 +26309,6 @@
	break;
	case TCP_RACK_PACE_TO_FILL:
	optval = rack->rc_pace_to_cwnd;
	- if (optval && rack->r_fill_less_agg)
	- optval++;
	break;
	case TCP_RACK_NO_PUSH_AT_MAX:
	optval = rack->r_ctl.rc_no_push_at_mrtt;
	@@ -24185,6 +26327,18 @@
	else
	optval = 0;
	break;
	+ case TCP_GP_USE_LTBW:
	+ if (rack->dis_lt_bw) {
	+ /* It is not used */
	+ optval = 0;
	+ } else if (rack->use_lesser_lt_bw) {
	+ /* we use min() */
	+ optval = 1;
	+ } else {
	+ /* we use max() */
	+ optval = 2;
	+ }
	+ break;
	case TCP_RACK_DO_DETECTION:
	optval = rack->do_detection;
	break;
	@@ -24192,11 +26346,14 @@
	/* Now do we use the LRO mbuf-queue feature */
	optval = rack->r_mbuf_queue;
	break;
	+ case RACK_CSPR_IS_FCC:
	+ optval = rack->cspr_is_fcc;
	+ break;
	case TCP_TIMELY_DYN_ADJ:
	optval = rack->rc_gp_dyn_mul;
	break;
	case TCP_BBR_IWINTSO:
	- optval = rack->rc_init_win;
	+ error = EINVAL;
	break;
	case TCP_RACK_TLP_REDUCE:
	/* RACK TLP cwnd reduction (bool) */
	@@ -24242,6 +26399,18 @@
	/* RACK reorder threshold (shift amount) */
	optval = rack->r_ctl.rc_reorder_shift;
	break;
	+ case TCP_SS_EEXIT:
	+ if (rack->r_ctl.gp_rnd_thresh) {
	+ uint32_t v;
	+
	+ v = rack->r_ctl.gp_gain_req;
	+ v <<= 17;
	+ optval = v \| (rack->r_ctl.gp_rnd_thresh & 0xff);
	+ if (rack->r_ctl.gate_to_fs == 1)
	+ optval \|= 0x10000;
	+ } else
	+ optval = 0;
	+ break;
	case TCP_RACK_REORD_FADE:
	/* Does reordering fade after ms time */
	optval = rack->r_ctl.rc_reorder_fade;
	@@ -24282,6 +26451,11 @@
	case TCP_RACK_PACE_RATE_REC:
	optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
	break;
	+ case TCP_DGP_UPPER_BOUNDS:
	+ optval = rack->r_ctl.rack_per_upper_bound_ss;
	+ optval <<= 16;
	+ optval \|= rack->r_ctl.rack_per_upper_bound_ca;
	+ break;
	case TCP_RACK_GP_INCREASE_SS:
	optval = rack->r_ctl.rack_per_of_gp_ca;
	break;
	@@ -24303,6 +26477,18 @@
	case TCP_SHARED_CWND_TIME_LIMIT:
	optval = rack->r_limit_scw;
	break;
	+ case TCP_HONOR_HPTS_MIN:
	+ if (rack->r_use_hpts_min)
	+ optval = rack->r_ctl.max_reduction;
	+ else
	+ optval = 0;
	+ break;
	+ case TCP_REC_IS_DYN:
	+ optval = rack->rc_gp_no_rec_chg;
	+ break;
	+ case TCP_NO_TIMELY:
	+ optval = rack->rc_skip_timely;
	+ break;
	case TCP_RACK_TIMER_SLOP:
	optval = rack->r_ctl.timer_slop;
	break;
	@@ -24312,7 +26498,8 @@
	}
	INP_WUNLOCK(inp);
	if (error == 0) {
	- if (TCP_PACING_RATE_CAP)
	+ if ((sopt->sopt_name == TCP_PACING_RATE_CAP) \|\|
	+ (sopt->sopt_name == TCP_FILLCW_RATE_CAP))
	error = sooptcopyout(sopt, &loptval, sizeof loptval);
	else
	error = sooptcopyout(sopt, &optval, sizeof optval);
	diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c
	new file mode 100644
	diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h
	--- a/sys/netinet/tcp_stacks/sack_filter.h
	+++ b/sys/netinet/tcp_stacks/sack_filter.h
	@@ -51,5 +51,10 @@
	int sack_filter_blks(struct sack_filter sf, struct sackblk in, int numblks,
	tcp_seq th_ack);
	void sack_filter_reject(struct sack_filter sf, struct sackblk in);
	+static inline uint8_t sack_filter_blks_used(struct sack_filter *sf)
	+{
	+ return (sf->sf_used);
	+}
	+
	#endif
	#endif
	diff --git a/sys/netinet/tcp_stacks/tailq_hash.h b/sys/netinet/tcp_stacks/tailq_hash.h
	--- a/sys/netinet/tcp_stacks/tailq_hash.h
	+++ b/sys/netinet/tcp_stacks/tailq_hash.h
	@@ -13,10 +13,12 @@
	#define MAX_ALLOWED_SEQ_RANGE (SEQ_BUCKET_SIZE * (MAX_HASH_ENTRIES-1))

	struct tailq_hash {
	- struct rack_head ht[MAX_HASH_ENTRIES];
	uint32_t min;
	uint32_t max;
	uint32_t count;
	+ struct rack_sendmap *rsm_min;
	+ struct rack_sendmap *rsm_max;
	+ struct rack_head ht[MAX_HASH_ENTRIES];
	};

	struct rack_sendmap *
	@@ -53,6 +55,10 @@
	int
	tqhash_trim(struct tailq_hash *hs, uint32_t th_ack);

	+void
	+tqhash_update_end(struct tailq_hash hs, struct rack_sendmap rsm,
	+ uint32_t th_ack);
	+

	#define TQHASH_FOREACH(var, head) \
	for ((var) = tqhash_min((head)); \
	diff --git a/sys/netinet/tcp_stacks/tailq_hash.c b/sys/netinet/tcp_stacks/tailq_hash.c
	--- a/sys/netinet/tcp_stacks/tailq_hash.c
	+++ b/sys/netinet/tcp_stacks/tailq_hash.c
	@@ -65,7 +65,6 @@
	#include <netinet/tcp_log_buf.h>
	#include <netinet/tcp_syncache.h>
	#include <netinet/tcp_hpts.h>
	-#include <netinet/tcp_ratelimit.h>
	#include <netinet/tcp_accounting.h>
	#include <netinet/tcpip.h>
	#include <netinet/cc/cc.h>
	@@ -100,6 +99,7 @@
	#include "sack_filter.h"
	#include "tcp_rack.h"
	#include "tailq_hash.h"
	+#include "opt_global.h"


	struct rack_sendmap *
	@@ -107,7 +107,7 @@
	{
	struct rack_sendmap *rsm;

	- rsm = tqhash_find(hs, hs->min);
	+ rsm = hs->rsm_min;
	return(rsm);
	}

	@@ -116,7 +116,7 @@
	{
	struct rack_sendmap *rsm;

	- rsm = tqhash_find(hs, (hs->max - 1));
	+ rsm = hs->rsm_max;
	return (rsm);
	}

	@@ -224,13 +224,19 @@
	void
	tqhash_remove(struct tailq_hash hs, struct rack_sendmap rsm, int type)
	{
	- TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
	+
	hs->count--;
	if (hs->count == 0) {
	hs->min = hs->max;
	+ hs->rsm_max = hs->rsm_min = NULL;
	} else if (type == REMOVE_TYPE_CUMACK) {
	hs->min = rsm->r_end;
	+ hs->rsm_min = tqhash_next(hs, rsm);
	+ } else if (rsm == hs->rsm_max) {
	+ hs->rsm_max = tqhash_prev(hs, rsm);
	+ hs->max = hs->rsm_max->r_end;
	}
	+ TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next);
	}

	int
	@@ -240,6 +246,7 @@
	int inserted = 0;
	uint32_t ebucket;

	+#ifdef INVARIANTS
	if (hs->count > 0) {
	if ((rsm->r_end - hs->min) > MAX_ALLOWED_SEQ_RANGE) {
	return (-1);
	@@ -249,6 +256,7 @@
	return (-2);
	}
	}
	+#endif
	rsm->bindex = rsm->r_start / SEQ_BUCKET_SIZE;
	rsm->bindex %= MAX_HASH_ENTRIES;
	ebucket = rsm->r_end / SEQ_BUCKET_SIZE;
	@@ -263,13 +271,17 @@
	/* Special case */
	hs->min = rsm->r_start;
	hs->max = rsm->r_end;
	+ hs->rsm_min = hs->rsm_max = rsm;
	hs->count = 1;
	} else {
	hs->count++;
	- if (SEQ_GT(rsm->r_end, hs->max))
	+ if (SEQ_GEQ(rsm->r_end, hs->max)) {
	hs->max = rsm->r_end;
	- if (SEQ_LT(rsm->r_start, hs->min))
	+ hs->rsm_max = rsm;
	+ } if (SEQ_LEQ(rsm->r_start, hs->min)) {
	hs->min = rsm->r_start;
	+ hs->rsm_min = rsm;
	+ }
	}
	/* Check the common case of inserting at the end */
	l = TAILQ_LAST(&hs->ht[rsm->bindex], rack_head);
	@@ -299,6 +311,7 @@
	TAILQ_INIT(&hs->ht[i]);
	}
	hs->min = hs->max = 0;
	+ hs->rsm_min = hs->rsm_max = NULL;
	hs->count = 0;
	}

	@@ -339,3 +352,11 @@
	return (0);
	}

	+void
	+tqhash_update_end(struct tailq_hash hs, struct rack_sendmap rsm,
	+ uint32_t th_ack)
	+{
	+ if (hs->max == rsm->r_end)
	+ hs->max = th_ack;
	+ rsm->r_end = th_ack;
	+}
	diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
	--- a/sys/netinet/tcp_stacks/tcp_rack.h
	+++ b/sys/netinet/tcp_stacks/tcp_rack.h
	@@ -48,6 +48,8 @@
	#define RACK_MERGED 0x080000/* The RSM was merged */
	#define RACK_PMTU_CHG 0x100000/* The path mtu changed on this guy */
	#define RACK_STRADDLE 0x200000/* The seq straddles the bucket line */
	+#define RACK_WAS_LOST 0x400000/* Is the rsm considered lost */
	+#define RACK_IS_PCM 0x800000/* A PCM measurement is being taken */
	#define RACK_NUM_OF_RETRANS 3

	#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
	@@ -63,6 +65,7 @@
	uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */
	uint32_t r_flags : 24, /* Flags as defined above */
	r_rtr_cnt : 8; /* Retran count, index this -1 to get time */
	+ uint32_t r_act_rxt_cnt; /* The actual total count of transmits */
	struct mbuf *m;
	uint32_t soff;
	uint32_t orig_m_len; /* The original mbuf len when we sent (can update) */
	@@ -174,6 +177,8 @@
	#define RACK_TO_FRM_PERSIST 5
	#define RACK_TO_FRM_DELACK 6

	+#define RCV_PATH_RTT_MS 10 /* How many ms between recv path RTT's */
	+
	struct rack_opts_stats {
	uint64_t tcp_rack_tlp_reduce;
	uint64_t tcp_rack_pace_always;
	@@ -232,7 +237,7 @@
	uint64_t tcp_rack_rtt_use;
	uint64_t tcp_data_after_close;
	uint64_t tcp_defer_opt;
	- uint64_t tcp_rxt_clamp;
	+ uint64_t tcp_pol_detect;
	uint64_t tcp_rack_beta;
	uint64_t tcp_rack_beta_ecn;
	uint64_t tcp_rack_timer_slop;
	@@ -242,6 +247,11 @@
	uint64_t tcp_rack_pacing_divisor;
	uint64_t tcp_rack_min_seg;
	uint64_t tcp_dgp_in_rec;
	+ uint64_t tcp_notimely;
	+ uint64_t tcp_honor_hpts;
	+ uint64_t tcp_dyn_rec;
	+ uint64_t tcp_fillcw_rate_cap;
	+ uint64_t tcp_pol_mss;
	};

	/* RTT shrink reasons */
	@@ -263,6 +273,9 @@
	#define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */
	#define RACK_MIN_BW 8000 /* 64kbps in Bps */

	+#define CCSP_DIS_MASK 0x0001
	+#define HYBRID_DIS_MASK 0x0002
	+
	/* Rack quality indicators for GPUT measurements */
	#define RACK_QUALITY_NONE 0 /* No quality stated */
	#define RACK_QUALITY_HIGH 1 /* A normal measurement of a GP RTT */
	@@ -319,6 +332,7 @@
	*
	*/
	#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */
	+#define RETRAN_CNT_SIZE 16

	#define RACK_NUM_FSB_DEBUG 16
	#ifdef _KERNEL
	@@ -342,6 +356,26 @@

	struct tailq_hash;

	+struct rack_pcm_info {
	+ /* Base send time and s/e filled in by rack_log_output */
	+ uint64_t send_time;
	+ uint32_t sseq;
	+ uint32_t eseq;
	+ /* Ack's fill in the rest of the data */
	+ uint16_t cnt;
	+ /* Maximum acks present */
	+ uint16_t cnt_alloc;
	+};
	+
	+#define RACK_DEFAULT_PCM_ARRAY 16
	+
	+struct rack_pcm_stats {
	+ uint32_t sseq;
	+ uint32_t eseq;
	+ uint64_t ack_time;
	+};
	+
	+
	struct rack_control {
	/* Second cache line 0x40 from tcp_rack */
	struct tailq_hash tqh; / Tree of all segments Lock(a) */
	@@ -402,6 +436,7 @@
	uint32_t rc_rcvtime; /* When we last received data */
	uint32_t rc_num_split_allocs; /* num split map entries allocated */
	uint32_t rc_split_limit; /* Limit from control var can be set by socket opt */
	+ uint32_t rack_avg_rec_sends;

	uint32_t rc_last_output_to;
	uint32_t rc_went_idle_time;
	@@ -452,19 +487,45 @@
	struct tcp_sendfile_track *rc_last_sft;
	uint32_t lt_seq; /* Seq at start of lt_bw gauge */
	int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */
	- uint64_t last_sndbytes;
	- uint64_t last_snd_rxt_bytes;
	- uint64_t rxt_threshold;
	uint64_t last_tmit_time_acked; /* Holds the last cumack point's last send time */
	- uint32_t last_rnd_rxt_clamped;
	- uint32_t num_of_clamps_applied;
	- uint32_t clamp_options;
	- uint32_t max_clamps;
	+ /* Recovery stats */
	+ uint64_t time_entered_recovery;
	+ uint64_t bytes_acked_in_recovery;
	+ /* Policer Detection */
	+ uint64_t last_policer_sndbytes;
	+ uint64_t last_policer_snd_rxt_bytes;
	+ uint64_t policer_bw;
	+ uint64_t last_sendtime;
	+
	+ uint64_t last_gpest;
	+ uint64_t last_tm_mark; /* Last tm mark used */
	+ uint64_t fillcw_cap; /* B/W cap on fill cw */
	+ struct rack_pcm_info pcm_i;
	+ struct rack_pcm_stats *pcm_s;
	+ uint32_t gp_gain_req; /* Percent off gp gain req */
	+ uint32_t last_rnd_of_gp_rise;
	+ uint32_t gp_rnd_thresh;
	+ uint32_t ss_hi_fs;
	+ uint32_t gate_to_fs;
	+ uint32_t policer_max_seg;
	+ uint32_t pol_bw_comp;
	+ uint16_t policer_rxt_threshold;
	+ uint8_t policer_avg_threshold;
	+ uint8_t policer_med_threshold;
	+ uint32_t pcm_max_seg;
	+ uint32_t last_pcm_round;
	+ uint32_t pcm_idle_rounds;
	+ uint32_t current_policer_bucket;
	+ uint32_t policer_bucket_size;
	+ uint32_t idle_snd_una;
	+ uint32_t ack_for_idle;
	+ uint32_t last_amount_before_rec;

	uint32_t rc_gp_srtt; /* Current GP srtt */
	uint32_t rc_prev_gp_srtt; /* Previous RTT */
	uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */
	uint32_t rc_loss_at_start; /* At measurement window where was our lost value */
	+ uint32_t rc_considered_lost; /* Count in recovery of non-retransmitted bytes considered lost */

	uint32_t dsack_round_end; /* In a round of seeing a DSACK */
	uint32_t current_round; /* Starting at zero */
	@@ -491,6 +552,8 @@
	uint32_t rc_snd_max_at_rto; /* For non-sack when the RTO occurred what was snd-max */
	uint32_t rc_out_at_rto;
	int32_t rc_scw_index;
	+ uint32_t max_reduction;
	+ uint32_t side_chan_dis_mask; /* Bit mask of socket opt's disabled */
	uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
	uint32_t rc_last_timeout_snduna;
	uint32_t last_tlp_acked_start;
	@@ -503,7 +566,11 @@
	uint32_t ack_during_sd;
	uint32_t input_pkt;
	uint32_t saved_input_pkt;
	- uint32_t saved_rxt_clamp_val; /* The encoded value we used to setup clamping */
	+ uint32_t saved_policer_val; /* The encoded value we used to setup policer detection */
	+ uint32_t cleared_app_ack_seq;
	+ uint32_t last_rcv_tstmp_for_rtt;
	+ uint32_t last_time_of_arm_rcv;
	+ uint32_t rto_ssthresh;
	struct newreno rc_saved_beta; /*
	* For newreno cc:
	* rc_saved_cc are the values we have had
	@@ -516,10 +583,13 @@
	* we also set the flag (if ecn_beta is set) to make
	* new_reno do less of a backoff for ecn (think abe).
	*/
	+ uint16_t rc_cnt_of_retran[RETRAN_CNT_SIZE];
	uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
	uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
	+ uint8_t policer_del_mss; /* How many mss during recovery for policer detection */
	uint8_t rack_per_upper_bound_ss;
	uint8_t rack_per_upper_bound_ca;
	+ uint8_t cleared_app_ack;
	uint8_t dsack_persist;
	uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */
	uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */
	@@ -528,17 +598,19 @@
	uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
	uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
	uint8_t rc_rate_sample_method;
	- uint8_t rc_dgp_bl_agg; /* Buffer Level aggression during DGP */
	+ uint8_t policer_alt_median; /* Alternate median for policer detection */
	uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */
	uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */
	- uint8_t pacing_discount_amm; /*
	- * This is a multipler to the base discount that
	- * can be used to increase the discount.
	- */
	+ uint8_t use_gp_not_last;
	+ uint8_t pacing_method; /* If pace_always, what type of pacing */
	uint8_t already_had_a_excess;
	};
	#endif

	+#define RACK_PACING_NONE 0x00
	+#define RACK_DGP_PACING 0x01
	+#define RACK_REG_PACING 0x02
	+
	/* DGP with no buffer level mitigations */
	#define DGP_LEVEL0 0

	@@ -578,6 +650,10 @@
	#define HYBRID_LOG_EXTEND 14 /* We extended the end */
	#define HYBRID_LOG_SENT_LOST 15 /* A closing sent/lost report */

	+#define LOST_ZERO 1 /* Zero it out */
	+#define LOST_ADD 2 /* Add to it */
	+#define LOST_SUB 3 /* Sub from it */
	+
	#define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */
	#define RACK_MINRTT_FILTER_TIM 10 /* Seconds */

	@@ -590,6 +666,7 @@
	*/

	#define MAX_USER_SET_SEG 0x3f /* The max we can set is 63 which is probably too many */
	+#define RACK_FREE_CNT_MAX 0x2f /* Max our counter can do */

	#ifdef _KERNEL

	@@ -601,8 +678,9 @@
	int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */
	struct tcpcb rc_tp; / The tcpcb Lock(a) */
	struct inpcb rc_inp; / The inpcb Lock(a) */
	- uint8_t rc_free_cnt; /* Number of free entries on the rc_free list
	- * Lock(a) */
	+ uint8_t rc_free_cnt : 6,
	+ rc_skip_timely : 1,
	+ pcm_enabled : 1; /* Is PCM enabled */
	uint8_t client_bufferlvl : 3, /* Expected range [0,5]: 0=unset, 1=low/empty */
	rack_deferred_inited : 1,
	/* ******************************************************************** */
	@@ -612,11 +690,11 @@
	shape_rxt_to_pacing_min : 1,
	/* ******************************************************************** */
	rc_ack_required: 1,
	- r_pacing_discount : 1;
	+ r_use_hpts_min : 1;
	uint8_t no_prr_addback : 1,
	gp_ready : 1,
	defer_options: 1,
	- excess_rxt_on: 1, /* Are actions on for excess retransmissions? */
	+ dis_lt_bw : 1,
	rc_ack_can_sendout_data: 1, /*
	* If set it will override pacing restrictions on not sending
	* data when the pacing timer is running. I.e. you set this
	@@ -659,7 +737,7 @@
	r_rack_hw_rate_caps: 1,
	r_up_only: 1,
	r_via_fill_cw : 1,
	- r_fill_less_agg : 1;
	+ r_rcvpath_rtt_up : 1;

	uint8_t rc_user_set_max_segs : 7, /* Socket option value Lock(a) */
	rc_fillcw_apply_discount;
	@@ -673,7 +751,7 @@
	rc_highly_buffered: 1, /* The path is highly buffered */
	rc_dragged_bottom: 1,
	rc_pace_dnd : 1, /* The pace do not disturb bit */
	- rc_avali2 : 1,
	+ rc_initial_ss_comp : 1,
	rc_gp_filled : 1,
	rc_hw_nobuf : 1;
	uint8_t r_state : 4, /* Current rack state Lock(a) */
	@@ -696,8 +774,8 @@
	uint8_t app_limited_needs_set : 1,
	use_fixed_rate : 1,
	rc_has_collapsed : 1,
	- r_cwnd_was_clamped : 1,
	- r_clamped_gets_lower : 1,
	+ use_lesser_lt_bw : 1,
	+ cspr_is_fcc : 1,
	rack_hdrw_pacing : 1, /* We are doing Hardware pacing */
	rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */
	rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */
	@@ -722,7 +800,14 @@
	r_persist_lt_bw_off : 1,
	r_collapse_point_valid : 1,
	dgp_on : 1;
	- uint16_t rc_init_win : 8,
	+ uint16_t rto_from_rec: 1,
	+ avail_bit: 1,
	+ pcm_in_progress: 1,
	+ pcm_needed: 1,
	+ policer_detect_on: 1, /* Are we detecting policers? */
	+ rc_policer_detected : 1, /* We are beiing policed */
	+ rc_policer_should_pace : 1, /* The sizing algo thinks we should pace */
	+ rc_sendvars_notset : 1, /* Inside rack_init send variables (snd_max/una etc) were not set */
	rc_gp_rtt_set : 1,
	rc_gp_dyn_mul : 1,
	rc_gp_saw_rec : 1,
	@@ -735,5 +820,9 @@
	struct rack_control r_ctl;
	} __aligned(CACHE_LINE_SIZE);

	+
	+void rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack,
	+ uint32_t ss, uint32_t es);
	+
	#endif
	#endif
	diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
	--- a/sys/netinet/tcp_subr.c
	+++ b/sys/netinet/tcp_subr.c
	@@ -287,18 +287,29 @@
	static volatile uint32_t number_of_tcp_connections_pacing = 0;
	static uint32_t shadow_num_connections = 0;
	static counter_u64_t tcp_pacing_failures;
	+static counter_u64_t tcp_dgp_failures;
	+static uint32_t shadow_tcp_pacing_dgp = 0;
	+static volatile uint32_t number_of_dgp_connections = 0;

	static int tcp_pacing_limit = 10000;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
	&tcp_pacing_limit, 1000,
	"If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");

	+static int tcp_dgp_limit = -1;
	+SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW,
	+ &tcp_dgp_limit, -1,
	+ "If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)");
	+
	SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
	&shadow_num_connections, 0, "Number of TCP connections being paced");

	SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD,
	&tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit");

	+SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD,
	+ &tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit");
	+
	static int tcp_log_debug = 0;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
	&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
	@@ -1571,6 +1582,7 @@
	tcp_uncomp_total = counter_u64_alloc(M_WAITOK);
	tcp_bad_csums = counter_u64_alloc(M_WAITOK);
	tcp_pacing_failures = counter_u64_alloc(M_WAITOK);
	+ tcp_dgp_failures = counter_u64_alloc(M_WAITOK);
	#ifdef TCPPCAP
	tcp_pcap_init();
	#endif
	@@ -4022,6 +4034,43 @@
	}
	}

	+int
	+tcp_incr_dgp_pacing_cnt(void)
	+{
	+ if ((tcp_dgp_limit == -1) \|\|
	+ (tcp_dgp_limit > number_of_dgp_connections)) {
	+ atomic_fetchadd_int(&number_of_dgp_connections, 1);
	+ shadow_tcp_pacing_dgp = number_of_dgp_connections;
	+ return (1);
	+ } else {
	+ counter_u64_add(tcp_dgp_failures, 1);
	+ return (0);
	+ }
	+}
	+
	+static uint8_t tcp_dgp_warning = 0;
	+
	+void
	+tcp_dec_dgp_pacing_cnt(void)
	+{
	+ uint32_t ret;
	+
	+ ret = atomic_fetchadd_int(&number_of_dgp_connections, -1);
	+ shadow_tcp_pacing_dgp = number_of_dgp_connections;
	+ KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?"));
	+ if (ret == 0) {
	+ if (tcp_dgp_limit != -1) {
	+ printf("Warning all DGP is now disabled, count decrements invalidly!\n");
	+ tcp_dgp_limit = 0;
	+ tcp_dgp_warning = 1;
	+ } else if (tcp_dgp_warning == 0) {
	+ printf("Warning DGP pacing is invalid, invalid decrement\n");
	+ tcp_dgp_warning = 1;
	+ }
	+ }
	+
	+}
	+
	static uint8_t tcp_pacing_warning = 0;

	void
	@@ -4541,7 +4590,7 @@
	if (tp->t_tcpreq_req) {
	for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) {
	fil = &tp->t_tcpreq_info[i];
	- if (fil->flags != TCP_TRK_TRACK_FLG_USED)
	+ if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0)
	continue;
	if ((fil->timestamp == req->timestamp) &&
	(fil->start == req->start) &&
	@@ -4573,6 +4622,7 @@
	allocated = 1;
	fil->flags = TCP_TRK_TRACK_FLG_USED;
	fil->timestamp = req->timestamp;
	+ fil->playout_ms = req->playout_ms;
	fil->localtime = ts;
	fil->start = req->start;
	if (req->flags & TCP_LOG_HTTPD_RANGE_END) {
	@@ -4589,7 +4639,10 @@
	fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc;
	fil->start_seq = tp->snd_una +
	tptosocket(tp)->so_snd.sb_ccc;
	- fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
	+ if (req->flags & TCP_LOG_HTTPD_RANGE_END)
	+ fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
	+ else
	+ fil->end_seq = 0;
	if (tptosocket(tp)->so_snd.sb_tls_info) {
	/*
	* This session is doing TLS. Take a swag guess
	diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
	--- a/sys/netinet/tcp_syncache.c
	+++ b/sys/netinet/tcp_syncache.c
	@@ -1032,7 +1032,10 @@

	if (!solisten_enqueue(so, SS_ISCONNECTED))
	tp->t_flags \|= TF_SONOTCONN;
	-
	+ /* Can we inherit anything from the listener? */
	+ if (tp->t_fb->tfb_inherit != NULL) {
	+ (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(lso));
	+ }
	return (so);

	allocfail:
	diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
	--- a/sys/netinet/tcp_usrreq.c
	+++ b/sys/netinet/tcp_usrreq.c
	@@ -179,6 +179,12 @@
	goto out;
	}
	tp->t_state = TCPS_CLOSED;
	+ /* Can we inherit anything from the listener? */
	+ if ((so->so_listen != NULL) &&
	+ (so->so_listen->so_pcb != NULL) &&
	+ (tp->t_fb->tfb_inherit != NULL)) {
	+ (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(so->so_listen));
	+ }
	tcp_bblog_pru(tp, PRU_ATTACH, error);
	INP_WUNLOCK(inp);
	TCPSTATES_INC(TCPS_CLOSED);
	@@ -1601,6 +1607,7 @@
	ti->tcpi_rcv_numsacks = tp->rcv_numsacks;
	ti->tcpi_rcv_adv = tp->rcv_adv;
	ti->tcpi_dupacks = tp->t_dupacks;
	+ ti->tcpi_rttmin = tp->t_rttlow;
	#ifdef TCP_OFFLOAD
	if (tp->t_flags & TF_TOE) {
	ti->tcpi_options \|= TCPI_OPT_TOE;
	diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
	--- a/sys/netinet/tcp_var.h
	+++ b/sys/netinet/tcp_var.h
	@@ -138,7 +138,8 @@
	#define TCP_TRK_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */
	#define TCP_TRK_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */
	#define TCP_TRK_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */
	-#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
	+#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
	+#define TCP_TRK_TRACK_FLG_LSND 0x20 /* We were able to set the Last Sent */
	#define MAX_TCP_TRK_REQ 5 /* Max we will have at once */

	struct tcp_sendfile_track {
	@@ -151,11 +152,14 @@
	uint64_t cspr; /* Client suggested pace rate */
	uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */
	uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */
	+ uint64_t sent_at_ls; /* Sent value at the last send */
	+ uint64_t rxt_at_ls; /* Retransmit value at the last send */
	tcp_seq start_seq; /* First TCP Seq assigned */
	tcp_seq end_seq; /* If range req last seq */
	uint32_t flags; /* Type of request open etc */
	uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */
	uint32_t hint_maxseg; /* Client hinted maxseg */
	+ uint32_t playout_ms; /* Client playout ms */
	uint32_t hybrid_flags; /* Hybrid flags on this request */
	};

	@@ -623,6 +627,8 @@
	void (tfb_switch_failed)(struct tcpcb );
	bool (tfb_early_wake_check)(struct tcpcb );
	int (tfb_compute_pipe)(struct tcpcb tp);
	+ int (tfb_stack_info)(struct tcpcb tp, struct stack_specific_info *);
	+ void (tfb_inherit)(struct tcpcb tp, struct inpcb *h_inp);
	volatile uint32_t tfb_refcnt;
	uint32_t tfb_flags;
	uint8_t tfb_id;
	@@ -788,7 +794,7 @@
	#define TF_TSO 0x01000000 /* TSO enabled on this connection */
	#define TF_TOE 0x02000000 /* this connection is offloaded */
	#define TF_CLOSED 0x04000000 /* close(2) called on socket */
	-#define TF_UNUSED1 0x08000000 /* unused */
	+#define TF_SENTSYN 0x08000000 /* At least one syn has been sent */
	#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
	#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
	#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
	@@ -1501,6 +1507,8 @@
	int tcp_stats_sample_rollthedice(struct tcpcb tp, void seed_bytes,
	size_t seed_len);
	int tcp_can_enable_pacing(void);
	+int tcp_incr_dgp_pacing_cnt(void);
	+void tcp_dec_dgp_pacing_cnt(void);
	void tcp_decrement_paced_conn(void);
	void tcp_change_time_units(struct tcpcb *, int);
	void tcp_handle_orphaned_packets(struct tcpcb *);

File Metadata

Mime Type: text/plain
Expires: Mon, Sep 30, 11:05 PM (8 h, 30 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 13186043
Default Alt Text: D43986.diff (257 KB)

D43986.diffNo OneTemporaryActions

D43986.diffView Options

File Metadata

Event Timeline

D43986.diff
No OneTemporary
Actions

D43986.diff
View Options