Page MenuHomeFreeBSD

D45410.diff
No OneTemporary

D45410.diff

diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -337,8 +337,7 @@
#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */
#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */
-#define TCP_POLICER_DETECT 1149 /* Do we apply a thresholds to rack to detect and compensate for policers? */
-#define TCP_RXT_CLAMP TCP_POLICER_DETECT
+/* #define TCP_POLICER_DETECT 1149 not used */
#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */
#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */
#define TCP_SS_EEXIT 1152 /* Do we do early exit from slowtart if no b/w growth */
@@ -348,7 +347,7 @@
#define TCP_REC_IS_DYN 1156 /* Do we allow timely to change recovery multiplier? */
#define TCP_SIDECHAN_DIS 1157 /* Disable/enable the side-channel */
#define TCP_FILLCW_RATE_CAP 1158 /* Set a cap for DGP's fillcw */
-#define TCP_POLICER_MSS 1159 /* Policer MSS requirement */
+/* #define TCP_POLICER_MSS 1159 not used */
#define TCP_STACK_SPEC_INFO 1160 /* Get stack specific information (if present) */
#define RACK_CSPR_IS_FCC 1161
#define TCP_GP_USE_LTBW 1162 /* how we use lt_bw 0=not, 1=min, 2=max */
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -201,14 +201,14 @@
TCP_LOG_OUT, /* Transmit (without other event) 2 */
TCP_LOG_RTO, /* Retransmit timeout 3 */
TCP_LOG_SB_WAKE, /* Awaken socket buffer 4 */
- TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
+ TCP_UNUSED_5, /* Detected bad retransmission 5 */
TCP_LOG_PRR, /* Doing PRR 6 */
- TCP_LOG_REORDER, /* Detected reorder 7 */
+ TCP_UNUSED_7, /* Detected reorder 7 */
TCP_LOG_HPTS, /* Hpts sending a packet 8 */
BBR_LOG_BBRUPD, /* We updated BBR info 9 */
BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
- BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */
+ TCP_UNUSED_12, /* The tcb had a packet input to it 12 */
BBR_LOG_TIMERSTAR, /* Start a timer 13 */
BBR_LOG_TIMERCANC, /* Cancel a timer 14 */
BBR_LOG_ENTREC, /* Entered recovery 15 */
@@ -245,7 +245,7 @@
BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
- BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */
+ TCP_UNUSED_49, /* SRTT gaining -- now not used 49 */
TCP_LOG_REASS, /* Reassembly buffer logging 50 */
TCP_HDWR_PACE_SIZE, /* TCP pacing size set (rl and rack uses this) 51 */
BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */
@@ -253,9 +253,9 @@
TCP_LOG_CONNEND, /* End of connection 54 */
TCP_LOG_LRO, /* LRO entry 55 */
TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */
- TCP_SAD_DETECT, /* Sack Attack Detection 57 */
+ TCP_UNUSED_57, /* Sack Attack Detection 57 */
TCP_TIMELY_WORK, /* Logs regarding Timely CC tweaks 58 */
- TCP_LOG_USER_EVENT, /* User space event data 59 */
+ TCP_UNUSED_59, /* User space event data 59 */
TCP_LOG_SENDFILE, /* sendfile() logging for TCP connections 60 */
TCP_LOG_REQ_T, /* logging of request tracking 61 */
TCP_LOG_ACCOUNTING, /* Log of TCP Accounting data 62 */
@@ -267,7 +267,7 @@
TCP_RACK_TP_TRIGGERED, /* A rack tracepoint is triggered 68 */
TCP_HYBRID_PACING_LOG, /* Hybrid pacing log 69 */
TCP_LOG_PRU, /* TCP protocol user request 70 */
- TCP_POLICER_DET, /* TCP Policer detectionn 71 */
+ TCP_UNUSED_71, /* old TCP Policer detectionn, not used 71 */
TCP_PCM_MEASURE, /* TCP Path Capacity Measurement 72 */
TCP_LOG_END /* End (keep at end) 73 */
};
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -193,17 +193,9 @@
static int32_t rack_reorder_thresh = 2;
static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
* - 60 seconds */
-static uint16_t rack_policer_rxt_thresh= 0; /* 499 = 49.9%, 0 is off */
-static uint8_t rack_policer_avg_thresh = 0; /* 3.2 */
-static uint8_t rack_policer_med_thresh = 0; /* 1 - 16 */
-static uint16_t rack_policer_bucket_reserve = 20; /* How much % is reserved in the bucket */
-static uint64_t rack_pol_min_bw = 125000; /* 1mbps in Bytes per sec */
-static uint32_t rack_policer_data_thresh = 64000; /* 64,000 bytes must be sent before we engage */
-static uint32_t rack_policing_do_bw_comp = 1;
static uint32_t rack_pcm_every_n_rounds = 100;
static uint32_t rack_pcm_blast = 0;
static uint32_t rack_pcm_is_enabled = 1;
-static uint8_t rack_req_del_mss = 18; /* How many segments need to be sent in a recovery episode to do policer_detection */
static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */
@@ -392,7 +384,6 @@
counter_u64_t rack_tlp_retran_bytes;
counter_u64_t rack_to_tot;
counter_u64_t rack_hot_alloc;
-counter_u64_t tcp_policer_detected;
counter_u64_t rack_to_alloc;
counter_u64_t rack_to_alloc_hard;
counter_u64_t rack_to_alloc_emerg;
@@ -558,9 +549,6 @@
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
-static void
-rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz);
-
static int
rack_do_closing(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
@@ -898,7 +886,6 @@
struct sysctl_oid *rack_measure;
struct sysctl_oid *rack_probertt;
struct sysctl_oid *rack_hw_pacing;
- struct sysctl_oid *rack_policing;
rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -1551,53 +1538,6 @@
OID_AUTO, "hystartplusplus", CTLFLAG_RW,
&rack_do_hystart, 0,
"Should RACK enable HyStart++ on connections?");
- /* Policer detection */
- rack_policing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO,
- "policing",
- CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "policer detection");
- SYSCTL_ADD_U16(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "rxt_thresh", CTLFLAG_RW,
- &rack_policer_rxt_thresh, 0,
- "Percentage of retransmits we need to be a possible policer (499 = 49.9 percent)");
- SYSCTL_ADD_U8(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "avg_thresh", CTLFLAG_RW,
- &rack_policer_avg_thresh, 0,
- "What threshold of average retransmits needed to recover a lost packet (1 - 169 aka 21 = 2.1)?");
- SYSCTL_ADD_U8(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "med_thresh", CTLFLAG_RW,
- &rack_policer_med_thresh, 0,
- "What threshold of Median retransmits needed to recover a lost packet (1 - 16)?");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "data_thresh", CTLFLAG_RW,
- &rack_policer_data_thresh, 64000,
- "How many bytes must have gotten through before we can start doing policer detection?");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "bwcomp", CTLFLAG_RW,
- &rack_policing_do_bw_comp, 1,
- "Do we raise up low b/w so that at least pace_max_seg can be sent in the srtt?");
- SYSCTL_ADD_U8(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "recmss", CTLFLAG_RW,
- &rack_req_del_mss, 18,
- "How many MSS must be delivered during recovery to engage policer detection?");
- SYSCTL_ADD_U16(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "res_div", CTLFLAG_RW,
- &rack_policer_bucket_reserve, 20,
- "What percentage is reserved in the policer bucket?");
- SYSCTL_ADD_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_policing),
- OID_AUTO, "min_comp_bw", CTLFLAG_RW,
- &rack_pol_min_bw, 125000,
- "Do we have a min b/w for b/w compensation (0 = no)?");
/* Misc rack controls */
rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -1880,13 +1820,6 @@
OID_AUTO, "alloc_hot", CTLFLAG_RD,
&rack_hot_alloc,
"Total allocations from the top of our list");
- tcp_policer_detected = counter_u64_alloc(M_WAITOK);
- SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_counters),
- OID_AUTO, "policer_detected", CTLFLAG_RD,
- &tcp_policer_detected,
- "Total policer_detections");
-
rack_to_alloc = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -3429,7 +3362,6 @@
counter_u64_free(rack_saw_enobuf_hw);
counter_u64_free(rack_saw_enetunreach);
counter_u64_free(rack_hot_alloc);
- counter_u64_free(tcp_policer_detected);
counter_u64_free(rack_to_alloc);
counter_u64_free(rack_to_alloc_hard);
counter_u64_free(rack_to_alloc_emerg);
@@ -5702,459 +5634,12 @@
rack->r_wanted_output = 1;
}
-static inline uint64_t
-rack_get_rxt_per(uint64_t snds, uint64_t rxts)
-{
- uint64_t rxt_per;
-
- if (snds > 0) {
- rxt_per = rxts * 1000;
- rxt_per /= snds;
- } else {
- /* This is an unlikely path */
- if (rxts) {
- /* Its the max it was all re-transmits */
- rxt_per = 0xffffffffffffffff;
- } else {
- rxt_per = 0;
- }
- }
- return (rxt_per);
-}
-
-static void
-policer_detection_log(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint8_t flex8)
-{
- if (tcp_bblogging_on(rack->rc_tp)) {
- union tcp_log_stackspecific log;
- struct timeval tv;
-
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.flex1 = flex1;
- log.u_bbr.flex2 = flex2;
- log.u_bbr.flex3 = flex3;
- log.u_bbr.flex4 = flex4;
- log.u_bbr.flex5 = rack->r_ctl.current_policer_bucket;
- log.u_bbr.flex6 = rack->r_ctl.policer_bucket_size;
- log.u_bbr.flex7 = 0;
- log.u_bbr.flex8 = flex8;
- log.u_bbr.bw_inuse = rack->r_ctl.policer_bw;
- log.u_bbr.applimited = rack->r_ctl.current_round;
- log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
- log.u_bbr.delivered = (uint32_t)rack->r_ctl.bytes_acked_in_recovery;
- log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
- log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes;
- log.u_bbr.rttProp = rack->r_ctl.gp_bw;
- log.u_bbr.bbr_state = rack->rc_policer_detected;
- log.u_bbr.bbr_substate = 0;
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- log.u_bbr.use_lt_bw = rack->policer_detect_on;
- log.u_bbr.lt_epoch = 0;
- log.u_bbr.pkts_out = 0;
- tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- }
-
-}
-
-static void
-policer_detection(struct tcpcb *tp, struct tcp_rack *rack, int post_recovery)
-{
- /*
- * Rack excess rxt accounting is turned on. If we
- * are above a threshold of rxt's in at least N
- * rounds, then back off the cwnd and ssthresh
- * to fit into the long-term b/w.
- */
-
- uint32_t pkts, mid, med, alt_med, avg, segsiz, tot_retran_pkt_count = 0;
- uint32_t cnt_of_mape_rxt = 0;
- uint64_t snds, rxts, rxt_per, tim, del, del_bw;
- int i;
- struct timeval tv;
-
-
- /*
- * First is there enough packets delivered during recovery to make
- * a determiniation of b/w?
- */
- segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
- if ((rack->rc_policer_detected == 0) &&
- (rack->r_ctl.policer_del_mss > 0) &&
- ((uint32_t)rack->r_ctl.policer_del_mss > ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz))) {
- /*
- * Not enough data sent in recovery for initial detection. Once
- * we have deteced a policer we allow less than the threshold (polcer_del_mss)
- * amount of data in a recovery to let us fall through and double check
- * our policer settings and possibly expand or collapse the bucket size and
- * the polcier b/w.
- *
- * Once you are declared to be policed. this block of code cannot be
- * reached, instead blocks further down will re-check the policer detection
- * triggers and possibly reset the measurements if somehow we have let the
- * policer bucket size grow too large.
- */
- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
- policer_detection_log(rack, rack->r_ctl.policer_del_mss,
- ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz),
- rack->r_ctl.bytes_acked_in_recovery, segsiz, 18);
- }
- return;
- }
- tcp_get_usecs(&tv);
- tim = tcp_tv_to_lusectick(&tv) - rack->r_ctl.time_entered_recovery;
- del = rack->r_ctl.bytes_acked_in_recovery;
- if (tim > 0)
- del_bw = (del * (uint64_t)1000000) / tim;
- else
- del_bw = 0;
- /* B/W compensation? */
-
- if (rack->r_ctl.pol_bw_comp && ((rack->r_ctl.policer_bw > 0) ||
- (del_bw > 0))) {
- /*
- * Sanity check now that the data is in. How long does it
- * take for us to pace out two of our policer_max_seg's?
- *
- * If it is longer than the RTT then we are set
- * too slow, maybe because of not enough data
- * sent during recovery.
- */
- uint64_t lentime, res, srtt, max_delbw, alt_bw;
-
- srtt = (uint64_t)rack_grab_rtt(tp, rack);
- if ((tp->t_srtt > 0) && (srtt > tp->t_srtt))
- srtt = tp->t_srtt;
- lentime = rack->r_ctl.policer_max_seg * (uint64_t)HPTS_USEC_IN_SEC * 2;
- if (del_bw > rack->r_ctl.policer_bw) {
- max_delbw = del_bw;
- } else {
- max_delbw = rack->r_ctl.policer_bw;
- }
- res = lentime / max_delbw;
- if ((srtt > 0) && (res > srtt)) {
- /*
- * At this rate we can not get two policer_maxsegs
- * out before the ack arrives back.
- *
- * Lets at least get it raised up so that
- * we can be a bit faster than that if possible.
- */
- lentime = (rack->r_ctl.policer_max_seg * 2);
- tim = srtt;
- alt_bw = (lentime * (uint64_t)HPTS_USEC_IN_SEC) / tim;
- if (alt_bw > max_delbw) {
- uint64_t cap_alt_bw;
-
- cap_alt_bw = (max_delbw + (max_delbw * rack->r_ctl.pol_bw_comp));
- if ((rack_pol_min_bw > 0) && (cap_alt_bw < rack_pol_min_bw)) {
- /* We place a min on the cap which defaults to 1Mbps */
- cap_alt_bw = rack_pol_min_bw;
- }
- if (alt_bw <= cap_alt_bw) {
- /* It should be */
- del_bw = alt_bw;
- policer_detection_log(rack,
- (uint32_t)tim,
- rack->r_ctl.policer_max_seg,
- 0,
- 0,
- 16);
- } else {
- /*
- * This is an odd case where likely the RTT is very very
- * low. And yet it is still being policed. We don't want
- * to get more than (rack_policing_do_bw_comp+1) x del-rate
- * where del-rate is what we got in recovery for either the
- * first Policer Detection(PD) or this PD we are on now.
- */
- del_bw = cap_alt_bw;
- policer_detection_log(rack,
- (uint32_t)tim,
- rack->r_ctl.policer_max_seg,
- (uint32_t)max_delbw,
- (rack->r_ctl.pol_bw_comp + 1),
- 16);
- }
- }
- }
- }
- snds = tp->t_sndbytes - rack->r_ctl.last_policer_sndbytes;
- rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_policer_snd_rxt_bytes;
- rxt_per = rack_get_rxt_per(snds, rxts);
- /* Figure up the average and median */
- for(i = 0; i < RETRAN_CNT_SIZE; i++) {
- if (rack->r_ctl.rc_cnt_of_retran[i] > 0) {
- tot_retran_pkt_count += (i + 1) * rack->r_ctl.rc_cnt_of_retran[i];
- cnt_of_mape_rxt += rack->r_ctl.rc_cnt_of_retran[i];
- }
- }
- if (cnt_of_mape_rxt)
- avg = (tot_retran_pkt_count * 10)/cnt_of_mape_rxt;
- else
- avg = 0;
- alt_med = med = 0;
- mid = tot_retran_pkt_count/2;
- for(i = 0; i < RETRAN_CNT_SIZE; i++) {
- pkts = (i + 1) * rack->r_ctl.rc_cnt_of_retran[i];
- if (mid > pkts) {
- mid -= pkts;
- continue;
- }
- med = (i + 1);
- break;
- }
- mid = cnt_of_mape_rxt / 2;
- for(i = 0; i < RETRAN_CNT_SIZE; i++) {
- if (mid > rack->r_ctl.rc_cnt_of_retran[i]) {
- mid -= rack->r_ctl.rc_cnt_of_retran[i];
- continue;
- }
- alt_med = (i + 1);
- break;
- }
- if (rack->r_ctl.policer_alt_median) {
- /* Swap the medians */
- uint32_t swap;
-
- swap = med;
- med = alt_med;
- alt_med = swap;
- }
- if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
- union tcp_log_stackspecific log;
- struct timeval tv;
-
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.flex1 = avg;
- log.u_bbr.flex2 = med;
- log.u_bbr.flex3 = (uint32_t)rxt_per;
- log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold;
- log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold;
- log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold;
- log.u_bbr.flex7 = rack->r_ctl.policer_alt_median;
- log.u_bbr.flex8 = 1;
- log.u_bbr.delivered = rack->r_ctl.policer_bucket_size;
- log.u_bbr.applimited = rack->r_ctl.current_round;
- log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
- log.u_bbr.bw_inuse = del_bw;
- log.u_bbr.cur_del_rate = rxts;
- log.u_bbr.delRate = snds;
- log.u_bbr.rttProp = rack->r_ctl.gp_bw;
- log.u_bbr.bbr_state = rack->rc_policer_detected;
- log.u_bbr.bbr_substate = 0;
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- log.u_bbr.use_lt_bw = rack->policer_detect_on;
- log.u_bbr.lt_epoch = (uint32_t)tim;
- log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery;
- tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- }
- if (med == RETRAN_CNT_SIZE) {
- /*
- * If the median is the maximum, then what we
- * likely have here is a network breakage. Either that
- * or we are so unlucky that all of our traffic is being
- * dropped and having to be retransmitted the maximum times
- * and this just is not how a policer works.
- *
- * If it is truely a policer eventually we will come
- * through and it won't be the maximum.
- */
- return;
- }
- /* Has enough rounds progressed for us to re-measure? */
- if ((rxt_per >= (uint64_t)rack->r_ctl.policer_rxt_threshold) &&
- (avg >= rack->r_ctl.policer_avg_threshold) &&
- (med >= rack->r_ctl.policer_med_threshold)) {
- /*
- * We hit all thresholds that indicate we are
- * being policed. Now we may be doing this from a rack timeout
- * which then means the rest of recovery will hopefully go
- * smoother as we pace. At the end of recovery we will
- * fall back in here and reset the values using the
- * results of the entire recovery episode (we could also
- * hit this as we exit recovery as well which means only
- * one time in here).
- *
- * This is done explicitly that if we hit the thresholds
- * again in a second recovery we overwrite the values. We do
- * that because over time, as we pace the policer_bucket_size may
- * continue to grow. This then provides more and more times when
- * we are not pacing to the policer rate. This lets us compensate
- * for when we hit a false positive and those flows continue to
- * increase. However if its a real policer we will then get over its
- * limit, over time, again and thus end up back here hitting the
- * thresholds again.
- *
- * The alternative to this is to instead whenever we pace due to
- * policing in rack_policed_sending we could add the amount len paced to the
- * idle_snd_una value (which decreases the amount in last_amount_before_rec
- * since that is always [th_ack - idle_snd_una]). This would then prevent
- * the polcier_bucket_size from growing in additional recovery episodes
- * Which would then mean false postives would be pretty much stuck
- * after things got back to normal (assuming that what caused the
- * false positive was a small network outage).
- *
- */
- tcp_trace_point(rack->rc_tp, TCP_TP_POLICER_DET);
- if (rack->rc_policer_detected == 0) {
- /*
- * Increment the stat that tells us we identified
- * a policer only once. Note that if we ever allow
- * the flag to be cleared (reverted) then we need
- * to adjust this to not do multi-counting.
- */
- counter_u64_add(tcp_policer_detected, 1);
- }
- rack->r_ctl.last_policer_sndbytes = tp->t_sndbytes;
- rack->r_ctl.last_policer_snd_rxt_bytes = tp->t_snd_rxt_bytes;
- rack->r_ctl.policer_bw = del_bw;
- rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp,
- rack->r_ctl.policer_bw,
- min(ctf_fixed_maxseg(rack->rc_tp),
- rack->r_ctl.rc_pace_min_segs),
- 0, NULL,
- NULL, rack->r_ctl.pace_len_divisor);
- /* Now what about the policer bucket size */
- rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
- if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) {
- /* We must be able to send our max-seg or else chaos ensues */
- rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2;
- }
- if (rack->rc_policer_detected == 0)
- rack->r_ctl.current_policer_bucket = 0;
- if (tcp_bblogging_on(rack->rc_tp)) {
- union tcp_log_stackspecific log;
- struct timeval tv;
-
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_get_usecs(&tv);
- log.u_bbr.flex1 = avg;
- log.u_bbr.flex2 = med;
- log.u_bbr.flex3 = rxt_per;
- log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold;
- log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold;
- log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold;
- log.u_bbr.flex7 = rack->r_ctl.policer_alt_median;
- log.u_bbr.flex8 = 2;
- log.u_bbr.applimited = rack->r_ctl.current_round;
- log.u_bbr.bw_inuse = del_bw;
- log.u_bbr.delivered = rack->r_ctl.policer_bucket_size;
- log.u_bbr.cur_del_rate = rxts;
- log.u_bbr.delRate = snds;
- log.u_bbr.rttProp = rack->r_ctl.gp_bw;
- log.u_bbr.bbr_state = rack->rc_policer_detected;
- log.u_bbr.bbr_substate = 0;
- log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
- log.u_bbr.use_lt_bw = rack->policer_detect_on;
- log.u_bbr.epoch = rack->r_ctl.policer_max_seg;
- log.u_bbr.lt_epoch = (uint32_t)tim;
- log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery;
- tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- /*
- * Put out an added log, 19, for the sole purpose
- * of getting the txt/rxt so that we can benchmark
- * in read-bbrlog the ongoing rxt rate after our
- * policer invocation in the HYSTART announcments.
- */
- memset(&log.u_bbr, 0, sizeof(log.u_bbr));
- log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv);
- log.u_bbr.flex1 = alt_med;
- log.u_bbr.flex8 = 19;
- log.u_bbr.cur_del_rate = tp->t_sndbytes;
- log.u_bbr.delRate = tp->t_snd_rxt_bytes;
- tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0,
- 0, &log, false, NULL, NULL, 0, &tv);
- }
- /* Turn off any fast output, thats ended */
- rack->r_fast_output = 0;
- /* Mark the time for credits */
- rack->r_ctl.last_sendtime = tcp_get_u64_usecs(NULL);
- if (rack->r_rr_config < 2) {
- /*
- * We need to be stricter on the RR config so
- * the pacing has priority.
- */
- rack->r_rr_config = 2;
- }
- policer_detection_log(rack,
- rack->r_ctl.idle_snd_una,
- rack->r_ctl.ack_for_idle,
- 0,
- (uint32_t)tim,
- 14);
- rack->rc_policer_detected = 1;
- } else if ((rack->rc_policer_detected == 1) &&
- (post_recovery == 1)) {
- /*
- * If we are exiting recovery and have already detected
- * we need to possibly update the values.
- *
- * First: Update the idle -> recovery sent value.
- */
- uint32_t srtt;
-
- if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) {
- rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
- }
- srtt = (uint64_t)rack_grab_rtt(tp, rack);
- if ((tp->t_srtt > 0) && (srtt > tp->t_srtt))
- srtt = tp->t_srtt;
- if ((srtt != 0) &&
- (tim < (uint64_t)srtt)) {
- /*
- * Not long enough.
- */
- if (rack_verbose_logging)
- policer_detection_log(rack,
- (uint32_t)tim,
- 0,
- 0,
- 0,
- 15);
- return;
- }
- /*
- * Finally update the b/w if its grown.
- */
- if (del_bw > rack->r_ctl.policer_bw) {
- rack->r_ctl.policer_bw = del_bw;
- rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp,
- rack->r_ctl.policer_bw,
- min(ctf_fixed_maxseg(rack->rc_tp),
- rack->r_ctl.rc_pace_min_segs),
- 0, NULL,
- NULL, rack->r_ctl.pace_len_divisor);
- if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) {
- /* We must be able to send our max-seg or else chaos ensues */
- rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2;
- }
- }
- policer_detection_log(rack,
- rack->r_ctl.idle_snd_una,
- rack->r_ctl.ack_for_idle,
- 0,
- (uint32_t)tim,
- 3);
- }
-}
-
static void
rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how)
{
- /* now check with the policer if on */
- if (rack->policer_detect_on == 1) {
- policer_detection(tp, rack, 1);
- }
/*
- * Now exit recovery, note we must do the idle set after the policer_detection
- * to get the amount acked prior to recovery correct.
+ * Now exit recovery.
*/
- rack->r_ctl.idle_snd_una = tp->snd_una;
EXIT_RECOVERY(tp->t_flags);
}
@@ -6260,69 +5745,11 @@
tp->t_flags &= ~TF_WASFRECOVERY;
tp->t_flags &= ~TF_WASCRECOVERY;
if (!IN_FASTRECOVERY(tp->t_flags)) {
- struct rack_sendmap *rsm;
- struct timeval tv;
- uint32_t segsiz;
-
/* Check if this is the end of the initial Start-up i.e. initial slow-start */
if (rack->rc_initial_ss_comp == 0) {
/* Yep it is the end of the initial slowstart */
rack->rc_initial_ss_comp = 1;
}
- microuptime(&tv);
- rack->r_ctl.time_entered_recovery = tcp_tv_to_lusectick(&tv);
- if (SEQ_GEQ(ack, tp->snd_una)) {
- /*
- * The ack is above snd_una. Lets see
- * if we can establish a postive distance from
- * our idle mark.
- */
- rack->r_ctl.ack_for_idle = ack;
- if (SEQ_GT(ack, rack->r_ctl.idle_snd_una)) {
- rack->r_ctl.last_amount_before_rec = ack - rack->r_ctl.idle_snd_una;
- } else {
- /* No data thru yet */
- rack->r_ctl.last_amount_before_rec = 0;
- }
- } else if (SEQ_GT(tp->snd_una, rack->r_ctl.idle_snd_una)) {
- /*
- * The ack is out of order and behind the snd_una. It may
- * have contained SACK information which we processed else
- * we would have rejected it.
- */
- rack->r_ctl.ack_for_idle = tp->snd_una;
- rack->r_ctl.last_amount_before_rec = tp->snd_una - rack->r_ctl.idle_snd_una;
- } else {
- rack->r_ctl.ack_for_idle = ack;
- rack->r_ctl.last_amount_before_rec = 0;
- }
- if (rack->rc_policer_detected) {
- /*
- * If we are being policed and we have a loss, it
- * means our bucket is now empty. This can happen
- * where some other flow on the same host sends
- * that this connection is not aware of.
- */
- rack->r_ctl.current_policer_bucket = 0;
- if (rack_verbose_logging)
- policer_detection_log(rack, rack->r_ctl.last_amount_before_rec, 0, 0, 0, 4);
- if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) {
- rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec;
- }
- }
- memset(rack->r_ctl.rc_cnt_of_retran, 0, sizeof(rack->r_ctl.rc_cnt_of_retran));
- segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
- TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
- /*
- * Go through the outstanding and re-peg
- * any that should have been left in the
- * retransmit list (on a double recovery).
- */
- if (rsm->r_act_rxt_cnt > 0) {
- rack_peg_rxt(rack, rsm, segsiz);
- }
- }
- rack->r_ctl.bytes_acked_in_recovery = 0;
rack->r_ctl.rc_prr_delivered = 0;
rack->r_ctl.rc_prr_out = 0;
rack->r_fast_output = 0;
@@ -6357,8 +5784,6 @@
rack->r_fast_output = 0;
if (IN_RECOVERY(tp->t_flags))
rack_exit_recovery(tp, rack, 2);
- rack->r_ctl.bytes_acked_in_recovery = 0;
- rack->r_ctl.time_entered_recovery = 0;
orig_cwnd = tp->snd_cwnd;
rack_log_to_prr(rack, 16, orig_cwnd, line);
if (CC_ALGO(tp)->cong_signal == NULL) {
@@ -7059,7 +6484,6 @@
rack->lt_bw_up = 1;
rack->r_persist_lt_bw_off = 0;
}
- rack->r_ctl.idle_snd_una = tp->snd_una;
rack->rc_in_persist = 0;
rack->r_ctl.rc_went_idle_time = 0;
tp->t_rxtshift = 0;
@@ -7520,16 +6944,6 @@
0, 0, 0);
return (1);
}
- if ((rack->policer_detect_on == 1) &&
- (rack->rc_policer_detected == 0)) {
- /*
- * We do this early if we have not
- * deteceted to attempt to detect
- * quicker. Normally we want to do this
- * as recovery exits (and we will again).
- */
- policer_detection(tp, rack, 0);
- }
return (0);
}
@@ -8718,86 +8132,6 @@
}
}
-/*
- * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This
- * array is zeroed at the start of recovery. Each time a segment
- * is retransmitted, we translate that into a number of packets
- * (based on segsiz) and based on how many times its been retransmitted
- * increment by the number of packets the counter that represents
- * retansmitted N times. Index 0 is retransmitted 1 time, index 1
- * is retransmitted 2 times etc.
- *
- * So for example when we send a 4344 byte transmission with a 1448
- * byte segsize, and its the third time we have retransmitted this
- * segment, we would add to the rc_cnt_of_retran[2] the value of
- * 3. That represents 3 MSS were retransmitted 3 times (index is
- * the number of times retranmitted minus 1).
- */
-static void
-rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz)
-{
- int idx;
- uint32_t peg;
-
- peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1;
- peg /= segsiz;
- idx = rsm->r_act_rxt_cnt - 1;
- if (idx >= RETRAN_CNT_SIZE)
- idx = RETRAN_CNT_SIZE - 1;
- /* Max of a uint16_t retransmits in a bucket */
- if ((rack->r_ctl.rc_cnt_of_retran[idx] + peg) < 0xffff)
- rack->r_ctl.rc_cnt_of_retran[idx] += peg;
- else
- rack->r_ctl.rc_cnt_of_retran[idx] = 0xffff;
-}
-
-/*
- * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This
- * array is zeroed at the start of recovery. Each time a segment
- * is retransmitted, we translate that into a number of packets
- * (based on segsiz) and based on how many times its been retransmitted
- * increment by the number of packets the counter that represents
- * retansmitted N times. Index 0 is retransmitted 1 time, index 1
- * is retransmitted 2 times etc.
- *
- * The rack_unpeg_rxt is used when we go to retransmit a segment
- * again. Basically if the segment had previously been retransmitted
- * say 3 times (as our previous example illustrated in the comment
- * above rack_peg_rxt() prior to calling that and incrementing
- * r_ack_rxt_cnt we would have called rack_unpeg_rxt() that would
- * subtract back the previous add from its last rxt (in this
- * example r_act_cnt would have been 2 for 2 retransmissions. So
- * we would have subtracted 3 from rc_cnt_of_reetran[1] to remove
- * those 3 segments. You will see this in the rack_update_rsm()
- * below where we do:
- * if (rsm->r_act_rxt_cnt > 0) {
- * rack_unpeg_rxt(rack, rsm, segsiz);
- * }
- * rsm->r_act_rxt_cnt++;
- * rack_peg_rxt(rack, rsm, segsiz);
- *
- * This effectively moves the count from rc_cnt_of_retran[1] to
- * rc_cnt_of_retran[2].
- */
-static void
-rack_unpeg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz)
-{
- int idx;
- uint32_t peg;
-
- idx = rsm->r_act_rxt_cnt - 1;
- if (idx >= RETRAN_CNT_SIZE)
- idx = RETRAN_CNT_SIZE - 1;
- peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1;
- peg /= segsiz;
- if (peg < rack->r_ctl.rc_cnt_of_retran[idx])
- rack->r_ctl.rc_cnt_of_retran[idx] -= peg;
- else {
- /* TSNH */
- rack->r_ctl.rc_cnt_of_retran[idx] = 0;
- }
-}
-
static void
rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz)
@@ -8809,13 +8143,8 @@
rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
rsm->r_flags |= RACK_OVERMAX;
}
- if (rsm->r_act_rxt_cnt > 0) {
- /* Drop the count back for this, its retransmitting again */
- rack_unpeg_rxt(rack, rsm, segsiz);
- }
rsm->r_act_rxt_cnt++;
/* Peg the count/index */
- rack_peg_rxt(rack, rsm, segsiz);
rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
rsm->r_dupack = 0;
if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
@@ -10768,9 +10097,6 @@
/* Save off the next one for quick reference. */
nrsm = tqhash_find(rack->r_ctl.tqh, end);
*prsm = rack->r_ctl.rc_sacklast = nrsm;
- if (IN_RECOVERY(tp->t_flags)) {
- rack->r_ctl.bytes_acked_in_recovery += changed;
- }
return (changed);
}
@@ -11085,10 +10411,6 @@
rsm->r_in_tmap = 0;
}
newly_acked = 1;
- if (((rsm->r_flags & RACK_ACKED) == 0) &&
- (IN_RECOVERY(tp->t_flags))) {
- rack->r_ctl.bytes_acked_in_recovery += (rsm->r_end - rsm->r_start);
- }
if (rsm->r_flags & RACK_ACKED) {
/*
* It was acked on the scoreboard -- remove
@@ -11171,10 +10493,6 @@
*/
rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
} else {
- if (((rsm->r_flags & RACK_ACKED) == 0) &&
- (IN_RECOVERY(tp->t_flags))) {
- rack->r_ctl.bytes_acked_in_recovery += (th_ack - rsm->r_start);
- }
rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack);
}
/* And what about the lost flag? */
@@ -11325,8 +10643,6 @@
tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
}
}
- rack->r_ctl.bytes_acked_in_recovery = 0;
- rack->r_ctl.time_entered_recovery = 0;
}
rack->r_might_revert = 0;
}
@@ -12717,8 +12033,6 @@
if (tp->snd_una == tp->snd_max) {
/* Nothing left outstanding */
tp->t_flags &= ~TF_PREVVALID;
- rack->r_ctl.idle_snd_una = tp->snd_una;
- rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
rack->r_ctl.retran_during_recovery = 0;
@@ -13531,7 +12845,6 @@
rack->r_ctl.retran_during_recovery = 0;
rack->rc_suspicious = 0;
rack->r_ctl.dsack_byte_cnt = 0;
- rack->r_ctl.idle_snd_una = tp->snd_una;
rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
@@ -15250,36 +14563,6 @@
return (0);
}
-static void
-rack_translate_policer_detect(struct tcp_rack *rack, uint32_t optval)
-{
- /*
- * P = Percent of retransmits 499 = 49.9%
- * A = Average number 1 (.1%) -> 169 (16.9%)
- * M = Median number of retrans 1 - 16
- * MMMM MMMM AAAA AAAA PPPP PPPP PPPP PPPP
- *
- */
- uint16_t per, upp;
-
- per = optval & 0x0000ffff;
- rack->r_ctl.policer_rxt_threshold = (uint32_t)(per & 0xffff);
- upp = ((optval & 0xffff0000) >> 16);
- rack->r_ctl.policer_avg_threshold = (0x00ff & upp);
- rack->r_ctl.policer_med_threshold = ((upp >> 8) & 0x00ff);
- if ((rack->r_ctl.policer_rxt_threshold > 0) &&
- (rack->r_ctl.policer_avg_threshold > 0) &&
- (rack->r_ctl.policer_med_threshold > 0)) {
- rack->policer_detect_on = 1;
- } else {
- rack->policer_detect_on = 0;
- }
- rack->r_ctl.saved_policer_val = optval;
- policer_detection_log(rack, optval,
- rack->r_ctl.policer_avg_threshold,
- rack->r_ctl.policer_med_threshold,
- rack->r_ctl.policer_rxt_threshold, 11);
-}
static int32_t
rack_init(struct tcpcb *tp, void **ptr)
@@ -15351,17 +14634,6 @@
rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
- rack->r_ctl.policer_del_mss = rack_req_del_mss;
- if ((rack_policer_rxt_thresh > 0) &&
- (rack_policer_avg_thresh > 0) &&
- (rack_policer_med_thresh > 0)) {
- rack->r_ctl.policer_rxt_threshold = rack_policer_rxt_thresh;
- rack->r_ctl.policer_avg_threshold = rack_policer_avg_thresh;
- rack->r_ctl.policer_med_threshold = rack_policer_med_thresh;
- rack->policer_detect_on = 1;
- } else {
- rack->policer_detect_on = 0;
- }
if (rack_fill_cw_state)
rack->rc_pace_to_cwnd = 1;
if (rack_pacing_min_seg)
@@ -15418,7 +14690,6 @@
rack->r_ctl.last_tm_mark = 0xffffffffffffffff;
rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
- rack->r_ctl.pol_bw_comp = rack_policing_do_bw_comp;
rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
rack->r_ctl.rc_highest_us_rtt = 0;
@@ -15454,7 +14725,6 @@
if (rack_honors_hpts_min_to)
rack->r_use_hpts_min = 1;
if (tp->snd_una != 0) {
- rack->r_ctl.idle_snd_una = tp->snd_una;
rack->rc_sendvars_notset = 0;
/*
* Make sure any TCP timers are not running.
@@ -18186,116 +17456,6 @@
return (slot);
}
-static uint32_t
-rack_policer_check_send(struct tcp_rack *rack, uint32_t len, uint32_t segsiz, uint32_t *needs)
-{
- uint64_t calc;
-
- rack->rc_policer_should_pace = 0;
- calc = rack_policer_bucket_reserve * rack->r_ctl.policer_bucket_size;
- calc /= 100;
- /*
- * Now lets look at if we want more than is in the bucket <or>
- * we want more than is reserved in the bucket.
- */
- if (rack_verbose_logging > 0)
- policer_detection_log(rack, len, segsiz, calc, rack->r_ctl.current_policer_bucket, 8);
- if ((calc > rack->r_ctl.current_policer_bucket) ||
- (len >= (rack->r_ctl.current_policer_bucket - calc))) {
- /*
- * We may want to pace depending on if we are going
- * into the reserve or not.
- */
- uint32_t newlen;
-
- if (calc > rack->r_ctl.current_policer_bucket) {
- /*
- * This will eat into the reserve if we
- * don't have room at all some lines
- * below will catch it.
- */
- newlen = rack->r_ctl.policer_max_seg;
- rack->rc_policer_should_pace = 1;
- } else {
- /*
- * We have all of the reserve plus something in the bucket
- * that we can give out.
- */
- newlen = rack->r_ctl.current_policer_bucket - calc;
- if (newlen < rack->r_ctl.policer_max_seg) {
- /*
- * Into the reserve to get a full policer_max_seg
- * so we set the len to that and eat into
- * the reserve. If we go over the code
- * below will make us wait.
- */
- newlen = rack->r_ctl.policer_max_seg;
- rack->rc_policer_should_pace = 1;
- }
- }
- if (newlen > rack->r_ctl.current_policer_bucket) {
- /* We have to wait some */
- *needs = newlen - rack->r_ctl.current_policer_bucket;
- return (0);
- }
- if (rack_verbose_logging > 0)
- policer_detection_log(rack, len, segsiz, newlen, 0, 9);
- len = newlen;
- } /* else we have all len available above the reserve */
- if (rack_verbose_logging > 0)
- policer_detection_log(rack, len, segsiz, calc, 0, 10);
- return (len);
-}
-
-static uint32_t
-rack_policed_sending(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, uint32_t segsiz, int call_line)
-{
- /*
- * Given a send of len, and a token bucket set at current_policer_bucket_size
- * are we close enough to the end of the bucket that we need to pace? If so
- * calculate out a time and return it. Otherwise subtract the tokens from
- * the bucket.
- */
- uint64_t calc;
-
- if ((rack->r_ctl.policer_bw == 0) ||
- (rack->r_ctl.policer_bucket_size < segsiz)) {
- /*
- * We should have an estimate here...
- */
- return (0);
- }
- calc = (uint64_t)rack_policer_bucket_reserve * (uint64_t)rack->r_ctl.policer_bucket_size;
- calc /= 100;
- if ((rack->r_ctl.current_policer_bucket < len) ||
- (rack->rc_policer_should_pace == 1) ||
- ((rack->r_ctl.current_policer_bucket - len) <= (uint32_t)calc)) {
- /* we need to pace */
- uint64_t lentim, res;
- uint32_t slot;
-
- lentim = (uint64_t)len * (uint64_t)HPTS_USEC_IN_SEC;
- res = lentim / rack->r_ctl.policer_bw;
- slot = (uint32_t)res;
- if (rack->r_ctl.current_policer_bucket > len)
- rack->r_ctl.current_policer_bucket -= len;
- else
- rack->r_ctl.current_policer_bucket = 0;
- policer_detection_log(rack, len, slot, (uint32_t)rack_policer_bucket_reserve, call_line, 5);
- rack->rc_policer_should_pace = 0;
- return(slot);
- }
- /* Just take tokens out of the bucket and let rack do whatever it would have */
- policer_detection_log(rack, len, 0, (uint32_t)rack_policer_bucket_reserve, call_line, 6);
- if (len < rack->r_ctl.current_policer_bucket) {
- rack->r_ctl.current_policer_bucket -= len;
- } else {
- rack->r_ctl.current_policer_bucket = 0;
- }
- return (0);
-}
-
-
static int32_t
rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)
{
@@ -18311,25 +17471,6 @@
pace_one = 1;
else
pace_one = 0;
- if (rack->rc_policer_detected == 1) {
- /*
- * A policer has been detected and we
- * have all of our data (policer-bw and
- * policer bucket size) calculated. Call
- * into the function to find out if we are
- * overriding the time.
- */
- slot = rack_policed_sending(rack, tp, len, segsiz, line);
- if (slot) {
- uint64_t logbw;
-
- logbw = rack->r_ctl.current_policer_bucket;
- logbw <<= 32;
- logbw |= rack->r_ctl.policer_bucket_size;
- rack_log_pacing_delay_calc(rack, len, slot, rack->r_ctl.policer_bw, logbw, 0, 89, __LINE__, NULL, 0);
- return(slot);
- }
- }
if (rack->rc_always_pace == 0) {
/*
* We use the most optimistic possible cwnd/srtt for
@@ -20536,25 +19677,6 @@
return (NULL);
}
-static void
-rack_credit_back_policer_idle_time(struct tcp_rack *rack, uint64_t idle_t, int line)
-{
- /*
- * We were idle some time (idle_t) and so our policer bucket
- * needs to grow. It can go no higher than policer_bucket_size.
- */
- uint64_t len;
-
- len = idle_t * rack->r_ctl.policer_bw;
- len /= HPTS_USEC_IN_SEC;
- rack->r_ctl.current_policer_bucket += (uint32_t)len;
- if (rack->r_ctl.policer_bucket_size < rack->r_ctl.current_policer_bucket) {
- rack->r_ctl.current_policer_bucket = rack->r_ctl.policer_bucket_size;
- }
- if (rack_verbose_logging > 0)
- policer_detection_log(rack, (uint32_t)len, line, (uint32_t)idle_t, 0, 7);
-}
-
static inline void
rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg)
{
@@ -20872,34 +19994,12 @@
}
}
}
- if(rack->policer_detect_on) {
- /*
- * If we are doing policer detetion we at a minium
- * record the time but if possible add back to
- * the bucket based on the idle time.
- */
- uint64_t idle_t, u64_cts;
-
- segsiz = min(ctf_fixed_maxseg(tp),
- rack->r_ctl.rc_pace_min_segs);
- u64_cts = tcp_tv_to_lusectick(&tv);
- if ((rack->rc_policer_detected == 1) &&
- (rack->r_ctl.policer_bucket_size > segsiz) &&
- (rack->r_ctl.policer_bw > 0) &&
- (u64_cts > rack->r_ctl.last_sendtime)) {
- /* We are being policed add back the time */
- idle_t = u64_cts - rack->r_ctl.last_sendtime;
- rack_credit_back_policer_idle_time(rack, idle_t, __LINE__);
- }
- rack->r_ctl.last_sendtime = u64_cts;
- }
if (rack_use_fsb &&
(rack->r_ctl.fsb.tcp_ip_hdr) &&
(rack->r_fsb_inited == 0) &&
(rack->r_state != TCPS_CLOSED))
rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]);
if (rack->rc_sendvars_notset == 1) {
- rack->r_ctl.idle_snd_una = tp->snd_una;
rack->rc_sendvars_notset = 0;
/*
* Make sure any TCP timers (keep-alive) is not running.
@@ -21215,19 +20315,10 @@
((rsm->r_flags & RACK_HAS_FIN) == 0)) {
int ret;
- if ((rack->rc_policer_detected == 1) &&
- (rack->r_ctl.policer_bucket_size > segsiz) &&
- (rack->r_ctl.policer_bw > 0)) {
- /* Check to see if there is room */
- if (rack->r_ctl.current_policer_bucket < len) {
- goto skip_fast_output;
- }
- }
ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
if (ret == 0)
return (0);
}
-skip_fast_output:
so = inp->inp_socket;
sb = &so->so_snd;
if (do_a_prefetch == 0) {
@@ -21418,43 +20509,6 @@
prefetch_so_done = 1;
}
orig_len = len;
- if ((rack->rc_policer_detected == 1) &&
- (rack->r_ctl.policer_bucket_size > segsiz) &&
- (rack->r_ctl.policer_bw > 0) &&
- (len > 0)) {
- /*
- * Ok we believe we have a policer watching
- * what we send, can we send len? If not can
- * we tune it down to a smaller value?
- */
- uint32_t plen, buck_needs;
-
- plen = rack_policer_check_send(rack, len, segsiz, &buck_needs);
- if (plen == 0) {
- /*
- * We are not allowed to send. How long
- * do we need to pace for i.e. how long
- * before len is available to send?
- */
- uint64_t lentime;
-
- lentime = buck_needs;
- lentime *= HPTS_USEC_IN_SEC;
- lentime /= rack->r_ctl.policer_bw;
- slot = (uint32_t)lentime;
- tot_len_this_send = 0;
- SOCKBUF_UNLOCK(sb);
- if (rack_verbose_logging > 0)
- policer_detection_log(rack, len, slot, buck_needs, 0, 12);
- rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
- rack_log_type_just_return(rack, cts, 0, slot, hpts_calling, 0, cwnd_to_use);
- goto just_return_clean;
- }
- if (plen < len) {
- sendalot = 0;
- len = plen;
- }
- }
/*
* Lop off SYN bit if it has already been sent. However, if this is
* SYN-SENT state and if segment contains data and if we don't know
@@ -21853,7 +20907,6 @@
rack->r_ctl.fsb.recwin = recwin;
slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
if ((error == 0) &&
- (rack->rc_policer_detected == 0) &&
rack_use_rfo &&
((flags & (TH_SYN|TH_FIN)) == 0) &&
(ipoptlen == 0) &&
@@ -22038,7 +21091,6 @@
rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
}
-just_return_clean:
#ifdef NETFLIX_SHARED_CWND
if ((sbavail(sb) == 0) &&
rack->r_ctl.rc_scw) {
@@ -23498,7 +22550,6 @@
(rsm == NULL) &&
(ipoptlen == 0) &&
(tp->rcv_numsacks == 0) &&
- (rack->rc_policer_detected == 0) &&
rack->r_fsb_inited &&
TCPS_HAVEESTABLISHED(tp->t_state) &&
((IN_RECOVERY(tp->t_flags)) == 0) &&
@@ -23909,28 +22960,7 @@
static int
rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si)
{
- /*
- * Gather rack specific information.
- */
- struct tcp_rack *rack;
-
- rack = (struct tcp_rack *)tp->t_fb_ptr;
/* We pulled a SSI info log out what was there */
- policer_detection_log(rack, rack->rc_highly_buffered, 0, 0, 0, 20);
- if (rack->policer_detect_on) {
- si->policer_detection_enabled = 1;
- if (rack->rc_policer_detected) {
- si->policer_detected = 1;
- si->policer_bucket_size = rack->r_ctl.policer_bucket_size;
- si->policer_last_bw = rack->r_ctl.policer_bw;
- } else {
- si->policer_detected = 0;
- si->policer_bucket_size = 0;
- si->policer_last_bw = 0;
- }
- si->current_round = rack->r_ctl.current_round;
- si->highly_buffered = rack->rc_highly_buffered;
- }
si->bytes_transmitted = tp->t_sndbytes;
si->bytes_retransmitted = tp->t_snd_rxt_bytes;
return (0);
@@ -24161,36 +23191,6 @@
case TCP_RACK_DGP_IN_REC:
error = EINVAL;
break;
- case TCP_POLICER_DETECT: /* URL:pol_det */
- RACK_OPTS_INC(tcp_pol_detect);
- rack_translate_policer_detect(rack, optval);
- break;
- case TCP_POLICER_MSS:
- RACK_OPTS_INC(tcp_pol_mss);
- rack->r_ctl.policer_del_mss = (uint8_t)optval;
- if (optval & 0x00000100) {
- /*
- * Value is setup like so:
- * VVVV VVVV VVVV VVVV VVVV VVAI MMMM MMMM
- * Where MMMM MMMM is MSS setting
- * I (9th bit) is the Postive value that
- * says it is being set (if its 0 then the
- * upper bits 11 - 32 have no meaning.
- * This allows setting it off with
- * 0x000001MM.
- *
- * The 10th bit is used to turn on the
- * alternate median (not the expanded one).
- *
- */
- rack->r_ctl.pol_bw_comp = (optval >> 10);
- }
- if (optval & 0x00000200) {
- rack->r_ctl.policer_alt_median = 1;
- } else {
- rack->r_ctl.policer_alt_median = 0;
- }
- break;
case TCP_RACK_PACE_TO_FILL:
RACK_OPTS_INC(tcp_fillcw);
if (optval == 0)
@@ -24857,43 +23857,6 @@
dest->r_limit_scw = src->r_limit_scw;
cnt++;
}
- /* TCP_POLICER_DETECT */
- if (dest->r_ctl.policer_rxt_threshold != src->r_ctl.policer_rxt_threshold) {
- dest->r_ctl.policer_rxt_threshold = src->r_ctl.policer_rxt_threshold;
- cnt++;
- }
- if (dest->r_ctl.policer_avg_threshold != src->r_ctl.policer_avg_threshold) {
- dest->r_ctl.policer_avg_threshold = src->r_ctl.policer_avg_threshold;
- cnt++;
- }
- if (dest->r_ctl.policer_med_threshold != src->r_ctl.policer_med_threshold) {
- dest->r_ctl.policer_med_threshold = src->r_ctl.policer_med_threshold;
- cnt++;
- }
- if (dest->policer_detect_on != src->policer_detect_on) {
- dest->policer_detect_on = src->policer_detect_on;
- cnt++;
- }
-
- if (dest->r_ctl.saved_policer_val != src->r_ctl.saved_policer_val) {
- dest->r_ctl.saved_policer_val = src->r_ctl.saved_policer_val;
- cnt++;
- }
- /* TCP_POLICER_MSS */
- if (dest->r_ctl.policer_del_mss != src->r_ctl.policer_del_mss) {
- dest->r_ctl.policer_del_mss = src->r_ctl.policer_del_mss;
- cnt++;
- }
-
- if (dest->r_ctl.pol_bw_comp != src->r_ctl.pol_bw_comp) {
- dest->r_ctl.pol_bw_comp = src->r_ctl.pol_bw_comp;
- cnt++;
- }
-
- if (dest->r_ctl.policer_alt_median != src->r_ctl.policer_alt_median) {
- dest->r_ctl.policer_alt_median = src->r_ctl.policer_alt_median;
- cnt++;
- }
/* TCP_RACK_PACE_TO_FILL */
if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) {
dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd;
@@ -25345,8 +24308,6 @@
case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
/* End pacing related */
- case TCP_POLICER_DETECT: /* URL:pol_det */
- case TCP_POLICER_MSS: /* URL:pol_mss */
case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
case TCP_RACK_MIN_TO: /* URL:min_to */
@@ -25590,12 +24551,6 @@
case TCP_RACK_HI_BETA:
optval = rack->rack_hibeta;
break;
- case TCP_POLICER_MSS:
- optval = rack->r_ctl.policer_del_mss;
- break;
- case TCP_POLICER_DETECT:
- optval = rack->r_ctl.saved_policer_val;
- break;
case TCP_DEFER_OPTIONS:
optval = rack->defer_options;
break;
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -484,12 +484,6 @@
int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */
uint64_t last_tmit_time_acked; /* Holds the last cumack point's last send time */
/* Recovery stats */
- uint64_t time_entered_recovery;
- uint64_t bytes_acked_in_recovery;
- /* Policer Detection */
- uint64_t last_policer_sndbytes;
- uint64_t last_policer_snd_rxt_bytes;
- uint64_t policer_bw;
uint64_t last_sendtime;
uint64_t last_gpest;
@@ -502,19 +496,9 @@
uint32_t gp_rnd_thresh;
uint32_t ss_hi_fs;
uint32_t gate_to_fs;
- uint32_t policer_max_seg;
- uint32_t pol_bw_comp;
- uint16_t policer_rxt_threshold;
- uint8_t policer_avg_threshold;
- uint8_t policer_med_threshold;
uint32_t pcm_max_seg;
uint32_t last_pcm_round;
uint32_t pcm_idle_rounds;
- uint32_t current_policer_bucket;
- uint32_t policer_bucket_size;
- uint32_t idle_snd_una;
- uint32_t ack_for_idle;
- uint32_t last_amount_before_rec;
uint32_t rc_gp_srtt; /* Current GP srtt */
uint32_t rc_prev_gp_srtt; /* Previous RTT */
@@ -558,7 +542,6 @@
uint32_t persist_lost_ends;
uint32_t input_pkt;
uint32_t saved_input_pkt;
- uint32_t saved_policer_val; /* The encoded value we used to setup policer detection */
uint32_t cleared_app_ack_seq;
uint32_t last_rcv_tstmp_for_rtt;
uint32_t last_time_of_arm_rcv;
@@ -578,7 +561,6 @@
uint16_t rc_cnt_of_retran[RETRAN_CNT_SIZE];
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
- uint8_t policer_del_mss; /* How many mss during recovery for policer detection */
uint8_t rack_per_upper_bound_ss;
uint8_t rack_per_upper_bound_ca;
uint8_t cleared_app_ack;
@@ -590,7 +572,6 @@
uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */
uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */
uint8_t rc_rate_sample_method;
- uint8_t policer_alt_median; /* Alternate median for policer detection */
uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */
uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */
uint8_t use_gp_not_last;
@@ -792,12 +773,9 @@
r_collapse_point_valid : 1,
dgp_on : 1;
uint16_t rto_from_rec: 1,
- avail_bit: 1,
+ avail_bit: 4,
pcm_in_progress: 1,
pcm_needed: 1,
- policer_detect_on: 1, /* Are we detecting policers? */
- rc_policer_detected : 1, /* We are beiing policed */
- rc_policer_should_pace : 1, /* The sizing algo thinks we should pace */
rc_sendvars_notset : 1, /* Inside rack_init send variables (snd_max/una etc) were not set */
rc_gp_rtt_set : 1,
rc_gp_dyn_mul : 1,

File Metadata

Mime Type
text/plain
Expires
Sat, Nov 16, 10:42 AM (21 h, 12 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14657419
Default Alt Text
D45410.diff (52 KB)

Event Timeline