@@ -110,6 +110,9 @@ u8:2 syn_ect_rcv read_mostly read_w
u8:1 wait_third_ack read_write
u8:2 accecn_minlen write_mostly read_write
u8:2 est_ecnfield read_write
+u8:2 accecn_opt_demand read_mostly read_write
+u8:2 prev_ecnfield read_write
+u64 accecn_opt_tstamp read_write
u8:4 accecn_fail_mode
u32 lost read_mostly tcp_ack
u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
@@ -276,6 +276,7 @@ struct tcp_sock {
u32 mdev_us; /* medium deviation */
u32 rtt_seq; /* sequence number to update rttvar */
u64 tcp_wstamp_ns; /* departure time for next sent data packet */
+ u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */
struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
struct sk_buff *highest_sack; /* skb just after the highest
* skb with SACKed bit set
@@ -297,7 +298,8 @@ struct tcp_sock {
unused2:4;
u8 accecn_minlen:2,/* Minimum length of AccECN option sent */
est_ecnfield:2,/* ECN field for AccECN delivered estimates */
- unused3:4;
+ accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
+ prev_ecnfield:2; /* ECN bits from the previous segment */
__be32 pred_flags;
u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
u64 tcp_mstamp; /* most recent packet received/sent */
@@ -149,6 +149,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_ecn;
u8 sysctl_tcp_ecn_option;
+ u8 sysctl_tcp_ecn_option_beacon;
u8 sysctl_tcp_ecn_fallback;
u8 sysctl_ip_default_ttl;
@@ -91,6 +91,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE 14U
+/* Default sending frequency of accurate ECN option per RTT */
+#define TCP_ECN_OPTION_BEACON 3
+
/* urg_data states */
#define TCP_URG_VALID 0x0100
#define TCP_URG_NOTYET 0x0200
@@ -1088,6 +1091,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
__tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
__tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
tp->accecn_minlen = 0;
+ tp->accecn_opt_demand = 0;
tp->est_ecnfield = 0;
}
@@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
+ {
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
{
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
@@ -3368,6 +3368,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->wait_third_ack = 0;
tp->accecn_fail_mode = 0;
tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -5087,11 +5089,12 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
+ CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 98);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
@@ -487,6 +487,7 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
default:
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
+ tp->accecn_opt_demand = 2;
if (INET_ECN_is_ce(ip_dsfield) &&
tcp_accecn_validate_syn_feedback(sk, ace,
tp->syn_ect_snt)) {
@@ -507,6 +508,7 @@ static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
} else {
tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
INET_ECN_MASK;
+ tp->prev_ecnfield = tp->syn_ect_rcv;
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
}
}
@@ -6302,6 +6304,7 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb,
u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
u8 is_ce = INET_ECN_is_ce(ecnfield);
struct tcp_sock *tp = tcp_sk(sk);
+ bool ecn_edge;
if (!INET_ECN_is_not_ect(ecnfield)) {
u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
@@ -6315,9 +6318,36 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb,
if (payload_len > 0) {
u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
+ u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
+
tp->received_ecn_bytes[ecnfield - 1] += payload_len;
tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
minlen);
+
+ /* Demand AccECN option at least every 2^22 bytes to
+ * avoid overflowing the ECN byte counters.
+ */
+ if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
+ ~((1 << 22) - 1)) {
+ u8 opt_demand = max_t(u8, 1,
+ tp->accecn_opt_demand);
+
+ tp->accecn_opt_demand = opt_demand;
+ }
+ }
+ }
+
+ ecn_edge = tp->prev_ecnfield != ecnfield;
+ if (ecn_edge || is_ce) {
+ tp->prev_ecnfield = ecnfield;
+ /* Demand Accurate ECN change-triggered ACKs. Two ACK are
+ * demanded to indicate unambiguously the ecnfield value
+ * in the latter ACK.
+ */
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (ecn_edge)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ tp->accecn_opt_demand = 2;
}
}
}
@@ -6450,8 +6480,12 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
- if (tcp_ecn_mode_accecn(tp))
+ if (tcp_ecn_mode_accecn(tp)) {
+ u8 opt_demand = max_t(u8, 1, tp->accecn_opt_demand);
+
accecn_reflector = true;
+ tp->accecn_opt_demand = opt_demand;
+ }
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
@@ -3451,6 +3451,7 @@ static int __net_init tcp_sk_init(struct net *net)
{
net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
net->ipv4.sysctl_tcp_ecn_option = TCP_ECN_OPTION_FULL;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ECN_OPTION_BEACON;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
@@ -501,6 +501,8 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
} else {
tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
@@ -806,8 +806,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
}
- if (tp)
+ if (tp) {
tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -984,6 +988,18 @@ static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
return size;
}
+static bool tcp_accecn_option_beacon_check(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon)
+ return false;
+
+ return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) *
+ sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon >=
+ (tp->srtt_us >> 3);
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -1236,12 +1252,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
}
if (tcp_ecn_mode_accecn(tp) &&
- sock_net(sk)->ipv4.sysctl_tcp_ecn_option) {
+ sock_net(sk)->ipv4.sysctl_tcp_ecn_option &&
+ (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= TCP_ECN_OPTION_FULL ||
+ tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk))) {
int saving = opts->num_sack_blocks > 0 ? 2 : 0;
int remaining = MAX_TCP_OPTION_SPACE - size;
opts->ecn_bytes = tp->received_ecn_bytes;
- size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ size += tcp_options_fit_accecn(opts,
+ tp->accecn_minlen,
remaining,
saving);
}
@@ -2959,6 +2979,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);