diff mbox series

net/mlx4: tcp_drop replace of tcp_drop_new

Message ID 20210821155327.251284-1-yan2228598786@gmail.com
State New
Headers show
Series net/mlx4: tcp_drop replace of tcp_drop_new | expand

Commit Message

Zhongya Yan Aug. 21, 2021, 3:53 p.m. UTC
We never know why we are deleting a tcp packet when we delete it,
and the tcp_drop_new() function can effectively solve this problem.
The tcp_drop_new() will learn from the specified status code why the
packet was deleted, and the caller from whom the packet was deleted.
The kernel should be a little more open to data that is about to be
destroyed and useless, and users should be able to keep track of it.

Signed-off-by: jony-one <yan2228598786@gmail.com>
---
 include/trace/events/tcp.h | 51 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       | 29 ++++++++++++++--------
 2 files changed, 69 insertions(+), 11 deletions(-)

Comments

Yonghong Song Aug. 23, 2021, 12:21 a.m. UTC | #1
On 8/21/21 8:53 AM, jony-one wrote:
> We never know why we are deleting a tcp packet when we delete it,

> and the tcp_drop_new() function can effectively solve this problem.

> The tcp_drop_new() will learn from the specified status code why the

> packet was deleted, and the caller from whom the packet was deleted.

> The kernel should be a little more open to data that is about to be

> destroyed and useless, and users should be able to keep track of it.


For the subject, prefix "net/mlx4" is not accurate. This patch has
nothing to do with mlx4. Just use "net". The subject is not accurate
too, maybe "introduce tracepoint tcp_drop". Tracepoint name "tcp_drop"
seems more accurate compared to "tcp_drop_new".

It is worthwhile to mention the context/why we want to this
tracepoint with bcc issue https://github.com/iovisor/bcc/issues/3533.
Mainly two reasons: (1). tcp_drop is a tiny function which
may easily get inlined, a tracepoint is more stable, and (2).
tcp_drop does not provide enough information on why it is dropped.

> 

> Signed-off-by: jony-one <yan2228598786@gmail.com>


Please use your proper name. "jony-one" is not a good one.

> ---

>   include/trace/events/tcp.h | 51 ++++++++++++++++++++++++++++++++++++++

>   net/ipv4/tcp_input.c       | 29 ++++++++++++++--------

>   2 files changed, 69 insertions(+), 11 deletions(-)

> 

> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h

> index 521059d8d..5a0478440 100644

> --- a/include/trace/events/tcp.h

> +++ b/include/trace/events/tcp.h

> @@ -371,6 +371,57 @@ DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,

>   	TP_ARGS(skb)

>   );

>   

> +/*

> + * tcp event whit argument sk, skb, reason

> + */

> +TRACE_EVENT(tcp_drop_new,


tcp_drop instead of tcp_deop_new?

> +

> +		TP_PROTO(struct sock *sk, struct sk_buff *skb, const char *reason),

> +

> +		TP_ARGS(sk, skb, reason),

> +

> +		TP_STRUCT__entry(

> +			__field(const void *, skbaddr)

> +			__field(const void *, skaddr)

> +			__string(reason, reason)


I understand we may want to print out the reason with more
elaborate reason, but for kernel internal implementation
an enum should be enough. See tracepoint sched/sched_switch.

> +			__field(int, state)

> +			__field(__u16, sport)

> +			__field(__u16, dport)

> +			__array(__u8, saddr, 4)

> +			__array(__u8, daddr, 4)

> +			__array(__u8, saddr_v6, 16)

> +			__array(__u8, daddr_v6, 16)

> +		),

> +

> +		TP_fast_assign(

> +			struct inet_sock *inet = inet_sk(sk);

> +			__be32 *p32;

> +

> +			__assign_str(reason, reason);

> +

> +			__entry->skbaddr = skb;

> +			__entry->skaddr = sk;

> +			__entry->state = sk->sk_state;

> +

> +			__entry->sport = ntohs(inet->inet_sport);

> +			__entry->dport = ntohs(inet->inet_dport);

> +

> +			p32 = (__be32 *) __entry->saddr;

> +			*p32 = inet->inet_saddr;

> +

> +			p32 = (__be32 *) __entry->daddr;

> +			*p32 =  inet->inet_daddr;

> +

> +			TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,

> +				sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);

> +		),

> +

> +		TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s reason=%s",

> +				__entry->sport, __entry->dport, __entry->saddr, __entry->daddr,

> +				__entry->saddr_v6, __entry->daddr_v6,

> +				show_tcp_state_name(__entry->state), __get_str(reason))

> +);

> +

>   #endif /* _TRACE_TCP_H */

>   

>   /* This part must be outside protection */

> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

> index 149ceb5c9..988989e25 100644

> --- a/net/ipv4/tcp_input.c

> +++ b/net/ipv4/tcp_input.c

> @@ -4649,6 +4649,13 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)

>   	__kfree_skb(skb);

>   }

>   

> +static void tcp_drop_new(struct sock *sk, struct sk_buff *skb, const char *reason)


You can rename funciton name to __tcp_drop(). tcp_drop_new() is not a 
good name.

> +{

> +	trace_tcp_drop_new(sk, skb, reason);

> +	sk_drops_add(sk, skb);

> +	__kfree_skb(skb);

> +}

> +

>   /* This one checks to see if we can put data from the

>    * out_of_order queue into the receive_queue.

>    */

> @@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)

>   		rb_erase(&skb->rbnode, &tp->out_of_order_queue);

>   

>   		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {

> -			tcp_drop(sk, skb);

> +			tcp_drop_new(sk, skb, __func__);


use enumeration?

>   			continue;

>   		}

>   

> @@ -4732,7 +4739,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

>   	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {

>   		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);

>   		sk->sk_data_ready(sk);

> -		tcp_drop(sk, skb);

> +		tcp_drop_new(sk, skb, __func__);

>   		return;

>   	}

>   

> @@ -4795,7 +4802,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

>   				/* All the bits are present. Drop. */

>   				NET_INC_STATS(sock_net(sk),

>   					      LINUX_MIB_TCPOFOMERGE);

> -				tcp_drop(sk, skb);

> +				tcp_drop_new(sk, skb, __func__);

>   				skb = NULL;

>   				tcp_dsack_set(sk, seq, end_seq);

>   				goto add_sack;

> @@ -4814,7 +4821,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

>   						 TCP_SKB_CB(skb1)->end_seq);

>   				NET_INC_STATS(sock_net(sk),

>   					      LINUX_MIB_TCPOFOMERGE);

> -				tcp_drop(sk, skb1);

> +				tcp_drop_new(sk, skb1, __func__);


This one and the above one are in the same function so they will have
the same reason. It would be good if we can differentiate them.

>   				goto merge_right;

>   			}

>   		} else if (tcp_ooo_try_coalesce(sk, skb1,

> @@ -4842,7 +4849,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)

>   		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,

>   				 TCP_SKB_CB(skb1)->end_seq);

>   		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);

> -		tcp_drop(sk, skb1);

> +		tcp_drop_new(sk, skb1, __func__);

>   	}

>   	/* If there is no skb after us, we are the last_skb ! */

>   	if (!skb1)

> @@ -5019,7 +5026,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)

>   		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);

>   		inet_csk_schedule_ack(sk);

>   drop:

> -		tcp_drop(sk, skb);

> +		tcp_drop_new(sk, skb, __func__);

>   		return;

>   	}

>   

> @@ -5276,7 +5283,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)

>   		prev = rb_prev(node);

>   		rb_erase(node, &tp->out_of_order_queue);

>   		goal -= rb_to_skb(node)->truesize;

> -		tcp_drop(sk, rb_to_skb(node));

> +		tcp_drop_new(sk, rb_to_skb(node), __func__);

>   		if (!prev || goal <= 0) {

>   			sk_mem_reclaim(sk);

>   			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&

> @@ -5701,7 +5708,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,

>   	return true;

>   

>   discard:

> -	tcp_drop(sk, skb);

> +	tcp_drop_new(sk, skb, __func__);

>   	return false;

>   }

>   

> @@ -5905,7 +5912,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)

>   	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);

>   

>   discard:

> -	tcp_drop(sk, skb);

> +	tcp_drop_new(sk, skb, __func__);

>   }

>   EXPORT_SYMBOL(tcp_rcv_established);

>   

> @@ -6196,7 +6203,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,

>   						  TCP_DELACK_MAX, TCP_RTO_MAX);

>   

>   discard:

> -			tcp_drop(sk, skb);

> +			tcp_drop_new(sk, skb, __func__);

>   			return 0;

>   		} else {

>   			tcp_send_ack(sk);

> @@ -6568,7 +6575,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)

>   

>   	if (!queued) {

>   discard:

> -		tcp_drop(sk, skb);

> +		tcp_drop_new(sk, skb, __func__);

>   	}

>   	return 0;

>   }

>
Cong Wang Aug. 23, 2021, 7:06 p.m. UTC | #2
On Sat, Aug 21, 2021 at 8:55 AM jony-one <yan2228598786@gmail.com> wrote:
> @@ -4676,7 +4683,7 @@ static void tcp_ofo_queue(struct sock *sk)

>                 rb_erase(&skb->rbnode, &tp->out_of_order_queue);

>

>                 if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {

> -                       tcp_drop(sk, skb);

> +                       tcp_drop_new(sk, skb, __func__);


This is very similar to race_kfree_skb():

void kfree_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_kfree_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}

So... why not something like this?

iff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3f7bd7ae7d7a..cc840e4552c9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4678,6 +4678,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 {
        sk_drops_add(sk, skb);
+       trace_kfree_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
 }

Thanks.
Cong Wang Aug. 31, 2021, 1:24 a.m. UTC | #3
On Tue, Aug 24, 2021 at 7:52 AM 小严 <yan2228598786@gmail.com> wrote:
>

> The main reason is to distinguish the reason for each tcp deletion, __kfree_skb is to release the skb, and it is used in more places than just the skb, so it is impossible to trace the reason for the tcp deletion.


You can filter them out by 'location', which is the second parameter of
the tracepoint. Please check out the filter syntax for tracepoint here:
https://www.kernel.org/doc/html/v4.18/trace/events.html

Thanks.
diff mbox series

Patch

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 521059d8d..5a0478440 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -371,6 +371,57 @@  DEFINE_EVENT(tcp_event_skb, tcp_bad_csum,
 	TP_ARGS(skb)
 );
 
+/*
+ * tcp event whit argument sk, skb, reason
+ */
+TRACE_EVENT(tcp_drop_new,
+
+		TP_PROTO(struct sock *sk, struct sk_buff *skb, const char *reason),
+
+		TP_ARGS(sk, skb, reason),
+
+		TP_STRUCT__entry(
+			__field(const void *, skbaddr)
+			__field(const void *, skaddr)
+			__string(reason, reason)
+			__field(int, state)
+			__field(__u16, sport)
+			__field(__u16, dport)
+			__array(__u8, saddr, 4)
+			__array(__u8, daddr, 4)
+			__array(__u8, saddr_v6, 16)
+			__array(__u8, daddr_v6, 16)
+		),
+
+		TP_fast_assign(
+			struct inet_sock *inet = inet_sk(sk);
+			__be32 *p32;
+
+			__assign_str(reason, reason);
+
+			__entry->skbaddr = skb;
+			__entry->skaddr = sk;
+			__entry->state = sk->sk_state;
+
+			__entry->sport = ntohs(inet->inet_sport);
+			__entry->dport = ntohs(inet->inet_dport);
+
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = inet->inet_saddr;
+
+			p32 = (__be32 *) __entry->daddr;
+			*p32 =  inet->inet_daddr;
+
+			TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+				sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+		),
+
+		TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s reason=%s",
+				__entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
+				__entry->saddr_v6, __entry->daddr_v6,
+				show_tcp_state_name(__entry->state), __get_str(reason))
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 149ceb5c9..988989e25 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4649,6 +4649,13 @@  static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 	__kfree_skb(skb);
 }
 
+static void tcp_drop_new(struct sock *sk, struct sk_buff *skb, const char *reason)
+{
+	trace_tcp_drop_new(sk, skb, reason);
+	sk_drops_add(sk, skb);
+	__kfree_skb(skb);
+}
+
 /* This one checks to see if we can put data from the
  * out_of_order queue into the receive_queue.
  */
@@ -4676,7 +4683,7 @@  static void tcp_ofo_queue(struct sock *sk)
 		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
 
 		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
-			tcp_drop(sk, skb);
+			tcp_drop_new(sk, skb, __func__);
 			continue;
 		}
 
@@ -4732,7 +4739,7 @@  static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
 		sk->sk_data_ready(sk);
-		tcp_drop(sk, skb);
+		tcp_drop_new(sk, skb, __func__);
 		return;
 	}
 
@@ -4795,7 +4802,7 @@  static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 				/* All the bits are present. Drop. */
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				tcp_drop(sk, skb);
+				tcp_drop_new(sk, skb, __func__);
 				skb = NULL;
 				tcp_dsack_set(sk, seq, end_seq);
 				goto add_sack;
@@ -4814,7 +4821,7 @@  static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 						 TCP_SKB_CB(skb1)->end_seq);
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				tcp_drop(sk, skb1);
+				tcp_drop_new(sk, skb1, __func__);
 				goto merge_right;
 			}
 		} else if (tcp_ooo_try_coalesce(sk, skb1,
@@ -4842,7 +4849,7 @@  static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
 				 TCP_SKB_CB(skb1)->end_seq);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
-		tcp_drop(sk, skb1);
+		tcp_drop_new(sk, skb1, __func__);
 	}
 	/* If there is no skb after us, we are the last_skb ! */
 	if (!skb1)
@@ -5019,7 +5026,7 @@  static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 		inet_csk_schedule_ack(sk);
 drop:
-		tcp_drop(sk, skb);
+		tcp_drop_new(sk, skb, __func__);
 		return;
 	}
 
@@ -5276,7 +5283,7 @@  static bool tcp_prune_ofo_queue(struct sock *sk)
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
 		goal -= rb_to_skb(node)->truesize;
-		tcp_drop(sk, rb_to_skb(node));
+		tcp_drop_new(sk, rb_to_skb(node), __func__);
 		if (!prev || goal <= 0) {
 			sk_mem_reclaim(sk);
 			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
@@ -5701,7 +5708,7 @@  static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	return true;
 
 discard:
-	tcp_drop(sk, skb);
+	tcp_drop_new(sk, skb, __func__);
 	return false;
 }
 
@@ -5905,7 +5912,7 @@  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
 
 discard:
-	tcp_drop(sk, skb);
+	tcp_drop_new(sk, skb, __func__);
 }
 EXPORT_SYMBOL(tcp_rcv_established);
 
@@ -6196,7 +6203,7 @@  static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
 discard:
-			tcp_drop(sk, skb);
+			tcp_drop_new(sk, skb, __func__);
 			return 0;
 		} else {
 			tcp_send_ack(sk);
@@ -6568,7 +6575,7 @@  int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	if (!queued) {
 discard:
-		tcp_drop(sk, skb);
+		tcp_drop_new(sk, skb, __func__);
 	}
 	return 0;
 }