diff mbox series

[RFC,v4] packet: experimental support for 64-bit timestamps

Message ID 20171129134633.1652169-1-arnd@arndb.de
State New
Headers show
Series [RFC,v4] packet: experimental support for 64-bit timestamps | expand

Commit Message

Arnd Bergmann Nov. 29, 2017, 1:45 p.m. UTC
As I noticed in my previous patch to remove the 'timespec' usage in
the packet socket, the timestamps in the packet socket are slightly
inefficient as they convert a nanosecond value into seconds/nanoseconds
or seconds/microseconds.

This adds two new socket options for the timestamp to resolve that:

PACKET_SKIPTIMESTAMP sets a flag to indicate whether to generate
timestamps at all. When this is set, all timestamps are hardcoded to
zero, which saves a few cycles for the conversion and the access of
the hardware clocksource. The idea was taken from pktgen, which has an
F_NO_TIMESTAMP option for the same purpose.

PACKET_TIMESTAMP_NS64 changes the interpretation of the time stamp fields:
instead of having 32 bits for seconds plus 32 bits for nanoseconds or
microseconds, we now always send down 64 bits worth of nanoseconds when
this flag is set.

Link: https://patchwork.kernel.org/patch/10077199/
Suggested-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>

---
I still have not done any runtime testing on this patch,
only implemented the suggestions from the previous versions.

While I don't think anyone is actively looking for this feature,
I don't think there are any reasons left against merging it either,
and it might come in handy for someone.

v4: address minor comments from Willem
v3: rework to use setsockopt
v2: use new tstamp flags instead of a new version
v1: original implementation using TPACKET_V4
---
 include/uapi/linux/if_packet.h |   2 +
 net/packet/af_packet.c         | 160 ++++++++++++++++++++++++++++-------------
 net/packet/internal.h          |   2 +
 3 files changed, 116 insertions(+), 48 deletions(-)

-- 
2.9.0
diff mbox series

Patch

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 67b61d91d89b..2eba54770e6b 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -57,6 +57,8 @@  struct sockaddr_ll {
 #define PACKET_QDISC_BYPASS		20
 #define PACKET_ROLLOVER_STATS		21
 #define PACKET_FANOUT_DATA		22
+#define PACKET_SKIPTIMESTAMP		23
+#define PACKET_TIMESTAMP_NS64		24
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7432c6699818..f55f330ab547 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -200,7 +200,7 @@  static void prb_retire_current_block(struct tpacket_kbdq_core *,
 		struct packet_sock *, unsigned int status);
 static int prb_queue_frozen(struct tpacket_kbdq_core *);
 static void prb_open_block(struct tpacket_kbdq_core *,
-		struct tpacket_block_desc *);
+		struct tpacket_block_desc *, struct packet_sock *);
 static void prb_retire_rx_blk_timer_expired(struct timer_list *);
 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
@@ -439,52 +439,91 @@  static int __packet_get_status(struct packet_sock *po, void *frame)
 	}
 }
 
-static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
-				   unsigned int flags)
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, __u32 *hi, __u32 *lo)
 {
+	struct packet_sock *po = pkt_sk(skb->sk);
 	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+	ktime_t stamp;
+	u32 type;
+
+	if (po->tp_skiptstamp)
+		return 0;
 
 	if (shhwtstamps &&
-	    (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
-	    ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
-		return TP_STATUS_TS_RAW_HARDWARE;
+	    (po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+	    shhwtstamps->hwtstamp) {
+		stamp = shhwtstamps->hwtstamp;
+		type = TP_STATUS_TS_RAW_HARDWARE;
+	} else if (skb->tstamp) {
+		stamp = skb->tstamp;
+		type = TP_STATUS_TS_SOFTWARE;
+	} else {
+		return 0;
+	}
 
-	if (ktime_to_timespec64_cond(skb->tstamp, ts))
-		return TP_STATUS_TS_SOFTWARE;
+	if (po->tp_tstamp_ns64) {
+		__u64 ns = ktime_to_ns(stamp);
 
-	return 0;
+		*hi = upper_32_bits(ns);
+		*lo = lower_32_bits(ns);
+	} else {
+		struct timespec64 ts = ktime_to_timespec64(stamp);
+
+		*hi = ts.tv_sec;
+		if (po->tp_version > TPACKET_V1)
+			*lo = ts.tv_nsec;
+		else
+			*lo = ts.tv_nsec / NSEC_PER_USEC;
+	}
+
+	return type;
+}
+
+static void packet_get_time(struct packet_sock *po, __u32 *hi, __u32 *lo)
+{
+	if (po->tp_skiptstamp) {
+		*hi = 0;
+		*lo = 0;
+	} else if (po->tp_tstamp_ns64) {
+		__u64 ns = ktime_get_real_ns();
+
+		*hi = upper_32_bits(ns);
+		*hi = lower_32_bits(ns);
+	} else {
+		struct timespec64 ts;
+
+		ktime_get_real_ts64(&ts);
+		/* unsigned seconds overflow in y2106 here */
+		*hi = ts.tv_sec;
+		if (po->tp_version > TPACKET_V1)
+			*lo = ts.tv_nsec;
+		else
+			*lo = ts.tv_nsec / NSEC_PER_USEC;
+	}
 }
 
 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
 				    struct sk_buff *skb)
 {
 	union tpacket_uhdr h;
-	struct timespec64 ts;
-	__u32 ts_status;
+	__u32 ts_status, hi, lo;
 
-	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+	if (!(ts_status = tpacket_get_timestamp(skb, &hi, &lo)))
 		return 0;
 
 	h.raw = frame;
-	/*
-	 * versions 1 through 3 overflow the timestamps in y2106, since they
-	 * all store the seconds in a 32-bit unsigned integer.
-	 * If we create a version 4, that should have a 64-bit timestamp,
-	 * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
-	 * nanoseconds.
-	 */
 	switch (po->tp_version) {
 	case TPACKET_V1:
-		h.h1->tp_sec = ts.tv_sec;
-		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+		h.h1->tp_sec = hi;
+		h.h1->tp_usec = lo;
 		break;
 	case TPACKET_V2:
-		h.h2->tp_sec = ts.tv_sec;
-		h.h2->tp_nsec = ts.tv_nsec;
+		h.h2->tp_sec = hi;
+		h.h2->tp_nsec = lo;
 		break;
 	case TPACKET_V3:
-		h.h3->tp_sec = ts.tv_sec;
-		h.h3->tp_nsec = ts.tv_nsec;
+		h.h3->tp_sec = hi;
+		h.h3->tp_nsec = lo;
 		break;
 	default:
 		WARN(1, "TPACKET version not supported.\n");
@@ -633,7 +672,7 @@  static void init_prb_bdqc(struct packet_sock *po,
 	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
 	prb_init_ft_ops(p1, req_u);
 	prb_setup_retire_blk_timer(po);
-	prb_open_block(p1, pbd);
+	prb_open_block(p1, pbd, po);
 }
 
 /*  Do NOT update the last_blk_num first.
@@ -730,7 +769,7 @@  static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 				* opening a block thaws the queue,restarts timer
 				* Thawing/timer-refresh is a side effect.
 				*/
-				prb_open_block(pkc, pbd);
+				prb_open_block(pkc, pbd, po);
 				goto out;
 			}
 		}
@@ -812,10 +851,8 @@  static void prb_close_block(struct tpacket_kbdq_core *pkc1,
 		 * It shouldn't really happen as we don't close empty
 		 * blocks. See prb_retire_rx_blk_timer_expired().
 		 */
-		struct timespec64 ts;
-		ktime_get_real_ts64(&ts);
-		h1->ts_last_pkt.ts_sec = ts.tv_sec;
-		h1->ts_last_pkt.ts_nsec	= ts.tv_nsec;
+		packet_get_time(po, &h1->ts_last_pkt.ts_sec,
+				&h1->ts_last_pkt.ts_nsec);
 	}
 
 	smp_wmb();
@@ -841,9 +878,8 @@  static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
  *
  */
 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
-	struct tpacket_block_desc *pbd1)
+	struct tpacket_block_desc *pbd1, struct packet_sock *po)
 {
-	struct timespec64 ts;
 	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
 
 	smp_rmb();
@@ -856,10 +892,8 @@  static void prb_open_block(struct tpacket_kbdq_core *pkc1,
 	BLOCK_NUM_PKTS(pbd1) = 0;
 	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
 
-	ktime_get_real_ts64(&ts);
-
-	h1->ts_first_pkt.ts_sec = ts.tv_sec;
-	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
+	packet_get_time(po, &h1->ts_first_pkt.ts_sec,
+			&h1->ts_first_pkt.ts_nsec);
 
 	pkc1->pkblk_start = (char *)pbd1;
 	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
@@ -936,7 +970,7 @@  static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
 	 * open this block and return the offset where the first packet
 	 * needs to get stored.
 	 */
-	prb_open_block(pkc, pbd);
+	prb_open_block(pkc, pbd, po);
 	return (void *)pkc->nxt_offset;
 }
 
@@ -1068,7 +1102,7 @@  static void *__packet_lookup_frame_in_block(struct packet_sock *po,
 			 * opening a block also thaws the queue.
 			 * Thawing is a side effect.
 			 */
-			prb_open_block(pkc, pbd);
+			prb_open_block(pkc, pbd, po);
 		}
 	}
 
@@ -2191,8 +2225,8 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	unsigned long status = TP_STATUS_USER;
 	unsigned short macoff, netoff, hdrlen;
 	struct sk_buff *copy_skb = NULL;
-	struct timespec64 ts;
 	__u32 ts_status;
+	__u32 tstamp_hi, tstamp_lo;
 	bool is_drop_n_account = false;
 	bool do_vnet = false;
 
@@ -2318,8 +2352,8 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
 
-	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
-		ktime_get_real_ts64(&ts);
+	if (!(ts_status = tpacket_get_timestamp(skb, &tstamp_hi, &tstamp_lo)))
+		packet_get_time(po, &tstamp_hi, &tstamp_lo);
 
 	status |= ts_status;
 
@@ -2329,8 +2363,8 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h1->tp_snaplen = snaplen;
 		h.h1->tp_mac = macoff;
 		h.h1->tp_net = netoff;
-		h.h1->tp_sec = ts.tv_sec;
-		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+		h.h1->tp_sec = tstamp_hi;
+		h.h1->tp_usec = tstamp_lo;
 		hdrlen = sizeof(*h.h1);
 		break;
 	case TPACKET_V2:
@@ -2338,8 +2372,8 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h2->tp_snaplen = snaplen;
 		h.h2->tp_mac = macoff;
 		h.h2->tp_net = netoff;
-		h.h2->tp_sec = ts.tv_sec;
-		h.h2->tp_nsec = ts.tv_nsec;
+		h.h2->tp_sec = tstamp_hi;
+		h.h2->tp_nsec = tstamp_lo;
 		if (skb_vlan_tag_present(skb)) {
 			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
 			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
@@ -2360,8 +2394,8 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h3->tp_snaplen = snaplen;
 		h.h3->tp_mac = macoff;
 		h.h3->tp_net = netoff;
-		h.h3->tp_sec  = ts.tv_sec;
-		h.h3->tp_nsec = ts.tv_nsec;
+		h.h3->tp_sec  = tstamp_hi;
+		h.h3->tp_nsec = tstamp_lo;
 		memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
 		hdrlen = sizeof(*h.h3);
 		break;
@@ -3792,6 +3826,30 @@  packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->tp_tstamp = val;
 		return 0;
 	}
+	case PACKET_SKIPTIMESTAMP:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->tp_skiptstamp = val;
+		return 0;
+	}
+	case PACKET_TIMESTAMP_NS64:
+	{
+		int val;
+
+		if (optlen != sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->tp_tstamp_ns64 = val;
+		return 0;
+	}
 	case PACKET_FANOUT:
 	{
 		int val;
@@ -3921,6 +3979,12 @@  static int packet_getsockopt(struct socket *sock, int level, int optname,
 	case PACKET_TIMESTAMP:
 		val = po->tp_tstamp;
 		break;
+	case PACKET_SKIPTIMESTAMP:
+		val = po->tp_skiptstamp;
+		break;
+	case PACKET_TIMESTAMP_NS64:
+		val = po->tp_tstamp_ns64;
+		break;
 	case PACKET_FANOUT:
 		val = (po->fanout ?
 		       ((u32)po->fanout->id |
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 562fbc155006..20b69512210f 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -128,6 +128,8 @@  struct packet_sock {
 	unsigned int		tp_reserve;
 	unsigned int		tp_loss:1;
 	unsigned int		tp_tx_has_off:1;
+	unsigned int		tp_skiptstamp:1;
+	unsigned int		tp_tstamp_ns64:1;
 	unsigned int		tp_tstamp;
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);